1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
| import re
|
| import numpy as np
| import pytest
|
| from pandas import (
| DataFrame,
| Series,
| date_range,
| )
| import pandas._testing as tm
|
|
| @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
| def test_duplicated_with_misspelled_column_name(subset):
| # GH 19730
| df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
| msg = re.escape("Index(['a'], dtype='object')")
|
| with pytest.raises(KeyError, match=msg):
| df.duplicated(subset)
|
|
| @pytest.mark.slow
| def test_duplicated_do_not_fail_on_wide_dataframes():
| # gh-21524
| # Given the wide dataframe with a lot of columns
| # with different (important!) values
| data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
| df = DataFrame(data).T
| result = df.duplicated()
|
| # Then duplicates produce the bool Series as a result and don't fail during
| # calculation. Actual values doesn't matter here, though usually it's all
| # False in this case
| assert isinstance(result, Series)
| assert result.dtype == np.bool_
|
|
| @pytest.mark.parametrize(
| "keep, expected",
| [
| ("first", Series([False, False, True, False, True])),
| ("last", Series([True, True, False, False, False])),
| (False, Series([True, True, True, False, True])),
| ],
| )
| def test_duplicated_keep(keep, expected):
| df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
|
| result = df.duplicated(keep=keep)
| tm.assert_series_equal(result, expected)
|
|
| @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
| @pytest.mark.parametrize(
| "keep, expected",
| [
| ("first", Series([False, False, True, False, True])),
| ("last", Series([True, True, False, False, False])),
| (False, Series([True, True, True, False, True])),
| ],
| )
| def test_duplicated_nan_none(keep, expected):
| df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object)
|
| result = df.duplicated(keep=keep)
| tm.assert_series_equal(result, expected)
|
|
| @pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
| def test_duplicated_subset(subset, keep):
| df = DataFrame(
| {
| "A": [0, 1, 1, 2, 0],
| "B": ["a", "b", "b", "c", "a"],
| "C": [np.nan, 3, 3, None, np.nan],
| }
| )
|
| if subset is None:
| subset = list(df.columns)
| elif isinstance(subset, str):
| # need to have a DataFrame, not a Series
| # -> select columns with singleton list, not string
| subset = [subset]
|
| expected = df[subset].duplicated(keep=keep)
| result = df.duplicated(keep=keep, subset=subset)
| tm.assert_series_equal(result, expected)
|
|
| def test_duplicated_on_empty_frame():
| # GH 25184
|
| df = DataFrame(columns=["a", "b"])
| dupes = df.duplicated("a")
|
| result = df[dupes]
| expected = df.copy()
| tm.assert_frame_equal(result, expected)
|
|
| def test_frame_datetime64_duplicated():
| dates = date_range("2010-07-01", end="2010-08-05")
|
| tst = DataFrame({"symbol": "AAA", "date": dates})
| result = tst.duplicated(["date", "symbol"])
| assert (-result).all()
|
| tst = DataFrame({"date": dates})
| result = tst.date.duplicated()
| assert (-result).all()
|
|