1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
| import numpy as np
| import pytest
|
| import pandas as pd
| from pandas import (
| DataFrame,
| Index,
| date_range,
| )
| import pandas._testing as tm
|
|
| @pytest.mark.parametrize("func", ["ffill", "bfill"])
| def test_groupby_column_index_name_lost_fill_funcs(func):
| # GH: 29764 groupby loses index sometimes
| df = DataFrame(
| [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
| columns=Index(["type", "a", "b"], name="idx"),
| )
| df_grouped = df.groupby(["type"])[["a", "b"]]
| result = getattr(df_grouped, func)().columns
| expected = Index(["a", "b"], name="idx")
| tm.assert_index_equal(result, expected)
|
|
| @pytest.mark.parametrize("func", ["ffill", "bfill"])
| def test_groupby_fill_duplicate_column_names(func):
| # GH: 25610 ValueError with duplicate column names
| df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
| df2 = DataFrame({"field1": [1, np.nan, 4]})
| df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
| expected = DataFrame(
| [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
| )
| result = getattr(df_grouped, func)()
| tm.assert_frame_equal(result, expected)
|
|
| def test_ffill_missing_arguments():
| # GH 14955
| df = DataFrame({"a": [1, 2], "b": [1, 1]})
| with pytest.raises(ValueError, match="Must specify a fill"):
| df.groupby("b").fillna()
|
|
| @pytest.mark.parametrize(
| "method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
| )
| def test_fillna_with_string_dtype(method, expected):
| # GH 40250
| df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
| grp = df.groupby("b")
| result = grp.fillna(method=method)
| expected = DataFrame({"a": pd.array(expected, dtype="string")})
| tm.assert_frame_equal(result, expected)
|
|
| def test_fill_consistency():
| # GH9221
| # pass thru keyword arguments to the generated wrapper
| # are set if the passed kw is None (only)
| df = DataFrame(
| index=pd.MultiIndex.from_product(
| [["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
| ),
| columns=Index(["1", "2"], name="id"),
| )
| df["1"] = [
| np.nan,
| 1,
| np.nan,
| np.nan,
| 11,
| np.nan,
| np.nan,
| 2,
| np.nan,
| np.nan,
| 22,
| np.nan,
| ]
| df["2"] = [
| np.nan,
| 3,
| np.nan,
| np.nan,
| 33,
| np.nan,
| np.nan,
| 4,
| np.nan,
| np.nan,
| 44,
| np.nan,
| ]
|
| expected = df.groupby(level=0, axis=0).fillna(method="ffill")
| result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
| tm.assert_frame_equal(result, expected)
|
|
| @pytest.mark.parametrize("method", ["ffill", "bfill"])
| @pytest.mark.parametrize("dropna", [True, False])
| @pytest.mark.parametrize("has_nan_group", [True, False])
| def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
| # GH 34725
|
| df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
|
| ridx = [-1, 0, -1, -1, 1, -1]
| df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
| group_b = np.nan if has_nan_group else "b"
| df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
|
| grouped = df.groupby(by="group_col", dropna=dropna)
| result = getattr(grouped, method)(limit=None)
|
| expected_rows = {
| ("ffill", True, True): [-1, 0, 0, -1, -1, -1],
| ("ffill", True, False): [-1, 0, 0, -1, 1, 1],
| ("ffill", False, True): [-1, 0, 0, -1, 1, 1],
| ("ffill", False, False): [-1, 0, 0, -1, 1, 1],
| ("bfill", True, True): [0, 0, -1, -1, -1, -1],
| ("bfill", True, False): [0, 0, -1, 1, 1, -1],
| ("bfill", False, True): [0, 0, -1, 1, 1, -1],
| ("bfill", False, False): [0, 0, -1, 1, 1, -1],
| }
|
| ridx = expected_rows.get((method, dropna, has_nan_group))
| expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
| # columns are a 'take' on df.columns, which are object dtype
| expected.columns = expected.columns.astype(object)
|
| tm.assert_frame_equal(result, expected)
|
|
| @pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
| @pytest.mark.parametrize("func", ["first", "last", "max", "min"])
| def test_min_count(func, min_count, value):
| # GH#37821
| df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
| result = getattr(df.groupby("a"), func)(min_count=min_count)
| expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
| tm.assert_frame_equal(result, expected)
|
|
| def test_indices_with_missing():
| # GH 9304
| df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
| g = df.groupby(["a", "b"])
| result = g.indices
| expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
| assert result == expected
|
|