import numpy as np import pytest from pandas.core.dtypes.dtypes import ExtensionDtype import pandas as pd from pandas import ( DataFrame, Timestamp, ) import pandas._testing as tm from pandas.core.arrays import ExtensionArray class DummyDtype(ExtensionDtype): type = int def __init__(self, numeric) -> None: self._numeric = numeric @property def name(self): return "Dummy" @property def _is_numeric(self): return self._numeric class DummyArray(ExtensionArray): def __init__(self, data, dtype) -> None: self.data = data self._dtype = dtype def __array__(self, dtype): return self.data @property def dtype(self): return self._dtype def __len__(self) -> int: return len(self.data) def __getitem__(self, item): pass def copy(self): return self class TestSelectDtypes: def test_select_dtypes_include_using_list_like(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=[np.number]) ei = df[["b", "c", "d", "k"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number], exclude=["timedelta"]) ei = df[["b", "c", "d"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"]) ei = df[["b", "c", "d", "f"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include=["datetime"]) ei = df[["g"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include=["datetime64"]) ei = df[["g"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include=["datetimetz"]) ei = df[["h", "i"]] tm.assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], } ) re = df.select_dtypes(exclude=[np.number]) ee = df[["a", "e"]] tm.assert_frame_equal(re, ee) def test_select_dtypes_exclude_include_using_list_like(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6, dtype="u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) exclude = (np.datetime64,) include = np.bool_, "integer" r = df.select_dtypes(include=include, exclude=exclude) e = df[["b", "c", "e"]] tm.assert_frame_equal(r, e) exclude = ("datetime",) include = "bool", "int64", "int32" r = df.select_dtypes(include=include, exclude=exclude) e = df[["b", "e"]] tm.assert_frame_equal(r, e) @pytest.mark.parametrize( "include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)] ) def test_select_dtypes_exclude_include_int(self, include): # Fix select_dtypes(include='int') for Windows, FYI #36596 df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6, dtype="int32"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) exclude = (np.datetime64,) result = df.select_dtypes(include=include, exclude=exclude) expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) def test_select_dtypes_include_using_scalars(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=np.number) ei = df[["b", "c", "d", "k"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include="datetime") ei = df[["g"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include="datetime64") ei = df[["g"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include="category") ei = df[["f"]] tm.assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(exclude=np.number) ei = df[["a", "e", "f", "g", "h", "i", "j"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(exclude="category") ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]] tm.assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(exclude="period") def test_select_dtypes_include_exclude_using_scalars(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=np.number, exclude="floating") ei = df[["b", "c", "k"]] tm.assert_frame_equal(ri, ei) def test_select_dtypes_include_exclude_mixed_scalars_lists(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"]) ei = df[["b", "c"]] tm.assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, "category"], exclude="floating") ei = df[["b", "c", "f", "k"]] tm.assert_frame_equal(ri, ei) def test_select_dtypes_duplicate_columns(self): # GH20839 df = DataFrame( { "a": ["a", "b", "c"], "b": [1, 2, 3], "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) df.columns = ["a", "a", "b", "b", "b", "c"] expected = DataFrame( {"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")} ) result = df.select_dtypes(include=[np.number], exclude=["floating"]) tm.assert_frame_equal(result, expected) def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): df = DataFrame({"a": list("abc"), "b": list(range(1, 4))}) msg = "at least one of include or exclude must be nonempty" with pytest.raises(ValueError, match=msg): df.select_dtypes() def test_select_dtypes_bad_datetime64(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) with pytest.raises(ValueError, match=".+ is too specific"): df.select_dtypes(include=["datetime64[D]"]) with pytest.raises(ValueError, match=".+ is too specific"): df.select_dtypes(exclude=["datetime64[as]"]) def test_select_dtypes_datetime_with_tz(self): df2 = DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ) df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) result = df3.select_dtypes(include=["datetime64[ns]"]) expected = df3.reindex(columns=[]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"] ) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( { "a": list("abc"), "g": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) msg = "string dtypes are not allowed" kwargs = {arg: [dtype]} with pytest.raises(TypeError, match=msg): df.select_dtypes(**kwargs) def test_select_dtypes_bad_arg_raises(self): df = DataFrame( { "a": list("abc"), "g": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) msg = "data type.*not understood" with pytest.raises(TypeError, match=msg): df.select_dtypes(["blargy, blarg, blarg"]) def test_select_dtypes_typecodes(self): # GH 11990 df = tm.makeCustomDataframe(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes["AllFloat"]) tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected) @pytest.mark.parametrize( "arr,expected", ( (np.array([1, 2], dtype=np.int32), True), (pd.array([1, 2], dtype="Int32"), True), (DummyArray([1, 2], dtype=DummyDtype(numeric=True)), True), (DummyArray([1, 2], dtype=DummyDtype(numeric=False)), False), ), ) def test_select_dtypes_numeric(self, arr, expected): # GH 35340 df = DataFrame(arr) is_selected = df.select_dtypes(np.number).shape == df.shape assert is_selected == expected def test_select_dtypes_numeric_nullable_string(self, nullable_string_dtype): arr = pd.array(["a", "b"], dtype=nullable_string_dtype) df = DataFrame(arr) is_selected = df.select_dtypes(np.number).shape == df.shape assert not is_selected @pytest.mark.parametrize( "expected, float_dtypes", [ [ DataFrame( {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)} ).astype(dtype={"A": float, "B": np.float64, "C": np.float32}), float, ], [ DataFrame( {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)} ).astype(dtype={"A": float, "B": np.float64, "C": np.float32}), "float", ], [DataFrame({"C": range(10, 7, -1)}, dtype=np.float32), np.float32], [ DataFrame({"A": range(3), "B": range(5, 8)}).astype( dtype={"A": float, "B": np.float64} ), np.float64, ], ], ) def test_select_dtypes_float_dtype(self, expected, float_dtypes): # GH#42452 dtype_dict = {"A": float, "B": np.float64, "C": np.float32} df = DataFrame( {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}, ) df = df.astype(dtype_dict) result = df.select_dtypes(include=float_dtypes) tm.assert_frame_equal(result, expected) def test_np_bool_ea_boolean_include_number(self): # GH 46870 df = DataFrame( { "a": [1, 2, 3], "b": pd.Series([True, False, True], dtype="boolean"), "c": np.array([True, False, True]), "d": pd.Categorical([True, False, True]), "e": pd.arrays.SparseArray([True, False, True]), } ) result = df.select_dtypes(include="number") expected = DataFrame({"a": [1, 2, 3]}) tm.assert_frame_equal(result, expected) def test_select_dtypes_no_view(self): # https://github.com/pandas-dev/pandas/issues/48090 # result of this method is not a view on the original dataframe df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() result = df.select_dtypes(include=["number"]) result.iloc[0, 0] = 0 tm.assert_frame_equal(df, df_orig)