""" Tests dtype specification during parsing for all of the parsers defined in parsers.py """ from io import StringIO import os import numpy as np import pytest from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( Categorical, DataFrame, Timestamp, ) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @xfail_pyarrow @pytest.mark.parametrize( "dtype", [ "category", CategoricalDtype(), {"a": "category", "b": "category", "c": CategoricalDtype()}, ], ) def test_categorical_dtype(all_parsers, dtype): # see gh-10153 parser = all_parsers data = """a,b,c 1,a,3.4 1,a,3.4 2,b,4.5""" expected = DataFrame( { "a": Categorical(["1", "1", "2"]), "b": Categorical(["a", "a", "b"]), "c": Categorical(["3.4", "3.4", "4.5"]), } ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) @skip_pyarrow # Flaky @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 parser = all_parsers data = """a,b,c 1,a,3.4 1,a,3.4 2,b,4.5""" expected = DataFrame( {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) @xfail_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers data = """a,b,c 1,b,3.4 1,b,3.4 2,a,4.5""" expected = DataFrame( { "a": Categorical(["1", "1", "2"]), "b": Categorical(["b", "b", "a"]), "c": Categorical(["3.4", "3.4", "4.5"]), } ) actual = parser.read_csv(StringIO(data), dtype="category") tm.assert_frame_equal(actual, expected) @xfail_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers data = """a,b,c 1,b,3.4 1,nan,3.4 2,a,4.5""" expected = DataFrame( { "a": Categorical(["1", "1", "2"]), "b": Categorical(["b", np.nan, "a"]), "c": Categorical(["3.4", "3.4", "4.5"]), } ) actual = parser.read_csv(StringIO(data), dtype="category") tm.assert_frame_equal(actual, expected) @xfail_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 parser = all_parsers data = np.sort([str(i) for i in range(524289)]) expected = DataFrame({"a": Categorical(data, ordered=True)}) actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") actual["a"] = actual["a"].cat.reorder_categories( np.sort(actual.a.cat.categories), ordered=True ) tm.assert_frame_equal(actual, expected) def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers encoding = "utf-16" sep = "\t" expected = parser.read_csv(pth, sep=sep, encoding=encoding) expected = expected.apply(Categorical) actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") tm.assert_frame_equal(actual, expected) @xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers data = """a,b 1,a 1,b 1,b 2,c""" expecteds = [ DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), ] with parser.read_csv( StringIO(data), dtype={"b": "category"}, chunksize=2 ) as actuals: for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) @xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers data = """a,b 1,a 1,b 1,b 2,c""" cats = ["a", "b", "c"] expecteds = [ DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), DataFrame( {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3], ), ] dtype = CategoricalDtype(cats) with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) def test_categorical_dtype_latin1(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers encoding = "latin-1" expected = parser.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) tm.assert_frame_equal(actual, expected) @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize( "categories", [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], ) def test_categorical_category_dtype(all_parsers, categories, ordered): parser = all_parsers data = """a,b 1,a 1,b 1,b 2,c""" expected = DataFrame( { "a": [1, 1, 1, 2], "b": Categorical( ["a", "b", "b", "c"], categories=categories, ordered=ordered ), } ) dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) def test_categorical_category_dtype_unsorted(all_parsers): parser = all_parsers data = """a,b 1,a 1,b 1,b 2,c""" dtype = CategoricalDtype(["c", "b", "a"]) expected = DataFrame( { "a": [1, 1, 1, 2], "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), } ) result = parser.read_csv(StringIO(data), dtype={"b": dtype}) tm.assert_frame_equal(result, expected) def test_categorical_coerces_numeric(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype([1, 2, 3])} data = "b\n1\n1\n2\n3" expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) @skip_pyarrow # Flaky def test_categorical_coerces_datetime(all_parsers): parser = all_parsers dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) dtype = {"b": CategoricalDtype(dti)} data = "b\n2017-01-01\n2018-01-01\n2019-01-01" expected = DataFrame({"b": Categorical(dtype["b"].categories)}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) def test_categorical_coerces_timestamp(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype([Timestamp("2014")])} data = "b\n2014-01-01\n2014-01-01" expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} data = "b\n1H\n2H\n3H" expected = DataFrame({"b": Categorical(dtype["b"].categories)}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data", [ "b\nTrue\nFalse\nNA\nFalse", "b\ntrue\nfalse\nNA\nfalse", "b\nTRUE\nFALSE\nNA\nFALSE", "b\nTrue\nFalse\nNA\nFALSE", ], ) def test_categorical_dtype_coerces_boolean(all_parsers, data): # see gh-20498 parser = all_parsers dtype = {"b": CategoricalDtype([False, True])} expected = DataFrame({"b": Categorical([True, False, None, False])}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) def test_categorical_unexpected_categories(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} data = "b\nd\na\nc\nd" # Unexpected c expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected)