from collections import abc import email from email.parser import Parser import numpy as np import pytest from pandas import ( CategoricalDtype, DataFrame, MultiIndex, Series, Timestamp, date_range, ) import pandas._testing as tm class TestDataFrameToRecords: def test_to_records_timeseries(self): index = date_range("1/1/2000", periods=10) df = DataFrame(np.random.randn(10, 3), index=index, columns=["a", "b", "c"]) result = df.to_records() assert result["index"].dtype == "M8[ns]" result = df.to_records(index=False) def test_to_records_dt64(self): df = DataFrame( [["one", "two", "three"], ["four", "five", "six"]], index=date_range("2012-01-01", "2012-01-02"), ) expected = df.index.values[0] result = df.to_records()["index"][0] assert expected == result def test_to_records_dt64tz_column(self): # GH#32535 dont less tz in to_records df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) result = df.to_records() assert result.dtype["A"] == object val = result[0][1] assert isinstance(val, Timestamp) assert val == df.loc[0, "A"] def test_to_records_with_multindex(self): # GH#3189 index = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] data = np.zeros((8, 4)) df = DataFrame(data, index=index) r = df.to_records(index=True)["level_0"] assert "bar" in r assert "one" not in r def test_to_records_with_Mapping_type(self): abc.Mapping.register(email.message.Message) headers = Parser().parsestr( "From: \n" "To: \n" "Subject: Test message\n" "\n" "Body would go here\n" ) frame = DataFrame.from_records([headers]) all(x in frame for x in ["Type", "Subject", "From"]) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) df.to_records() def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = "X" rs = df.to_records() assert "X" in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() assert "index" in rs.dtype.fields df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) df.index.names = ["A", None] result = df.to_records() expected = np.rec.fromarrays( [np.array(["a", "a", "b"]), np.array(["x", "y", "z"])] + [np.asarray(df.iloc[:, i]) for i in range(3)], dtype={ "names": ["A", "level_1", "0", "1", "2"], "formats": [ "O", "O", f"{tm.ENDIAN}f8", f"{tm.ENDIAN}f8", f"{tm.ENDIAN}f8", ], }, ) tm.assert_numpy_array_equal(result, expected) def test_to_records_with_unicode_index(self): # GH#13172 # unicode_literals conflict with to_records result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) tm.assert_almost_equal(result, expected) def test_to_records_index_dtype(self): # GH 47263: consistent data types for Index and MultiIndex df = DataFrame( { 1: date_range("2022-01-01", periods=2), 2: date_range("2022-01-01", periods=2), 3: date_range("2022-01-01", periods=2), } ) expected = np.rec.array( [ ("2022-01-01", "2022-01-01", "2022-01-01"), ("2022-01-02", "2022-01-02", "2022-01-02"), ], dtype=[ ("1", f"{tm.ENDIAN}M8[ns]"), ("2", f"{tm.ENDIAN}M8[ns]"), ("3", f"{tm.ENDIAN}M8[ns]"), ], ) result = df.to_records(index=False) tm.assert_almost_equal(result, expected) result = df.set_index(1).to_records(index=True) tm.assert_almost_equal(result, expected) result = df.set_index([1, 2]).to_records(index=True) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): # xref issue: https://github.com/numpy/numpy/issues/2407 # Issue GH#11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() # Note that numpy allows for unicode field names but dtypes need # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]}, ) tm.assert_almost_equal(result, expected) def test_to_records_with_categorical(self): # GH#8626 # dict creation df = DataFrame({"A": list("abc")}, dtype="category") expected = Series(list("abc"), dtype="category", name="A") tm.assert_series_equal(df["A"], expected) # list-like creation df = DataFrame(list("abc"), dtype="category") expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() expected = np.rec.array( [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")] ) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "kwargs,expected", [ # No dtypes --> default to array dtypes. ( {}, np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", f"{tm.ENDIAN}i8"), ("B", f"{tm.ENDIAN}f8"), ("C", "O"), ], ), ), # Should have no effect in this case. ( {"index": True}, np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", f"{tm.ENDIAN}i8"), ("B", f"{tm.ENDIAN}f8"), ("C", "O"), ], ), ), # Column dtype applied across the board. Index unaffected. ( {"column_dtypes": f"{tm.ENDIAN}U4"}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", f"{tm.ENDIAN}U4"), ("B", f"{tm.ENDIAN}U4"), ("C", f"{tm.ENDIAN}U4"), ], ), ), # Index dtype applied across the board. Columns unaffected. ( {"index_dtypes": f"{tm.ENDIAN}U1"}, np.rec.array( [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], dtype=[ ("index", f"{tm.ENDIAN}U1"), ("A", f"{tm.ENDIAN}i8"), ("B", f"{tm.ENDIAN}f8"), ("C", "O"), ], ), ), # Pass in a type instance. ( {"column_dtypes": str}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", f"{tm.ENDIAN}U"), ("B", f"{tm.ENDIAN}U"), ("C", f"{tm.ENDIAN}U"), ], ), ), # Pass in a dtype instance. ( {"column_dtypes": np.dtype("unicode")}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", f"{tm.ENDIAN}U"), ("B", f"{tm.ENDIAN}U"), ("C", f"{tm.ENDIAN}U"), ], ), ), # Pass in a dictionary (name-only). ( { "column_dtypes": { "A": np.int8, "B": np.float32, "C": f"{tm.ENDIAN}U2", } }, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", "i1"), ("B", f"{tm.ENDIAN}f4"), ("C", f"{tm.ENDIAN}U2"), ], ), ), # Pass in a dictionary (indices-only). ( {"index_dtypes": {0: "int16"}}, np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[ ("index", "i2"), ("A", f"{tm.ENDIAN}i8"), ("B", f"{tm.ENDIAN}f8"), ("C", "O"), ], ), ), # Ignore index mappings if index is not True. ( {"index": False, "index_dtypes": f"{tm.ENDIAN}U2"}, np.rec.array( [(1, 0.2, "a"), (2, 1.5, "bc")], dtype=[ ("A", f"{tm.ENDIAN}i8"), ("B", f"{tm.ENDIAN}f8"), ("C", "O"), ], ), ), # Non-existent names / indices in mapping should not error. ( {"index_dtypes": {0: "int16", "not-there": "float32"}}, np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[ ("index", "i2"), ("A", f"{tm.ENDIAN}i8"), ("B", f"{tm.ENDIAN}f8"), ("C", "O"), ], ), ), # Names / indices not in mapping default to array dtype. ( {"column_dtypes": {"A": np.int8, "B": np.float32}}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", "i1"), ("B", f"{tm.ENDIAN}f4"), ("C", "O"), ], ), ), # Names / indices not in dtype mapping default to array dtype. ( {"column_dtypes": {"A": np.dtype("int8"), "B": np.dtype("float32")}}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}i8"), ("A", "i1"), ("B", f"{tm.ENDIAN}f4"), ("C", "O"), ], ), ), # Mixture of everything. ( { "column_dtypes": {"A": np.int8, "B": np.float32}, "index_dtypes": f"{tm.ENDIAN}U2", }, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}U2"), ("A", "i1"), ("B", f"{tm.ENDIAN}f4"), ("C", "O"), ], ), ), # Invalid dype values. ( {"index": False, "column_dtypes": []}, (ValueError, "Invalid dtype \\[\\] specified for column A"), ), ( {"index": False, "column_dtypes": {"A": "int32", "B": 5}}, (ValueError, "Invalid dtype 5 specified for column B"), ), # Numpy can't handle EA types, so check error is raised ( { "index": False, "column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])}, }, (ValueError, "Invalid dtype category specified for column B"), ), # Check that bad types raise ( {"index": False, "column_dtypes": {"A": "int32", "B": "foo"}}, (TypeError, "data type [\"']foo[\"'] not understood"), ), ], ) def test_to_records_dtype(self, kwargs, expected): # see GH#18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) if not isinstance(expected, np.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "df,kwargs,expected", [ # MultiIndex in the index. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc") ).set_index(["a", "b"]), {"column_dtypes": "float64", "index_dtypes": {0: "int32", 1: "int8"}}, np.rec.array( [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)], dtype=[ ("a", f"{tm.ENDIAN}i4"), ("b", "i1"), ("c", f"{tm.ENDIAN}f8"), ], ), ), # MultiIndex in the columns. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples( [("a", "d"), ("b", "e"), ("c", "f")] ), ), { "column_dtypes": {0: f"{tm.ENDIAN}U1", 2: "float32"}, "index_dtypes": "float32", }, np.rec.array( [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)], dtype=[ ("index", f"{tm.ENDIAN}f4"), ("('a', 'd')", f"{tm.ENDIAN}U1"), ("('b', 'e')", f"{tm.ENDIAN}i8"), ("('c', 'f')", f"{tm.ENDIAN}f4"), ], ), ), # MultiIndex in both the columns and index. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples( [("a", "d"), ("b", "e"), ("c", "f")], names=list("ab") ), index=MultiIndex.from_tuples( [("d", -4), ("d", -5), ("f", -6)], names=list("cd") ), ), { "column_dtypes": "float64", "index_dtypes": {0: f"{tm.ENDIAN}U2", 1: "int8"}, }, np.rec.array( [ ("d", -4, 1.0, 2.0, 3.0), ("d", -5, 4.0, 5.0, 6.0), ("f", -6, 7, 8, 9.0), ], dtype=[ ("c", f"{tm.ENDIAN}U2"), ("d", "i1"), ("('a', 'd')", f"{tm.ENDIAN}f8"), ("('b', 'e')", f"{tm.ENDIAN}f8"), ("('c', 'f')", f"{tm.ENDIAN}f8"), ], ), ), ], ) def test_to_records_dtype_mi(self, df, kwargs, expected): # see GH#18146 result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) def test_to_records_dict_like(self): # see GH#18146 class DictLike: def __init__(self, **kwargs) -> None: self.d = kwargs.copy() def __getitem__(self, key): return self.d.__getitem__(key) def __contains__(self, key) -> bool: return key in self.d def keys(self): return self.d.keys() df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) dtype_mappings = { "column_dtypes": DictLike(**{"A": np.int8, "B": np.float32}), "index_dtypes": f"{tm.ENDIAN}U2", } result = df.to_records(**dtype_mappings) expected = np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ ("index", f"{tm.ENDIAN}U2"), ("A", "i1"), ("B", f"{tm.ENDIAN}f4"), ("C", "O"), ], ) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): # GH#13937 dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) df = DataFrame({"datetime": dr}, index=dr) expected = df.to_records() result = df.tz_convert("UTC").to_records() # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected)