fume-manage-python.git

from collections import abc
import email
from email.parser import Parser
 
import numpy as np
import pytest
 
from pandas import (
    CategoricalDtype,
    DataFrame,
    MultiIndex,
    Series,
    Timestamp,
    date_range,
)
import pandas._testing as tm
 
 
class TestDataFrameToRecords:
    def test_to_records_timeseries(self):
        index = date_range("1/1/2000", periods=10)
        df = DataFrame(np.random.randn(10, 3), index=index, columns=["a", "b", "c"])
 
        result = df.to_records()
        assert result["index"].dtype == "M8[ns]"
 
        result = df.to_records(index=False)
 
    def test_to_records_dt64(self):
        df = DataFrame(
            [["one", "two", "three"], ["four", "five", "six"]],
            index=date_range("2012-01-01", "2012-01-02"),
        )
 
        expected = df.index.values[0]
        result = df.to_records()["index"][0]
        assert expected == result
 
    def test_to_records_dt64tz_column(self):
        # GH#32535 dont less tz in to_records
        df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")})
 
        result = df.to_records()
 
        assert result.dtype["A"] == object
        val = result[0][1]
        assert isinstance(val, Timestamp)
        assert val == df.loc[0, "A"]
 
    def test_to_records_with_multindex(self):
        # GH#3189
        index = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)["level_0"]
        assert "bar" in r
        assert "one" not in r
 
    def test_to_records_with_Mapping_type(self):
        abc.Mapping.register(email.message.Message)
 
        headers = Parser().parsestr(
            "From: <user@example.com>\n"
            "To: <someone_else@example.com>\n"
            "Subject: Test message\n"
            "\n"
            "Body would go here\n"
        )
 
        frame = DataFrame.from_records([headers])
        all(x in frame for x in ["Type", "Subject", "From"])
 
    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()
 
    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = "X"
        rs = df.to_records()
        assert "X" in rs.dtype.fields
 
        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert "index" in rs.dtype.fields
 
        df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        df.index.names = ["A", None]
        result = df.to_records()
        expected = np.rec.fromarrays(
            [np.array(["a", "a", "b"]), np.array(["x", "y", "z"])]
            + [np.asarray(df.iloc[:, i]) for i in range(3)],
            dtype={
                "names": ["A", "level_1", "0", "1", "2"],
                "formats": [
                    "O",
                    "O",
                    f"{tm.ENDIAN}f8",
                    f"{tm.ENDIAN}f8",
                    f"{tm.ENDIAN}f8",
                ],
            },
        )
        tm.assert_numpy_array_equal(result, expected)
 
    def test_to_records_with_unicode_index(self):
        # GH#13172
        # unicode_literals conflict with to_records
        result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
        expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
        tm.assert_almost_equal(result, expected)
 
    def test_to_records_index_dtype(self):
        # GH 47263: consistent data types for Index and MultiIndex
        df = DataFrame(
            {
                1: date_range("2022-01-01", periods=2),
                2: date_range("2022-01-01", periods=2),
                3: date_range("2022-01-01", periods=2),
            }
        )
 
        expected = np.rec.array(
            [
                ("2022-01-01", "2022-01-01", "2022-01-01"),
                ("2022-01-02", "2022-01-02", "2022-01-02"),
            ],
            dtype=[
                ("1", f"{tm.ENDIAN}M8[ns]"),
                ("2", f"{tm.ENDIAN}M8[ns]"),
                ("3", f"{tm.ENDIAN}M8[ns]"),
            ],
        )
 
        result = df.to_records(index=False)
        tm.assert_almost_equal(result, expected)
 
        result = df.set_index(1).to_records(index=True)
        tm.assert_almost_equal(result, expected)
 
        result = df.set_index([1, 2]).to_records(index=True)
        tm.assert_almost_equal(result, expected)
 
    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue GH#11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()
 
        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]},
        )
        tm.assert_almost_equal(result, expected)
 
    def test_to_records_with_categorical(self):
        # GH#8626
 
        # dict creation
        df = DataFrame({"A": list("abc")}, dtype="category")
        expected = Series(list("abc"), dtype="category", name="A")
        tm.assert_series_equal(df["A"], expected)
 
        # list-like creation
        df = DataFrame(list("abc"), dtype="category")
        expected = Series(list("abc"), dtype="category", name=0)
        tm.assert_series_equal(df[0], expected)
 
        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array(
            [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]
        )
        tm.assert_almost_equal(result, expected)
 
    @pytest.mark.parametrize(
        "kwargs,expected",
        [
            # No dtypes --> default to array dtypes.
            (
                {},
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", f"{tm.ENDIAN}i8"),
                        ("B", f"{tm.ENDIAN}f8"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Should have no effect in this case.
            (
                {"index": True},
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", f"{tm.ENDIAN}i8"),
                        ("B", f"{tm.ENDIAN}f8"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Column dtype applied across the board. Index unaffected.
            (
                {"column_dtypes": f"{tm.ENDIAN}U4"},
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", f"{tm.ENDIAN}U4"),
                        ("B", f"{tm.ENDIAN}U4"),
                        ("C", f"{tm.ENDIAN}U4"),
                    ],
                ),
            ),
            # Index dtype applied across the board. Columns unaffected.
            (
                {"index_dtypes": f"{tm.ENDIAN}U1"},
                np.rec.array(
                    [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}U1"),
                        ("A", f"{tm.ENDIAN}i8"),
                        ("B", f"{tm.ENDIAN}f8"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Pass in a type instance.
            (
                {"column_dtypes": str},
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", f"{tm.ENDIAN}U"),
                        ("B", f"{tm.ENDIAN}U"),
                        ("C", f"{tm.ENDIAN}U"),
                    ],
                ),
            ),
            # Pass in a dtype instance.
            (
                {"column_dtypes": np.dtype("unicode")},
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", f"{tm.ENDIAN}U"),
                        ("B", f"{tm.ENDIAN}U"),
                        ("C", f"{tm.ENDIAN}U"),
                    ],
                ),
            ),
            # Pass in a dictionary (name-only).
            (
                {
                    "column_dtypes": {
                        "A": np.int8,
                        "B": np.float32,
                        "C": f"{tm.ENDIAN}U2",
                    }
                },
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", "i1"),
                        ("B", f"{tm.ENDIAN}f4"),
                        ("C", f"{tm.ENDIAN}U2"),
                    ],
                ),
            ),
            # Pass in a dictionary (indices-only).
            (
                {"index_dtypes": {0: "int16"}},
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[
                        ("index", "i2"),
                        ("A", f"{tm.ENDIAN}i8"),
                        ("B", f"{tm.ENDIAN}f8"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Ignore index mappings if index is not True.
            (
                {"index": False, "index_dtypes": f"{tm.ENDIAN}U2"},
                np.rec.array(
                    [(1, 0.2, "a"), (2, 1.5, "bc")],
                    dtype=[
                        ("A", f"{tm.ENDIAN}i8"),
                        ("B", f"{tm.ENDIAN}f8"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Non-existent names / indices in mapping should not error.
            (
                {"index_dtypes": {0: "int16", "not-there": "float32"}},
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[
                        ("index", "i2"),
                        ("A", f"{tm.ENDIAN}i8"),
                        ("B", f"{tm.ENDIAN}f8"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Names / indices not in mapping default to array dtype.
            (
                {"column_dtypes": {"A": np.int8, "B": np.float32}},
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", "i1"),
                        ("B", f"{tm.ENDIAN}f4"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Names / indices not in dtype mapping default to array dtype.
            (
                {"column_dtypes": {"A": np.dtype("int8"), "B": np.dtype("float32")}},
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}i8"),
                        ("A", "i1"),
                        ("B", f"{tm.ENDIAN}f4"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Mixture of everything.
            (
                {
                    "column_dtypes": {"A": np.int8, "B": np.float32},
                    "index_dtypes": f"{tm.ENDIAN}U2",
                },
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[
                        ("index", f"{tm.ENDIAN}U2"),
                        ("A", "i1"),
                        ("B", f"{tm.ENDIAN}f4"),
                        ("C", "O"),
                    ],
                ),
            ),
            # Invalid dype values.
            (
                {"index": False, "column_dtypes": []},
                (ValueError, "Invalid dtype \\[\\] specified for column A"),
            ),
            (
                {"index": False, "column_dtypes": {"A": "int32", "B": 5}},
                (ValueError, "Invalid dtype 5 specified for column B"),
            ),
            # Numpy can't handle EA types, so check error is raised
            (
                {
                    "index": False,
                    "column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])},
                },
                (ValueError, "Invalid dtype category specified for column B"),
            ),
            # Check that bad types raise
            (
                {"index": False, "column_dtypes": {"A": "int32", "B": "foo"}},
                (TypeError, "data type [\"']foo[\"'] not understood"),
            ),
        ],
    )
    def test_to_records_dtype(self, kwargs, expected):
        # see GH#18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
 
        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)
 
    @pytest.mark.parametrize(
        "df,kwargs,expected",
        [
            # MultiIndex in the index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")
                ).set_index(["a", "b"]),
                {"column_dtypes": "float64", "index_dtypes": {0: "int32", 1: "int8"}},
                np.rec.array(
                    [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
                    dtype=[
                        ("a", f"{tm.ENDIAN}i4"),
                        ("b", "i1"),
                        ("c", f"{tm.ENDIAN}f8"),
                    ],
                ),
            ),
            # MultiIndex in the columns.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples(
                        [("a", "d"), ("b", "e"), ("c", "f")]
                    ),
                ),
                {
                    "column_dtypes": {0: f"{tm.ENDIAN}U1", 2: "float32"},
                    "index_dtypes": "float32",
                },
                np.rec.array(
                    [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)],
                    dtype=[
                        ("index", f"{tm.ENDIAN}f4"),
                        ("('a', 'd')", f"{tm.ENDIAN}U1"),
                        ("('b', 'e')", f"{tm.ENDIAN}i8"),
                        ("('c', 'f')", f"{tm.ENDIAN}f4"),
                    ],
                ),
            ),
            # MultiIndex in both the columns and index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples(
                        [("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")
                    ),
                    index=MultiIndex.from_tuples(
                        [("d", -4), ("d", -5), ("f", -6)], names=list("cd")
                    ),
                ),
                {
                    "column_dtypes": "float64",
                    "index_dtypes": {0: f"{tm.ENDIAN}U2", 1: "int8"},
                },
                np.rec.array(
                    [
                        ("d", -4, 1.0, 2.0, 3.0),
                        ("d", -5, 4.0, 5.0, 6.0),
                        ("f", -6, 7, 8, 9.0),
                    ],
                    dtype=[
                        ("c", f"{tm.ENDIAN}U2"),
                        ("d", "i1"),
                        ("('a', 'd')", f"{tm.ENDIAN}f8"),
                        ("('b', 'e')", f"{tm.ENDIAN}f8"),
                        ("('c', 'f')", f"{tm.ENDIAN}f8"),
                    ],
                ),
            ),
        ],
    )
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see GH#18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)
 
    def test_to_records_dict_like(self):
        # see GH#18146
        class DictLike:
            def __init__(self, **kwargs) -> None:
                self.d = kwargs.copy()
 
            def __getitem__(self, key):
                return self.d.__getitem__(key)
 
            def __contains__(self, key) -> bool:
                return key in self.d
 
            def keys(self):
                return self.d.keys()
 
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
 
        dtype_mappings = {
            "column_dtypes": DictLike(**{"A": np.int8, "B": np.float32}),
            "index_dtypes": f"{tm.ENDIAN}U2",
        }
 
        result = df.to_records(**dtype_mappings)
        expected = np.rec.array(
            [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
            dtype=[
                ("index", f"{tm.ENDIAN}U2"),
                ("A", "i1"),
                ("B", f"{tm.ENDIAN}f4"),
                ("C", "O"),
            ],
        )
        tm.assert_almost_equal(result, expected)
 
    @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH#13937
        dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)
 
        df = DataFrame({"datetime": dr}, index=dr)
 
        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()
 
        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)