import re import numpy as np import pytest from pandas.compat import pa_version_under7p0 import pandas.util._test_decorators as td import pandas as pd from pandas import ( Categorical, CategoricalDtype, DataFrame, DatetimeTZDtype, Index, Interval, IntervalDtype, NaT, Series, Timedelta, Timestamp, concat, date_range, option_context, ) import pandas._testing as tm def _check_cast(df, v): """ Check if all dtypes of df are equal to v """ assert all(s.dtype.name == v for _, s in df.items()) class TestAstype: def test_astype_float(self, float_frame): casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) expected = DataFrame( float_frame.values.astype(np.int32), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) float_frame["foo"] = "5" casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") _check_cast(casted, "float32") casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") casted = mn.astype("float64") _check_cast(casted, "float64") casted = mn.astype("int64") _check_cast(casted, "int64") casted = mn.reindex(columns=["little_float"]).astype("float16") _check_cast(casted, "float16") casted = mn.astype("float32") _check_cast(casted, "float32") casted = mn.astype("int32") _check_cast(casted, "int32") # to object casted = mn.astype("O") _check_cast(casted, "object") def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) df["string"] = "foo" casted = df.astype(int, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) df["string"] = "foo" casted = df.astype(np.int32, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way tf = np.round(float_frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = float_frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa def test_astype_with_view_mixed_float(self, mixed_float_frame): tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see GH#14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with pytest.raises(ValueError, match=msg): df.astype(dtype) def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame( { "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), } ) tm.assert_frame_equal(result, expected) def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.NaN]).astype(str) expected = DataFrame(["nan"]) tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" expected = DataFrame([val]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(["1.0", "2", "3.14", "4", "5.4"]) df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) expected = DataFrame( { "a": a, "b": Series(["0", "1", "2", "3", "4"]), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } ) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) expected = DataFrame( { "a": a, "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), } ) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) # change all columns dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) tm.assert_frame_equal(df.astype(dt3), df.astype(str)) tm.assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({"b": str, 2: str}) dt5 = dtype_class({"e": str}) msg_frame = ( "Only a column name can be used for the key in a dtype mappings argument. " "'{}' not found in columns." ) with pytest.raises(KeyError, match=msg_frame.format(2)): df.astype(dt4) with pytest.raises(KeyError, match=msg_frame.format("e")): df.astype(dt5) tm.assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) # GH#16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object) equiv = df.astype(dt7) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name="a") b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) tm.assert_frame_equal(result, expected) def test_astype_duplicate_col_series_arg(self): # GH#44417 vals = np.random.randn(3, 4) df = DataFrame(vals, columns=["A", "B", "C", "A"]) dtypes = df.dtypes dtypes.iloc[0] = str dtypes.iloc[2] = "Float64" result = df.astype(dtypes) expected = DataFrame( { 0: vals[:, 0].astype(str), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], } ) expected.columns = df.columns tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [ "category", CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list("abcdef")), CategoricalDtype(categories=list("edba"), ordered=False), CategoricalDtype(categories=list("edcb"), ordered=True), ], ids=repr, ) def test_astype_categorical(self, dtype): # GH#18099 d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = f"Expected an instance of {cls.__name__}" with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH#22578 df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) expected1 = DataFrame( { "a": pd.array([1, 3, 5], dtype=dtype), "b": pd.array([2, 4, 6], dtype=dtype), } ) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) df["b"] = df["b"].astype(dtype) expected2 = DataFrame( {"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)} ) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH#22578 df = DataFrame({"a": [1.0, 2.0, 3.0]}) expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) df = DataFrame({"a": [1.0, 2.0, 3.0]}) df["a"] = df["a"].astype(dtype) expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH#24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] ) def test_astype_column_metadata(self, dtype): # GH#19920 columns = Index([100, 200, 300], dtype=np.uint64, name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"]) def test_astype_from_object_to_datetime_unit(self, unit): vals = [ ["2015-01-01", "2015-01-02", "2015-01-03"], ["2017-01-01", "2017-01-02", "2017-02-03"], ] df = DataFrame(vals, dtype=object) with pytest.raises(TypeError, match="Cannot cast"): df.astype(f"M8[{unit}]") @pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"]) def test_astype_from_object_to_timedelta_unit(self, unit): vals = [ ["1 Day", "2 Days", "3 Days"], ["4 Days", "5 Days", "6 Days"], ] df = DataFrame(vals, dtype=object) msg = ( r"Cannot convert from timedelta64\[ns\] to timedelta64\[.*\]. " "Supported resolutions are 's', 'ms', 'us', 'ns'" ) with pytest.raises(ValueError, match=msg): # TODO: this is ValueError while for DatetimeArray it is TypeError; # get these consistent df.astype(f"m8[{unit}]") @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_object(self, dtype, unit): # tests astype to object dtype # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith("M8"): assert result.iloc[0, 0] == Timestamp(1, unit=unit) else: assert result.iloc[0, 0] == Timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # GH#19223 dtype = f"M8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) ser = df.iloc[:, 0] idx = Index(ser) dta = ser._values if unit in ["ns", "us", "ms", "s"]: # GH#48928 result = df.astype(dtype) else: # we use the nearest supported dtype (i.e. M8[s]) msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]" with pytest.raises(TypeError, match=msg): df.astype(dtype) with pytest.raises(TypeError, match=msg): ser.astype(dtype) with pytest.raises(TypeError, match=msg.replace("Array", "Index")): idx.astype(dtype) with pytest.raises(TypeError, match=msg): dta.astype(dtype) return exp_df = DataFrame(arr.astype(dtype)) assert (exp_df.dtypes == dtype).all() tm.assert_frame_equal(result, exp_df) res_ser = ser.astype(dtype) exp_ser = exp_df.iloc[:, 0] assert exp_ser.dtype == dtype tm.assert_series_equal(res_ser, exp_ser) exp_dta = exp_ser._values res_index = idx.astype(dtype) exp_index = Index(exp_ser) assert exp_index.dtype == dtype tm.assert_index_equal(res_index, exp_index) res_dta = dta.astype(dtype) assert exp_dta.dtype == dtype tm.assert_extension_array_equal(res_dta, exp_dta) @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # GH#19223 until 2.0 used to coerce to float dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) ser = df.iloc[:, 0] tdi = Index(ser) tda = tdi._values if unit in ["us", "ms", "s"]: assert (df.dtypes == dtype).all() result = df.astype(dtype) else: # We get the nearest supported unit, i.e. "s" assert (df.dtypes == "m8[s]").all() msg = ( rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. " "Supported resolutions are 's', 'ms', 'us', 'ns'" ) with pytest.raises(ValueError, match=msg): df.astype(dtype) with pytest.raises(ValueError, match=msg): ser.astype(dtype) with pytest.raises(ValueError, match=msg): tdi.astype(dtype) with pytest.raises(ValueError, match=msg): tda.astype(dtype) return result = df.astype(dtype) # The conversion is a no-op, so we just get a copy expected = df tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # GH#19224 dtype = f"M8[{unit}]" other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = "|".join( [ # BlockManager path rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]", # ArrayManager path "cannot astype a datetimelike from " rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]", ] ) with pytest.raises(TypeError, match=msg): df.astype(other) msg = "|".join( [ # BlockManager path rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]", # ArrayManager path "cannot astype a timedelta from " rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]", ] ) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_astype_arg_for_errors(self): # GH#14878 df = DataFrame([1, 2, 3]) msg = ( "Expected value of kwarg 'errors' to be one of " "['raise', 'ignore']. Supplied value is 'True'" ) with pytest.raises(ValueError, match=re.escape(msg)): df.astype(np.float64, errors=True) df.astype(np.int8, errors="ignore") def test_astype_invalid_conversion(self): # GH#47571 df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]}) msg = ( "invalid literal for int() with base 10: 'text': " "Error while type casting for column 'a'" ) with pytest.raises(ValueError, match=re.escape(msg)): df.astype({"a": int}) def test_astype_arg_for_errors_dictlist(self): # GH#25905 df = DataFrame( [ {"a": "1", "b": "16.5%", "c": "test"}, {"a": "2.2", "b": "15.3", "c": "another_test"}, ] ) expected = DataFrame( [ {"a": 1.0, "b": "16.5%", "c": "test"}, {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) def test_astype_dt64tz(self, timezone_frame): # astype expected = np.array( [ [ Timestamp("2013-01-01 00:00:00"), Timestamp("2013-01-02 00:00:00"), Timestamp("2013-01-03 00:00:00"), ], [ Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), NaT, Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), ], [ Timestamp("2013-01-01 00:00:00+0100", tz="CET"), NaT, Timestamp("2013-01-03 00:00:00+0100", tz="CET"), ], ], dtype=object, ).T expected = DataFrame( expected, index=timezone_frame.index, columns=timezone_frame.columns, dtype=object, ) result = timezone_frame.astype(object) tm.assert_frame_equal(result, expected) msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-" with pytest.raises(TypeError, match=msg): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") def test_astype_dt64tz_to_str(self, timezone_frame): # str formatting result = timezone_frame.astype(str) expected = DataFrame( [ [ "2013-01-01", "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], ["2013-01-02", "NaT", "NaT"], [ "2013-01-03", "2013-01-03 00:00:00-05:00", "2013-01-03 00:00:00+01:00", ], ], columns=timezone_frame.columns, ) tm.assert_frame_equal(result, expected) with option_context("display.max_columns", 20): result = str(timezone_frame) assert ( "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" ) in result assert ( "1 2013-01-02 NaT NaT" ) in result assert ( "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result def test_astype_empty_dtype_dict(self): # issue mentioned further down in the following issue's thread # https://github.com/pandas-dev/pandas/issues/33113 df = DataFrame() result = df.astype({}) tm.assert_frame_equal(result, df) assert result is not df @pytest.mark.parametrize( "data, dtype", [ (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], "string[pyarrow]", marks=td.skip_if_no("pyarrow"), ), (["x", "y", "z"], "category"), (3 * [Timestamp("2020-01-01", tz="UTC")], None), (3 * [Interval(0, 1)], None), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 df = DataFrame(Series(data, dtype=dtype)) if errors == "ignore": expected = df result = df.astype(float, errors=errors) tm.assert_frame_equal(result, expected) else: msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): df.astype(float, errors=errors) def test_astype_tz_conversion(self): # GH 35973 val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) expected = df expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) result = expected.astype({"tz": f"datetime64[ns, {tz}]"}) result = result.astype({"tz": "object"}) # do real test: object dtype to a specified tz, different from construction tz. result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): # GH#41409 tz = tz_naive_fixture dti = date_range("2016-01-01", periods=3, tz=tz) dta = dti._data dta[0] = NaT obj = frame_or_series(dta) result = obj.astype("string") # Check that Series/DataFrame.astype matches DatetimeArray.astype expected = frame_or_series(dta.astype("string")) tm.assert_equal(result, expected) item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) assert np.all(alt.iloc[1:] == result.iloc[1:]) def test_astype_td64_to_string(self, frame_or_series): # GH#41409 tdi = pd.timedelta_range("1 Day", periods=3) obj = frame_or_series(tdi) expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string") result = obj.astype("string") tm.assert_equal(result, expected) def test_astype_bytes(self): # GH#39474 result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") @pytest.mark.parametrize( "index_slice", [ np.s_[:2, :2], np.s_[:1, :2], np.s_[:2, :1], np.s_[::2, ::2], np.s_[::1, ::2], np.s_[::2, ::1], ], ) def test_astype_noncontiguous(self, index_slice): # GH#42396 data = np.arange(16).reshape(4, 4) df = DataFrame(data) result = df.iloc[index_slice].astype("int16") expected = df.iloc[index_slice] tm.assert_frame_equal(result, expected, check_dtype=False) def test_astype_retain_attrs(self, any_numpy_dtype): # GH#44414 df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]}) df.attrs["Location"] = "Michigan" result = df.astype({"a": any_numpy_dtype}).attrs expected = df.attrs tm.assert_dict_equal(expected, result) class TestAstypeCategorical: def test_astype_from_categorical3(self): df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]}) cats = Categorical([1, 2, 3, 4, 5, 6]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) def test_astype_from_categorical4(self): df = DataFrame( {"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]} ) cats = Categorical(["a", "b", "b", "a", "a", "d"]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) def test_categorical_astype_to_int(self, any_int_dtype): # GH#39402 df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])}) df.col1 = df.col1.astype("category") df.col1 = df.col1.astype(any_int_dtype) expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)}) tm.assert_frame_equal(df, expected) def test_astype_categorical_to_string_missing(self): # https://github.com/pandas-dev/pandas/issues/41797 df = DataFrame(["a", "b", np.nan]) expected = df.astype(str) cat = df.astype("category") result = cat.astype(str) tm.assert_frame_equal(result, expected) class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): # GH 42501 def copy(self): assert False class Int16DtypeNoCopy(pd.Int16Dtype): # GH 42501 @classmethod def construct_array_type(cls): return IntegerArrayNoCopy def test_frame_astype_no_copy(): # GH 42501 df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object) result = df.astype({"a": Int16DtypeNoCopy()}, copy=False) assert result.a.dtype == pd.Int16Dtype() assert np.shares_memory(df.b.values, result.b.values) @pytest.mark.skipif(pa_version_under7p0, reason="pyarrow is required for this test") @pytest.mark.parametrize("dtype", ["int64", "Int64"]) def test_astype_copies(dtype): # GH#50984 df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) result = df.astype("int64[pyarrow]", copy=True) df.iloc[0, 0] = 100 expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) def test_astype_to_string_not_modifying_input(string_storage, val): # GH#51073 df = DataFrame({"a": ["a", "b", val]}) expected = df.copy() with option_context("mode.string_storage", string_storage): df.astype("string", copy=False) tm.assert_frame_equal(df, expected)