import numpy as np import pytest from pandas.compat import pa_version_under7p0 import pandas.util._test_decorators as td import pandas as pd from pandas import ( DataFrame, Series, Timestamp, date_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array def test_astype_single_dtype(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) df_orig = df.copy() df2 = df.astype("float64") if using_copy_on_write: assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 2] = 5.5 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) # mutating parent also doesn't update result df2 = df.astype("float64") df.iloc[0, 2] = 5.5 tm.assert_frame_equal(df2, df_orig.astype("float64")) @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): if new_dtype == "int64[pyarrow]" and pa_version_under7p0: pytest.skip("pyarrow not installed") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) # mutating parent also doesn't update result df2 = df.astype(new_dtype) df.iloc[0, 0] = 100 tm.assert_frame_equal(df2, df_orig.astype(new_dtype)) @pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) def test_astype_different_target_dtype(using_copy_on_write, dtype): if dtype == "int32[pyarrow]" and pa_version_under7p0: pytest.skip("pyarrow not installed") df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() df2 = df.astype(dtype) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) if using_copy_on_write: assert df2._mgr._has_no_reference(0) df2.iloc[0, 0] = 5 tm.assert_frame_equal(df, df_orig) # mutating parent also doesn't update result df2 = df.astype(dtype) df.iloc[0, 0] = 100 tm.assert_frame_equal(df2, df_orig.astype(dtype)) @td.skip_array_manager_invalid_test def test_astype_numpy_to_ea(): ser = Series([1, 2, 3]) with pd.option_context("mode.copy_on_write", True): result = ser.astype("Int64") assert np.shares_memory(get_array(ser), get_array(result)) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = "x" tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) def test_astype_string_and_object_update_original( using_copy_on_write, dtype, new_dtype ): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df2 = df.astype(new_dtype) df_orig = df2.copy() if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df.iloc[0, 0] = "x" tm.assert_frame_equal(df2, df_orig) def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} ) df_orig = df.copy() df2 = df.astype({"a": "float64", "c": "float64"}) if using_copy_on_write: assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 2] = 5.5 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) df2.iloc[0, 1] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) tm.assert_frame_equal(df, df_orig) def test_astype_different_datetime_resos(using_copy_on_write): df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")}) result = df.astype("datetime64[ms]") assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) if using_copy_on_write: assert result._mgr._has_no_reference(0) def test_astype_different_timezones(using_copy_on_write): df = DataFrame( {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} ) result = df.astype("datetime64[ns, Europe/Berlin]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) def test_astype_different_timezones_different_reso(using_copy_on_write): df = DataFrame( {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} ) result = df.astype("datetime64[ms, Europe/Berlin]") if using_copy_on_write: assert result._mgr._has_no_reference(0) assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) @pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed") def test_astype_arrow_timestamp(using_copy_on_write): df = DataFrame( { "a": [ Timestamp("2020-01-01 01:01:01.000001"), Timestamp("2020-01-01 01:01:01.000001"), ] }, dtype="M8[ns]", ) result = df.astype("timestamp[ns][pyarrow]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) # TODO(CoW): arrow is not setting copy=False in the Series constructor # under the hood assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")._data) def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() result = ser.convert_dtypes( convert_integer=False, convert_boolean=False, convert_floating=False, convert_string=False, ) if using_copy_on_write: assert np.shares_memory(get_array(ser), get_array(result)) else: assert not np.shares_memory(get_array(ser), get_array(result)) result.iloc[0] = "x" tm.assert_series_equal(ser, ser_orig) def test_convert_dtypes(using_copy_on_write): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) df2.iloc[0, 0] = "x" tm.assert_frame_equal(df, df_orig)