"""
|
Routines for casting.
|
"""
|
|
from __future__ import annotations
|
|
import datetime as dt
|
import functools
|
from typing import (
|
TYPE_CHECKING,
|
Any,
|
Literal,
|
Sized,
|
TypeVar,
|
cast,
|
overload,
|
)
|
import warnings
|
|
import numpy as np
|
|
from pandas._libs import lib
|
from pandas._libs.missing import (
|
NA,
|
NAType,
|
checknull,
|
)
|
from pandas._libs.tslibs import (
|
NaT,
|
OutOfBoundsDatetime,
|
OutOfBoundsTimedelta,
|
Timedelta,
|
Timestamp,
|
get_unit_from_dtype,
|
is_supported_unit,
|
)
|
from pandas._libs.tslibs.timedeltas import array_to_timedelta64
|
from pandas._typing import (
|
ArrayLike,
|
Dtype,
|
DtypeObj,
|
NumpyIndexT,
|
Scalar,
|
npt,
|
)
|
from pandas.errors import (
|
IntCastingNaNError,
|
LossySetitemError,
|
)
|
|
from pandas.core.dtypes.common import (
|
ensure_int8,
|
ensure_int16,
|
ensure_int32,
|
ensure_int64,
|
ensure_object,
|
ensure_str,
|
is_bool,
|
is_bool_dtype,
|
is_complex,
|
is_complex_dtype,
|
is_datetime64_dtype,
|
is_extension_array_dtype,
|
is_float,
|
is_float_dtype,
|
is_integer,
|
is_integer_dtype,
|
is_numeric_dtype,
|
is_object_dtype,
|
is_scalar,
|
is_signed_integer_dtype,
|
is_string_dtype,
|
is_timedelta64_dtype,
|
is_unsigned_integer_dtype,
|
pandas_dtype as pandas_dtype_func,
|
)
|
from pandas.core.dtypes.dtypes import (
|
BaseMaskedDtype,
|
CategoricalDtype,
|
DatetimeTZDtype,
|
ExtensionDtype,
|
IntervalDtype,
|
PandasExtensionDtype,
|
PeriodDtype,
|
)
|
from pandas.core.dtypes.generic import (
|
ABCExtensionArray,
|
ABCIndex,
|
ABCSeries,
|
)
|
from pandas.core.dtypes.inference import is_list_like
|
from pandas.core.dtypes.missing import (
|
is_valid_na_for_dtype,
|
isna,
|
na_value_for_dtype,
|
notna,
|
)
|
|
if TYPE_CHECKING:
|
from pandas import Index
|
from pandas.core.arrays import (
|
Categorical,
|
DatetimeArray,
|
ExtensionArray,
|
IntervalArray,
|
PeriodArray,
|
TimedeltaArray,
|
)
|
|
|
_int8_max = np.iinfo(np.int8).max
|
_int16_max = np.iinfo(np.int16).max
|
_int32_max = np.iinfo(np.int32).max
|
_int64_max = np.iinfo(np.int64).max
|
|
_dtype_obj = np.dtype(object)
|
|
NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
|
|
|
def maybe_convert_platform(
|
values: list | tuple | range | np.ndarray | ExtensionArray,
|
) -> ArrayLike:
|
"""try to do platform conversion, allow ndarray or list here"""
|
arr: ArrayLike
|
|
if isinstance(values, (list, tuple, range)):
|
arr = construct_1d_object_array_from_listlike(values)
|
else:
|
# The caller is responsible for ensuring that we have np.ndarray
|
# or ExtensionArray here.
|
arr = values
|
|
if arr.dtype == _dtype_obj:
|
arr = cast(np.ndarray, arr)
|
arr = lib.maybe_convert_objects(arr)
|
|
return arr
|
|
|
def is_nested_object(obj) -> bool:
|
"""
|
return a boolean if we have a nested object, e.g. a Series with 1 or
|
more Series elements
|
|
This may not be necessarily be performant.
|
|
"""
|
return bool(
|
isinstance(obj, ABCSeries)
|
and is_object_dtype(obj.dtype)
|
and any(isinstance(v, ABCSeries) for v in obj._values)
|
)
|
|
|
def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:
|
"""
|
Cast scalar to Timestamp or Timedelta if scalar is datetime-like
|
and dtype is not object.
|
|
Parameters
|
----------
|
value : scalar
|
dtype : Dtype, optional
|
|
Returns
|
-------
|
scalar
|
"""
|
if dtype == _dtype_obj:
|
pass
|
elif isinstance(value, (np.datetime64, dt.datetime)):
|
value = Timestamp(value)
|
elif isinstance(value, (np.timedelta64, dt.timedelta)):
|
value = Timedelta(value)
|
|
return value
|
|
|
def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType:
|
"""
|
If passed a scalar cast the scalar to a python native type.
|
|
Parameters
|
----------
|
value : scalar or Series
|
|
Returns
|
-------
|
scalar or Series
|
"""
|
if is_float(value):
|
# error: Argument 1 to "float" has incompatible type
|
# "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
|
# expected "Union[SupportsFloat, _SupportsIndex, str]"
|
value = float(value) # type: ignore[arg-type]
|
elif is_integer(value):
|
# error: Argument 1 to "int" has incompatible type
|
# "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
|
# expected "Union[str, SupportsInt, _SupportsIndex, _SupportsTrunc]"
|
value = int(value) # type: ignore[arg-type]
|
elif is_bool(value):
|
value = bool(value)
|
elif isinstance(value, (np.datetime64, np.timedelta64)):
|
value = maybe_box_datetimelike(value)
|
elif value is NA:
|
value = None
|
return value
|
|
|
def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar:
|
"""
|
Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting
|
into a numpy array. Failing to unbox would risk dropping nanoseconds.
|
|
Notes
|
-----
|
Caller is responsible for checking dtype.kind in ["m", "M"]
|
"""
|
if is_valid_na_for_dtype(value, dtype):
|
# GH#36541: can't fill array directly with pd.NaT
|
# > np.empty(10, dtype="datetime64[ns]").fill(pd.NaT)
|
# ValueError: cannot convert float NaN to integer
|
value = dtype.type("NaT", "ns")
|
elif isinstance(value, Timestamp):
|
if value.tz is None:
|
value = value.to_datetime64()
|
elif not isinstance(dtype, DatetimeTZDtype):
|
raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype")
|
elif isinstance(value, Timedelta):
|
value = value.to_timedelta64()
|
|
_disallow_mismatched_datetimelike(value, dtype)
|
return value
|
|
|
def _disallow_mismatched_datetimelike(value, dtype: DtypeObj):
|
"""
|
numpy allows np.array(dt64values, dtype="timedelta64[ns]") and
|
vice-versa, but we do not want to allow this, so we need to
|
check explicitly
|
"""
|
vdtype = getattr(value, "dtype", None)
|
if vdtype is None:
|
return
|
elif (vdtype.kind == "m" and dtype.kind == "M") or (
|
vdtype.kind == "M" and dtype.kind == "m"
|
):
|
raise TypeError(f"Cannot cast {repr(value)} to {dtype}")
|
|
|
@overload
|
def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray:
|
...
|
|
|
@overload
|
def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike:
|
...
|
|
|
def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike:
|
"""
|
try to cast to the specified dtype (e.g. convert back to bool/int
|
or could be an astype of float64->float32
|
"""
|
do_round = False
|
|
if isinstance(dtype, str):
|
if dtype == "infer":
|
inferred_type = lib.infer_dtype(result, skipna=False)
|
if inferred_type == "boolean":
|
dtype = "bool"
|
elif inferred_type == "integer":
|
dtype = "int64"
|
elif inferred_type == "datetime64":
|
dtype = "datetime64[ns]"
|
elif inferred_type in ["timedelta", "timedelta64"]:
|
dtype = "timedelta64[ns]"
|
|
# try to upcast here
|
elif inferred_type == "floating":
|
dtype = "int64"
|
if issubclass(result.dtype.type, np.number):
|
do_round = True
|
|
else:
|
# TODO: complex? what if result is already non-object?
|
dtype = "object"
|
|
dtype = np.dtype(dtype)
|
|
if not isinstance(dtype, np.dtype):
|
# enforce our signature annotation
|
raise TypeError(dtype) # pragma: no cover
|
|
converted = maybe_downcast_numeric(result, dtype, do_round)
|
if converted is not result:
|
return converted
|
|
# a datetimelike
|
# GH12821, iNaT is cast to float
|
if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
|
result = result.astype(dtype)
|
|
elif dtype.kind == "m" and result.dtype == _dtype_obj:
|
# test_where_downcast_to_td64
|
result = cast(np.ndarray, result)
|
result = array_to_timedelta64(result)
|
|
elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj:
|
result = cast(np.ndarray, result)
|
return np.asarray(maybe_cast_to_datetime(result, dtype=dtype))
|
|
return result
|
|
|
@overload
|
def maybe_downcast_numeric(
|
result: np.ndarray, dtype: np.dtype, do_round: bool = False
|
) -> np.ndarray:
|
...
|
|
|
@overload
|
def maybe_downcast_numeric(
|
result: ExtensionArray, dtype: DtypeObj, do_round: bool = False
|
) -> ArrayLike:
|
...
|
|
|
def maybe_downcast_numeric(
|
result: ArrayLike, dtype: DtypeObj, do_round: bool = False
|
) -> ArrayLike:
|
"""
|
Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
|
|
Parameters
|
----------
|
result : ndarray or ExtensionArray
|
dtype : np.dtype or ExtensionDtype
|
do_round : bool
|
|
Returns
|
-------
|
ndarray or ExtensionArray
|
"""
|
if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype):
|
# e.g. SparseDtype has no itemsize attr
|
return result
|
|
def trans(x):
|
if do_round:
|
return x.round()
|
return x
|
|
if dtype.kind == result.dtype.kind:
|
# don't allow upcasts here (except if empty)
|
if result.dtype.itemsize <= dtype.itemsize and result.size:
|
return result
|
|
if is_bool_dtype(dtype) or is_integer_dtype(dtype):
|
if not result.size:
|
# if we don't have any elements, just astype it
|
return trans(result).astype(dtype)
|
|
# do a test on the first element, if it fails then we are done
|
r = result.ravel()
|
arr = np.array([r[0]])
|
|
if isna(arr).any():
|
# if we have any nulls, then we are done
|
return result
|
|
elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):
|
# a comparable, e.g. a Decimal may slip in here
|
return result
|
|
if (
|
issubclass(result.dtype.type, (np.object_, np.number))
|
and notna(result).all()
|
):
|
new_result = trans(result).astype(dtype)
|
if new_result.dtype.kind == "O" or result.dtype.kind == "O":
|
# np.allclose may raise TypeError on object-dtype
|
if (new_result == result).all():
|
return new_result
|
else:
|
if np.allclose(new_result, result, rtol=0):
|
return new_result
|
|
elif (
|
issubclass(dtype.type, np.floating)
|
and not is_bool_dtype(result.dtype)
|
and not is_string_dtype(result.dtype)
|
):
|
with warnings.catch_warnings():
|
warnings.filterwarnings(
|
"ignore", "overflow encountered in cast", RuntimeWarning
|
)
|
new_result = result.astype(dtype)
|
|
# Adjust tolerances based on floating point size
|
size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16}
|
|
atol = size_tols.get(new_result.dtype.itemsize, 0.0)
|
|
# Check downcast float values are still equal within 7 digits when
|
# converting from float64 to float32
|
if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol):
|
return new_result
|
|
elif dtype.kind == result.dtype.kind == "c":
|
new_result = result.astype(dtype)
|
|
if np.array_equal(new_result, result, equal_nan=True):
|
# TODO: use tolerance like we do for float?
|
return new_result
|
|
return result
|
|
|
def maybe_upcast_numeric_to_64bit(arr: NumpyIndexT) -> NumpyIndexT:
|
"""
|
If array is a int/uint/float bit size lower than 64 bit, upcast it to 64 bit.
|
|
Parameters
|
----------
|
arr : ndarray or ExtensionArray
|
|
Returns
|
-------
|
ndarray or ExtensionArray
|
"""
|
dtype = arr.dtype
|
if is_signed_integer_dtype(dtype) and dtype != np.int64:
|
return arr.astype(np.int64)
|
elif is_unsigned_integer_dtype(dtype) and dtype != np.uint64:
|
return arr.astype(np.uint64)
|
elif is_float_dtype(dtype) and dtype != np.float64:
|
return arr.astype(np.float64)
|
else:
|
return arr
|
|
|
def maybe_cast_pointwise_result(
|
result: ArrayLike,
|
dtype: DtypeObj,
|
numeric_only: bool = False,
|
same_dtype: bool = True,
|
) -> ArrayLike:
|
"""
|
Try casting result of a pointwise operation back to the original dtype if
|
appropriate.
|
|
Parameters
|
----------
|
result : array-like
|
Result to cast.
|
dtype : np.dtype or ExtensionDtype
|
Input Series from which result was calculated.
|
numeric_only : bool, default False
|
Whether to cast only numerics or datetimes as well.
|
same_dtype : bool, default True
|
Specify dtype when calling _from_sequence
|
|
Returns
|
-------
|
result : array-like
|
result maybe casted to the dtype.
|
"""
|
|
assert not is_scalar(result)
|
|
if isinstance(dtype, ExtensionDtype):
|
if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
|
# TODO: avoid this special-casing
|
# We have to special case categorical so as not to upcast
|
# things like counts back to categorical
|
|
cls = dtype.construct_array_type()
|
if same_dtype:
|
result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
|
else:
|
result = maybe_cast_to_extension_array(cls, result)
|
|
elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
|
result = maybe_downcast_to_dtype(result, dtype)
|
|
return result
|
|
|
def maybe_cast_to_extension_array(
|
cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None
|
) -> ArrayLike:
|
"""
|
Call to `_from_sequence` that returns the object unchanged on Exception.
|
|
Parameters
|
----------
|
cls : class, subclass of ExtensionArray
|
obj : arraylike
|
Values to pass to cls._from_sequence
|
dtype : ExtensionDtype, optional
|
|
Returns
|
-------
|
ExtensionArray or obj
|
"""
|
from pandas.core.arrays.string_ import BaseStringArray
|
|
assert isinstance(cls, type), f"must pass a type: {cls}"
|
assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
|
assert issubclass(cls, ABCExtensionArray), assertion_msg
|
|
# Everything can be converted to StringArrays, but we may not want to convert
|
if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
|
return obj
|
|
try:
|
result = cls._from_sequence(obj, dtype=dtype)
|
except Exception:
|
# We can't predict what downstream EA constructors may raise
|
result = obj
|
return result
|
|
|
@overload
|
def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype:
|
...
|
|
|
@overload
|
def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype:
|
...
|
|
|
def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj:
|
"""
|
If we have a dtype that cannot hold NA values, find the best match that can.
|
"""
|
if isinstance(dtype, ExtensionDtype):
|
if dtype._can_hold_na:
|
return dtype
|
elif isinstance(dtype, IntervalDtype):
|
# TODO(GH#45349): don't special-case IntervalDtype, allow
|
# overriding instead of returning object below.
|
return IntervalDtype(np.float64, closed=dtype.closed)
|
return _dtype_obj
|
elif dtype.kind == "b":
|
return _dtype_obj
|
elif dtype.kind in ["i", "u"]:
|
return np.dtype(np.float64)
|
return dtype
|
|
|
_canonical_nans = {
|
np.datetime64: np.datetime64("NaT", "ns"),
|
np.timedelta64: np.timedelta64("NaT", "ns"),
|
type(np.nan): np.nan,
|
}
|
|
|
def maybe_promote(dtype: np.dtype, fill_value=np.nan):
|
"""
|
Find the minimal dtype that can hold both the given dtype and fill_value.
|
|
Parameters
|
----------
|
dtype : np.dtype
|
fill_value : scalar, default np.nan
|
|
Returns
|
-------
|
dtype
|
Upcasted from dtype argument if necessary.
|
fill_value
|
Upcasted from fill_value argument if necessary.
|
|
Raises
|
------
|
ValueError
|
If fill_value is a non-scalar and dtype is not object.
|
"""
|
orig = fill_value
|
if checknull(fill_value):
|
# https://github.com/pandas-dev/pandas/pull/39692#issuecomment-1441051740
|
# avoid cache misses with NaN/NaT values that are not singletons
|
fill_value = _canonical_nans.get(type(fill_value), fill_value)
|
|
# for performance, we are using a cached version of the actual implementation
|
# of the function in _maybe_promote. However, this doesn't always work (in case
|
# of non-hashable arguments), so we fallback to the actual implementation if needed
|
try:
|
# error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type
|
# "Type[Any]"; expected "Hashable" [arg-type]
|
dtype, fill_value = _maybe_promote_cached(
|
dtype, fill_value, type(fill_value) # type: ignore[arg-type]
|
)
|
except TypeError:
|
# if fill_value is not hashable (required for caching)
|
dtype, fill_value = _maybe_promote(dtype, fill_value)
|
|
if dtype == _dtype_obj and orig is not None:
|
# GH#51592 restore our potentially non-canonical fill_value
|
fill_value = orig
|
return dtype, fill_value
|
|
|
@functools.lru_cache(maxsize=128)
|
def _maybe_promote_cached(dtype, fill_value, fill_value_type):
|
# The cached version of _maybe_promote below
|
# This also use fill_value_type as (unused) argument to use this in the
|
# cache lookup -> to differentiate 1 and True
|
return _maybe_promote(dtype, fill_value)
|
|
|
def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
|
# The actual implementation of the function, use `maybe_promote` above for
|
# a cached version.
|
if not is_scalar(fill_value):
|
# with object dtype there is nothing to promote, and the user can
|
# pass pretty much any weird fill_value they like
|
if not is_object_dtype(dtype):
|
# with object dtype there is nothing to promote, and the user can
|
# pass pretty much any weird fill_value they like
|
raise ValueError("fill_value must be a scalar")
|
dtype = _dtype_obj
|
return dtype, fill_value
|
|
kinds = ["i", "u", "f", "c", "m", "M"]
|
if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in kinds:
|
dtype = ensure_dtype_can_hold_na(dtype)
|
fv = na_value_for_dtype(dtype)
|
return dtype, fv
|
|
elif isinstance(dtype, CategoricalDtype):
|
if fill_value in dtype.categories or isna(fill_value):
|
return dtype, fill_value
|
else:
|
return object, ensure_object(fill_value)
|
|
elif isna(fill_value):
|
dtype = _dtype_obj
|
if fill_value is None:
|
# but we retain e.g. pd.NA
|
fill_value = np.nan
|
return dtype, fill_value
|
|
# returns tuple of (dtype, fill_value)
|
if issubclass(dtype.type, np.datetime64):
|
inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
|
if inferred == dtype:
|
return dtype, fv
|
|
from pandas.core.arrays import DatetimeArray
|
|
dta = DatetimeArray._from_sequence([], dtype="M8[ns]")
|
try:
|
fv = dta._validate_setitem_value(fill_value)
|
return dta.dtype, fv
|
except (ValueError, TypeError):
|
return _dtype_obj, fill_value
|
|
elif issubclass(dtype.type, np.timedelta64):
|
inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
|
if inferred == dtype:
|
return dtype, fv
|
|
return np.dtype("object"), fill_value
|
|
elif is_float(fill_value):
|
if issubclass(dtype.type, np.bool_):
|
dtype = np.dtype(np.object_)
|
|
elif issubclass(dtype.type, np.integer):
|
dtype = np.dtype(np.float64)
|
|
elif dtype.kind == "f":
|
mst = np.min_scalar_type(fill_value)
|
if mst > dtype:
|
# e.g. mst is np.float64 and dtype is np.float32
|
dtype = mst
|
|
elif dtype.kind == "c":
|
mst = np.min_scalar_type(fill_value)
|
dtype = np.promote_types(dtype, mst)
|
|
elif is_bool(fill_value):
|
if not issubclass(dtype.type, np.bool_):
|
dtype = np.dtype(np.object_)
|
|
elif is_integer(fill_value):
|
if issubclass(dtype.type, np.bool_):
|
dtype = np.dtype(np.object_)
|
|
elif issubclass(dtype.type, np.integer):
|
if not np.can_cast(fill_value, dtype):
|
# upcast to prevent overflow
|
mst = np.min_scalar_type(fill_value)
|
dtype = np.promote_types(dtype, mst)
|
if dtype.kind == "f":
|
# Case where we disagree with numpy
|
dtype = np.dtype(np.object_)
|
|
elif is_complex(fill_value):
|
if issubclass(dtype.type, np.bool_):
|
dtype = np.dtype(np.object_)
|
|
elif issubclass(dtype.type, (np.integer, np.floating)):
|
mst = np.min_scalar_type(fill_value)
|
dtype = np.promote_types(dtype, mst)
|
|
elif dtype.kind == "c":
|
mst = np.min_scalar_type(fill_value)
|
if mst > dtype:
|
# e.g. mst is np.complex128 and dtype is np.complex64
|
dtype = mst
|
|
else:
|
dtype = np.dtype(np.object_)
|
|
# in case we have a string that looked like a number
|
if issubclass(dtype.type, (bytes, str)):
|
dtype = np.dtype(np.object_)
|
|
fill_value = _ensure_dtype_type(fill_value, dtype)
|
return dtype, fill_value
|
|
|
def _ensure_dtype_type(value, dtype: np.dtype):
|
"""
|
Ensure that the given value is an instance of the given dtype.
|
|
e.g. if out dtype is np.complex64_, we should have an instance of that
|
as opposed to a python complex object.
|
|
Parameters
|
----------
|
value : object
|
dtype : np.dtype
|
|
Returns
|
-------
|
object
|
"""
|
# Start with exceptions in which we do _not_ cast to numpy types
|
|
if dtype == _dtype_obj:
|
return value
|
|
# Note: before we get here we have already excluded isna(value)
|
return dtype.type(value)
|
|
|
def infer_dtype_from(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
|
"""
|
Interpret the dtype from a scalar or array.
|
|
Parameters
|
----------
|
val : object
|
pandas_dtype : bool, default False
|
whether to infer dtype including pandas extension types.
|
If False, scalar/array belongs to pandas extension types is inferred as
|
object
|
"""
|
if not is_list_like(val):
|
return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
|
return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
|
|
|
def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
|
"""
|
Interpret the dtype from a scalar.
|
|
Parameters
|
----------
|
pandas_dtype : bool, default False
|
whether to infer dtype including pandas extension types.
|
If False, scalar belongs to pandas extension types is inferred as
|
object
|
"""
|
dtype: DtypeObj = _dtype_obj
|
|
# a 1-element ndarray
|
if isinstance(val, np.ndarray):
|
if val.ndim != 0:
|
msg = "invalid ndarray passed to infer_dtype_from_scalar"
|
raise ValueError(msg)
|
|
dtype = val.dtype
|
val = lib.item_from_zerodim(val)
|
|
elif isinstance(val, str):
|
# If we create an empty array using a string to infer
|
# the dtype, NumPy will only allocate one character per entry
|
# so this is kind of bad. Alternately we could use np.repeat
|
# instead of np.empty (but then you still don't want things
|
# coming out as np.str_!
|
|
dtype = _dtype_obj
|
|
elif isinstance(val, (np.datetime64, dt.datetime)):
|
try:
|
val = Timestamp(val)
|
if val is not NaT:
|
val = val.as_unit("ns")
|
except OutOfBoundsDatetime:
|
return _dtype_obj, val
|
|
if val is NaT or val.tz is None:
|
val = val.to_datetime64()
|
dtype = val.dtype
|
# TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
|
else:
|
if pandas_dtype:
|
dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
|
else:
|
# return datetimetz as object
|
return _dtype_obj, val
|
|
elif isinstance(val, (np.timedelta64, dt.timedelta)):
|
try:
|
val = Timedelta(val)
|
except (OutOfBoundsTimedelta, OverflowError):
|
dtype = _dtype_obj
|
else:
|
dtype = np.dtype("m8[ns]")
|
val = np.timedelta64(val.value, "ns")
|
|
elif is_bool(val):
|
dtype = np.dtype(np.bool_)
|
|
elif is_integer(val):
|
if isinstance(val, np.integer):
|
dtype = np.dtype(type(val))
|
else:
|
dtype = np.dtype(np.int64)
|
|
try:
|
np.array(val, dtype=dtype)
|
except OverflowError:
|
dtype = np.array(val).dtype
|
|
elif is_float(val):
|
if isinstance(val, np.floating):
|
dtype = np.dtype(type(val))
|
else:
|
dtype = np.dtype(np.float64)
|
|
elif is_complex(val):
|
dtype = np.dtype(np.complex_)
|
|
elif pandas_dtype:
|
if lib.is_period(val):
|
dtype = PeriodDtype(freq=val.freq)
|
elif lib.is_interval(val):
|
subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
|
dtype = IntervalDtype(subtype=subtype, closed=val.closed)
|
|
return dtype, val
|
|
|
def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
|
"""
|
Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
|
|
Parameters
|
----------
|
d: dict-like object
|
|
Returns
|
-------
|
dict
|
"""
|
return {maybe_box_datetimelike(key): value for key, value in d.items()}
|
|
|
def infer_dtype_from_array(
|
arr, pandas_dtype: bool = False
|
) -> tuple[DtypeObj, ArrayLike]:
|
"""
|
Infer the dtype from an array.
|
|
Parameters
|
----------
|
arr : array
|
pandas_dtype : bool, default False
|
whether to infer dtype including pandas extension types.
|
If False, array belongs to pandas extension types
|
is inferred as object
|
|
Returns
|
-------
|
tuple (numpy-compat/pandas-compat dtype, array)
|
|
Notes
|
-----
|
if pandas_dtype=False. these infer to numpy dtypes
|
exactly with the exception that mixed / object dtypes
|
are not coerced by stringifying or conversion
|
|
if pandas_dtype=True. datetime64tz-aware/categorical
|
types will retain there character.
|
|
Examples
|
--------
|
>>> np.asarray([1, '1'])
|
array(['1', '1'], dtype='<U21')
|
|
>>> infer_dtype_from_array([1, '1'])
|
(dtype('O'), [1, '1'])
|
"""
|
if isinstance(arr, np.ndarray):
|
return arr.dtype, arr
|
|
if not is_list_like(arr):
|
raise TypeError("'arr' must be list-like")
|
|
if pandas_dtype and is_extension_array_dtype(arr):
|
return arr.dtype, arr
|
|
elif isinstance(arr, ABCSeries):
|
return arr.dtype, np.asarray(arr)
|
|
# don't force numpy coerce with nan's
|
inferred = lib.infer_dtype(arr, skipna=False)
|
if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
|
return (np.dtype(np.object_), arr)
|
|
arr = np.asarray(arr)
|
return arr.dtype, arr
|
|
|
def _maybe_infer_dtype_type(element):
|
"""
|
Try to infer an object's dtype, for use in arithmetic ops.
|
|
Uses `element.dtype` if that's available.
|
Objects implementing the iterator protocol are cast to a NumPy array,
|
and from there the array's type is used.
|
|
Parameters
|
----------
|
element : object
|
Possibly has a `.dtype` attribute, and possibly the iterator
|
protocol.
|
|
Returns
|
-------
|
tipo : type
|
|
Examples
|
--------
|
>>> from collections import namedtuple
|
>>> Foo = namedtuple("Foo", "dtype")
|
>>> _maybe_infer_dtype_type(Foo(np.dtype("i8")))
|
dtype('int64')
|
"""
|
tipo = None
|
if hasattr(element, "dtype"):
|
tipo = element.dtype
|
elif is_list_like(element):
|
element = np.asarray(element)
|
tipo = element.dtype
|
return tipo
|
|
|
def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
|
"""
|
Change string like dtypes to object for
|
``DataFrame.select_dtypes()``.
|
"""
|
# error: Argument 1 to <set> has incompatible type "Type[generic]"; expected
|
# "Union[dtype[Any], ExtensionDtype, None]"
|
# error: Argument 2 to <set> has incompatible type "Type[generic]"; expected
|
# "Union[dtype[Any], ExtensionDtype, None]"
|
non_string_dtypes = dtype_set - {
|
np.dtype("S").type, # type: ignore[arg-type]
|
np.dtype("<U").type, # type: ignore[arg-type]
|
}
|
if non_string_dtypes != dtype_set:
|
raise TypeError("string dtypes are not allowed, use 'object' instead")
|
|
|
def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
|
"""coerce the indexer input array to the smallest dtype possible"""
|
length = len(categories)
|
if length < _int8_max:
|
return ensure_int8(indexer)
|
elif length < _int16_max:
|
return ensure_int16(indexer)
|
elif length < _int32_max:
|
return ensure_int32(indexer)
|
return ensure_int64(indexer)
|
|
|
def convert_dtypes(
|
input_array: ArrayLike,
|
convert_string: bool = True,
|
convert_integer: bool = True,
|
convert_boolean: bool = True,
|
convert_floating: bool = True,
|
infer_objects: bool = False,
|
dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
|
) -> DtypeObj:
|
"""
|
Convert objects to best possible type, and optionally,
|
to types supporting ``pd.NA``.
|
|
Parameters
|
----------
|
input_array : ExtensionArray or np.ndarray
|
convert_string : bool, default True
|
Whether object dtypes should be converted to ``StringDtype()``.
|
convert_integer : bool, default True
|
Whether, if possible, conversion can be done to integer extension types.
|
convert_boolean : bool, defaults True
|
Whether object dtypes should be converted to ``BooleanDtypes()``.
|
convert_floating : bool, defaults True
|
Whether, if possible, conversion can be done to floating extension types.
|
If `convert_integer` is also True, preference will be give to integer
|
dtypes if the floats can be faithfully casted to integers.
|
infer_objects : bool, defaults False
|
Whether to also infer objects to float/int if possible. Is only hit if the
|
object array contains pd.NA.
|
dtype_backend : str, default "numpy_nullable"
|
Nullable dtype implementation to use.
|
|
* "numpy_nullable" returns numpy-backed nullable types
|
* "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype``
|
|
Returns
|
-------
|
np.dtype, or ExtensionDtype
|
"""
|
inferred_dtype: str | DtypeObj
|
|
if (
|
convert_string or convert_integer or convert_boolean or convert_floating
|
) and isinstance(input_array, np.ndarray):
|
if is_object_dtype(input_array.dtype):
|
inferred_dtype = lib.infer_dtype(input_array)
|
else:
|
inferred_dtype = input_array.dtype
|
|
if is_string_dtype(inferred_dtype):
|
if not convert_string or inferred_dtype == "bytes":
|
inferred_dtype = input_array.dtype
|
else:
|
inferred_dtype = pandas_dtype_func("string")
|
|
if convert_integer:
|
target_int_dtype = pandas_dtype_func("Int64")
|
|
if is_integer_dtype(input_array.dtype):
|
from pandas.core.arrays.integer import INT_STR_TO_DTYPE
|
|
inferred_dtype = INT_STR_TO_DTYPE.get(
|
input_array.dtype.name, target_int_dtype
|
)
|
elif is_numeric_dtype(input_array.dtype):
|
# TODO: de-dup with maybe_cast_to_integer_array?
|
arr = input_array[notna(input_array)]
|
if (arr.astype(int) == arr).all():
|
inferred_dtype = target_int_dtype
|
else:
|
inferred_dtype = input_array.dtype
|
elif (
|
infer_objects
|
and is_object_dtype(input_array.dtype)
|
and (isinstance(inferred_dtype, str) and inferred_dtype == "integer")
|
):
|
inferred_dtype = target_int_dtype
|
|
if convert_floating:
|
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
|
input_array.dtype
|
):
|
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
|
|
inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(
|
input_array.dtype.name, pandas_dtype_func("Float64")
|
)
|
# if we could also convert to integer, check if all floats
|
# are actually integers
|
if convert_integer:
|
# TODO: de-dup with maybe_cast_to_integer_array?
|
arr = input_array[notna(input_array)]
|
if (arr.astype(int) == arr).all():
|
inferred_dtype = pandas_dtype_func("Int64")
|
else:
|
inferred_dtype = inferred_float_dtype
|
else:
|
inferred_dtype = inferred_float_dtype
|
elif (
|
infer_objects
|
and is_object_dtype(input_array.dtype)
|
and (
|
isinstance(inferred_dtype, str)
|
and inferred_dtype == "mixed-integer-float"
|
)
|
):
|
inferred_dtype = pandas_dtype_func("Float64")
|
|
if convert_boolean:
|
if is_bool_dtype(input_array.dtype):
|
inferred_dtype = pandas_dtype_func("boolean")
|
elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
|
inferred_dtype = pandas_dtype_func("boolean")
|
|
if isinstance(inferred_dtype, str):
|
# If we couldn't do anything else, then we retain the dtype
|
inferred_dtype = input_array.dtype
|
|
else:
|
inferred_dtype = input_array.dtype
|
|
if dtype_backend == "pyarrow":
|
from pandas.core.arrays.arrow.array import to_pyarrow_type
|
from pandas.core.arrays.arrow.dtype import ArrowDtype
|
from pandas.core.arrays.string_ import StringDtype
|
|
if isinstance(inferred_dtype, PandasExtensionDtype):
|
base_dtype = inferred_dtype.base
|
elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
|
base_dtype = inferred_dtype.numpy_dtype
|
elif isinstance(inferred_dtype, StringDtype):
|
base_dtype = np.dtype(str)
|
else:
|
# error: Incompatible types in assignment (expression has type
|
# "Union[str, Any, dtype[Any], ExtensionDtype]",
|
# variable has type "Union[dtype[Any], ExtensionDtype, None]")
|
base_dtype = inferred_dtype # type: ignore[assignment]
|
pa_type = to_pyarrow_type(base_dtype)
|
if pa_type is not None:
|
inferred_dtype = ArrowDtype(pa_type)
|
|
# error: Incompatible return value type (got "Union[str, Union[dtype[Any],
|
# ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
|
return inferred_dtype # type: ignore[return-value]
|
|
|
def maybe_infer_to_datetimelike(
|
value: npt.NDArray[np.object_],
|
) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
|
"""
|
we might have a array (or single object) that is datetime like,
|
and no dtype is passed don't change the value unless we find a
|
datetime/timedelta set
|
|
this is pretty strict in that a datetime/timedelta is REQUIRED
|
in addition to possible nulls/string likes
|
|
Parameters
|
----------
|
value : np.ndarray[object]
|
|
Returns
|
-------
|
np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray
|
|
"""
|
if not isinstance(value, np.ndarray) or value.dtype != object:
|
# Caller is responsible for passing only ndarray[object]
|
raise TypeError(type(value)) # pragma: no cover
|
if value.ndim != 1:
|
# Caller is responsible
|
raise ValueError(value.ndim) # pragma: no cover
|
|
if not len(value):
|
return value
|
|
# error: Incompatible return value type (got "Union[ExtensionArray,
|
# ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray,
|
# TimedeltaArray, PeriodArray, IntervalArray]")
|
return lib.maybe_convert_objects( # type: ignore[return-value]
|
value,
|
# Here we do not convert numeric dtypes, as if we wanted that,
|
# numpy would have done it for us.
|
convert_numeric=False,
|
convert_period=True,
|
convert_interval=True,
|
convert_timedelta=True,
|
convert_datetime=True,
|
dtype_if_all_nat=np.dtype("M8[ns]"),
|
)
|
|
|
def maybe_cast_to_datetime(
|
value: np.ndarray | list, dtype: np.dtype
|
) -> ExtensionArray | np.ndarray:
|
"""
|
try to cast the array/value to a datetimelike dtype, converting float
|
nan to iNaT
|
|
Caller is responsible for handling ExtensionDtype cases and non dt64/td64
|
cases.
|
"""
|
from pandas.core.arrays.datetimes import DatetimeArray
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
assert dtype.kind in ["m", "M"]
|
if not is_list_like(value):
|
raise TypeError("value must be listlike")
|
|
# TODO: _from_sequence would raise ValueError in cases where
|
# _ensure_nanosecond_dtype raises TypeError
|
_ensure_nanosecond_dtype(dtype)
|
|
if is_timedelta64_dtype(dtype):
|
res = TimedeltaArray._from_sequence(value, dtype=dtype)
|
return res
|
else:
|
try:
|
dta = DatetimeArray._from_sequence(value, dtype=dtype)
|
except ValueError as err:
|
# We can give a Series-specific exception message.
|
if "cannot supply both a tz and a timezone-naive dtype" in str(err):
|
raise ValueError(
|
"Cannot convert timezone-aware data to "
|
"timezone-naive dtype. Use "
|
"pd.Series(values).dt.tz_localize(None) instead."
|
) from err
|
raise
|
|
return dta
|
|
|
def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
|
"""
|
Convert dtypes with granularity less than nanosecond to nanosecond
|
|
>>> _ensure_nanosecond_dtype(np.dtype("M8[us]"))
|
|
>>> _ensure_nanosecond_dtype(np.dtype("M8[D]"))
|
Traceback (most recent call last):
|
...
|
TypeError: dtype=datetime64[D] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
|
|
>>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
|
Traceback (most recent call last):
|
...
|
TypeError: dtype=timedelta64[ps] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
|
""" # noqa:E501
|
msg = (
|
f"The '{dtype.name}' dtype has no unit. "
|
f"Please pass in '{dtype.name}[ns]' instead."
|
)
|
|
# unpack e.g. SparseDtype
|
dtype = getattr(dtype, "subtype", dtype)
|
|
if not isinstance(dtype, np.dtype):
|
# i.e. datetime64tz
|
pass
|
|
elif dtype.kind in ["m", "M"]:
|
reso = get_unit_from_dtype(dtype)
|
if not is_supported_unit(reso):
|
# pre-2.0 we would silently swap in nanos for lower-resolutions,
|
# raise for above-nano resolutions
|
if dtype.name in ["datetime64", "timedelta64"]:
|
raise ValueError(msg)
|
# TODO: ValueError or TypeError? existing test
|
# test_constructor_generic_timestamp_bad_frequency expects TypeError
|
raise TypeError(
|
f"dtype={dtype} is not supported. Supported resolutions are 's', "
|
"'ms', 'us', and 'ns'"
|
)
|
|
|
# TODO: other value-dependent functions to standardize here include
|
# Index._find_common_type_compat
|
def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:
|
"""
|
Find the type/dtype for a the result of an operation between these objects.
|
|
This is similar to find_common_type, but looks at the objects instead
|
of just their dtypes. This can be useful in particular when one of the
|
objects does not have a `dtype`.
|
|
Parameters
|
----------
|
left : np.ndarray or ExtensionArray
|
right : Any
|
|
Returns
|
-------
|
np.dtype or ExtensionDtype
|
|
See also
|
--------
|
find_common_type
|
numpy.result_type
|
"""
|
new_dtype: DtypeObj
|
|
if (
|
isinstance(left, np.ndarray)
|
and left.dtype.kind in ["i", "u", "c"]
|
and (lib.is_integer(right) or lib.is_float(right))
|
):
|
# e.g. with int8 dtype and right=512, we want to end up with
|
# np.int16, whereas infer_dtype_from(512) gives np.int64,
|
# which will make us upcast too far.
|
if lib.is_float(right) and right.is_integer() and left.dtype.kind != "f":
|
right = int(right)
|
|
new_dtype = np.result_type(left, right)
|
|
elif is_valid_na_for_dtype(right, left.dtype):
|
# e.g. IntervalDtype[int] and None/np.nan
|
new_dtype = ensure_dtype_can_hold_na(left.dtype)
|
|
else:
|
dtype, _ = infer_dtype_from(right, pandas_dtype=True)
|
|
new_dtype = find_common_type([left.dtype, dtype])
|
|
return new_dtype
|
|
|
def common_dtype_categorical_compat(
|
objs: list[Index | ArrayLike], dtype: DtypeObj
|
) -> DtypeObj:
|
"""
|
Update the result of find_common_type to account for NAs in a Categorical.
|
|
Parameters
|
----------
|
objs : list[np.ndarray | ExtensionArray | Index]
|
dtype : np.dtype or ExtensionDtype
|
|
Returns
|
-------
|
np.dtype or ExtensionDtype
|
"""
|
# GH#38240
|
|
# TODO: more generally, could do `not can_hold_na(dtype)`
|
if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]:
|
for obj in objs:
|
# We don't want to accientally allow e.g. "categorical" str here
|
obj_dtype = getattr(obj, "dtype", None)
|
if isinstance(obj_dtype, CategoricalDtype):
|
if isinstance(obj, ABCIndex):
|
# This check may already be cached
|
hasnas = obj.hasnans
|
else:
|
# Categorical
|
hasnas = cast("Categorical", obj)._hasna
|
|
if hasnas:
|
# see test_union_int_categorical_with_nan
|
dtype = np.dtype(np.float64)
|
break
|
return dtype
|
|
|
@overload
|
def find_common_type(types: list[np.dtype]) -> np.dtype:
|
...
|
|
|
@overload
|
def find_common_type(types: list[ExtensionDtype]) -> DtypeObj:
|
...
|
|
|
@overload
|
def find_common_type(types: list[DtypeObj]) -> DtypeObj:
|
...
|
|
|
def find_common_type(types):
|
"""
|
Find a common data type among the given dtypes.
|
|
Parameters
|
----------
|
types : list of dtypes
|
|
Returns
|
-------
|
pandas extension or numpy dtype
|
|
See Also
|
--------
|
numpy.find_common_type
|
|
"""
|
if not types:
|
raise ValueError("no types given")
|
|
first = types[0]
|
|
# workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
|
# => object
|
if lib.dtypes_all_equal(list(types)):
|
return first
|
|
# get unique types (dict.fromkeys is used as order-preserving set())
|
types = list(dict.fromkeys(types).keys())
|
|
if any(isinstance(t, ExtensionDtype) for t in types):
|
for t in types:
|
if isinstance(t, ExtensionDtype):
|
res = t._get_common_dtype(types)
|
if res is not None:
|
return res
|
return np.dtype("object")
|
|
# take lowest unit
|
if all(is_datetime64_dtype(t) for t in types):
|
return np.dtype(max(types))
|
if all(is_timedelta64_dtype(t) for t in types):
|
return np.dtype(max(types))
|
|
# don't mix bool / int or float or complex
|
# this is different from numpy, which casts bool with float/int as int
|
has_bools = any(is_bool_dtype(t) for t in types)
|
if has_bools:
|
for t in types:
|
if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
|
return np.dtype("object")
|
|
return np.find_common_type(types, [])
|
|
|
def construct_2d_arraylike_from_scalar(
|
value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
|
) -> np.ndarray:
|
shape = (length, width)
|
|
if dtype.kind in ["m", "M"]:
|
value = _maybe_box_and_unbox_datetimelike(value, dtype)
|
elif dtype == _dtype_obj:
|
if isinstance(value, (np.timedelta64, np.datetime64)):
|
# calling np.array below would cast to pytimedelta/pydatetime
|
out = np.empty(shape, dtype=object)
|
out.fill(value)
|
return out
|
|
# Attempt to coerce to a numpy array
|
try:
|
arr = np.array(value, dtype=dtype, copy=copy)
|
except (ValueError, TypeError) as err:
|
raise TypeError(
|
f"DataFrame constructor called with incompatible data and dtype: {err}"
|
) from err
|
|
if arr.ndim != 0:
|
raise ValueError("DataFrame constructor not properly called!")
|
|
return np.full(shape, arr)
|
|
|
def construct_1d_arraylike_from_scalar(
|
value: Scalar, length: int, dtype: DtypeObj | None
|
) -> ArrayLike:
|
"""
|
create a np.ndarray / pandas type of specified shape and dtype
|
filled with values
|
|
Parameters
|
----------
|
value : scalar value
|
length : int
|
dtype : pandas_dtype or np.dtype
|
|
Returns
|
-------
|
np.ndarray / pandas type of length, filled with value
|
|
"""
|
|
if dtype is None:
|
try:
|
dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
|
except OutOfBoundsDatetime:
|
dtype = _dtype_obj
|
|
if isinstance(dtype, ExtensionDtype):
|
cls = dtype.construct_array_type()
|
seq = [] if length == 0 else [value]
|
subarr = cls._from_sequence(seq, dtype=dtype).repeat(length)
|
|
else:
|
if length and is_integer_dtype(dtype) and isna(value):
|
# coerce if we have nan for an integer dtype
|
dtype = np.dtype("float64")
|
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
|
# we need to coerce to object dtype to avoid
|
# to allow numpy to take our string as a scalar value
|
dtype = np.dtype("object")
|
if not isna(value):
|
value = ensure_str(value)
|
elif dtype.kind in ["M", "m"]:
|
value = _maybe_box_and_unbox_datetimelike(value, dtype)
|
|
subarr = np.empty(length, dtype=dtype)
|
if length:
|
# GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes
|
subarr.fill(value)
|
|
return subarr
|
|
|
def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
|
# Caller is responsible for checking dtype.kind in ["m", "M"]
|
|
if isinstance(value, dt.datetime):
|
# we dont want to box dt64, in particular datetime64("NaT")
|
value = maybe_box_datetimelike(value, dtype)
|
|
return _maybe_unbox_datetimelike(value, dtype)
|
|
|
def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
|
"""
|
Transform any list-like object in a 1-dimensional numpy array of object
|
dtype.
|
|
Parameters
|
----------
|
values : any iterable which has a len()
|
|
Raises
|
------
|
TypeError
|
* If `values` does not have a len()
|
|
Returns
|
-------
|
1-dimensional numpy array of dtype object
|
"""
|
# numpy will try to interpret nested lists as further dimensions, hence
|
# making a 1D array that contains list-likes is a bit tricky:
|
result = np.empty(len(values), dtype="object")
|
result[:] = values
|
return result
|
|
|
def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray:
|
"""
|
Takes any dtype and returns the casted version, raising for when data is
|
incompatible with integer/unsigned integer dtypes.
|
|
Parameters
|
----------
|
arr : np.ndarray or list
|
The array to cast.
|
dtype : np.dtype
|
The integer dtype to cast the array to.
|
|
Returns
|
-------
|
ndarray
|
Array of integer or unsigned integer dtype.
|
|
Raises
|
------
|
OverflowError : the dtype is incompatible with the data
|
ValueError : loss of precision has occurred during casting
|
|
Examples
|
--------
|
If you try to coerce negative values to unsigned integers, it raises:
|
|
>>> pd.Series([-1], dtype="uint64")
|
Traceback (most recent call last):
|
...
|
OverflowError: Trying to coerce negative values to unsigned integers
|
|
Also, if you try to coerce float values to integers, it raises:
|
|
>>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64"))
|
Traceback (most recent call last):
|
...
|
ValueError: Trying to coerce float values to integers
|
"""
|
assert is_integer_dtype(dtype)
|
|
try:
|
if not isinstance(arr, np.ndarray):
|
with warnings.catch_warnings():
|
# We already disallow dtype=uint w/ negative numbers
|
# (test_constructor_coercion_signed_to_unsigned) so safe to ignore.
|
warnings.filterwarnings(
|
"ignore",
|
"NumPy will stop allowing conversion of out-of-bound Python int",
|
DeprecationWarning,
|
)
|
casted = np.array(arr, dtype=dtype, copy=False)
|
else:
|
with warnings.catch_warnings():
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
casted = arr.astype(dtype, copy=False)
|
except OverflowError as err:
|
raise OverflowError(
|
"The elements provided in the data cannot all be "
|
f"casted to the dtype {dtype}"
|
) from err
|
|
if isinstance(arr, np.ndarray) and arr.dtype == dtype:
|
# avoid expensive array_equal check
|
return casted
|
|
with warnings.catch_warnings():
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
if np.array_equal(arr, casted):
|
return casted
|
|
# We do this casting to allow for proper
|
# data and dtype checking.
|
#
|
# We didn't do this earlier because NumPy
|
# doesn't handle `uint64` correctly.
|
arr = np.asarray(arr)
|
|
if np.issubdtype(arr.dtype, str):
|
if (casted.astype(str) == arr).all():
|
return casted
|
raise ValueError(f"string values cannot be losslessly cast to {dtype}")
|
|
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
|
raise OverflowError("Trying to coerce negative values to unsigned integers")
|
|
if is_float_dtype(arr.dtype):
|
if not np.isfinite(arr).all():
|
raise IntCastingNaNError(
|
"Cannot convert non-finite values (NA or inf) to integer"
|
)
|
raise ValueError("Trying to coerce float values to integers")
|
if is_object_dtype(arr.dtype):
|
raise ValueError("Trying to coerce float values to integers")
|
|
if casted.dtype < arr.dtype:
|
# GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows
|
raise ValueError(
|
f"Values are too large to be losslessly converted to {dtype}. "
|
f"To cast anyway, use pd.Series(values).astype({dtype})"
|
)
|
|
if arr.dtype.kind in ["m", "M"]:
|
# test_constructor_maskedarray_nonfloat
|
raise TypeError(
|
f"Constructing a Series or DataFrame from {arr.dtype} values and "
|
f"dtype={dtype} is not supported. Use values.view({dtype}) instead."
|
)
|
|
# No known cases that get here, but raising explicitly to cover our bases.
|
raise ValueError(f"values cannot be losslessly cast to {dtype}")
|
|
|
def can_hold_element(arr: ArrayLike, element: Any) -> bool:
|
"""
|
Can we do an inplace setitem with this element in an array with this dtype?
|
|
Parameters
|
----------
|
arr : np.ndarray or ExtensionArray
|
element : Any
|
|
Returns
|
-------
|
bool
|
"""
|
dtype = arr.dtype
|
if not isinstance(dtype, np.dtype) or dtype.kind in ["m", "M"]:
|
if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)):
|
# np.dtype here catches datetime64ns and timedelta64ns; we assume
|
# in this case that we have DatetimeArray/TimedeltaArray
|
arr = cast(
|
"PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr
|
)
|
try:
|
arr._validate_setitem_value(element)
|
return True
|
except (ValueError, TypeError):
|
# TODO: re-use _catch_deprecated_value_error to ensure we are
|
# strict about what exceptions we allow through here.
|
return False
|
|
# This is technically incorrect, but maintains the behavior of
|
# ExtensionBlock._can_hold_element
|
return True
|
|
try:
|
np_can_hold_element(dtype, element)
|
return True
|
except (TypeError, LossySetitemError):
|
return False
|
|
|
def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
|
"""
|
Raise if we cannot losslessly set this element into an ndarray with this dtype.
|
|
Specifically about places where we disagree with numpy. i.e. there are
|
cases where numpy will raise in doing the setitem that we do not check
|
for here, e.g. setting str "X" into a numeric ndarray.
|
|
Returns
|
-------
|
Any
|
The element, potentially cast to the dtype.
|
|
Raises
|
------
|
ValueError : If we cannot losslessly store this element with this dtype.
|
"""
|
if dtype == _dtype_obj:
|
return element
|
|
tipo = _maybe_infer_dtype_type(element)
|
|
if dtype.kind in ["i", "u"]:
|
if isinstance(element, range):
|
if _dtype_can_hold_range(element, dtype):
|
return element
|
raise LossySetitemError
|
|
if is_integer(element) or (is_float(element) and element.is_integer()):
|
# e.g. test_setitem_series_int8 if we have a python int 1
|
# tipo may be np.int32, despite the fact that it will fit
|
# in smaller int dtypes.
|
info = np.iinfo(dtype)
|
if info.min <= element <= info.max:
|
return dtype.type(element)
|
raise LossySetitemError
|
|
if tipo is not None:
|
if tipo.kind not in ["i", "u"]:
|
if isinstance(element, np.ndarray) and element.dtype.kind == "f":
|
# If all can be losslessly cast to integers, then we can hold them
|
with np.errstate(invalid="ignore"):
|
# We check afterwards if cast was losslessly, so no need to show
|
# the warning
|
casted = element.astype(dtype)
|
comp = casted == element
|
if comp.all():
|
# Return the casted values bc they can be passed to
|
# np.putmask, whereas the raw values cannot.
|
# see TestSetitemFloatNDarrayIntoIntegerSeries
|
return casted
|
raise LossySetitemError
|
|
# Anything other than integer we cannot hold
|
raise LossySetitemError
|
if (
|
dtype.kind == "u"
|
and isinstance(element, np.ndarray)
|
and element.dtype.kind == "i"
|
):
|
# see test_where_uint64
|
casted = element.astype(dtype)
|
if (casted == element).all():
|
# TODO: faster to check (element >=0).all()? potential
|
# itemsize issues there?
|
return casted
|
raise LossySetitemError
|
if dtype.itemsize < tipo.itemsize:
|
raise LossySetitemError
|
if not isinstance(tipo, np.dtype):
|
# i.e. nullable IntegerDtype; we can put this into an ndarray
|
# losslessly iff it has no NAs
|
if element._hasna:
|
raise LossySetitemError
|
return element
|
|
return element
|
|
raise LossySetitemError
|
|
if dtype.kind == "f":
|
if lib.is_integer(element) or lib.is_float(element):
|
casted = dtype.type(element)
|
if np.isnan(casted) or casted == element:
|
return casted
|
# otherwise e.g. overflow see TestCoercionFloat32
|
raise LossySetitemError
|
|
if tipo is not None:
|
# TODO: itemsize check?
|
if tipo.kind not in ["f", "i", "u"]:
|
# Anything other than float/integer we cannot hold
|
raise LossySetitemError
|
if not isinstance(tipo, np.dtype):
|
# i.e. nullable IntegerDtype or FloatingDtype;
|
# we can put this into an ndarray losslessly iff it has no NAs
|
if element._hasna:
|
raise LossySetitemError
|
return element
|
elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:
|
if isinstance(element, np.ndarray):
|
# e.g. TestDataFrameIndexingWhere::test_where_alignment
|
casted = element.astype(dtype)
|
if np.array_equal(casted, element, equal_nan=True):
|
return casted
|
raise LossySetitemError
|
|
return element
|
|
raise LossySetitemError
|
|
if dtype.kind == "c":
|
if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element):
|
if np.isnan(element):
|
# see test_where_complex GH#6345
|
return dtype.type(element)
|
|
with warnings.catch_warnings():
|
warnings.filterwarnings("ignore")
|
casted = dtype.type(element)
|
if casted == element:
|
return casted
|
# otherwise e.g. overflow see test_32878_complex_itemsize
|
raise LossySetitemError
|
|
if tipo is not None:
|
if tipo.kind in ["c", "f", "i", "u"]:
|
return element
|
raise LossySetitemError
|
raise LossySetitemError
|
|
if dtype.kind == "b":
|
if tipo is not None:
|
if tipo.kind == "b":
|
if not isinstance(tipo, np.dtype):
|
# i.e. we have a BooleanArray
|
if element._hasna:
|
# i.e. there are pd.NA elements
|
raise LossySetitemError
|
return element
|
raise LossySetitemError
|
if lib.is_bool(element):
|
return element
|
raise LossySetitemError
|
|
if dtype.kind == "S":
|
# TODO: test tests.frame.methods.test_replace tests get here,
|
# need more targeted tests. xref phofl has a PR about this
|
if tipo is not None:
|
if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize:
|
return element
|
raise LossySetitemError
|
if isinstance(element, bytes) and len(element) <= dtype.itemsize:
|
return element
|
raise LossySetitemError
|
|
if dtype.kind == "V":
|
# i.e. np.void, which cannot hold _anything_
|
raise LossySetitemError
|
|
raise NotImplementedError(dtype)
|
|
|
def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
|
"""
|
_maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),
|
but in many cases a range can be held by a smaller integer dtype.
|
Check if this is one of those cases.
|
"""
|
if not len(rng):
|
return True
|
return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
|