fume-manage-python.git

"""
Testing that we work in the downstream packages
"""
import importlib
import subprocess
import sys
 
import numpy as np
import pytest
 
from pandas.errors import IntCastingNaNError
import pandas.util._test_decorators as td
 
import pandas as pd
from pandas import (
    DataFrame,
    Series,
)
import pandas._testing as tm
 
 
def import_module(name):
    # we *only* want to skip if the module is truly not available
    # and NOT just an actual import error because of pandas changes
 
    try:
        return importlib.import_module(name)
    except ModuleNotFoundError:
        pytest.skip(f"skipping as {name} not available")
 
 
@pytest.fixture
def df():
    return DataFrame({"A": [1, 2, 3]})
 
 
def test_dask(df):
    # dask sets "compute.use_numexpr" to False, so catch the current value
    # and ensure to reset it afterwards to avoid impacting other tests
    olduse = pd.get_option("compute.use_numexpr")
 
    try:
        toolz = import_module("toolz")  # noqa:F841
        dask = import_module("dask")  # noqa:F841
 
        import dask.dataframe as dd
 
        ddf = dd.from_pandas(df, npartitions=3)
        assert ddf.A is not None
        assert ddf.compute() is not None
    finally:
        pd.set_option("compute.use_numexpr", olduse)
 
 
def test_dask_ufunc():
    # dask sets "compute.use_numexpr" to False, so catch the current value
    # and ensure to reset it afterwards to avoid impacting other tests
    olduse = pd.get_option("compute.use_numexpr")
 
    try:
        dask = import_module("dask")  # noqa:F841
        import dask.array as da
        import dask.dataframe as dd
 
        s = Series([1.5, 2.3, 3.7, 4.0])
        ds = dd.from_pandas(s, npartitions=2)
 
        result = da.fix(ds).compute()
        expected = np.fix(s)
        tm.assert_series_equal(result, expected)
    finally:
        pd.set_option("compute.use_numexpr", olduse)
 
 
@td.skip_if_no("dask")
def test_construct_dask_float_array_int_dtype_match_ndarray():
    # GH#40110 make sure we treat a float-dtype dask array with the same
    #  rules we would for an ndarray
    import dask.dataframe as dd
 
    arr = np.array([1, 2.5, 3])
    darr = dd.from_array(arr)
 
    res = Series(darr)
    expected = Series(arr)
    tm.assert_series_equal(res, expected)
 
    # GH#49599 in 2.0 we raise instead of silently ignoring the dtype
    msg = "Trying to coerce float values to integers"
    with pytest.raises(ValueError, match=msg):
        Series(darr, dtype="i8")
 
    msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
    arr[2] = np.nan
    with pytest.raises(IntCastingNaNError, match=msg):
        Series(darr, dtype="i8")
    # which is the same as we get with a numpy input
    with pytest.raises(IntCastingNaNError, match=msg):
        Series(arr, dtype="i8")
 
 
def test_xarray(df):
    xarray = import_module("xarray")  # noqa:F841
 
    assert df.to_xarray() is not None
 
 
@td.skip_if_no("cftime")
@td.skip_if_no("xarray", "0.21.0")
def test_xarray_cftimeindex_nearest():
    # https://github.com/pydata/xarray/issues/3751
    import cftime
    import xarray
 
    times = xarray.cftime_range("0001", periods=2)
    key = cftime.DatetimeGregorian(2000, 1, 1)
    result = times.get_indexer([key], method="nearest")
    expected = 1
    assert result == expected
 
 
def test_oo_optimizable():
    # GH 21071
    subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])
 
 
def test_oo_optimized_datetime_index_unpickle():
    # GH 42866
    subprocess.check_call(
        [
            sys.executable,
            "-OO",
            "-c",
            (
                "import pandas as pd, pickle; "
                "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))"
            ),
        ]
    )
 
 
@pytest.mark.network
@tm.network
def test_statsmodels():
    statsmodels = import_module("statsmodels")  # noqa:F841
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
 
    df = sm.datasets.get_rdataset("Guerry", "HistData").data
    smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
 
 
def test_scikit_learn():
    sklearn = import_module("sklearn")  # noqa:F841
    from sklearn import (
        datasets,
        svm,
    )
 
    digits = datasets.load_digits()
    clf = svm.SVC(gamma=0.001, C=100.0)
    clf.fit(digits.data[:-1], digits.target[:-1])
    clf.predict(digits.data[-1:])
 
 
@pytest.mark.network
@tm.network
def test_seaborn():
    seaborn = import_module("seaborn")
    tips = seaborn.load_dataset("tips")
    seaborn.stripplot(x="day", y="total_bill", data=tips)
 
 
def test_pandas_gbq():
    # Older versions import from non-public, non-existent pandas funcs
    pytest.importorskip("pandas_gbq", minversion="0.10.0")
    pandas_gbq = import_module("pandas_gbq")  # noqa:F841
 
 
@pytest.mark.network
@tm.network
@pytest.mark.xfail(
    raises=ValueError,
    reason="The Quandl API key must be provided either through the api_key "
    "variable or through the environmental variable QUANDL_API_KEY",
)
def test_pandas_datareader():
    pandas_datareader = import_module("pandas_datareader")
    pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01")
 
 
def test_pyarrow(df):
    pyarrow = import_module("pyarrow")
    table = pyarrow.Table.from_pandas(df)
    result = table.to_pandas()
    tm.assert_frame_equal(result, df)
 
 
def test_yaml_dump(df):
    # GH#42748
    yaml = import_module("yaml")
 
    dumped = yaml.dump(df)
 
    loaded = yaml.load(dumped, Loader=yaml.Loader)
    tm.assert_frame_equal(df, loaded)
 
    loaded2 = yaml.load(dumped, Loader=yaml.UnsafeLoader)
    tm.assert_frame_equal(df, loaded2)
 
 
def test_missing_required_dependency():
    # GH 23868
    # To ensure proper isolation, we pass these flags
    # -S : disable site-packages
    # -s : disable user site-packages
    # -E : disable PYTHON* env vars, especially PYTHONPATH
    # https://github.com/MacPython/pandas-wheels/pull/50
 
    pyexe = sys.executable.replace("\\", "/")
 
    # We skip this test if pandas is installed as a site package. We first
    # import the package normally and check the path to the module before
    # executing the test which imports pandas with site packages disabled.
    call = [pyexe, "-c", "import pandas;print(pandas.__file__)"]
    output = subprocess.check_output(call).decode()
    if "site-packages" in output:
        pytest.skip("pandas installed as site package")
 
    # This test will fail if pandas is installed as a site package. The flags
    # prevent pandas being imported and the test will report Failed: DID NOT
    # RAISE <class 'subprocess.CalledProcessError'>
    call = [pyexe, "-sSE", "-c", "import pandas"]
 
    msg = (
        rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' "
        "returned non-zero exit status 1."
    )
 
    with pytest.raises(subprocess.CalledProcessError, match=msg) as exc:
        subprocess.check_output(call, stderr=subprocess.STDOUT)
 
    output = exc.value.stdout.decode()
    for name in ["numpy", "pytz", "dateutil"]:
        assert name in output
 
 
def test_frame_setitem_dask_array_into_new_col():
    # GH#47128
 
    # dask sets "compute.use_numexpr" to False, so catch the current value
    # and ensure to reset it afterwards to avoid impacting other tests
    olduse = pd.get_option("compute.use_numexpr")
 
    try:
        dask = import_module("dask")  # noqa:F841
 
        import dask.array as da
 
        dda = da.array([1, 2])
        df = DataFrame({"a": ["a", "b"]})
        df["b"] = dda
        df["c"] = dda
        df.loc[[False, True], "b"] = 100
        result = df.loc[[1], :]
        expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1])
        tm.assert_frame_equal(result, expected)
    finally:
        pd.set_option("compute.use_numexpr", olduse)