"""
|
self-contained to write legacy storage pickle files
|
|
To use this script. Create an environment where you want
|
generate pickles, say its for 0.20.3, with your pandas clone
|
in ~/pandas
|
|
. activate pandas_0.20.3
|
cd ~/pandas/pandas
|
|
$ python -m tests.io.generate_legacy_storage_files \
|
tests/io/data/legacy_pickle/0.20.3/ pickle
|
|
This script generates a storage file for the current arch, system,
|
and python version
|
pandas version: 0.20.3
|
output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
|
storage format: pickle
|
created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
|
|
The idea here is you are using the *current* version of the
|
generate_legacy_storage_files with an *older* version of pandas to
|
generate a pickle file. We will then check this file into a current
|
branch, and test using test_pickle.py. This will load the *older*
|
pickles and test versus the current data that is generated
|
(with main). These are then compared.
|
|
If we have cases where we changed the signature (e.g. we renamed
|
offset -> freq in Timestamp). Then we have to conditionally execute
|
in the generate_legacy_storage_files.py to make it
|
run under the older AND the newer version.
|
|
"""
|
|
from datetime import timedelta
|
import os
|
import pickle
|
import platform as pl
|
import sys
|
|
import numpy as np
|
|
import pandas
|
from pandas import (
|
Categorical,
|
DataFrame,
|
Index,
|
MultiIndex,
|
NaT,
|
Period,
|
RangeIndex,
|
Series,
|
Timestamp,
|
bdate_range,
|
date_range,
|
interval_range,
|
period_range,
|
timedelta_range,
|
)
|
from pandas.arrays import SparseArray
|
|
from pandas.tseries.offsets import (
|
FY5253,
|
BusinessDay,
|
BusinessHour,
|
CustomBusinessDay,
|
DateOffset,
|
Day,
|
Easter,
|
Hour,
|
LastWeekOfMonth,
|
Minute,
|
MonthBegin,
|
MonthEnd,
|
QuarterBegin,
|
QuarterEnd,
|
SemiMonthBegin,
|
SemiMonthEnd,
|
Week,
|
WeekOfMonth,
|
YearBegin,
|
YearEnd,
|
)
|
|
|
def _create_sp_series():
|
nan = np.nan
|
|
# nan-based
|
arr = np.arange(15, dtype=np.float64)
|
arr[7:12] = nan
|
arr[-1:] = nan
|
|
bseries = Series(SparseArray(arr, kind="block"))
|
bseries.name = "bseries"
|
return bseries
|
|
|
def _create_sp_tsseries():
|
nan = np.nan
|
|
# nan-based
|
arr = np.arange(15, dtype=np.float64)
|
arr[7:12] = nan
|
arr[-1:] = nan
|
|
date_index = bdate_range("1/1/2011", periods=len(arr))
|
bseries = Series(SparseArray(arr, kind="block"), index=date_index)
|
bseries.name = "btsseries"
|
return bseries
|
|
|
def _create_sp_frame():
|
nan = np.nan
|
|
data = {
|
"A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
|
"B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
|
"C": np.arange(10).astype(np.int64),
|
"D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
|
}
|
|
dates = bdate_range("1/1/2011", periods=10)
|
return DataFrame(data, index=dates).apply(SparseArray)
|
|
|
def create_data():
|
"""create the pickle data"""
|
data = {
|
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
|
"B": [0, 1, 0, 1, 0],
|
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
|
"D": date_range("1/1/2009", periods=5),
|
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
|
}
|
|
scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
|
|
index = {
|
"int": Index(np.arange(10)),
|
"date": date_range("20130101", periods=10),
|
"period": period_range("2013-01-01", freq="M", periods=10),
|
"float": Index(np.arange(10, dtype=np.float64)),
|
"uint": Index(np.arange(10, dtype=np.uint64)),
|
"timedelta": timedelta_range("00:00:00", freq="30T", periods=10),
|
}
|
|
index["range"] = RangeIndex(10)
|
|
index["interval"] = interval_range(0, periods=10)
|
|
mi = {
|
"reg2": MultiIndex.from_tuples(
|
tuple(
|
zip(
|
*[
|
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
["one", "two", "one", "two", "one", "two", "one", "two"],
|
]
|
)
|
),
|
names=["first", "second"],
|
)
|
}
|
|
series = {
|
"float": Series(data["A"]),
|
"int": Series(data["B"]),
|
"mixed": Series(data["E"]),
|
"ts": Series(
|
np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
|
),
|
"mi": Series(
|
np.arange(5).astype(np.float64),
|
index=MultiIndex.from_tuples(
|
tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
|
),
|
),
|
"dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
|
"cat": Series(Categorical(["foo", "bar", "baz"])),
|
"dt": Series(date_range("20130101", periods=5)),
|
"dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
|
"period": Series([Period("2000Q1")] * 5),
|
}
|
|
mixed_dup_df = DataFrame(data)
|
mixed_dup_df.columns = list("ABCDA")
|
frame = {
|
"float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
|
"int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
|
"mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
|
"mi": DataFrame(
|
{"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
|
index=MultiIndex.from_tuples(
|
tuple(
|
zip(
|
*[
|
["bar", "bar", "baz", "baz", "baz"],
|
["one", "two", "one", "two", "three"],
|
]
|
)
|
),
|
names=["first", "second"],
|
),
|
),
|
"dup": DataFrame(
|
np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
|
),
|
"cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
|
"cat_and_float": DataFrame(
|
{
|
"A": Categorical(["foo", "bar", "baz"]),
|
"B": np.arange(3).astype(np.int64),
|
}
|
),
|
"mixed_dup": mixed_dup_df,
|
"dt_mixed_tzs": DataFrame(
|
{
|
"A": Timestamp("20130102", tz="US/Eastern"),
|
"B": Timestamp("20130603", tz="CET"),
|
},
|
index=range(5),
|
),
|
"dt_mixed2_tzs": DataFrame(
|
{
|
"A": Timestamp("20130102", tz="US/Eastern"),
|
"B": Timestamp("20130603", tz="CET"),
|
"C": Timestamp("20130603", tz="UTC"),
|
},
|
index=range(5),
|
),
|
}
|
|
cat = {
|
"int8": Categorical(list("abcdefg")),
|
"int16": Categorical(np.arange(1000)),
|
"int32": Categorical(np.arange(10000)),
|
}
|
|
timestamp = {
|
"normal": Timestamp("2011-01-01"),
|
"nat": NaT,
|
"tz": Timestamp("2011-01-01", tz="US/Eastern"),
|
}
|
|
off = {
|
"DateOffset": DateOffset(years=1),
|
"DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
|
"BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
|
"BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
|
"CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
|
"SemiMonthBegin": SemiMonthBegin(day_of_month=9),
|
"SemiMonthEnd": SemiMonthEnd(day_of_month=24),
|
"MonthBegin": MonthBegin(1),
|
"MonthEnd": MonthEnd(1),
|
"QuarterBegin": QuarterBegin(1),
|
"QuarterEnd": QuarterEnd(1),
|
"Day": Day(1),
|
"YearBegin": YearBegin(1),
|
"YearEnd": YearEnd(1),
|
"Week": Week(1),
|
"Week_Tues": Week(2, normalize=False, weekday=1),
|
"WeekOfMonth": WeekOfMonth(week=3, weekday=4),
|
"LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
|
"FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
|
"Easter": Easter(),
|
"Hour": Hour(1),
|
"Minute": Minute(1),
|
}
|
|
return {
|
"series": series,
|
"frame": frame,
|
"index": index,
|
"scalars": scalars,
|
"mi": mi,
|
"sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
|
"sp_frame": {"float": _create_sp_frame()},
|
"cat": cat,
|
"timestamp": timestamp,
|
"offsets": off,
|
}
|
|
|
def create_pickle_data():
|
data = create_data()
|
|
return data
|
|
|
def platform_name():
|
return "_".join(
|
[
|
str(pandas.__version__),
|
str(pl.machine()),
|
str(pl.system().lower()),
|
str(pl.python_version()),
|
]
|
)
|
|
|
def write_legacy_pickles(output_dir):
|
version = pandas.__version__
|
|
print(
|
"This script generates a storage file for the current arch, system, "
|
"and python version"
|
)
|
print(f" pandas version: {version}")
|
print(f" output dir : {output_dir}")
|
print(" storage format: pickle")
|
|
pth = f"{platform_name()}.pickle"
|
|
with open(os.path.join(output_dir, pth), "wb") as fh:
|
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
|
|
print(f"created pickle file: {pth}")
|
|
|
def write_legacy_file():
|
# force our cwd to be the first searched
|
sys.path.insert(0, ".")
|
|
if not 3 <= len(sys.argv) <= 4:
|
sys.exit(
|
"Specify output directory and storage type: generate_legacy_"
|
"storage_files.py <output_dir> <storage_type> "
|
)
|
|
output_dir = str(sys.argv[1])
|
storage_type = str(sys.argv[2])
|
|
if storage_type == "pickle":
|
write_legacy_pickles(output_dir=output_dir)
|
else:
|
sys.exit("storage_type must be one of {'pickle'}")
|
|
|
if __name__ == "__main__":
|
write_legacy_file()
|