diff --git a/docs/advanced/serial.rst b/docs/advanced/serial.rst new file mode 100644 index 0000000000..8c0f837467 --- /dev/null +++ b/docs/advanced/serial.rst @@ -0,0 +1,42 @@ +.. _serial: + +================================= +Serialization +================================= +.. currentmodule:: qlib + +Introduction +=================== +``Qlib`` supports dumping the state of ``DataHandler``, ``DataSet``, ``Processor`` and ``Model``, etc. into a disk and reloading them. + +Serializable Class +======================== + +``Qlib`` provides a base class ``qlib.utils.serial.Serializable``, whose state can be dumped into or loaded from disk in `pickle` format. +When users dump the state of a ``Serializable`` instance, the attributes of the instance whose name **does not** start with `_` will be saved on the disk. + +Example +========================== +``Qlib``'s serializable class includes ``DataHandler``, ``DataSet``, ``Processor`` and ``Model``, etc., which are subclass of ``qlib.utils.serial.Serializable``. +Specifically, ``qlib.data.dataset.DatasetH`` is one of them. Users can serialize ``DatasetH`` as follows. + +.. code-block:: Python + + ##=============dump dataset============= + dataset.to_pickle(path="dataset.pkl") # dataset is an instance of qlib.data.dataset.DatasetH + + ##=============reload dataset============= + with open("dataset.pkl", "rb") as file_dataset: + dataset = pickle.load(file_dataset) + +.. note:: + Only state of ``DatasetH`` should be saved on the disk, such as some `mean` and `variance` used for data normalization, etc. + + After reloading the ``DatasetH``, users need to reinitialize it. It means that users can reset some states of ``DatasetH`` or ``QlibDataHandler`` such as `instruments`, `start_time`, `end_time` and `segments`, etc., and generate new data according to the states (data is not state and should not be saved on the disk). + +A more detailed example is in this `link `_. + + +API +=================== +Please refer to `Serializable API <../reference/api.html#module-qlib.utils.serial.Serializable>`_. diff --git a/docs/component/data.rst b/docs/component/data.rst index dd32c5cd84..3e9586bf4b 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -31,7 +31,7 @@ Qlib Format Data We've specially designed a data structure to manage financial data, please refer to the `File storage design section in Qlib paper `_ for detailed information. Such data will be stored with filename suffix `.bin` (We'll call them `.bin` file, `.bin` format, or qlib format). `.bin` file is designed for scientific computing on finance data. -``Qlib`` provides two different off-the-shelf dataset, which can be accessed through this `link `_: +``Qlib`` provides two different off-the-shelf datasets, which can be accessed through this `link `_: ======================== ================= ================ Dataset US Market China Market @@ -41,6 +41,7 @@ Alpha360 √ √ Alpha158 √ √ ======================== ================= ================ +Also, ``Qlib`` provides a high-frequency dataset. Users can run a high-frequency dataset example through this `link `_. Qlib Format Dataset -------------------- diff --git a/docs/index.rst b/docs/index.rst index 15a36b4892..3fa35fc60d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -49,6 +49,7 @@ Document Structure Building Formulaic Alphas Online & Offline mode + Serialization .. toctree:: :maxdepth: 3 diff --git a/docs/reference/api.rst b/docs/reference/api.rst index f21a9f518a..3167d8a622 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -152,4 +152,14 @@ Recorder Record Template -------------------- .. automodule:: qlib.workflow.record_temp + :members: + + +Utils +==================== + +Serializable +-------------------- + +.. automodule:: qlib.utils.serial.Serializable :members: \ No newline at end of file diff --git a/examples/highfreq/README.md b/examples/highfreq/README.md new file mode 100644 index 0000000000..30c2e19db9 --- /dev/null +++ b/examples/highfreq/README.md @@ -0,0 +1,28 @@ +# High-Frequency Dataset + +This dataset is an example for RL high frequency trading. + +## Get High-Frequency Data + +Get high-frequency data by running the following command: +```bash + python workflow.py get_data +``` + +## Dump & Reload & Reinitialize the Dataset + + +The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of [`qlib.utils.serial.Serializable`](https://qlib.readthedocs.io/en/latest/advanced/serial.html), whose state can be dumped in or loaded from disk in `pickle` format. + +### About Reinitialization + +After reloading `Dataset` from disk, `Qlib` also support reinitializing the dataset. It means that users can reset some states of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segments`, etc., and generate new data according to the states. + +The example is given in `workflow.py`, users can run the code as follows. + +### Run the Code + +Run the example by running the following command: +```bash + python workflow.py dump_and_load_dataset +``` \ No newline at end of file diff --git a/examples/highfreq/__init__.py b/examples/highfreq/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index 6649079d83..ff3d3c5522 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -9,7 +9,7 @@ import pickle import numpy as np import pandas as pd -from qlib.config import HIGH_FREQ_CONFIG +from qlib.config import REG_CN, HIGH_FREQ_CONFIG from qlib.contrib.model.gbdt import LGBModel from qlib.contrib.data.handler import Alpha158 from qlib.contrib.strategy.strategy import TopkDropoutStrategy @@ -123,8 +123,7 @@ def get_data(self): backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) print(backtest_train, backtest_test) - del xtrain, xtest - del backtest_train, backtest_test + return def dump_and_load_dataset(self): """dump and load dataset state on disk""" @@ -146,18 +145,39 @@ def dump_and_load_dataset(self): dataset_backtest = pickle.load(file_dataset_backtest) self._prepare_calender_cache() - ##=============reload_dataset============= - dataset.init(init_type=DataHandlerLP.IT_LS) - dataset_backtest.init() + ##=============reinit dataset============= + dataset.init( + handler_kwargs={ + "init_type": DataHandlerLP.IT_LS, + "start_time": "2021-01-19 00:00:00", + "end_time": "2021-01-25 16:00:00", + }, + segment_kwargs={ + "test": ( + "2021-01-19 00:00:00", + "2021-01-25 16:00:00", + ), + }, + ) + dataset_backtest.init( + handler_kwargs={ + "start_time": "2021-01-19 00:00:00", + "end_time": "2021-01-25 16:00:00", + }, + segment_kwargs={ + "test": ( + "2021-01-19 00:00:00", + "2021-01-25 16:00:00", + ), + }, + ) ##=============get data============= - xtrain, xtest = dataset.prepare(["train", "test"]) - backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) + xtest = dataset.prepare(["test"]) + backtest_test = dataset_backtest.prepare(["test"]) - print(xtrain, xtest) - print(backtest_train, backtest_test) - del xtrain, xtest - del backtest_train, backtest_test + print(xtest, backtest_test) + return if __name__ == "__main__": diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 6b98baf8f3..8ff8c1210a 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -87,9 +87,42 @@ def __init__(self, handler: Union[dict, DataHandler], segments: dict): """ super().__init__(handler, segments) - def init(self, **kwargs): - """Initialize the DatasetH, Only parameters belonging to handler.init will be passed in""" - self.handler.init(**kwargs) + def init(self, handler_kwargs: dict = None, segment_kwargs: dict = None): + """ + Initialize the DatasetH + + Parameters + ---------- + handler_kwargs : dict + Config of DataHanlder, which could include the following arguments: + + - arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'. + + - arguments of DataHandler.init, such as 'enable_cache', etc. + + segment_kwargs : dict + Config of segments which is same as 'segments' in DatasetH.setup_data + + """ + if handler_kwargs: + if not isinstance(handler_kwargs, dict): + raise TypeError(f"param handler_kwargs must be type dict, not {type(handler_kwargs)}") + kwargs_init = {} + kwargs_conf_data = {} + conf_data_arg = {"instruments", "start_time", "end_time"} + for k, v in handler_kwargs.items(): + if k in conf_data_arg: + kwargs_conf_data.update({k: v}) + else: + kwargs_init.update({k: v}) + + self.handler.conf_data(**kwargs_conf_data) + self.handler.init(**kwargs_init) + + if segment_kwargs: + if not isinstance(segment_kwargs, dict): + raise TypeError(f"param handler_kwargs must be type dict, not {type(segment_kwargs)}") + self.segments = segment_kwargs.copy() def setup_data(self, handler: Union[dict, DataHandler], segments: dict): """