diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md new file mode 100644 index 0000000000..ca31e3f7a5 --- /dev/null +++ b/scripts/data_collector/br_index/README.md @@ -0,0 +1,61 @@ +# iBOVESPA History Companies Collection + +## Requirements + +- Install the libs from the file `requirements.txt` + + ```bash + pip install -r requirements.txt + ``` +- `requirements.txt` file was generated using python3.8 + +## For the ibovespa (IBOV) index, we have: + +
+ +### Method `get_new_companies` + +#### Index start date + +- The ibovespa index started on 2 January 1968 ([wiki](https://en.wikipedia.org/wiki/%C3%8Dndice_Bovespa)). In order to use this start date in our `bench_start_date(self)` method, two conditions must be satisfied: + 1) APIs used to download brazilian stocks (B3) historical prices must keep track of such historic data since 2 January 1968 + + 2) Some website or API must provide, from that date, the historic index composition. In other words, the companies used to build the index . + + As a consequence, the method `bench_start_date(self)` inside `collector.py` was implemented using `pd.Timestamp("2003-01-03")` due to two reasons + + 1) The earliest ibov composition that have been found was from the first quarter of 2003. More informations about such composition can be seen on the sections below. + + 2) Yahoo finance, one of the libraries used to download symbols historic prices, keeps track from this date forward. + +- Within the `get_new_companies` method, a logic was implemented to get, for each ibovespa component stock, the start date that yahoo finance keeps track of. + +#### Code Logic + +The code does a web scrapping into the B3's [website](https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br), which keeps track of the ibovespa stocks composition on the current day. + +Other approaches, such as `request` and `Beautiful Soup` could have been used. However, the website shows the table with the stocks with some delay, since it uses a script inside of it to obtain such compositions. +Alternatively, `selenium` was used to download this stocks' composition in order to overcome this problem. + +Futhermore, the data downloaded from the selenium script was preprocessed so it could be saved into the `csv` format stablished by `scripts/data_collector/index.py`. + +
+ +### Method `get_changes` + +No suitable data source that keeps track of ibovespa's history stocks composition has been found. Except from this [repository](https://github.com/igor17400/IBOV-HCI) which provide such information have been used, however it only provides the data from the 1st quarter of 2003 to 3rd quarter of 2021. + +With that reference, the index's composition can be compared quarter by quarter and year by year and then generate a file that keeps track of which stocks have been removed and which have been added each quarter and year. + +
+ +### Collector Data + +```bash +# parse instruments, using in qlib/instruments. +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments + +# parse new companies +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies +``` + diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py new file mode 100644 index 0000000000..bbb012b5c9 --- /dev/null +++ b/scripts/data_collector/br_index/collector.py @@ -0,0 +1,277 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from functools import partial +import sys +from pathlib import Path +import importlib +import datetime + +import fire +import pandas as pd +from tqdm import tqdm +from loguru import logger + +CUR_DIR = Path(__file__).resolve().parent +sys.path.append(str(CUR_DIR.parent.parent)) + +from data_collector.index import IndexBase +from data_collector.utils import get_instruments + +quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"} + + +class IBOVIndex(IndexBase): + + ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" + years_4_month_periods = [] + + def __init__( + self, + index_name: str, + qlib_dir: [str, Path] = None, + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, + ): + super(IBOVIndex, self).__init__( + index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep + ) + + self.today: datetime = datetime.date.today() + self.current_4_month_period = self.get_current_4_month_period(self.today.month) + self.year = str(self.today.year) + self.years_4_month_periods = self.get_four_month_period() + + @property + def bench_start_date(self) -> pd.Timestamp: + """ + The ibovespa index started on 2 January 1968 (wiki), however, + no suitable data source that keeps track of ibovespa's history + stocks composition has been found. Except from the repo indicated + in README. Which keeps track of such information starting from + the first quarter of 2003 + """ + return pd.Timestamp("2003-01-03") + + def get_current_4_month_period(self, current_month: int): + """ + This function is used to calculated what is the current + four month period for the current month. For example, + If the current month is August 8, its four month period + is 2Q. + + OBS: In english Q is used to represent *quarter* + which means a three month period. However, in + portuguese we use Q to represent a four month period. + In other words, + + Jan, Feb, Mar, Apr: 1Q + May, Jun, Jul, Aug: 2Q + Sep, Oct, Nov, Dez: 3Q + + Parameters + ---------- + month : int + Current month (1 <= month <= 12) + + Returns + ------- + current_4m_period:str + Current Four Month Period (1Q or 2Q or 3Q) + """ + if current_month < 5: + return "1Q" + if current_month < 9: + return "2Q" + if current_month <= 12: + return "3Q" + else: + return -1 + + def get_four_month_period(self): + """ + The ibovespa index is updated every four months. + Therefore, we will represent each time period as 2003_1Q + which means 2003 first four mount period (Jan, Feb, Mar, Apr) + """ + four_months_period = ["1Q", "2Q", "3Q"] + init_year = 2003 + now = datetime.datetime.now() + current_year = now.year + current_month = now.month + for year in [item for item in range(init_year, current_year)]: + for el in four_months_period: + self.years_4_month_periods.append(str(year)+"_"+el) + # For current year the logic must be a little different + current_4_month_period = self.get_current_4_month_period(current_month) + for i in range(int(current_4_month_period[0])): + self.years_4_month_periods.append(str(current_year) + "_" + str(i+1) + "Q") + return self.years_4_month_periods + + + def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame: + """formatting the datetime in an instrument + + Parameters + ---------- + inst_df: pd.DataFrame + inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD] + + Returns + ------- + inst_df: pd.DataFrame + + """ + logger.info("Formatting Datetime") + if self.freq != "day": + inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S") + ) + else: + inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d") + ) + + inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d") + ) + return inst_df + + def format_quarter(self, cell: str): + """ + Parameters + ---------- + cell: str + It must be on the format 2003_1Q --> years_4_month_periods + + Returns + ---------- + date: str + Returns date in format 2003-03-01 + """ + cell_split = cell.split("_") + return cell_split[0] + "-" + quarter_dict[cell_split[1]] + + def get_changes(self): + """ + Access the index historic composition and compare it quarter + by quarter and year by year in order to generate a file that + keeps track of which stocks have been removed and which have + been added. + + The Dataframe used as reference will provided the index + composition for each year an quarter: + pd.DataFrame: + symbol + SH600000 + SH600001 + . + . + . + + Parameters + ---------- + self: is used to represent the instance of the class. + + Returns + ---------- + pd.DataFrame: + symbol date type + SH600000 2019-11-11 add + SH600001 2020-11-10 remove + dtypes: + symbol: str + date: pd.Timestamp + type: str, value from ["add", "remove"] + """ + logger.info("Getting companies changes in {} index ...".format(self.index_name)) + + try: + df_changes_list = [] + for i in tqdm(range(len(self.years_4_month_periods) - 1)): + df = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i]), on_bad_lines="skip")["symbol"] + df_ = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i + 1]), on_bad_lines="skip")["symbol"] + + ## Remove Dataframe + remove_date = self.years_4_month_periods[i].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i].split("_")[1]] + list_remove = list(df[~df.isin(df_)]) + df_removed = pd.DataFrame( + { + "date": len(list_remove) * [remove_date], + "type": len(list_remove) * ["remove"], + "symbol": list_remove, + } + ) + + ## Add Dataframe + add_date = self.years_4_month_periods[i + 1].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i + 1].split("_")[1]] + list_add = list(df_[~df_.isin(df)]) + df_added = pd.DataFrame( + {"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add} + ) + + df_changes_list.append(pd.concat([df_added, df_removed], sort=False)) + df = pd.concat(df_changes_list).reset_index(drop=True) + df["symbol"] = df["symbol"].astype(str) + ".SA" + + return df + + except Exception as E: + logger.error("An error occured while downloading 2008 index composition - {}".format(E)) + + def get_new_companies(self): + """ + Get latest index composition. + The repo indicated on README has implemented a script + to get the latest index composition from B3 website using + selenium. Therefore, this method will download the file + containing such composition + + Parameters + ---------- + self: is used to represent the instance of the class. + + Returns + ---------- + pd.DataFrame: + symbol start_date end_date + RRRP3 2020-11-13 2022-03-02 + ALPA4 2008-01-02 2022-03-02 + dtypes: + symbol: str + start_date: pd.Timestamp + end_date: pd.Timestamp + """ + logger.info("Getting new companies in {} index ...".format(self.index_name)) + + try: + ## Get index composition + + df_index = pd.read_csv( + self.ibov_index_composition.format(self.year + "_" + self.current_4_month_period), on_bad_lines="skip" + ) + df_date_first_added = pd.read_csv( + self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.current_4_month_period), + on_bad_lines="skip", + ) + df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]] + df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter) + + # end_date will be our current quarter + 1, since the IBOV index updates itself every quarter + df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[self.current_4_month_period] + df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]] + df["symbol"] = df["symbol"].astype(str) + ".SA" + + return df + + except Exception as E: + logger.error("An error occured while getting new companies - {}".format(E)) + + def filter_df(self, df: pd.DataFrame) -> pd.DataFrame: + if "Código" in df.columns: + return df.loc[:, ["Código"]].copy() + + + +if __name__ == "__main__": + fire.Fire(partial(get_instruments, market_index="br_index" )) diff --git a/scripts/data_collector/br_index/requirements.txt b/scripts/data_collector/br_index/requirements.txt new file mode 100644 index 0000000000..c77e932879 --- /dev/null +++ b/scripts/data_collector/br_index/requirements.txt @@ -0,0 +1,34 @@ +async-generator==1.10 +attrs==21.4.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.12 +cryptography==36.0.1 +fire==0.4.0 +h11==0.13.0 +idna==3.3 +loguru==0.6.0 +lxml==4.8.0 +multitasking==0.0.10 +numpy==1.22.2 +outcome==1.1.0 +pandas==1.4.1 +pycoingecko==2.2.0 +pycparser==2.21 +pyOpenSSL==22.0.0 +PySocks==1.7.1 +python-dateutil==2.8.2 +pytz==2021.3 +requests==2.27.1 +requests-futures==1.0.0 +six==1.16.0 +sniffio==1.2.0 +sortedcontainers==2.4.0 +termcolor==1.1.0 +tqdm==4.63.0 +trio==0.20.0 +trio-websocket==0.9.2 +urllib3==1.26.8 +wget==3.2 +wsproto==1.1.0 +yahooquery==2.2.15 diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py index e5970c256d..0fdfc658b4 100644 --- a/scripts/data_collector/cn_index/collector.py +++ b/scripts/data_collector/cn_index/collector.py @@ -21,6 +21,7 @@ from data_collector.index import IndexBase from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry +from data_collector.utils import get_instruments NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls" @@ -315,7 +316,7 @@ def get_new_companies(self) -> pd.DataFrame: return df -class CSI300(CSIIndex): +class CSI300Index(CSIIndex): @property def index_code(self): return "000300" @@ -458,46 +459,5 @@ def get_new_companies(self) -> pd.DataFrame: return df -def get_instruments( - qlib_dir: str, - index_name: str, - method: str = "parse_instruments", - freq: str = "day", - request_retry: int = 5, - retry_sleep: int = 3, -): - """ - - Parameters - ---------- - qlib_dir: str - qlib data dir, default "Path(__file__).parent/qlib_data" - index_name: str - index name, value from ["csi100", "csi300"] - method: str - method, value from ["parse_instruments", "save_new_companies"] - freq: str - freq, value from ["day", "1min"] - request_retry: int - request retry, by default 5 - retry_sleep: int - request sleep, by default 3 - - Examples - ------- - # parse instruments - $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments - - # parse new companies - $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies - - """ - _cur_module = importlib.import_module("data_collector.cn_index.collector") - obj = getattr(_cur_module, f"{index_name.upper()}")( - qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep - ) - getattr(obj, method)() - - if __name__ == "__main__": fire.Fire(get_instruments) diff --git a/scripts/data_collector/index.py b/scripts/data_collector/index.py index 497c199482..a23614b413 100644 --- a/scripts/data_collector/index.py +++ b/scripts/data_collector/index.py @@ -19,7 +19,7 @@ class IndexBase: SYMBOL_FIELD_NAME = "symbol" DATE_FIELD_NAME = "date" START_DATE_FIELD = "start_date" - END_DATE_FIELD = "end_ate" + END_DATE_FIELD = "end_date" CHANGE_TYPE_FIELD = "type" INSTRUMENTS_COLUMNS = [SYMBOL_FIELD_NAME, START_DATE_FIELD, END_DATE_FIELD] REMOVE = "remove" diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 576b3c32ae..06c48f8f62 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import abc +from functools import partial import sys import importlib from pathlib import Path @@ -20,6 +21,7 @@ from data_collector.index import IndexBase from data_collector.utils import deco_retry, get_calendar_list, get_trading_date_by_shift +from data_collector.utils import get_instruments WIKI_URL = "https://en.wikipedia.org/wiki" @@ -269,46 +271,6 @@ def parse_instruments(self): logger.warning(f"No suitable data source has been found!") -def get_instruments( - qlib_dir: str, - index_name: str, - method: str = "parse_instruments", - freq: str = "day", - request_retry: int = 5, - retry_sleep: int = 3, -): - """ - - Parameters - ---------- - qlib_dir: str - qlib data dir, default "Path(__file__).parent/qlib_data" - index_name: str - index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"] - method: str - method, value from ["parse_instruments", "save_new_companies"] - freq: str - freq, value from ["day", "1min"] - request_retry: int - request retry, by default 5 - retry_sleep: int - request sleep, by default 3 - - Examples - ------- - # parse instruments - $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments - - # parse new companies - $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies - - """ - _cur_module = importlib.import_module("data_collector.us_index.collector") - obj = getattr(_cur_module, f"{index_name.upper()}Index")( - qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep - ) - getattr(obj, method)() - if __name__ == "__main__": - fire.Fire(get_instruments) + fire.Fire(partial(get_instruments, market_index="us_index")) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 1814b75eae..7ef1cdf959 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import re +import importlib import time import bisect import pickle @@ -19,6 +20,7 @@ from tqdm import tqdm from functools import partial from concurrent.futures import ProcessPoolExecutor +from bs4 import BeautifulSoup HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" @@ -34,6 +36,7 @@ # NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks "US_ALL": "^GSPC", "IN_ALL": "^NSEI", + "BR_ALL": "^BVSP", } _BENCH_CALENDAR_LIST = None @@ -41,6 +44,7 @@ _HS_SYMBOLS = None _US_SYMBOLS = None _IN_SYMBOLS = None +_BR_SYMBOLS = None _EN_FUND_SYMBOLS = None _CALENDAR_MAP = {} @@ -69,7 +73,9 @@ def _get_calendar(url): calendar = _CALENDAR_MAP.get(bench_code, None) if calendar is None: - if bench_code.startswith("US_") or bench_code.startswith("IN_"): + if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"): + print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code])) + print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")) df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max") calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist() else: @@ -345,6 +351,57 @@ def _format(s_): return _IN_SYMBOLS +def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list: + """get Brazil(B3) stock symbols + + Returns + ------- + B3 stock symbols + """ + global _BR_SYMBOLS + + @deco_retry + def _get_ibovespa(): + _symbols = [] + url = "https://www.fundamentus.com.br/detalhes.php?papel=" + + # Request + agent = {"User-Agent": "Mozilla/5.0"} + page = requests.get(url, headers=agent) + + # BeautifulSoup + soup = BeautifulSoup(page.content, "html.parser") + tbody = soup.find("tbody") + + children = tbody.findChildren("a", recursive=True) + for child in children: + _symbols.append(str(child).split('"')[-1].split(">")[1].split("<")[0]) + + return _symbols + + if _BR_SYMBOLS is None: + _all_symbols = _get_ibovespa() + if qlib_data_path is not None: + for _index in ["ibov"]: + ins_df = pd.read_csv( + Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"), + sep="\t", + names=["symbol", "start_date", "end_date"], + ) + _all_symbols += ins_df["symbol"].unique().tolist() + + def _format(s_): + s_ = s_.strip() + s_ = s_.strip("$") + s_ = s_.strip("*") + s_ = s_ + ".SA" + return s_ + + _BR_SYMBOLS = sorted(set(map(_format, _all_symbols))) + + return _BR_SYMBOLS + + def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list: """get en fund symbols @@ -502,6 +559,50 @@ def generate_minutes_calendar_from_daily( return pd.Index(sorted(set(np.hstack(res)))) +def get_instruments( + qlib_dir: str, + index_name: str, + method: str = "parse_instruments", + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, + market_index: str = "cn_index" +): + """ + + Parameters + ---------- + qlib_dir: str + qlib data dir, default "Path(__file__).parent/qlib_data" + index_name: str + index name, value from ["csi100", "csi300"] + method: str + method, value from ["parse_instruments", "save_new_companies"] + freq: str + freq, value from ["day", "1min"] + request_retry: int + request retry, by default 5 + retry_sleep: int + request sleep, by default 3 + market_index: str + Where the files to obtain the index are located, + for example data_collector.cn_index.collector + + Examples + ------- + # parse instruments + $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments + + # parse new companies + $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies + + """ + _cur_module = importlib.import_module("data_collector.{}.collector".format(market_index)) + obj = getattr(_cur_module, f"{index_name.upper()}Index")( + qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep + ) + getattr(obj, method)() + if __name__ == "__main__": - assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM + assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM \ No newline at end of file diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index 71f2b75f8e..3ce9bae7f6 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -66,7 +66,7 @@ pip install -r requirements.txt - `source_dir`: save the directory - `interval`: `1d` or `1min`, by default `1d` > **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`** - - `region`: `CN` or `US` or `IN`, by default `CN` + - `region`: `CN` or `US` or `IN` or `BR`, by default `CN` - `delay`: `time.sleep(delay)`, by default *0.5* - `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)* - `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)* @@ -80,14 +80,21 @@ pip install -r requirements.txt python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region CN # cn 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --delay 1 --interval 1min --region CN + # us 1d data python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US # us 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data_1min --delay 1 --interval 1min --region US + # in 1d data python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region IN # in 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data_1min --delay 1 --interval 1min --region IN + + # br 1d data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR + # br 1min data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR ``` 2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data` @@ -116,8 +123,15 @@ pip install -r requirements.txt ```bash # normalize 1d cn python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_data --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d + # normalize 1min cn python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min + + # normalize 1d br + python scripts/data_collector/yahoo/collector.py normalize_data --source_dir ~/.qlib/stock_data/source/br_data --normalize_dir ~/.qlib/stock_data/source/br_1d_nor --region BR --interval 1d + + # normalize 1min br + python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min ``` 3. dump data: `python scripts/dump_bin.py dump_all` diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index e99a30d2a6..d57a3057b8 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import abc +from re import I import sys import copy import time @@ -35,6 +36,7 @@ get_hs_stock_symbols, get_us_stock_symbols, get_in_stock_symbols, + get_br_stock_symbols, generate_minutes_calendar_from_daily, ) @@ -42,6 +44,8 @@ class YahooCollector(BaseCollector): + retry = 5 # Configuration attribute. How many times will it try to re-request the data if the network fails. + def __init__( self, save_dir: [str, Path], @@ -146,7 +150,7 @@ def _show_logging_func(): def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp ) -> pd.DataFrame: - @deco_retry(retry_sleep=self.delay) + @deco_retry(retry_sleep=self.delay, retry=self.retry) def _get_simple(start_, end_): self.sleep() _remote_interval = "1m" if interval == self.INTERVAL_1min else interval @@ -311,6 +315,55 @@ class YahooCollectorIN1min(YahooCollectorIN): pass +class YahooCollectorBR(YahooCollector, ABC): + def retry(cls): + """" + The reason to use retry=2 is due to the fact that + Yahoo Finance unfortunately does not keep track of some + Brazilian stocks. + + Therefore, the decorator deco_retry with retry argument + set to 5 will keep trying to get the stock data up to 5 times, + which makes the code to download Brazilians stocks very slow. + + In future, this may change, but for now + I suggest to leave retry argument to 1 or 2 in + order to improve download speed. + + To achieve this goal an abstract attribute (retry) + was added into YahooCollectorBR base class + """ + raise NotImplementedError + + def get_instrument_list(self): + logger.info("get BR stock symbols......") + symbols = get_br_stock_symbols() + [ + "^BVSP", + ] + logger.info(f"get {len(symbols)} symbols.") + return symbols + + def download_index_data(self): + pass + + def normalize_symbol(self, symbol): + return code_to_fname(symbol).upper() + + @property + def _timezone(self): + return "Brazil/East" + + +class YahooCollectorBR1d(YahooCollectorBR): + retry = 2 + pass + + +class YahooCollectorBR1min(YahooCollectorBR): + retry = 2 + pass + + class YahooNormalize(BaseNormalize): COLUMNS = ["open", "close", "high", "low", "volume"] DAILY_FORMAT = "%Y-%m-%d" @@ -833,6 +886,29 @@ def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]: return get_calendar_list("ALL") +class YahooNormalizeBR: + def _get_calendar_list(self) -> Iterable[pd.Timestamp]: + return get_calendar_list("BR_ALL") + + +class YahooNormalizeBR1d(YahooNormalizeBR, YahooNormalize1d): + pass + + +class YahooNormalizeBR1min(YahooNormalizeBR, YahooNormalize1minOffline): + CALC_PAUSED_NUM = False + + def _get_calendar_list(self) -> Iterable[pd.Timestamp]: + # TODO: support 1min + raise ValueError("Does not support 1min") + + def _get_1d_calendar_list(self): + return get_calendar_list("BR_ALL") + + def symbol_to_yahoo(self, symbol): + return fname_to_code(symbol) + + class Run(BaseRun): def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN): """ @@ -848,7 +924,7 @@ def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval= interval: str freq, value from [1min, 1d], default 1d region: str - region, value from ["CN", "US"], default "CN" + region, value from ["CN", "US", "BR"], default "CN" """ super().__init__(source_dir, normalize_dir, max_workers, interval) self.region = region diff --git a/scripts/data_collector/yahoo/requirements.txt b/scripts/data_collector/yahoo/requirements.txt index 61422c7ab6..1a58eda1f6 100644 --- a/scripts/data_collector/yahoo/requirements.txt +++ b/scripts/data_collector/yahoo/requirements.txt @@ -7,3 +7,6 @@ tqdm lxml yahooquery joblib +beautifulsoup4 +bs4 +soupsieve \ No newline at end of file