Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ibovespa index support #990

Merged
merged 20 commits into from
Apr 6, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
3419ec9
feat: download ibovespa index historic composition
igor17400 Mar 17, 2022
c2f933b
fix: typo error instead of end_date, it was written end_ate
igor17400 Mar 17, 2022
09b8ad9
feat: adds support for downloading stocks historic prices from Brazil…
igor17400 Mar 17, 2022
77107f3
fix: code formatted with black.
igor17400 Mar 19, 2022
3aaf1df
wip: Creating code logic for brazils stock market data normalization
igor17400 Mar 20, 2022
9ceb592
docs: brazils stock market data normalization code documentation
igor17400 Mar 23, 2022
d1b73b3
fix: code formatted the with black
igor17400 Mar 24, 2022
cc0e126
docs: fixed typo
igor17400 Mar 29, 2022
95938ea
docs: more info about python version used to generate requirements.tx…
igor17400 Mar 30, 2022
b0aafa2
docs: added BeautifulSoup requirements
igor17400 Apr 1, 2022
592559a
feat: removed debug prints
igor17400 Apr 1, 2022
92aa003
feat: added ibov_index_composition variable as a class attribute of I…
igor17400 Apr 1, 2022
4903845
feat: added increment to generate the four month period used by the i…
igor17400 Apr 2, 2022
6db33ef
refactor: Added get_instruments() method inside utils.py for better c…
igor17400 Apr 2, 2022
ae6380a
refactor: improve brazils stocks download speed
igor17400 Apr 2, 2022
1d80c4c
fix: added __main__ at the bottom of the script
igor17400 Apr 2, 2022
dc72c6b
refactor: changed interface inside each index
igor17400 Apr 2, 2022
6cc96cc
refactor: implemented class interface retry into YahooCollectorBR
igor17400 Apr 2, 2022
1cbfb5c
docs: added BR as a possible region into the documentation
igor17400 Apr 3, 2022
c313804
refactor: make retry attribute part of the interface
igor17400 Apr 3, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/data_collector/br_index/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ With that reference, the index's composition can be compared quarter by quarter

```bash
# parse instruments, using in qlib/instruments.
python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments
python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments --market_index br_index
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image


# parse new companies
python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies
python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies --market_index br_index
```

187 changes: 73 additions & 114 deletions scripts/data_collector/br_index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,74 +14,16 @@
sys.path.append(str(CUR_DIR.parent.parent))

from data_collector.index import IndexBase

IBOV_INDEX_COMPOSITION = "https://github.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv"

YEAR_QUARTER = [
"2003_1Q",
"2003_2Q",
"2003_3Q",
"2004_1Q",
"2004_2Q",
"2004_3Q",
"2005_1Q",
"2005_2Q",
"2005_3Q",
"2006_1Q",
"2006_2Q",
"2006_3Q",
"2007_1Q",
"2007_2Q",
"2007_3Q",
"2008_1Q",
"2008_2Q",
"2008_3Q",
"2009_1Q",
"2009_2Q",
"2009_3Q",
"2010_1Q",
"2010_2Q",
"2010_3Q",
"2011_1Q",
"2011_2Q",
"2011_3Q",
"2012_1Q",
"2012_2Q",
"2012_3Q",
"2013_1Q",
"2013_2Q",
"2013_3Q",
"2014_1Q",
"2014_2Q",
"2014_3Q",
"2015_1Q",
"2015_2Q",
"2015_3Q",
"2016_1Q",
"2016_2Q",
"2016_3Q",
"2017_1Q",
"2017_2Q",
"2017_3Q",
"2018_1Q",
"2018_2Q",
"2018_3Q",
"2019_1Q",
"2019_2Q",
"2019_3Q",
"2020_1Q",
"2020_2Q",
"2020_3Q",
"2021_1Q",
"2021_2Q",
"2021_3Q",
"2022_1Q",
]
from data_collector.utils import get_instruments

quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"}


class IBOVIndex(IndexBase):

ibov_index_composition = "https://github.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv"
years_4_month_periods = []

def __init__(
self,
index_name: str,
Expand All @@ -94,9 +36,10 @@ def __init__(
index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)

self.today = datetime.date.today()
self.quarter = str(pd.Timestamp(self.today).quarter)
self.today: datetime = datetime.date.today()
self.current_4_month_period = self.get_current_4_month_period(self.today.month)
self.year = str(self.today.year)
self.years_4_month_periods = self.get_four_month_period()

@property
def bench_start_date(self) -> pd.Timestamp:
Expand All @@ -109,6 +52,62 @@ def bench_start_date(self) -> pd.Timestamp:
"""
return pd.Timestamp("2003-01-03")

def get_current_4_month_period(self, current_month: int):
"""
This function is used to calculated what is the current
four month period for the current month. For example,
If the current month is August 8, its four month period
is 2Q.

OBS: In english Q is used to represent *quarter*
which means a three month period. However, in
portuguese we use Q to represent a four month period.
In other words,

Jan, Feb, Mar, Apr: 1Q
May, Jun, Jul, Aug: 2Q
Sep, Oct, Nov, Dez: 3Q

Parameters
----------
month : int
Current month (1 <= month <= 12)

Returns
-------
current_4m_period:str
Current Four Month Period (1Q or 2Q or 3Q)
"""
if current_month < 5:
return "1Q"
if current_month < 9:
return "2Q"
if current_month <= 12:
return "3Q"
else:
return -1

def get_four_month_period(self):
"""
The ibovespa index is updated every four months.
Therefore, we will represent each time period as 2003_1Q
which means 2003 first four mount period (Jan, Feb, Mar, Apr)
"""
four_months_period = ["1Q", "2Q", "3Q"]
init_year = 2003
now = datetime.datetime.now()
current_year = now.year
current_month = now.month
for year in [item for item in range(init_year, current_year)]:
for el in four_months_period:
self.years_4_month_periods.append(str(year)+"_"+el)
# For current year the logic must be a little different
current_4_month_period = self.get_current_4_month_period(current_month)
for i in range(int(current_4_month_period[0])):
self.years_4_month_periods.append(str(current_year) + "_" + str(i+1) + "Q")
return self.years_4_month_periods


def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument

Expand Down Expand Up @@ -142,7 +141,7 @@ def format_quarter(self, cell: str):
Parameters
----------
cell: str
It must be on the format 2003_1Q --> year_quarter
It must be on the format 2003_1Q --> years_4_month_periods

Returns
----------
Expand Down Expand Up @@ -188,12 +187,12 @@ def get_changes(self):

try:
df_changes_list = []
for i in tqdm(range(len(YEAR_QUARTER) - 1)):
df = pd.read_csv(IBOV_INDEX_COMPOSITION.format(YEAR_QUARTER[i]), on_bad_lines="skip")["symbol"]
df_ = pd.read_csv(IBOV_INDEX_COMPOSITION.format(YEAR_QUARTER[i + 1]), on_bad_lines="skip")["symbol"]
for i in tqdm(range(len(self.years_4_month_periods) - 1)):
df = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i]), on_bad_lines="skip")["symbol"]
df_ = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i + 1]), on_bad_lines="skip")["symbol"]

## Remove Dataframe
remove_date = YEAR_QUARTER[i].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i].split("_")[1]]
remove_date = self.years_4_month_periods[i].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i].split("_")[1]]
list_remove = list(df[~df.isin(df_)])
df_removed = pd.DataFrame(
{
Expand All @@ -204,7 +203,7 @@ def get_changes(self):
)

## Add Dataframe
add_date = YEAR_QUARTER[i + 1].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i + 1].split("_")[1]]
add_date = self.years_4_month_periods[i + 1].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i + 1].split("_")[1]]
list_add = list(df_[~df_.isin(df)])
df_added = pd.DataFrame(
{"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add}
Expand Down Expand Up @@ -248,17 +247,17 @@ def get_new_companies(self):
## Get index composition

df_index = pd.read_csv(
IBOV_INDEX_COMPOSITION.format(self.year + "_" + self.quarter + "Q"), on_bad_lines="skip"
self.ibov_index_composition.format(self.year + "_" + self.current_4_month_period), on_bad_lines="skip"
)
df_date_first_added = pd.read_csv(
IBOV_INDEX_COMPOSITION.format("date_first_added_" + self.year + "_" + self.quarter + "Q"),
self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.current_4_month_period),
on_bad_lines="skip",
)
df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]]
df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter)

# end_date will be our current quarter + 1, since the IBOV index updates itself every quarter
df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[str(int(self.quarter) + 1) + "Q"]
df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[self.current_4_month_period]
df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]]
df["symbol"] = df["symbol"].astype(str) + ".SA"

Expand All @@ -272,46 +271,6 @@ def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
return df.loc[:, ["Código"]].copy()


def get_instruments(
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""

Parameters
----------
qlib_dir: str
qlib data dir, default "Path(__file__).parent/qlib_data"
index_name: str
index name, value from ["IBOV"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
request sleep, by default 3

Examples
-------
# parse instruments
$ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments

# parse new companies
$ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies

"""
_cur_module = importlib.import_module("data_collector.br_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()


if __name__ == "__main__":
fire.Fire(get_instruments)
44 changes: 2 additions & 42 deletions scripts/data_collector/cn_index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from data_collector.index import IndexBase
from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry
from data_collector.utils import get_instruments


NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
Expand Down Expand Up @@ -315,7 +316,7 @@ def get_new_companies(self) -> pd.DataFrame:
return df


class CSI300(CSIIndex):
class CSI300Index(CSIIndex):
@property
def index_code(self):
return "000300"
Expand Down Expand Up @@ -458,46 +459,5 @@ def get_new_companies(self) -> pd.DataFrame:
return df


def get_instruments(
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""

Parameters
----------
qlib_dir: str
qlib data dir, default "Path(__file__).parent/qlib_data"
index_name: str
index name, value from ["csi100", "csi300"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
request sleep, by default 3

Examples
-------
# parse instruments
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments

# parse new companies
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies

"""
_cur_module = importlib.import_module("data_collector.cn_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}")(
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()


if __name__ == "__main__":
fire.Fire(get_instruments)
4 changes: 2 additions & 2 deletions scripts/data_collector/us_index/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ pip install -r requirements.txt

```bash
# parse instruments, using in qlib/instruments.
python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments
python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments --market_index us_index

# parse new companies
python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies
python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies --market_index us_index

# index_name support: SP500, NASDAQ100, DJIA, SP400
# help
Expand Down
Loading