Skip to content

Commit 9ff3dfe

Browse files
committed
support collecting yahoo 1min data
1 parent 896716f commit 9ff3dfe

File tree

4 files changed

+639
-277
lines changed

4 files changed

+639
-277
lines changed

docs/component/data.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
136136
- `volume`
137137
The trading volume
138138
- `factor`
139-
The Restoration factor
139+
The Restoration factor; ``factor = adjusted-close / close``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
140140

141141
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
142142

scripts/data_collector/utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import time
66
import bisect
77
import pickle
8+
import random
89
import requests
910
import functools
1011
from pathlib import Path
@@ -17,6 +18,7 @@
1718
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
1819

1920
CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
21+
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
2022

2123
CALENDAR_BENCH_URL_MAP = {
2224
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
@@ -63,7 +65,29 @@ def _get_calendar(url):
6365
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
6466
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
6567
else:
66-
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
68+
if bench_code.upper() == "ALL":
69+
70+
@deco_retry
71+
def _get_calendar(month):
72+
_cal = []
73+
try:
74+
resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json()
75+
for _r in resp["data"]:
76+
if int(_r["jybz"]):
77+
_cal.append(pd.Timestamp(_r["jyrq"]))
78+
except Exception as e:
79+
raise ValueError(f"{month}-->{e}")
80+
return _cal
81+
82+
month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
83+
calendar = []
84+
for _m in month_range:
85+
cal = _get_calendar(_m.strftime("%Y-%m"))
86+
if cal:
87+
calendar += cal
88+
calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
89+
else:
90+
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
6791
_CALENDAR_MAP[bench_code] = calendar
6892
logger.info(f"end of get calendar list: {bench_code}.")
6993
return calendar

scripts/data_collector/yahoo/README.md

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,87 @@ pip install -r requirements.txt
1818

1919
## Collector Data
2020

21-
### Download data and Normalize data
21+
22+
### CN Data
23+
24+
#### 1d
25+
2226
```bash
23-
python collector.py collector_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d --normalize_dir ~/.qlib/stock_data/normalize
27+
28+
# download from yahoo finance
29+
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
30+
31+
# normalize
32+
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
33+
34+
# dump data
35+
cd qlib/scripts
36+
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
37+
38+
# using
39+
import qlib
40+
from qlib.data import D
41+
42+
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1d", region="CN")
43+
df = D.features(D.instruments("all"), ["$close"], freq="day")
44+
2445
```
2546

26-
### Download Data
47+
#### 1min
2748

2849
```bash
29-
python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
50+
51+
# download from yahoo finance
52+
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1min
53+
54+
# normalize
55+
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
56+
57+
# dump data
58+
cd qlib/scripts
59+
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
60+
61+
# using
62+
import qlib
63+
from qlib.data import D
64+
65+
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1min", region="CN")
66+
df = D.features(D.instruments("all"), ["$close"], freq="1min")
67+
3068
```
3169

32-
### Normalize Data
70+
### US Data
71+
72+
#### 1d
3373

3474
```bash
35-
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN
75+
76+
# download from yahoo finance
77+
python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --region US --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
78+
79+
# normalize
80+
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/us_1d --normalize_dir ~/.qlib/stock_data/source/us_1d_nor --region US --interval 1d
81+
82+
# dump data
83+
cd qlib/scripts
84+
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
85+
86+
# using
87+
import qlib
88+
from qlib.data import D
89+
90+
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_us_1d", region="US")
91+
df = D.features(D.instruments("all"), ["$close"], freq="day")
92+
3693
```
3794

95+
3896
### Help
3997
```bash
4098
pythono collector.py collector_data --help
4199
```
42100

43101
## Parameters
44102

45-
- interval: 1m or 1d
103+
- interval: 1min or 1d
46104
- region: CN or US

0 commit comments

Comments
 (0)