Skip to content

Commit 155f803

Browse files
authored
fix get data error (#1793)
* fix get data error * fix get v0 data error * optimize get_data code * fix pylint error * add comments
1 parent 6302101 commit 155f803

File tree

2 files changed

+47
-26
lines changed

2 files changed

+47
-26
lines changed

examples/orderbook_data/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Current version of script with default value tries to connect localhost **via de
1616

1717
Run following command to install necessary libraries
1818
```
19-
pip install pytest coverage
19+
pip install pytest coverage gdown
2020
pip install arctic # NOTE: pip may fail to resolve the right package dependency !!! Please make sure the dependency are satisfied.
2121
```
2222

@@ -27,7 +27,8 @@ pip install arctic # NOTE: pip may fail to resolve the right package dependency
2727
2. Please follow following steps to download example data
2828
```bash
2929
cd examples/orderbook_data/
30-
python ../../scripts/get_data.py download_data --target_dir . --file_name highfreq_orderbook_example_data.zip
30+
gdown https://drive.google.com/uc?id=15nZF7tFT_eKVZAcMFL1qPS4jGyJflH7e # Proxies may be necessary here.
31+
python ../../scripts/get_data.py _unzip --file_path highfreq_orderbook_example_data.zip --target_dir .
3132
```
3233

3334
3. Please import the example data to your mongo db

qlib/tests/data.py

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,11 @@
1212
from tqdm import tqdm
1313
from pathlib import Path
1414
from loguru import logger
15-
from cryptography.fernet import Fernet
1615
from qlib.utils import exists_qlib_data
1716

1817

1918
class GetData:
20-
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
21-
# "?" is not included in the token.
22-
TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
23-
KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
19+
REMOTE_URL = "https://github.com/SunsetWolf/qlib_dataset/releases/download"
2420

2521
def __init__(self, delete_zip_file=False):
2622
"""
@@ -33,9 +29,45 @@ def __init__(self, delete_zip_file=False):
3329
self.delete_zip_file = delete_zip_file
3430

3531
def merge_remote_url(self, file_name: str):
36-
fernet = Fernet(self.KEY)
37-
token = fernet.decrypt(self.TOKEN).decode()
38-
return f"{self.REMOTE_URL}/{file_name}?{token}"
32+
"""
33+
Generate download links.
34+
35+
Parameters
36+
----------
37+
file_name: str
38+
The name of the file to be downloaded.
39+
The file name can be accompanied by a version number, (e.g.: v2/qlib_data_simple_cn_1d_latest.zip),
40+
if no version number is attached, it will be downloaded from v0 by default.
41+
"""
42+
return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}"
43+
44+
def download(self, url: str, target_path: [Path, str]):
45+
"""
46+
Download a file from the specified url.
47+
48+
Parameters
49+
----------
50+
url: str
51+
The url of the data.
52+
target_path: str
53+
The location where the data is saved, including the file name.
54+
"""
55+
file_name = str(target_path).rsplit("/", maxsplit=1)[-1]
56+
resp = requests.get(url, stream=True, timeout=60)
57+
resp.raise_for_status()
58+
if resp.status_code != 200:
59+
raise requests.exceptions.HTTPError()
60+
61+
chunk_size = 1024
62+
logger.warning(
63+
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
64+
)
65+
logger.info(f"{os.path.basename(file_name)} downloading......")
66+
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
67+
with target_path.open("wb") as fp:
68+
for chunk in resp.iter_content(chunk_size=chunk_size):
69+
fp.write(chunk)
70+
p_bar.update(chunk_size)
3971

4072
def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
4173
"""
@@ -70,21 +102,7 @@ def download_data(self, file_name: str, target_dir: [Path, str], delete_old: boo
70102
target_path = target_dir.joinpath(_target_file_name)
71103

72104
url = self.merge_remote_url(file_name)
73-
resp = requests.get(url, stream=True, timeout=60)
74-
resp.raise_for_status()
75-
if resp.status_code != 200:
76-
raise requests.exceptions.HTTPError()
77-
78-
chunk_size = 1024
79-
logger.warning(
80-
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
81-
)
82-
logger.info(f"{os.path.basename(file_name)} downloading......")
83-
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
84-
with target_path.open("wb") as fp:
85-
for chunk in resp.iter_content(chunk_size=chunk_size):
86-
fp.write(chunk)
87-
p_bar.update(chunk_size)
105+
self.download(url=url, target_path=target_path)
88106

89107
self._unzip(target_path, target_dir, delete_old)
90108
if self.delete_zip_file:
@@ -99,7 +117,9 @@ def check_dataset(self, file_name: str):
99117
return status
100118

101119
@staticmethod
102-
def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True):
120+
def _unzip(file_path: [Path, str], target_dir: [Path, str], delete_old: bool = True):
121+
file_path = Path(file_path)
122+
target_dir = Path(target_dir)
103123
if delete_old:
104124
logger.warning(
105125
f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}"

0 commit comments

Comments
 (0)