This repository was archived by the owner on Feb 16, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
216 lines (164 loc) · 6.71 KB
/
utils.py
File metadata and controls
216 lines (164 loc) · 6.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Copyright (C) 2022 J. S. Grewal <rg_public@proton.me>
Title: utils.py
Versioning:
python 3.10
black 22.10
isort 5.10
Description:
Responsible for conducting tests on all user inputs and providing the
dataframe to array conversion method.
"""
from os import PathLike
from typing import Dict, List, Union
import numpy as np
import numpy.typing as npt
import pandas as pd
NDArrayFloat = npt.NDArray[np.float_]
# default assertion errors
td: str = "variable must be of type dict"
ti: str = "variable must be of type int"
tl: str = "variable must be of type list"
ts: str = "variable must be of type str"
def market_data_tests(
start: str,
end: str,
SAVE_SINGLES: int,
stooq: Dict[str, list],
path: Union[str, bytes, PathLike],
path_singles: Union[str, bytes, PathLike],
price_type: str,
price_type_singles: str,
) -> None:
"""
Conduct tests on all user inputs prior to scrapping historical financial data
from Stooq.
Parameters:
start: start data
end: end date
SAVE_SINGLES: binary of whether to save single asset data
stooq: market data to download from Stooq
path: directory to save data
path_singles: directory to save singles data
price_type: type of market price to utilise
price_type_singles: type of market price to utilise for singles
"""
assert isinstance(start, str), ts
assert start[4] == start[7] == "-", "data format must be YYYY-MM-DD"
assert isinstance(end, str), ts
assert end[4] == end[7] == "-", "data format must be YYYY-MM-DD"
y_s, m_s, d_s = start[:4], start[5:7], start[-2:]
y_e, m_e, d_e = end[:4], end[5:7], end[-2:]
assert isinstance(int(y_s), int), ti
assert isinstance(int(m_s), int), ti
assert isinstance(int(d_s), int), ti
assert isinstance(int(y_e), int), ti
assert isinstance(int(m_e), int), ti
assert isinstance(int(d_e), int), ti
assert 1900 < int(y_s), "start year should be post 1900"
assert 0 < int(m_s) <= 12, "only 12 months in year"
assert 0 < int(d_s) <= 31, "maximum 31 days per month"
assert 1900 < int(y_s), "end year should be post 1900"
assert 0 < int(m_e) <= 12, "only 12 months in year"
assert 0 < int(d_e) <= 31, "maximum 31 days per month"
assert int(y_e + m_e + d_e) > int(
y_s + m_s + d_s
), "end date must exceed start date"
assert isinstance(SAVE_SINGLES, int), ti
assert SAVE_SINGLES == 0 or SAVE_SINGLES == 1, "must be 0 or 1"
assert isinstance(stooq, dict), td
for x in stooq:
mkt = stooq[str(x)]
assert isinstance(mkt, list), tl
assert len(mkt) == 2, "mkt must be of length 2"
assert isinstance(mkt[0], str), ts
assert isinstance(mkt[1], list), tl
assert len(mkt[1]) >= 1, "mkt must contain at least one asset"
assert len(mkt[1]) == len(set(mkt[1])), "mkt must contain only unique elements"
assert all(isinstance(a, str) for a in mkt[1]), ts
assert isinstance(path, Union[str, bytes, PathLike])
assert (
path[0:2] == "./" and path[-1] == "/"
), "file path must be in a sub-directory relative to main.py"
assert isinstance(path_singles, Union[str, bytes, PathLike])
assert (
path[0:2] == "./" and path[-1] == "/"
), "file path must be in a sub-directory relative to main.py"
assert isinstance(price_type, str)
assert (
price_type.capitalize() == "Open" or "High" or "Low" or "Close"
), "price_type must be one of Open, High, Low, or Close"
assert isinstance(price_type_singles, str)
assert (
price_type.capitalize() == "Open" or "High" or "Low" or "Close"
), "price_type_singles must be one of Open, High, Low, or Close"
print("Market Import Tests: Passed")
def dataframe_to_array(market_data: pd.DataFrame, price_type: str) -> NDArrayFloat:
"""
Converts pandas dataframe to cleaned numpy array by extracting relevant prices.
Parameters:
market_data: raw dataframe generated by pandas_datareader from remote source
price_type: 'Open', 'High', 'Low', or 'Close' prices for the time step
volume: whether to include volume
Returns:
prices: cleaned array of asset prices of a given type
"""
market = market_data[str(price_type).capitalize()]
# remove all rows with missing values
market = market.dropna()
# format time ordering if needed (earliest data point is at index 0)
if market.index[0] > market.index[-1]:
market = market[::-1]
n_assets, n_days = market.columns.shape[0], market.index.shape[0]
prices = np.empty((n_days, n_assets), dtype=np.float64)
a = 0
for asset in market.columns:
prices[:, a] = market[str(asset)]
a += 1
return prices
def extract_name_descriptor(names: pd.DataFrame) -> List[str]:
"""
Convert Stooq asset names to labels compatible with file saving.
Parameters:
names: Stooq lookup names for assets
Returns:
descriptors: labels used for file saving
"""
descriptors = list(names.copy())
for x in range(len(descriptors)):
if "^" in descriptors[x]:
descriptors[x] = descriptors[x].replace("^", "")
if "." in descriptors[x]:
descriptors[x] = descriptors[x].replace(".", "-")
return descriptors
def single_asset_histories(
market_data: pd.DataFrame,
price_type_singles: str,
path_singles: Union[str, bytes, PathLike],
) -> None:
"""
Save historical data for each singular asset included in the datasets obtained
from Stooq. Note that each asset array may not be temporally in sync due to the
removal of missing values.
Parameters:
market_data: raw dataframe generated by pandas_datareader from remote source
price_type_singles: "Open", "High", "Low", or "Close" prices for the time step
path_singles: location to save data
"""
market = market_data[str(price_type_singles).capitalize()]
# format time ordering if needed (earliest data point is at index 0)
if market.index[0] > market.index[-1]:
market = market[::-1]
names = market.keys()
descriptors = extract_name_descriptor(names)
for item, label in zip(names, descriptors):
asset = market[item]
asset.to_pickle(path_singles + "stooq_" + label + ".pkl")
asset.to_csv(path_singles + "stooq_" + label + ".csv")
# remove all dates with missing values
asset = asset.dropna()
n_days = asset.index.shape[0]
history = np.empty((n_days, 1), dtype=np.float64)
history[:, 0] = asset
np.save(path_singles + "stooq_" + label, history)
print("{}: days = {}".format(label, n_days))