-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathloader.py
More file actions
207 lines (173 loc) · 8.49 KB
/
Copy pathloader.py
File metadata and controls
207 lines (173 loc) · 8.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pandas as pd
import yfinance as yf
from typing import Union
def load_data(tickers: Union[str, list[str]], start_date = '1995-01-01') -> dict[str, dict[str, any]]:
"""
Load stock data from Yahoo Finance starting from 1995-01-01.
:param tickers: list of tickers to load
:param start_date: start date for historical data
:return: dictionary with stock data.
keys are tickers and values are dictionaries with keys 'info', 'historical_data', 'splits'
"""
stock_data = {}
# if tickers is a string, convert it to a list
if isinstance(tickers, str):
tickers = [tickers]
for ticker in tickers:
stock = yf.Ticker(ticker)
# get stock info
try:
stock_info = stock.info
except Exception as e:
print(f"Could not get info for {ticker}.")
print(f'Error: {e}')
stock_info = None
continue # skip to the next ticker
# get historical data
try:
historical_data = stock.history(start=start_date) # keepna=True will keep the rows with missing values
except Exception as e:
print(f"Could not get historical data for {ticker}.")
print(f'Error: {e}')
historical_data = None
continue
# get stock splits
try:
splits = stock.splits
except Exception as e:
print(f"Could not get splits for {ticker}.")
print(f'Error: {e}')
splits = None
continue
# store the data in the dictionary
stock_data[ticker] = {
'info': stock_info,
'historical_data': historical_data,
'splits': splits,
}
return stock_data
def round_values(df: pd.DataFrame) -> pd.DataFrame:
"""
Round the values in the dataframe to a significative number of decimal places.
:param df: dataframe with historical data
:return: dataframe with rounded values
"""
columns_to_round = ['Open', 'High', 'Low', 'Close']
# round the values in the dataframe depending on the stock prices
# price > 10: round to 2 decimal places, price > 1: round to 3 decimal places, price < 1: round to 4 decimal places
for column in columns_to_round:
df[column] = df[column].apply(lambda x: round(x, 2) if x > 10 else round(x, 3) if x > 1 else round(x, 4))
return df
def remove_typos_and_missing_data(df: pd.DataFrame, ticker: str) -> pd.DataFrame:
"""
Remove rows with missing or incorrect data.
Missing values, negative prices, and incorrect Open, High, Low, Close values are removed.
:param df: dataframe with historical data
:param ticker: ticker of the stock
:return: dataframe with cleaned data
"""
initial_rows = len(df)
# remove rows with missing values
#df = df.dropna(how='all') # to drop if all value in the row are NaN
df = df.dropna(subset=['Open', 'High', 'Low', 'Close'], how='any') # to drop if any value in the row is NaN
# remove rows with negative prices (valid for stocks)
df = df[(df['Open'] > 0) & (df['High'] > 0) & (df['Low'] > 0) & (df['Close'] > 0)]
# remove rows with Open, High, Low, Close values that don't make sense
df = df[df['High'] >= df['Low']]
df = df[df['High'] >= df['Open']]
df = df[df['High'] >= df['Close']]
df = df[df['Low'] <= df['Open']]
df = df[df['Low'] <= df['Close']]
final_rows = len(df)
if initial_rows != final_rows:
print(f'From {ticker} dataset were removed {initial_rows - final_rows} rows with missing or incorrect data')
return df
def check_prices_volumes_excursion(df: pd.DataFrame) -> Union[list[str], float]:
"""
Check for dates with same OHLC prices, low volume, and calculate the average price excursion
:param df: dataframe with historical data
:return: list of dates with same prices, list of dates with low volume, average price excursion
"""
# Dates with OHLC all the same
same_price_dates = df[(df['Open'] == df['High']) & (df['High'] == df['Low']) & (df['Low'] == df['Close'])].index.tolist()
# Dates with low volume
low_volume_dates = df[df['Volume'] < 1000].index.tolist()
# Calculate the average price
avgPrice = df[['Open', 'High', 'Low', 'Close']].mean(axis=1)
# Calculate the excursion of the average price for the whole history of the stock
max_avg_price = avgPrice.max()
min_avg_price = avgPrice.min()
excursion = (max_avg_price - min_avg_price) / min_avg_price * 100
return same_price_dates, low_volume_dates, excursion
def identify_anomalies(df: pd.DataFrame, threshold1: float = 0.35, threshold2: float = 0.50) -> dict[str, list]:
"""
Identify anomalies in the stock data: cases where variations between prices are too high
:param df: dataframe with historical data
:param threshold1: threshold for the percentage difference between open and close prices
:param threshold2: threshold for the percentage difference between high and low prices
:return: dictionary with anomalies.
Keys are 'Open-pClose Anomalies', 'High-Low Anomalies', 'Close-Open Anomalies'
Values are lists of tuples with the date and the prices
"""
anomalies = {
'Open-pClose Anomalies': [],
'High-Low Anomalies': [],
'Close-Open Anomalies': []
}
for i in range(1, len(df)):
previous_close = df.iloc[i-1]['Close']
current_open = df.iloc[i]['Open']
current_high = df.iloc[i]['High']
current_low = df.iloc[i]['Low']
current_close = df.iloc[i]['Close']
# check if the open is more than 35% higher or lower than the PREVIOUS close
if abs(current_open - previous_close) / previous_close > threshold1:
anomalies['Open-pClose Anomalies'].append((df.index[i], current_open, previous_close))
# check if the daily high-low excursion is more than 50%
if (current_high - current_low) / current_low > threshold2:
anomalies['High-Low Anomalies'].append((df.index[i], current_high, current_low))
# check if the close is more than 35% higher or lower than the CURRENT open
if abs(current_close - current_open) / current_open > threshold1:
anomalies['Close-Open Anomalies'].append((df.index[i], current_close, current_open))
return anomalies
def check_clean_data(stock_data: dict[str, dict[str, any]], verbose: bool = False) -> dict[str, dict[str, any]]:
"""
Function that applies a series of checks and adjustments to the dataset.
These checks and adjustments are defined in the functions above.
:param stock_data: dictionary with stock data
:param verbose: if True, print the results
:return: dictionary with cleaned stock data.
keys are tickers and values are dictionaries with keys 'info', 'historical_data', 'splits'
"""
for ticker, data in stock_data.items():
historical_data = data.get('historical_data')
if historical_data is not None and not historical_data.empty:
# remove typos and missing data
cleaned_data = remove_typos_and_missing_data(historical_data, ticker)
# round the values
rounded_data = round_values(cleaned_data)
stock_data[ticker]['historical_data'] = rounded_data
# check prices and average excursion
same_price_dates, low_volume_dates, avg_excursion = check_prices_volumes_excursion(rounded_data)
# identify anomalies
anomalies = identify_anomalies(rounded_data)
num_anomalies = sum([len(anomalies[key]) for key in anomalies.keys()])
# output the results
if same_price_dates:
print(f"Ticker: {ticker} has the same OHLC prices on {len(same_price_dates)} dates")
if verbose:
print(same_price_dates)
if low_volume_dates:
print(f"Ticker: {ticker} has low volume on {len(low_volume_dates)} dates")
if verbose:
print(low_volume_dates)
if avg_excursion < 100:
print(f"Ticker: {ticker} has an average price excursion of less than 100%: {avg_excursion:.2f}%")
if num_anomalies:
print(f"Ticker: {ticker} has {num_anomalies} anomalies:")
for key, value in anomalies.items():
if value:
print(f" {key}: {len(value)}")
if verbose:
print(value)
return stock_data