From ea66d6b981554f9d0262038aef2106dda7138316 Mon Sep 17 00:00:00 2001 From: raphf6 <97462273+raphf6@users.noreply.github.com> Date: Sun, 18 Dec 2022 23:12:39 +0000 Subject: [PATCH 1/5] In daily.py, Fixed the issue where the stores were being returned from Yahoo! Finance as encrypted due to a recent change in their API (around 2 days ago). Fix decodes the data in decrypt_cryptojs_aes() and makes small changes to _read_one_data() to parse it properly later. --- pandas_datareader/yahoo/daily.py | 107 +++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py index 0e8a8a7e..81035dad 100644 --- a/pandas_datareader/yahoo/daily.py +++ b/pandas_datareader/yahoo/daily.py @@ -4,6 +4,11 @@ import re import time +import hashlib +from base64 import b64decode +from Crypto.Cipher import AES +from Crypto.Util.Padding import unpad + from pandas import DataFrame, isnull, notnull, to_datetime from pandas_datareader._utils import RemoteDataError @@ -11,6 +16,74 @@ from pandas_datareader.yahoo.headers import DEFAULT_HEADERS +def decrypt_cryptojs_aes(data): + encrypted_stores = data['context']['dispatcher']['stores'] + _cs = data["_cs"] + _cr = data["_cr"] + + _cr = b"".join(int.to_bytes(i, length=4, byteorder="big", signed=True) for i in json.loads(_cr)["words"]) + password = hashlib.pbkdf2_hmac("sha1", _cs.encode("utf8"), _cr, 1, dklen=32).hex() + + encrypted_stores = b64decode(encrypted_stores) + assert encrypted_stores[0:8] == b"Salted__" + salt = encrypted_stores[8:16] + encrypted_stores = encrypted_stores[16:] + + def EVPKDF( + password, + salt, + keySize=32, + ivSize=16, + iterations=1, + hashAlgorithm="md5", + ) -> tuple: + """OpenSSL EVP Key Derivation Function + Args: + password (Union[str, bytes, bytearray]): Password to generate key from. + salt (Union[bytes, bytearray]): Salt to use. + keySize (int, optional): Output key length in bytes. Defaults to 32. + ivSize (int, optional): Output Initialization Vector (IV) length in bytes. Defaults to 16. + iterations (int, optional): Number of iterations to perform. Defaults to 1. + hashAlgorithm (str, optional): Hash algorithm to use for the KDF. Defaults to 'md5'. + Returns: + key, iv: Derived key and Initialization Vector (IV) bytes. + Taken from: https://gist.github.com/rafiibrahim8/0cd0f8c46896cafef6486cb1a50a16d3 + OpenSSL original code: https://github.com/openssl/openssl/blob/master/crypto/evp/evp_key.c#L78 + """ + + assert iterations > 0, "Iterations can not be less than 1." + + if isinstance(password, str): + password = password.encode("utf-8") + + final_length = keySize + ivSize + key_iv = b"" + block = None + + while len(key_iv) < final_length: + hasher = hashlib.new(hashAlgorithm) + if block: + hasher.update(block) + hasher.update(password) + hasher.update(salt) + block = hasher.digest() + for _ in range(1, iterations): + block = hashlib.new(hashAlgorithm, block).digest() + key_iv += block + + key, iv = key_iv[:keySize], key_iv[keySize:final_length] + return key, iv + + key, iv = EVPKDF(password, salt, keySize=32, ivSize=16, iterations=1, hashAlgorithm="md5") + + cipher = AES.new(key, AES.MODE_CBC, iv=iv) + plaintext = cipher.decrypt(encrypted_stores) + plaintext = unpad(plaintext, 16, style="pkcs7") + decoded_stores = json.loads(plaintext) + + return decoded_stores + + class YahooDailyReader(_DailyBaseReader): """ Returns DataFrame of with historical over date range, @@ -56,19 +129,19 @@ class YahooDailyReader(_DailyBaseReader): """ def __init__( - self, - symbols=None, - start=None, - end=None, - retry_count=3, - pause=0.1, - session=None, - adjust_price=False, - ret_index=False, - chunksize=1, - interval="d", - get_actions=False, - adjust_dividends=True, + self, + symbols=None, + start=None, + end=None, + retry_count=3, + pause=0.1, + session=None, + adjust_price=False, + ret_index=False, + chunksize=1, + interval="d", + get_actions=False, + adjust_dividends=True, ): super().__init__( symbols=symbols, @@ -150,7 +223,13 @@ def _read_one_data(self, url, params): ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);" try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) - data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"] + + if "_cs" in j and "_cr" in j: + new_j = decrypt_cryptojs_aes(j) # returns j["context"]["dispatcher"]["stores"] + # from old code + + data = new_j['HistoricalPriceStore'] + except KeyError: msg = "No data fetched for symbol {} using {}" raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) From a1e0c776c2389564137787a428bdb93a955c9de6 Mon Sep 17 00:00:00 2001 From: raphi6 <69864267+raphi6@users.noreply.github.com> Date: Sun, 25 Dec 2022 00:10:50 +0000 Subject: [PATCH 2/5] Update requirements.txt Updating with recommendations from @satoshi --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 25c6f68a..aa4a8be1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ lxml pandas>=0.23 requests>=2.19.0 +pycryptodome>=3.16.0 +packaging>=22.0 From 805ef9d04b21f96f5a1810df1c1cb905d87c9e70 Mon Sep 17 00:00:00 2001 From: rapha Date: Wed, 28 Dec 2022 23:46:53 +0000 Subject: [PATCH 3/5] Refactor: running black pandas-datareader and reformatted 2 files. --- pandas_datareader/tests/io/test_jsdmx.py | 2 +- pandas_datareader/tests/yahoo/test_options.py | 2 +- pandas_datareader/yahoo/daily.py | 55 +++++++++++-------- 3 files changed, 33 insertions(+), 26 deletions(-) diff --git a/pandas_datareader/tests/io/test_jsdmx.py b/pandas_datareader/tests/io/test_jsdmx.py index 066cc159..b95b0108 100644 --- a/pandas_datareader/tests/io/test_jsdmx.py +++ b/pandas_datareader/tests/io/test_jsdmx.py @@ -170,7 +170,7 @@ def test_quartervalue(dirpath): "2011-10-01", ], dtype="datetime64[ns]", - name=u"Period", + name="Period", freq=None, ) tm.assert_index_equal(result.index, expected) diff --git a/pandas_datareader/tests/yahoo/test_options.py b/pandas_datareader/tests/yahoo/test_options.py index d37839d8..17f0f74b 100644 --- a/pandas_datareader/tests/yahoo/test_options.py +++ b/pandas_datareader/tests/yahoo/test_options.py @@ -100,7 +100,7 @@ def assert_option_result(self, df): ] ) tm.assert_index_equal(df.columns, exp_columns) - assert df.index.names == [u"Strike", u"Expiry", u"Type", u"Symbol"] + assert df.index.names == ["Strike", "Expiry", "Type", "Symbol"] dtypes = [ np.dtype(x) diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py index 81035dad..dfbdd043 100644 --- a/pandas_datareader/yahoo/daily.py +++ b/pandas_datareader/yahoo/daily.py @@ -17,11 +17,14 @@ def decrypt_cryptojs_aes(data): - encrypted_stores = data['context']['dispatcher']['stores'] + encrypted_stores = data["context"]["dispatcher"]["stores"] _cs = data["_cs"] _cr = data["_cr"] - _cr = b"".join(int.to_bytes(i, length=4, byteorder="big", signed=True) for i in json.loads(_cr)["words"]) + _cr = b"".join( + int.to_bytes(i, length=4, byteorder="big", signed=True) + for i in json.loads(_cr)["words"] + ) password = hashlib.pbkdf2_hmac("sha1", _cs.encode("utf8"), _cr, 1, dklen=32).hex() encrypted_stores = b64decode(encrypted_stores) @@ -30,12 +33,12 @@ def decrypt_cryptojs_aes(data): encrypted_stores = encrypted_stores[16:] def EVPKDF( - password, - salt, - keySize=32, - ivSize=16, - iterations=1, - hashAlgorithm="md5", + password, + salt, + keySize=32, + ivSize=16, + iterations=1, + hashAlgorithm="md5", ) -> tuple: """OpenSSL EVP Key Derivation Function Args: @@ -74,7 +77,9 @@ def EVPKDF( key, iv = key_iv[:keySize], key_iv[keySize:final_length] return key, iv - key, iv = EVPKDF(password, salt, keySize=32, ivSize=16, iterations=1, hashAlgorithm="md5") + key, iv = EVPKDF( + password, salt, keySize=32, ivSize=16, iterations=1, hashAlgorithm="md5" + ) cipher = AES.new(key, AES.MODE_CBC, iv=iv) plaintext = cipher.decrypt(encrypted_stores) @@ -129,19 +134,19 @@ class YahooDailyReader(_DailyBaseReader): """ def __init__( - self, - symbols=None, - start=None, - end=None, - retry_count=3, - pause=0.1, - session=None, - adjust_price=False, - ret_index=False, - chunksize=1, - interval="d", - get_actions=False, - adjust_dividends=True, + self, + symbols=None, + start=None, + end=None, + retry_count=3, + pause=0.1, + session=None, + adjust_price=False, + ret_index=False, + chunksize=1, + interval="d", + get_actions=False, + adjust_dividends=True, ): super().__init__( symbols=symbols, @@ -225,10 +230,12 @@ def _read_one_data(self, url, params): j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) if "_cs" in j and "_cr" in j: - new_j = decrypt_cryptojs_aes(j) # returns j["context"]["dispatcher"]["stores"] + new_j = decrypt_cryptojs_aes( + j + ) # returns j["context"]["dispatcher"]["stores"] # from old code - data = new_j['HistoricalPriceStore'] + data = new_j["HistoricalPriceStore"] except KeyError: msg = "No data fetched for symbol {} using {}" From 075a6ddb16e03789ee5d71d1a70ed73aed53b7ce Mon Sep 17 00:00:00 2001 From: rapha Date: Thu, 29 Dec 2022 02:18:36 +0000 Subject: [PATCH 4/5] Updated docs/source/whatsnew/vLATEST.txt --- docs/source/whatsnew/v0.10.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/whatsnew/v0.10.0.txt b/docs/source/whatsnew/v0.10.0.txt index fb464ceb..9fc7c33a 100644 --- a/docs/source/whatsnew/v0.10.0.txt +++ b/docs/source/whatsnew/v0.10.0.txt @@ -11,6 +11,7 @@ Bug Fixes - Fixed Yahoo readers which now require headers - Fixed other reader - Improved compatibility with pandas +- Decoding stores from Yahoo that were encrypted Contributors ~~~~~~~~~~~~ @@ -26,6 +27,7 @@ Thanks to all of the contributors for the 0.10.0 release (based on git log): - Lukas Halim - Simon Garisch - Dmitry Alekseev +- Raphael Frach These lists of names are automatically generated based on git log, and may not be complete. \ No newline at end of file From 87dda3f297df8f4b3253c6f2d5006b5ac43a9150 Mon Sep 17 00:00:00 2001 From: hellc Date: Sun, 15 Jan 2023 23:15:17 +0300 Subject: [PATCH 5/5] Decryption issue fix --- pandas_datareader/yahoo/daily.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py index dfbdd043..96308cfb 100644 --- a/pandas_datareader/yahoo/daily.py +++ b/pandas_datareader/yahoo/daily.py @@ -18,16 +18,11 @@ def decrypt_cryptojs_aes(data): encrypted_stores = data["context"]["dispatcher"]["stores"] - _cs = data["_cs"] - _cr = data["_cr"] - - _cr = b"".join( - int.to_bytes(i, length=4, byteorder="big", signed=True) - for i in json.loads(_cr)["words"] - ) - password = hashlib.pbkdf2_hmac("sha1", _cs.encode("utf8"), _cr, 1, dklen=32).hex() + password_key = next(key for key in data.keys() if key not in ["context", "plugins"]) + password = data[password_key] encrypted_stores = b64decode(encrypted_stores) + assert encrypted_stores[0:8] == b"Salted__" salt = encrypted_stores[8:16] encrypted_stores = encrypted_stores[16:] @@ -229,11 +224,9 @@ def _read_one_data(self, url, params): try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) - if "_cs" in j and "_cr" in j: - new_j = decrypt_cryptojs_aes( - j - ) # returns j["context"]["dispatcher"]["stores"] - # from old code + new_j = decrypt_cryptojs_aes( + j + ) data = new_j["HistoricalPriceStore"]