Skip to content

Commit

Permalink
chg: Use bz2 dump instead of straight json
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Mar 11, 2024
1 parent a47106d commit 059c0e1
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 78 deletions.
26 changes: 19 additions & 7 deletions bin/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
from __future__ import annotations

import gzip
import bz2
import json
import logging
import logging.config
import shutil

from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path
Expand Down Expand Up @@ -38,11 +40,12 @@ def __init__(self, loglevel: int=logging.INFO):
phishtank_api_key = get_config('generic', 'phishtank_api_key')
self.expire_urls = get_config('generic', 'expire_urls')
self.fetch_freq = get_config('generic', 'dump_fetch_frequency')
self.useragent = get_config('generic', 'phishtank_useragent')

if phishtank_api_key:
self.json_db_url = f'https://data.phishtank.com/data/{phishtank_api_key}/online-valid.json'
self.json_db_url = f'https://data.phishtank.com/data/{phishtank_api_key}/online-valid.json.bz2'
else:
self.json_db_url = 'https://data.phishtank.com/data/online-valid.json'
self.json_db_url = 'https://data.phishtank.com/data/online-valid.json.bz2'

def _to_run_forever(self) -> None:
if to_import := self._fetch():
Expand Down Expand Up @@ -70,13 +73,22 @@ def _fetch(self) -> Path | None:
# response = requests.head(self.json_db_url)

self.logger.info('Fetching new file...')
headers = {'user-agent': 'phishtank/phishtank-lookup (Lookyloo)'}
headers = {'user-agent': self.useragent}
response = requests.get(self.json_db_url, headers=headers)
self.logger.info('Fetching done.')
dest_file = self.data_dir / f'{datetime.now().isoformat()}.json'
with dest_file.open('w') as f:
json.dump(response.json(), f)
return dest_file
if content := response.content:
try:
json_response = json.loads(bz2.decompress(content))
except Exception as e:
self.logger.error(f'Error while reading bz2 file from {self.json_db_url}: {e}')
return None
dest_file = self.data_dir / f'{datetime.now().isoformat()}.json'
with dest_file.open('w') as f:
json.dump(json_response, f)
return dest_file
else:
self.logger.error('JSON received from Phishtank is empty.')
return None

def _import(self, to_import: Path) -> None:
'''Import a dump
Expand Down
Loading

0 comments on commit 059c0e1

Please sign in to comment.