Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate to selenium #41

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Goodreads Snarfer

Forking from Goodreads Scraper. <b>This README will be out of date for a while</b>.
The original README.md contents follow:

# Goodreads Scraper

These Python scripts can be used to collect book reviews and metadata from Goodreads.
Expand Down
14 changes: 14 additions & 0 deletions cli/get_web_driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chromium.webdriver import ChromiumDriver


def get_web_driver(browser_name: str) -> ChromiumDriver:
if browser_name == 'chrome':
driver = webdriver.Chrome(ChromeDriverManager().install())
elif browser_name == 'edge':
driver = webdriver.Edge()
else:
raise ValueError('Please select a web browser: Chrome or Edge')

return driver
219 changes: 138 additions & 81 deletions get_books.py → cli/main_get_books.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import argparse
from datetime import datetime
from typing import AnyStr
import json
import os
import re
import time

from urllib.request import urlopen
from urllib.error import HTTPError
from get_web_driver import get_web_driver
import bs4
import pandas as pd
from selenium.webdriver.chromium.webdriver import ChromiumDriver
from selenium.webdriver.common.by import By

from dataclasses import dataclass

def get_all_lists(soup):

def get_all_lists(soup):
lists = []
list_count_dict = {}

Expand All @@ -26,7 +31,6 @@ def get_all_lists(soup):

i = 0
while soup.find('a', {'class': 'next_page'}) and i <= 10:

time.sleep(2)
next_url = 'https://www.goodreads.com' + soup.find('a', {'class': 'next_page'})['href']
source = urlopen(next_url)
Expand All @@ -49,17 +53,16 @@ def get_all_lists(soup):


def get_shelves(soup):

shelf_count_dict = {}

if soup.find('a', text='See top shelves…'):

# Find shelves text.
shelves_url = soup.find('a', text='See top shelves…')['href']
source = urlopen('https://www.goodreads.com' + shelves_url)
soup = bs4.BeautifulSoup(source, 'lxml')
shelves = [' '.join(node.text.strip().split()) for node in soup.find_all('div', {'class': 'shelfStat'})]

# Format shelves text.
shelf_count_dict = {}
for _shelf in shelves:
Expand All @@ -80,42 +83,46 @@ def get_genres(soup):
return genres


def get_series_name(soup):
def get_series_name(soup, driver):
series = soup.find(id="bookSeries").find("a")
if series:
series_name = re.search(r'\((.*?)\)', series.text).group(1)
return series_name
else:
if not series:
return ""
# TODO: page has changed this will no longer work
series_name = re.search(r'\((.*?)\)', series.text).group(1)
return series_name


def get_series_uri(soup):
def get_series_uri(soup, driver):
series = soup.find(id="bookSeries").find("a")
if series:
series_uri = series.get("href")
return series_uri
else:
if not series:
return ""
series_uri = series.get("href")
return series_uri


def get_top_5_other_editions(soup):
other_editions = []
for div in soup.findAll('div', {'class': 'otherEdition'}):
other_editions.append(div.find('a')['href'])
other_editions.append(div.find('a')['href'])
return other_editions

def get_isbn(soup):
try:
isbn = re.findall(r'nisbn: [0-9]{10}' , str(soup))[0].split()[1]
return isbn
except:
return "isbn not found"

def get_isbn13(soup):
try:
isbn13 = re.findall(r'nisbn13: [0-9]{13}' , str(soup))[0].split()[1]
return isbn13
except:
return "isbn13 not found"
def get_isbn(soup, driver) -> str | None:
# try:
# isbn = re.findall(r'nisbn: [0-9]{10}', str(soup))[0].split()[1]
# return isbn
# except:
# raise RuntimeError("isbn not found")
return None


def get_isbn13(soup, driver) -> str | None:
# try:
# isbn13 = re.findall(r'nisbn13: [0-9]{13}', str(soup))[0].split()[1]
# return isbn13
# except:
# return "isbn13 not found"
return None


def get_rating_distribution(soup):
Expand All @@ -126,7 +133,7 @@ def get_rating_distribution(soup):
'4 Stars': distribution[1],
'3 Stars': distribution[2],
'2 Stars': distribution[3],
'1 Star': distribution[4]}
'1 Star': distribution[4]}
return distribution_dict


Expand All @@ -138,89 +145,139 @@ def get_num_pages(soup):


def get_year_first_published(soup):
year_first_published = soup.find('nobr', attrs={'class':'greyText'})
year_first_published = soup.find('nobr', attrs={'class': 'greyText'})
if year_first_published:
year_first_published = year_first_published.string
return re.search('([0-9]{3,4})', year_first_published).group(1)
else:
return ''

def get_id(bookid):
pattern = re.compile("([^.-]+)")
return pattern.search(bookid).group()

def get_cover_image_uri(soup):
series = soup.find('img', id='coverImage')
if series:
series_uri = series.get('src')
return series_uri
else:
return ""

def scrape_book(book_id):
url = 'https://www.goodreads.com/book/show/' + book_id
BOOK_ID_PATTERN = re.compile("([^.-]+)")


def get_id_group(book_id: str) -> AnyStr:
return BOOK_ID_PATTERN.search(book_id).group()


def get_cover_image_uri(driver: ChromiumDriver) -> str | None:
el = driver.find_element(By.CSS_SELECTOR, "img[class='ResponsiveImage']")
src = el.get_attribute('src') if el is not None else None

return src


def get_book_title(driver: ChromiumDriver) -> str:
el = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='bookTitle']")
title = el.text if el is not None else None

return title


def scrape_book(book_id: str, driver: ChromiumDriver):
url = f'https://www.goodreads.com/book/show/{book_id}'

driver.get(url)
source = urlopen(url)
soup = bs4.BeautifulSoup(source, 'html.parser')

time.sleep(2)

return {'book_id_title': book_id,
'book_id': get_id(book_id),
'cover_image_uri': get_cover_image_uri(soup),
'book_title': ' '.join(soup.find('h1', {'id': 'bookTitle'}).text.split()),
"book_series": get_series_name(soup),
"book_series_uri": get_series_uri(soup),
book_id_title = book_id
book_id_group = get_id_group(book_id)
cover_image_uri = get_cover_image_uri(driver)
book_title = get_book_title(driver)

return {'book_id_title': book_id_title,
'book_id': book_id_group,
'cover_image_uri': cover_image_uri,
'book_title': book_title,
"book_series": get_series_name(soup, driver),
"book_series_uri": get_series_uri(soup, driver),
'top_5_other_editions': get_top_5_other_editions(soup),
'isbn': get_isbn(soup),
'isbn13': get_isbn13(soup),
'isbn': get_isbn(soup, driver),
'isbn13': get_isbn13(soup, driver),
'year_first_published': get_year_first_published(soup),
'authorlink': soup.find('a', {'class': 'authorName'})['href'],
'author': ' '.join(soup.find('span', {'itemprop': 'name'}).text.split()),
'num_pages': get_num_pages(soup),
'genres': get_genres(soup),
'shelves': get_shelves(soup),
'lists': get_all_lists(soup),
'num_ratings': soup.find('meta', {'itemprop': 'ratingCount'})['content'].strip(),
'num_reviews': soup.find('meta', {'itemprop': 'reviewCount'})['content'].strip(),
'average_rating': soup.find('span', {'itemprop': 'ratingValue'}).text.strip(),
'rating_distribution': get_rating_distribution(soup)}
'authorlink': soup.find('a', {'class': 'authorName'})['href'],
'author': ' '.join(soup.find('span', {'itemprop': 'name'}).text.split()),
'num_pages': get_num_pages(soup),
'genres': get_genres(soup),
'shelves': get_shelves(soup),
'lists': get_all_lists(soup),
'num_ratings': soup.find('meta', {'itemprop': 'ratingCount'})['content'].strip(),
'num_reviews': soup.find('meta', {'itemprop': 'reviewCount'})['content'].strip(),
'average_rating': soup.find('span', {'itemprop': 'ratingValue'}).text.strip(),
'rating_distribution': get_rating_distribution(soup)}

def condense_books(books_directory_path):

def condense_books(books_directory_path):
books = []

# Look for all the files in the directory and if they contain "book-metadata," then load them all and condense them into a single file
for file_name in os.listdir(books_directory_path):
if file_name.endswith('.json') and not file_name.startswith('.') and file_name != "all_books.json" and "book-metadata" in file_name:
_book = json.load(open(books_directory_path + '/' + file_name, 'r')) #, encoding='utf-8', errors='ignore'))
if file_name.endswith('.json') and not file_name.startswith(
'.') and file_name != "all_books.json" and "book-metadata" in file_name:
_book = json.load(
open(books_directory_path + '/' + file_name, 'r')) # , encoding='utf-8', errors='ignore'))
books.append(_book)

return books

def main():

start_time = datetime.now()
script_name = os.path.basename(__file__)

def get_arg_parse() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument('--book_ids_path', type=str)
parser.add_argument('--output_directory_path', type=str)
parser.add_argument('--format', type=str, action="store", default="json",
dest="format", choices=["json", "csv"],
help="set file output format")

return parser


@dataclass
class BooksToScrape:
books_to_scrape: list[str]
books_already_scraped: list[str]
book_ids: list[str]


def get_books_to_scrape(book_ids_path: str, output_directory: str) -> BooksToScrape:
# TODO: seems this could be done easier with sets!
with open(book_ids_path, 'r') as book_ids_file:
book_ids = [line.strip() for line in book_ids_file if line.strip()]
books_already_scraped = [file_name.replace('_book-metadata.json', '') for file_name in
os.listdir(output_directory) if
file_name.endswith('.json') and not file_name.startswith('all_books')]
books_to_scrape = [book_id for book_id in book_ids if book_id not in books_already_scraped]

return BooksToScrape(books_to_scrape=books_to_scrape, books_already_scraped=books_already_scraped, book_ids=book_ids)


def main():
start_time = datetime.now()
script_name = os.path.basename(__file__)

parser = get_arg_parse()
args = parser.parse_args()

book_ids = [line.strip() for line in open(args.book_ids_path, 'r') if line.strip()]
books_already_scraped = [file_name.replace('_book-metadata.json', '') for file_name in os.listdir(args.output_directory_path) if file_name.endswith('.json') and not file_name.startswith('all_books')]
books_to_scrape = [book_id for book_id in book_ids if book_id not in books_already_scraped]
condensed_books_path = args.output_directory_path + '/all_books'
# TODO: make browser name an arg
driver = get_web_driver('edge')

output_directory_path = args.output_directory_path
book_ids_path = args.book_ids_path

scrape_info = get_books_to_scrape(book_ids_path, output_directory_path)

for i, book_id in enumerate(books_to_scrape):
condensed_books_path = f'{output_directory_path}/all_books'

for i, book_id in enumerate(scrape_info.books_to_scrape):
try:
print(str(datetime.now()) + ' ' + script_name + ': Scraping ' + book_id + '...')
print(str(datetime.now()) + ' ' + script_name + ': #' + str(i+1+len(books_already_scraped)) + ' out of ' + str(len(book_ids)) + ' books')
print(str(datetime.now()) + ' ' + script_name + ': #' + str(
i + 1 + len(scrape_info.books_already_scraped)) + ' out of ' + str(len(scrape_info.book_ids)) + ' books')

book = scrape_book(book_id)
book = scrape_book(book_id, driver)
# Add book metadata to file name to be more specific
json.dump(book, open(args.output_directory_path + '/' + book_id + '_book-metadata.json', 'w'))

Expand All @@ -230,17 +287,17 @@ def main():
print(e)
exit(0)


books = condense_books(args.output_directory_path)
if args.format == 'json':
json.dump(books, open(f"{condensed_books_path}.json", 'w'))
json.dump(books, open(f"{condensed_books_path}.json", 'w'), indent=4)
elif args.format == 'csv':
json.dump(books, open(f"{condensed_books_path}.json", 'w'))
json.dump(books, open(f"{condensed_books_path}.json", 'w'), indent=4)
book_df = pd.read_json(f"{condensed_books_path}.json")
book_df.to_csv(f"{condensed_books_path}.csv", index=False, encoding='utf-8')

print(str(datetime.now()) + ' ' + script_name + f':\n\n🎉 Success! All book metadata scraped. 🎉\n\nMetadata files have been output to /{args.output_directory_path}\nGoodreads scraping run time = ⏰ ' + str(datetime.now() - start_time) + ' ⏰')

print(
str(datetime.now()) + ' ' + script_name + f':\n\n🎉 Success! All book metadata scraped. 🎉\n\nMetadata files have been output to /{args.output_directory_path}\nGoodreads scraping run time = ⏰ ' + str(
datetime.now() - start_time) + ' ⏰')


if __name__ == '__main__':
Expand Down
Loading