From 8d90a51566e68a1cf2486e9f39d79045a038e530 Mon Sep 17 00:00:00 2001 From: Murali Krishna Date: Sat, 18 Jul 2020 11:16:30 +0530 Subject: [PATCH 1/4] HackerEarth scraper has similarly been reworked to be functional; Bugfixes for CodeChef scraper --- scrapers/codechef.py | 6 +++--- scrapers/hackerearth.py | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 86c661a..98fdc2b 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -55,6 +55,7 @@ def get_rankings(site, contest_code): if page == total_pages-1: # Reached Last Page break driver.get(site + f'&page={page+2}') # go to next page + return scraped_scoreboard def scrape(contest_codes): scoreboard_base_url:str = "https://www.codechef.com/rankings" @@ -70,11 +71,10 @@ def scrape(contest_codes): easy_points=100000 # Initial value set to points per problem for Division in divisions: # Build the scraped scoreboard - divisions[Division].problems = get_problems(f"{site_url}/{contest_code}{Division}") - divisions[Division].scraped_scoreboard = get_rankings(f"{scoreboard_base_url}/{contest_code}{Division}{scoreboard_filter_query}", contest_code) + divisions[Division] = divisions[Division]._replace(problems = get_problems(f"{site_url}/{contest_code}{Division}")) + divisions[Division] = divisions[Division]._replace(scraped_scoreboard = get_rankings(f"{scoreboard_base_url}/{contest_code}{Division}{scoreboard_filter_query}", contest_code)) easy_points=len(divisions['B'].problems-divisions['A'].problems)*easy_points # Points to add to div-A participants - for i in range(len(divisions['A'].scraped_scoreboard)): # Add easy points to all div-A participants divisions['A'].scraped_scoreboard[i]=divisions['A'].scraped_scoreboard[i][0],divisions['A'].scraped_scoreboard[i][1]+easy_points diff --git a/scrapers/hackerearth.py b/scrapers/hackerearth.py index c82670f..1e47651 100644 --- a/scrapers/hackerearth.py +++ b/scrapers/hackerearth.py @@ -1,13 +1,18 @@ import requests +import sys +import os +sys.path.append( # Add absolute path of utils to sys.path + os.path.join( os.path.dirname( os.path.realpath( __file__ )), + '../../student-ratings' )) from bs4 import BeautifulSoup from tinydb import TinyDB, where from database.db_tools import DB_FILE, HACKEREARTH -import logging +from utils.log import * # 0 - event_id # 1 - page number -leaderboard_base_url = 'https://www.hackerearth.com/AJAX/feed/newsfeed/icpc-leaderboard/event/{0}/{1}/' - +API = 'https://www.hackerearth.com/AJAX/feed/newsfeed/icpc-leaderboard/event/{0}/{1}/' +leaderboard_base_url = 'https://www.hackerearth.com/challenges/competitive/{}/leaderboard/' def get_handles(html_doc): soup = BeautifulSoup(html_doc, 'html.parser') @@ -28,24 +33,42 @@ def get_leaderboard(event_id): leaderboard = [] while True: - r = requests.get(leaderboard_base_url.format(event_id, page_num)) - handles = get_handles(r.text) + + handles = get_handles(requests.get(API.format(event_id, page_num)).text) # url returns last page for page_num greater than last page number - if leaderboard[-len(handles):] == handles: + if leaderboard[-len(handles):] == handles or page_num > 10: break leaderboard.extend(handles) + info(f'Handles retrieved for page {page_num}') page_num += 1 - logging.info(page_num) return leaderboard +def get_contest_IDs(contest_codes): + contest_IDs = dict() + for contest_code in contest_codes: + # Gets the ICPC contest ID from the url; We use this ID to send the request + contest_IDs[contest_code] = BeautifulSoup(requests.get(leaderboard_base_url.format(contest_code)).text, 'html.parser').find('div', class_='event-id').text # + + return contest_IDs -if __name__ == "__main__": +def scrape(contest_codes): + contest_IDs = get_contest_IDs(contest_codes) + leaderboards = [] + for contest in contest_IDs: + info(f'HackerEarth contest {contest}:') + leaderboard = get_leaderboard(contest_IDs[contest]) + #contest_ranks_file = f'database/contest_ranks/hackerearth-{contest}.in' + leaderboards.append(leaderboard) + return leaderboards + +'''if __name__ == "__main__": logging.basicConfig(level='INFO') event_id = '814357' leaderboard = get_leaderboard(event_id) - print(*leaderboard, sep='\n') + print(*leaderboard, sep='\n')''' + """Uncomment to output mapped PES handles only with TinyDB(DB_FILE) as database: From f60efa959ff30b7c9dc5de5e4cc41542d8d3f651 Mon Sep 17 00:00:00 2001 From: Murali Krishna Date: Sat, 18 Jul 2020 20:04:58 +0530 Subject: [PATCH 2/4] Created a main file; It performs all the functionalities covered by executor.sh. Updated README and requirements --- .gitignore | 6 +- README.md | 28 +-- ...estsToProcess.in => contest_names_file.in} | 0 database/db_tools.py | 30 +-- ratings/processor.py | 56 +++--- requirements.txt | 10 +- run.py | 172 ++++++++++++++++++ scrapers/codechef.py | 4 - scrapers/hackerearth.py | 9 +- 9 files changed, 252 insertions(+), 63 deletions(-) rename database/{contestsToProcess.in => contest_names_file.in} (100%) create mode 100644 run.py diff --git a/.gitignore b/.gitignore index 6115d9f..9bad418 100644 --- a/.gitignore +++ b/.gitignore @@ -43,4 +43,8 @@ pip-delete-this-directory.txt .pytest_cache/ # Mypy (static type checking) -.mypy_cache/ \ No newline at end of file +.mypy_cache/ + +# Credentials for calendar +credentials.json +token.pickle \ No newline at end of file diff --git a/README.md b/README.md index bbd8faf..570e589 100644 --- a/README.md +++ b/README.md @@ -3,28 +3,28 @@ [![Build Status](https://travis-ci.com/varunvora/alcoding.svg?branch=master)](https://travis-ci.com/varunvora/alcoding) -Alcoding Club of [PES University](https://pes.edu/) maintains ratings of its students who are active in [competitive programming](https://en.wikipedia.org/wiki/Competitive_programming). This repository contains the ratings and the code which generates it. +Alcoding Club of [PES University](https://pes.edu/) maintains ratings of its students who are active in [competitive programming](https://en.wikipedia.org/wiki/Competitive_programming). This repository contains the ratings and the code that generates it. ## Purpose -An intra-college rating is maintained so that the club can identify good coders. The club will group these students and help them improve at competitive programming by organizing meet-ups, providing resources, arranging contests and develop a coding community in the University. +An intra-college rating is maintained to aid the club in identifying good coders. The club aims to help these students improve their competitive programming skills by organizing meet-ups, providing resources, arranging contests and developing a coding community in the University. ## Ratings -The ratings are calculated by students' performances in [specified contests](database/README.md). +The ratings are calculated using students' performances in [specified contests](database/README.md). ### Mechanism -A [rank list](database/contest_ranks) of registered students is generated at the end of each contest. A rating is computed from the rank list, which indicates their relative performance. The implementation is almost the same as [Codechef's Rating Mechanism](https://www.codechef.com/ratings) which is a modified version of [Elo rating system](https://en.wikipedia.org/wiki/Elo_rating_system). To avoid students from [protecting their ratings](https://en.wikipedia.org/wiki/Elo_rating_system#Game_activity_versus_protecting_one's_rating) and encourage participation, a decay rule is also added which decrements a student's rating by 1% if she does not take part in 5 consecutive contests. +A [rank list](database/contest_ranks) of registered students is generated at the end of each contest. A rating is computed from the rank list, which indicates their relative performance. The implementation is very similar to [Codechef's Rating Mechanism](https://www.codechef.com/ratings) which is a modified version of the [Elo rating system](https://en.wikipedia.org/wiki/Elo_rating_system). To prevent students from [protecting their ratings](https://en.wikipedia.org/wiki/Elo_rating_system#Game_activity_versus_protecting_one's_rating) and encourage participation, a decay rule, which decrements a student's rating by 1% if they do not take part in 5 consecutive rated contests, is also added. ### Verification -The [code that generates the rating](ratings/processor.py) is open. Along with that we have provided [a script with which you can verify](executor.sh) that the displayed ratings are correct. This script resets all students' ratings, and computes the ratings after all the contest ranks are considered. You may [report an issue](https://github.com/varunvora/alcoding/issues) if you find any discrepancy. +The [code that generates the rating](ratings/processor.py) is open. Further, we also provide [a method with which you can verify](run.py) the displayed ratings. This method resets all students' ratings, and recomputes the ratings of every student after considering all contest ranks. Please do [report an issue](https://github.com/pes-alcoding-club/student-ratings/issues) if you find any discrepancy. ## Calendar -Alcoding Club maintains a [Google calendar for competitive programming](https://calendar.google.com/calendar?cid=N3RsZGt1dXEwcW1mOW9ub2Jxb3ByZ2Z1cDRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ). Contests that are marked as "Rated" will be considered for these ratings. +Alcoding Club maintains a [Google Calendar for competitive programming](https://calendar.google.com/calendar?cid=N3RsZGt1dXEwcW1mOW9ub2Jxb3ByZ2Z1cDRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ). Contests that are marked "Rated" will be considered for these ratings. ## Contribute -This project is still very small so there are no strict guidelines for contribution. For now we are following [PEP 8 -- Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/). +At the moment, there are no strict guidelines for contribution. As a standard, we follow the [PEP 8 -- Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/). -You can [report an issue](https://github.com/varunvora/alcoding/issues) if you find a bug or any other change you would like to make. You may also make a [pull request](https://github.com/varunvora/alcoding/pulls). It would be helpful if you use [our Github labels](https://github.com/varunvora/alcoding/labels) for all issues and pull requests. Be sure to clearly document and describe any issues or changes. +Feel free to [report an issue](https://github.com/pes-alcoding-club/student-ratings/issues) if you find a bug, or have any other change you would like to see. You may also create a [pull request](https://github.com/pes-alcoding-club/student-ratings/pulls). It would be helpful if you use [our Github labels](https://github.com/pes-alcoding-club/student-ratings/labels) for all issues and pull requests. Be sure to clearly document and describe any issues or changes. ## FAQ @@ -37,11 +37,11 @@ You can [report an issue](https://github.com/varunvora/alcoding/issues) if you f 1. Which contests are taken into account for rating? Contests in ['Competitive Programming PESU' Calendar](https://calendar.google.com/calendar?cid=N3RsZGt1dXEwcW1mOW9ub2Jxb3ByZ2Z1cDRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) are considered for ratings. -1. How can I tell if these ratings are legitimate? +1. How can I tell whether these ratings are legitimate? - You can verify the ratings yourself by running [this script](executor.sh). It resets all students' ratings to default values and recomputes it for all contests so far in chronological order. - -1. How can I get the scoreboard only for some particular contest(s)? - - Clone this repository, open [executor.sh](executor.sh) and remove the contests you do not want the scoreboard for. Run this script and check [scoreboard.csv](scoreboard.csv). + You can verify the ratings yourself by calling the [make_scoreboard] function in [run.py](run.py). It resets all students' ratings to default values and recomputes it for all contests so far in chronological order. +1. How can I make a scoreboard for a few particular contests? + Firstly, clone this repository. + Create your own [contest_names_file.in](database/contest_names_file.in) and add the contest names in the format [platform]-[month]-[contest_code]. In [run.py](run.py), change the [contest_names_file_path] variable's value to your file's path. + Now call the [make_scoreboard] function in [run.py](run.py) with the required parameters and check [scoreboard.csv](scoreboard.csv). \ No newline at end of file diff --git a/database/contestsToProcess.in b/database/contest_names_file.in similarity index 100% rename from database/contestsToProcess.in rename to database/contest_names_file.in diff --git a/database/db_tools.py b/database/db_tools.py index c48796e..40c4874 100644 --- a/database/db_tools.py +++ b/database/db_tools.py @@ -1,7 +1,6 @@ import re import sys import csv -import logging from os import listdir from os.path import join from collections import Counter @@ -9,6 +8,7 @@ from typing import List, Set, Tuple, Dict, Callable, Any from tinydb import TinyDB, where from ratings import elo +from utils.log import * DB_FILE: str = 'database/db.json' CONTEST_RANKS_DIR: str = 'database/contest_ranks' @@ -57,7 +57,7 @@ def reset_database(db_file: str = DB_FILE) -> None: BEST: elo.DEFAULT_RATING, TIMES_PLAYED: 0, LAST_FIVE: 5}) - logging.info(f'Successfully reset database and stored in {db_file}') + info(f'Successfully reset database and stored in {db_file}') def get_site_name_from_file_name(file_name: str) -> str: @@ -68,8 +68,8 @@ def get_site_name_from_file_name(file_name: str) -> str: """ file_name_parts = file_name.split("-") if len(file_name_parts) < 2 or file_name_parts[0] not in SITES: - logging.error(f"Invalid filename '{file_name}' in contest ranks. File name convention is" - f"'site-contest-details.in'") + error(f"Invalid filename '{file_name}' in contest ranks. File name convention is" + f"'site-month-contestCode.in'") quit() return file_name_parts[0] @@ -144,7 +144,7 @@ def log_unmapped_handles(site_username_tuple_list: List[Tuple[str, str]]) -> Non log_unmapped_handles(site_handle_tuple_list) - logging.info('Mapped ') + info('Mapped usernames to SRNs') def remove_unmapped_handles_from_rank_file(file_name: str) -> None: @@ -152,10 +152,11 @@ def remove_unmapped_handles_from_rank_file(file_name: str) -> None: Removes unmapped handles from outdated rank files to reduce space and time it takes for the script to run """ - with open(join(CONTEST_RANKS_DIR, file_name), 'r') as rank_file: + with open(file_name, 'r') as rank_file: input_data: str = rank_file.read() - - with open(join(CONTEST_RANKS_DIR, file_name), 'w') as rank_file: + #print(len(input_data)) + count = 0 + with open(file_name, 'w') as rank_file: for user_name_line in input_data.split("\n"): check_occurrence_in_line: bool = False for user_name in user_name_line.split(): @@ -164,7 +165,10 @@ def remove_unmapped_handles_from_rank_file(file_name: str) -> None: rank_file.write(user_name + " ") if check_occurrence_in_line: rank_file.write("\n") - logging.info(f'Cleaned {file_name}') + count+=1 + #print(count) + loginfo = file_name.split('/')[2] + info(f'Cleaned {loginfo}') def export_to_csv(db_file: str = DB_FILE, scoreboard_file: str = SCOREBOARD_FILE) -> None: @@ -190,7 +194,7 @@ def export_to_csv(db_file: str = DB_FILE, scoreboard_file: str = SCOREBOARD_FILE wr = csv.writer(fp) wr.writerows(csv_table) - logging.info(f'Successfully exported database from {db_file} to {scoreboard_file}') + info(f'Successfully exported database from {db_file} to {scoreboard_file}') def prettify(db_file: str = DB_FILE) -> None: @@ -201,11 +205,11 @@ def prettify(db_file: str = DB_FILE) -> None: fp.write_back(fp.all()) -if __name__ == "__main__": +'''if __name__ == "__main__": # While executing this script, you can specify which function to execute func_str: str = sys.argv[1] try: func_obj: Callable = globals()[func_str] func_obj(*sys.argv[2:]) # Arguments to specified function can be passed - except KeyError: - logging.error(f'Provided invalid argument. No function {func_str}') + except KeyError:' + error(f'Provided invalid argument. No function {func_str}')''' \ No newline at end of file diff --git a/ratings/processor.py b/ratings/processor.py index 17c302b..0aaab6c 100644 --- a/ratings/processor.py +++ b/ratings/processor.py @@ -1,22 +1,23 @@ import sys -import logging from time import time from ratings import elo from database import db_tools as db from tinydb import TinyDB, where - +from utils.log import * class RatingProcessor: - def __init__(self, database: TinyDB, rank_file): + def __init__(self, database: TinyDB, rank_file_path): self.database: TinyDB = database self.N: int = 0 self.Cf: float = 0.0 self.Rb_Vb_list: list = [] self.usn_rank_dict: dict = {} + self.rank_file_path = rank_file_path + self.rank_file = open(rank_file_path) - self.read_contest_ranks(rank_file) # sets usn_rank_dict + self.read_contest_ranks(self.rank_file) # sets usn_rank_dict self.set_contest_details() # sets N, Cf and Rb_Vb_list self.process_competition() # uses the set attributes to compute new ratings @@ -35,9 +36,9 @@ def read_contest_ranks(self, rank_file) -> None: self.usn_rank_dict[usn] = current_rank same_rank_count += 1 else: - logging.info(f'Ignoring usn {usn}') + info(f'Ignoring SRN {usn}') current_rank += same_rank_count # ranks are not 1, 1, 1, 2 but 1, 1, 1, 4 - logging.debug(self.usn_rank_dict) + debug(self.usn_rank_dict) def set_contest_details(self) -> None: """ @@ -54,12 +55,12 @@ def set_contest_details(self) -> None: self.N = len(self.usn_rank_dict) self.Cf = elo.Cf(rating_list, vol_list, self.N) self.Rb_Vb_list = list(zip(rating_list, vol_list)) - logging.debug(f'Contest: {rank_file_path}\nPlayers: {self.N}\nCompetition Factor: {self.Cf}') + debug(f'Contest: {self.rank_file_path}\nPlayers: {self.N}\nCompetition Factor: {self.Cf}') @staticmethod def _decay_player(player_dict: dict) -> None: """ - Reduces ratings by 10% for those who have competed at least once + Reduces ratings by 1% for those who have competed at least once but have not taken part in the past 5 contests :param player_dict: dict with all details of a player """ @@ -76,7 +77,7 @@ def _decay_player(player_dict: dict) -> None: player_dict[db.RATING] = rating player_dict[db.LAST_FIVE] = max(1, last_five) - logging.debug('Successfully decayed ratings') + debug('Successfully decayed ratings') def _update_player(self, player_dict: dict, actual_rank: int) -> None: """ @@ -99,27 +100,27 @@ def _update_player(self, player_dict: dict, actual_rank: int) -> None: player_dict[db.BEST] = max(old_best, new_rating) player_dict[db.LAST_FIVE] = 5 - logging.debug('Successfully updated ratings') + debug('Successfully updated ratings') def process_competition(self) -> None: rows = self.database.all() for row in rows: - logging.debug(f'Before: {row}') + debug(f'Before: {row}') if row[db.USN] in self.usn_rank_dict: actual_rank = self.usn_rank_dict[row[db.USN]] self._update_player(row, actual_rank) else: self._decay_player(row) - logging.debug(f'After: {row}') + debug(f'After: {row}') self.database.write_back(rows) -def read_argv(argv_format_alert: str): - """ +"""def read_argv(argv_format_alert: str): + ''' :param argv_format_alert: An error message on what the command line arguments should be :return: rank file if argv is valid - """ + ''' try: assert len(sys.argv) == 2 rank_file = sys.argv[1] @@ -128,15 +129,28 @@ def read_argv(argv_format_alert: str): return rank_file except IOError or FileNotFoundError: - logging.error(f'Invalid file path for rank file: {rank_file}\n{argv_format_alert}') + error(f'Invalid file path for rank file: {rank_file}\n{argv_format_alert}') quit() except AssertionError: - logging.error(f'Invalid command line arguments.\n{argv_format_alert}') - quit() + error(f'Invalid command line arguments.\n{argv_format_alert}') + quit()""" +def process(rank_file_path): + start_time = time() + # Main logic starts here + database_obj = TinyDB(db.DB_FILE) + RatingProcessor(database_obj, rank_file_path) + database_obj.close() -if __name__ == "__main__": + duration = time()-start_time + debug(f'Updated ratings for {rank_file_path}') + if duration > 10: + critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' + f'Consider removing unnecessary handles or optimize ratings algorithm') + + +'''if __name__ == "__main__": start_time = time() argv_format = 'processor.py rank_file_path' @@ -149,7 +163,7 @@ def read_argv(argv_format_alert: str): database_obj.close() duration = time()-start_time - logging.debug(f'Updated ratings for {rank_file_path}') + debug(f'Updated ratings for {rank_file_path}') if duration > 10: logging.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' - f'Consider removing unnecessary handles or optimize ratings algorithm') + f'Consider removing unnecessary handles or optimize ratings algorithm')''' diff --git a/requirements.txt b/requirements.txt index c731e28..3d32c79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -tinydb -requests -bs4 -selenium \ No newline at end of file +requests==2.22.0 +beautifulsoup4==4.9.1 +google_api_python_client==1.10.0 +google_auth_oauthlib==0.4.1 +selenium==3.141.0 +tinydb==3.15.2 diff --git a/run.py b/run.py new file mode 100644 index 0000000..8cb0b83 --- /dev/null +++ b/run.py @@ -0,0 +1,172 @@ +import datetime +import pickle +import os.path +from googleapiclient.discovery import build +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request +import sys +import os +sys.path.append( # Add absolute path of utils to sys.path + os.path.join( os.path.dirname( os.path.realpath( __file__ )), + '../student-ratings' )) +from scrapers import codechef, hackerearth +from database import db_tools as tools +from ratings import processor +from pathlib import Path +from collections import defaultdict +from utils.log import * + +PATH_TO_RANK_FILES = 'database/contest_ranks/' # Change this path to 'database/[YOUR_CUSTOM_RANKS_DIR]' to calculate ratings for only a few contests +contest_names_file_path = 'database/contest_names_file.in' # Change this path to 'database/[YOUR_CUSTOM_CONTEST_NAMES_FILE.in]' and add required (supported) contests to calculate ratings for only those +SCOPES = ['https://www.googleapis.com/auth/calendar.readonly'] +months = ['jan', 'feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] + +class contest_details(): + def __init__(self, url): + self.website = str(url[0].split('.')[1]) # Fetch the platform name + self.contest_code = str(url[-1]) + self.file_name = self.make_file_name() + + def make_file_name(self): + if self.website == 'codechef': + if self.contest_code[0:5] == 'LTIME': + #Specific formula to determine month and year for Lunchtime based on numeric ID + month, year = months[(int(self.contest_code[5:]) + 4) % 12], int((int(self.contest_code[5:]) + 5) / 12) + 13 + return f'codechef-{month}-lunchtime-{year}.in' + #Specific formula to determine month and year for Cookoff based on numeric ID + elif self.contest_code[0:4]=="COOK": + month, year = months[(int(self.contest_code[4:]) + 6) % 12], int((int(self.contest_code[4:]) + 7) / 12) + 10 + return f'codechef-{month}-cookoff-{year}.in' + else: + month = self.contest_code[:-2].lower() + return f'codechef-{month}-long-{self.contest_code[-2:]}.in' + elif self.website == 'hackerearth': + return f'hackerearth-{self.contest_code}.in' + + def set_leaderboard(self, leaderboard): + self.leaderboard = leaderboard + + +def get_calendar_events(DAYS): + """This block of code is to allow OAuth""" + creds = None + if os.path.exists('token.pickle'): + with open('token.pickle', 'rb') as token: + creds = pickle.load(token) + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + 'credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + with open('token.pickle', 'wb') as token: + pickle.dump(creds, token) + + service = build('calendar', 'v3', credentials=creds) + + """ Call the Calendar API + Z indicates UTC time, as Google requires + the input timezones to be consistent """ + now = datetime.utcnow().isoformat() + 'Z' + tmin = (datetime.utcnow() - timedelta(days=DAYS)).isoformat() + 'Z' + response = service.events().list(calendarId='7tldkuuq0qmf9onobqoprgfup4@group.calendar.google.com', timeMin=tmin, + timeMax=now, singleEvents=True, + orderBy='startTime').execute() + + calendar_response = response.get('items', []) + return calendar_response + + +def get_all_contests(DAYS): + calendar_response = get_calendar_events(DAYS) + contests = defaultdict(list) + contest_names_file = open(contest_names_file_path, 'r+') + existing_contests = list(contest_names_file.read().split('\n')) + if not calendar_response: + info('No upcoming contests found.') + contest_names_file.close() + return calendar_response + else: + for event in calendar_response: + try: + url = event['location'].replace('https://', '').split('/') + except: + error('The contest {} does not have an associated website and is hence ignored.'.format(event['summary'])) + continue + try: + url.remove('') # To remove any unexpected blank items caused by a trailing slash + except: + pass + + contest = contest_details(url) # Create a contest_details object for the contest + if contest.website not in ['codechef', 'hackerearth']: # Only codechef and hackerearth scrapers are compatible as of now + continue + if contest.file_name not in existing_contests: # Checks whether the contest has already been scraped + contest_names_file.write(contest.file_name+'\n') + contests[contest.website].append(contest) + else: + warn(f'{contest.file_name} already exists, ignoring; To re-scrape, delete the file and remove this entry.') + + contest_names_file.close() + return contests + + +""" The scrapers take in a list of contest id's at a go to avoid the overhead of repeatedly calling it. This means that the output + leaderboards have to be reverse mapped back to the contest_details objects; Since they are in a list and the order is preserved, + we use the index of the leaderboard and map it to the object of the same index """ +def scrape(DAYS=30): + contests = get_all_contests(DAYS) + if contests: + leaderboards = codechef.scrape(list(contest.contest_code for contest in contests['codechef'])) + assert len(leaderboards) == len(contests['codechef']) # Make sure the number of leaderboards is the same as number of contests + for i in range(len(leaderboards)): + contests['codechef'][i].set_leaderboard(leaderboards[i]) + + leaderboards = hackerearth.scrape(list(contest.contest_code for contest in contests['hackerearth'])) + assert len(leaderboards) == len(contests['hackerearth']) # Make sure the number of leaderboards is the same as number of contests + for i in range(len(leaderboards)): + contests['hackerearth'][i].set_leaderboard(leaderboards[i]) + + for platform in contests: + for contest in contests[platform]: + file_path = PATH_TO_RANK_FILES + contest.file_name + with open (file_path, 'w+') as rank_file: + for rank in contest.leaderboard: + rank_file.write(rank + '\n') + info('Wrote to {file_path}') + + +def recalculate(clean=False): + contest_names_file = open(contest_names_file_path, 'r') + contest_names = list(contest_names_file.read().split('\n')) + try: + contest_names.remove('') # Removes trailing newline in case the input file had it + except: + pass + info('Built list of files to process') + for contest in contest_names: + if clean: + """ Removes handles that couldn't be mapped to a USN + Usually required in a contest where we couldn't obtain handles of only required students """ + tools.remove_unmapped_handles_from_rank_file(f'{PATH_TO_RANK_FILES}{contest}') + processor.process(f'{PATH_TO_RANK_FILES}{contest}') # Call the processor for each contest + info(f'Processed contest: {contest}') + tools.export_to_csv() + tools.prettify() + contest_names_file.close() + + +def make_scoreboard(map_USN=True, clean=False): + tools.reset_database() + if map_USN: + tools.map_username_to_usn() + recalculate(clean) + + +def execute(DAYS=30, map_USN=True, clean=False): + scrape(DAYS=DAYS) + make_scoreboard(map_USN=map_USN, clean=clean) + +#execute(clean=True) +make_scoreboard(map_USN=True, clean=True) \ No newline at end of file diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 98fdc2b..e85cf7c 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -3,9 +3,6 @@ from datetime import datetime import sys import os -sys.path.append( # Add absolute path of utils to sys.path - os.path.join( os.path.dirname( os.path.realpath( __file__ )), - '../../student-ratings' )) from utils import selenium_utils from utils.log import info @@ -17,7 +14,6 @@ division = namedtuple('division',['problems','scraped_scoreboard']) divisions: dict = {'A':division(set(), list()),'B':division(set(), list())} -month = ['jan', 'feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] def get_problems(site): diff --git a/scrapers/hackerearth.py b/scrapers/hackerearth.py index 1e47651..5bf62a8 100644 --- a/scrapers/hackerearth.py +++ b/scrapers/hackerearth.py @@ -1,9 +1,6 @@ import requests import sys import os -sys.path.append( # Add absolute path of utils to sys.path - os.path.join( os.path.dirname( os.path.realpath( __file__ )), - '../../student-ratings' )) from bs4 import BeautifulSoup from tinydb import TinyDB, where from database.db_tools import DB_FILE, HACKEREARTH @@ -36,11 +33,11 @@ def get_leaderboard(event_id): handles = get_handles(requests.get(API.format(event_id, page_num)).text) # url returns last page for page_num greater than last page number - if leaderboard[-len(handles):] == handles or page_num > 10: + if leaderboard[-len(handles):] == handles: break leaderboard.extend(handles) - info(f'Handles retrieved for page {page_num}') + debug(f'Handles retrieved for page {page_num}') page_num += 1 return leaderboard @@ -49,7 +46,7 @@ def get_contest_IDs(contest_codes): contest_IDs = dict() for contest_code in contest_codes: # Gets the ICPC contest ID from the url; We use this ID to send the request - contest_IDs[contest_code] = BeautifulSoup(requests.get(leaderboard_base_url.format(contest_code)).text, 'html.parser').find('div', class_='event-id').text # + contest_IDs[contest_code] = BeautifulSoup(requests.get(leaderboard_base_url.format(contest_code)).text, 'html.parser').find('div', class_='event-id').text return contest_IDs From 9e1cbd8d5dff39534c1f9838ab8468c2a19a790e Mon Sep 17 00:00:00 2001 From: Murali Krishna Date: Mon, 20 Jul 2020 21:35:32 +0530 Subject: [PATCH 3/4] Removed stray comment, changed namespace for log --- database/db_tools.py | 16 +++++++--------- ratings/processor.py | 24 ++++++++++++------------ run.py | 14 +++++++------- scrapers/codechef.py | 9 ++++----- scrapers/hackerearth.py | 10 +++++----- 5 files changed, 35 insertions(+), 38 deletions(-) diff --git a/database/db_tools.py b/database/db_tools.py index 40c4874..ee15c90 100644 --- a/database/db_tools.py +++ b/database/db_tools.py @@ -8,7 +8,7 @@ from typing import List, Set, Tuple, Dict, Callable, Any from tinydb import TinyDB, where from ratings import elo -from utils.log import * +from utils import log DB_FILE: str = 'database/db.json' CONTEST_RANKS_DIR: str = 'database/contest_ranks' @@ -57,7 +57,7 @@ def reset_database(db_file: str = DB_FILE) -> None: BEST: elo.DEFAULT_RATING, TIMES_PLAYED: 0, LAST_FIVE: 5}) - info(f'Successfully reset database and stored in {db_file}') + log.info(f'Successfully reset database and stored in {db_file}') def get_site_name_from_file_name(file_name: str) -> str: @@ -68,7 +68,7 @@ def get_site_name_from_file_name(file_name: str) -> str: """ file_name_parts = file_name.split("-") if len(file_name_parts) < 2 or file_name_parts[0] not in SITES: - error(f"Invalid filename '{file_name}' in contest ranks. File name convention is" + log.error(f"Invalid filename '{file_name}' in contest ranks. File name convention is" f"'site-month-contestCode.in'") quit() return file_name_parts[0] @@ -144,7 +144,7 @@ def log_unmapped_handles(site_username_tuple_list: List[Tuple[str, str]]) -> Non log_unmapped_handles(site_handle_tuple_list) - info('Mapped usernames to SRNs') + log.info('Mapped usernames to SRNs') def remove_unmapped_handles_from_rank_file(file_name: str) -> None: @@ -154,7 +154,6 @@ def remove_unmapped_handles_from_rank_file(file_name: str) -> None: """ with open(file_name, 'r') as rank_file: input_data: str = rank_file.read() - #print(len(input_data)) count = 0 with open(file_name, 'w') as rank_file: for user_name_line in input_data.split("\n"): @@ -166,9 +165,8 @@ def remove_unmapped_handles_from_rank_file(file_name: str) -> None: if check_occurrence_in_line: rank_file.write("\n") count+=1 - #print(count) loginfo = file_name.split('/')[2] - info(f'Cleaned {loginfo}') + log.info(f'Cleaned {loginfo}') def export_to_csv(db_file: str = DB_FILE, scoreboard_file: str = SCOREBOARD_FILE) -> None: @@ -194,7 +192,7 @@ def export_to_csv(db_file: str = DB_FILE, scoreboard_file: str = SCOREBOARD_FILE wr = csv.writer(fp) wr.writerows(csv_table) - info(f'Successfully exported database from {db_file} to {scoreboard_file}') + log.info(f'Successfully exported database from {db_file} to {scoreboard_file}') def prettify(db_file: str = DB_FILE) -> None: @@ -212,4 +210,4 @@ def prettify(db_file: str = DB_FILE) -> None: func_obj: Callable = globals()[func_str] func_obj(*sys.argv[2:]) # Arguments to specified function can be passed except KeyError:' - error(f'Provided invalid argument. No function {func_str}')''' \ No newline at end of file + log.error(f'Provided invalid argument. No function {func_str}')''' \ No newline at end of file diff --git a/ratings/processor.py b/ratings/processor.py index 0aaab6c..0893039 100644 --- a/ratings/processor.py +++ b/ratings/processor.py @@ -3,7 +3,7 @@ from ratings import elo from database import db_tools as db from tinydb import TinyDB, where -from utils.log import * +from utils import log class RatingProcessor: @@ -36,9 +36,9 @@ def read_contest_ranks(self, rank_file) -> None: self.usn_rank_dict[usn] = current_rank same_rank_count += 1 else: - info(f'Ignoring SRN {usn}') + log.info(f'Ignoring SRN {usn}') current_rank += same_rank_count # ranks are not 1, 1, 1, 2 but 1, 1, 1, 4 - debug(self.usn_rank_dict) + log.debug(self.usn_rank_dict) def set_contest_details(self) -> None: """ @@ -55,7 +55,7 @@ def set_contest_details(self) -> None: self.N = len(self.usn_rank_dict) self.Cf = elo.Cf(rating_list, vol_list, self.N) self.Rb_Vb_list = list(zip(rating_list, vol_list)) - debug(f'Contest: {self.rank_file_path}\nPlayers: {self.N}\nCompetition Factor: {self.Cf}') + log.debug(f'Contest: {self.rank_file_path}\nPlayers: {self.N}\nCompetition Factor: {self.Cf}') @staticmethod def _decay_player(player_dict: dict) -> None: @@ -77,7 +77,7 @@ def _decay_player(player_dict: dict) -> None: player_dict[db.RATING] = rating player_dict[db.LAST_FIVE] = max(1, last_five) - debug('Successfully decayed ratings') + log.debug('Successfully decayed ratings') def _update_player(self, player_dict: dict, actual_rank: int) -> None: """ @@ -100,19 +100,19 @@ def _update_player(self, player_dict: dict, actual_rank: int) -> None: player_dict[db.BEST] = max(old_best, new_rating) player_dict[db.LAST_FIVE] = 5 - debug('Successfully updated ratings') + log.debug('Successfully updated ratings') def process_competition(self) -> None: rows = self.database.all() for row in rows: - debug(f'Before: {row}') + log.debug(f'Before: {row}') if row[db.USN] in self.usn_rank_dict: actual_rank = self.usn_rank_dict[row[db.USN]] self._update_player(row, actual_rank) else: self._decay_player(row) - debug(f'After: {row}') + log.debug(f'After: {row}') self.database.write_back(rows) @@ -144,9 +144,9 @@ def process(rank_file_path): database_obj.close() duration = time()-start_time - debug(f'Updated ratings for {rank_file_path}') + log.debug(f'Updated ratings for {rank_file_path}') if duration > 10: - critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' + log.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' f'Consider removing unnecessary handles or optimize ratings algorithm') @@ -163,7 +163,7 @@ def process(rank_file_path): database_obj.close() duration = time()-start_time - debug(f'Updated ratings for {rank_file_path}') + log.debug(f'Updated ratings for {rank_file_path}') if duration > 10: - logging.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' + logging.log.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' f'Consider removing unnecessary handles or optimize ratings algorithm')''' diff --git a/run.py b/run.py index e8389ad..e170edf 100644 --- a/run.py +++ b/run.py @@ -14,7 +14,7 @@ from ratings import processor from pathlib import Path from collections import defaultdict -from utils.log import * +from utils import log PATH_TO_RANK_FILES = 'database/contest_ranks/' # Change this path to 'database/[YOUR_CUSTOM_RANKS_DIR]' to calculate ratings for only a few contests contest_names_file_path = 'database/contest_names_file.in' # Change this path to 'database/[YOUR_CUSTOM_CONTEST_NAMES_FILE.in]' and add required (supported) contests to calculate ratings for only those @@ -85,7 +85,7 @@ def get_all_contests(DAYS): contest_names_file = open(contest_names_file_path, 'r+') # Contains list of all contests scraped till now existing_contests = list(contest_names_file.read().split('\n')) if not calendar_response: - error('No upcoming contests found.') + log.error('No upcoming contests found.') contest_names_file.close() return calendar_response else: @@ -93,7 +93,7 @@ def get_all_contests(DAYS): try: url = event['location'].replace('https://', '').split('/') # Remove the https and make the parts of the url a list except: - error('The contest {} does not have an associated website and is hence ignored.'.format(event['summary'])) + log.error('The contest {} does not have an associated website and is hence ignored.'.format(event['summary'])) continue try: url.remove('') # To remove any unexpected blank items caused by a trailing slash @@ -107,7 +107,7 @@ def get_all_contests(DAYS): contest_names_file.write(contest.file_name+'\n') contests[contest.website].append(contest) else: - warn(f'{contest.file_name} already exists, ignoring; To re-scrape, delete the file and remove this entry.') + log.warn(f'{contest.file_name} already exists, ignoring; To re-scrape, delete the file and remove this entry.') contest_names_file.close() return contests @@ -138,7 +138,7 @@ def scrape(DAYS=30): with open (file_path, 'w+') as rank_file: for rank in contest.leaderboard: rank_file.write(rank + '\n') - info('Wrote to {file_path}') + log.info('Wrote to {file_path}') def recalculate(clean=False): # Recalculates the ratings from ground-up; This is to ensure integrity and to allow for later joinees @@ -148,14 +148,14 @@ def recalculate(clean=False): # Recalculates the ratings from ground-up; This is contest_names.remove('') # Removes trailing newline in case the input file had it except: pass - info('Built list of files to process') + log.info('Built list of files to process') for contest in contest_names: if clean: """ Removes handles that couldn't be mapped to a USN Usually required in a contest where we couldn't obtain handles of only required students, such as HackerEarth """ tools.remove_unmapped_handles_from_rank_file(f'{PATH_TO_RANK_FILES}{contest}') processor.process(f'{PATH_TO_RANK_FILES}{contest}') # Call the processor for each contest - info(f'Processed contest: {contest}') + log.info(f'Processed contest: {contest}') tools.export_to_csv() tools.prettify() contest_names_file.close() diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 8f2046e..1a935aa 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -3,8 +3,7 @@ from datetime import datetime import sys import os -from utils import selenium_utils -from utils.log import info +from utils import selenium_utils, log driver = selenium_utils.make_driver() load_all = selenium_utils.load_all(driver) @@ -18,7 +17,7 @@ def get_problems(site): driver.get(site) - info(f'Initialised website: {site}') + log.info(f'Initialised website: {site}') problem_list = list(load(r'tbody', 'tag').text.split('\n'))[1::4] problems = set() for question in problem_list: @@ -27,7 +26,7 @@ def get_problems(site): def get_rankings(site, contest_code): driver.get(site) - info(f'Initialised website: {site}') + log.info(f'Initialised website: {site}') total_pages = int(load_all(r'jump', 'class')[-1].text) scraped_scoreboard = [] for page in range(total_pages): @@ -59,7 +58,7 @@ def scrape(contest_codes): scoreboard_filter_query:str = "?filterBy=Institution%3DPES%20University&itemsPerPage=100&order=asc&sortBy=rank" leaderboards = [] for contest_code in contest_codes: - info(f'Codechef contest {contest_code}:') + log.info(f'Codechef contest {contest_code}:') final_scoreboard:list=list() easy_points:int=100 # Points to add to division A participants assuming they can solve all easy div B problems diff --git a/scrapers/hackerearth.py b/scrapers/hackerearth.py index 82de558..74cf596 100644 --- a/scrapers/hackerearth.py +++ b/scrapers/hackerearth.py @@ -4,7 +4,7 @@ from bs4 import BeautifulSoup from tinydb import TinyDB, where from database.db_tools import DB_FILE, HACKEREARTH -from utils.log import * +from utils import log # 0 - event_id # 1 - page number @@ -15,13 +15,13 @@ def get_handles(html_doc): soup = BeautifulSoup(html_doc, 'html.parser') ''' -