diff --git a/.gitignore b/.gitignore index 6115d9f..9bad418 100644 --- a/.gitignore +++ b/.gitignore @@ -43,4 +43,8 @@ pip-delete-this-directory.txt .pytest_cache/ # Mypy (static type checking) -.mypy_cache/ \ No newline at end of file +.mypy_cache/ + +# Credentials for calendar +credentials.json +token.pickle \ No newline at end of file diff --git a/README.md b/README.md index bbd8faf..570e589 100644 --- a/README.md +++ b/README.md @@ -3,28 +3,28 @@ [![Build Status](https://travis-ci.com/varunvora/alcoding.svg?branch=master)](https://travis-ci.com/varunvora/alcoding) -Alcoding Club of [PES University](https://pes.edu/) maintains ratings of its students who are active in [competitive programming](https://en.wikipedia.org/wiki/Competitive_programming). This repository contains the ratings and the code which generates it. +Alcoding Club of [PES University](https://pes.edu/) maintains ratings of its students who are active in [competitive programming](https://en.wikipedia.org/wiki/Competitive_programming). This repository contains the ratings and the code that generates it. ## Purpose -An intra-college rating is maintained so that the club can identify good coders. The club will group these students and help them improve at competitive programming by organizing meet-ups, providing resources, arranging contests and develop a coding community in the University. +An intra-college rating is maintained to aid the club in identifying good coders. The club aims to help these students improve their competitive programming skills by organizing meet-ups, providing resources, arranging contests and developing a coding community in the University. ## Ratings -The ratings are calculated by students' performances in [specified contests](database/README.md). +The ratings are calculated using students' performances in [specified contests](database/README.md). ### Mechanism -A [rank list](database/contest_ranks) of registered students is generated at the end of each contest. A rating is computed from the rank list, which indicates their relative performance. The implementation is almost the same as [Codechef's Rating Mechanism](https://www.codechef.com/ratings) which is a modified version of [Elo rating system](https://en.wikipedia.org/wiki/Elo_rating_system). To avoid students from [protecting their ratings](https://en.wikipedia.org/wiki/Elo_rating_system#Game_activity_versus_protecting_one's_rating) and encourage participation, a decay rule is also added which decrements a student's rating by 1% if she does not take part in 5 consecutive contests. +A [rank list](database/contest_ranks) of registered students is generated at the end of each contest. A rating is computed from the rank list, which indicates their relative performance. The implementation is very similar to [Codechef's Rating Mechanism](https://www.codechef.com/ratings) which is a modified version of the [Elo rating system](https://en.wikipedia.org/wiki/Elo_rating_system). To prevent students from [protecting their ratings](https://en.wikipedia.org/wiki/Elo_rating_system#Game_activity_versus_protecting_one's_rating) and encourage participation, a decay rule, which decrements a student's rating by 1% if they do not take part in 5 consecutive rated contests, is also added. ### Verification -The [code that generates the rating](ratings/processor.py) is open. Along with that we have provided [a script with which you can verify](executor.sh) that the displayed ratings are correct. This script resets all students' ratings, and computes the ratings after all the contest ranks are considered. You may [report an issue](https://github.com/varunvora/alcoding/issues) if you find any discrepancy. +The [code that generates the rating](ratings/processor.py) is open. Further, we also provide [a method with which you can verify](run.py) the displayed ratings. This method resets all students' ratings, and recomputes the ratings of every student after considering all contest ranks. Please do [report an issue](https://github.com/pes-alcoding-club/student-ratings/issues) if you find any discrepancy. ## Calendar -Alcoding Club maintains a [Google calendar for competitive programming](https://calendar.google.com/calendar?cid=N3RsZGt1dXEwcW1mOW9ub2Jxb3ByZ2Z1cDRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ). Contests that are marked as "Rated" will be considered for these ratings. +Alcoding Club maintains a [Google Calendar for competitive programming](https://calendar.google.com/calendar?cid=N3RsZGt1dXEwcW1mOW9ub2Jxb3ByZ2Z1cDRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ). Contests that are marked "Rated" will be considered for these ratings. ## Contribute -This project is still very small so there are no strict guidelines for contribution. For now we are following [PEP 8 -- Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/). +At the moment, there are no strict guidelines for contribution. As a standard, we follow the [PEP 8 -- Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/). -You can [report an issue](https://github.com/varunvora/alcoding/issues) if you find a bug or any other change you would like to make. You may also make a [pull request](https://github.com/varunvora/alcoding/pulls). It would be helpful if you use [our Github labels](https://github.com/varunvora/alcoding/labels) for all issues and pull requests. Be sure to clearly document and describe any issues or changes. +Feel free to [report an issue](https://github.com/pes-alcoding-club/student-ratings/issues) if you find a bug, or have any other change you would like to see. You may also create a [pull request](https://github.com/pes-alcoding-club/student-ratings/pulls). It would be helpful if you use [our Github labels](https://github.com/pes-alcoding-club/student-ratings/labels) for all issues and pull requests. Be sure to clearly document and describe any issues or changes. ## FAQ @@ -37,11 +37,11 @@ You can [report an issue](https://github.com/varunvora/alcoding/issues) if you f 1. Which contests are taken into account for rating? Contests in ['Competitive Programming PESU' Calendar](https://calendar.google.com/calendar?cid=N3RsZGt1dXEwcW1mOW9ub2Jxb3ByZ2Z1cDRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) are considered for ratings. -1. How can I tell if these ratings are legitimate? +1. How can I tell whether these ratings are legitimate? - You can verify the ratings yourself by running [this script](executor.sh). It resets all students' ratings to default values and recomputes it for all contests so far in chronological order. - -1. How can I get the scoreboard only for some particular contest(s)? - - Clone this repository, open [executor.sh](executor.sh) and remove the contests you do not want the scoreboard for. Run this script and check [scoreboard.csv](scoreboard.csv). + You can verify the ratings yourself by calling the [make_scoreboard] function in [run.py](run.py). It resets all students' ratings to default values and recomputes it for all contests so far in chronological order. +1. How can I make a scoreboard for a few particular contests? + Firstly, clone this repository. + Create your own [contest_names_file.in](database/contest_names_file.in) and add the contest names in the format [platform]-[month]-[contest_code]. In [run.py](run.py), change the [contest_names_file_path] variable's value to your file's path. + Now call the [make_scoreboard] function in [run.py](run.py) with the required parameters and check [scoreboard.csv](scoreboard.csv). \ No newline at end of file diff --git a/database/contestsToProcess.in b/database/contest_names_file.in similarity index 100% rename from database/contestsToProcess.in rename to database/contest_names_file.in diff --git a/database/db_tools.py b/database/db_tools.py index c48796e..ee15c90 100644 --- a/database/db_tools.py +++ b/database/db_tools.py @@ -1,7 +1,6 @@ import re import sys import csv -import logging from os import listdir from os.path import join from collections import Counter @@ -9,6 +8,7 @@ from typing import List, Set, Tuple, Dict, Callable, Any from tinydb import TinyDB, where from ratings import elo +from utils import log DB_FILE: str = 'database/db.json' CONTEST_RANKS_DIR: str = 'database/contest_ranks' @@ -57,7 +57,7 @@ def reset_database(db_file: str = DB_FILE) -> None: BEST: elo.DEFAULT_RATING, TIMES_PLAYED: 0, LAST_FIVE: 5}) - logging.info(f'Successfully reset database and stored in {db_file}') + log.info(f'Successfully reset database and stored in {db_file}') def get_site_name_from_file_name(file_name: str) -> str: @@ -68,8 +68,8 @@ def get_site_name_from_file_name(file_name: str) -> str: """ file_name_parts = file_name.split("-") if len(file_name_parts) < 2 or file_name_parts[0] not in SITES: - logging.error(f"Invalid filename '{file_name}' in contest ranks. File name convention is" - f"'site-contest-details.in'") + log.error(f"Invalid filename '{file_name}' in contest ranks. File name convention is" + f"'site-month-contestCode.in'") quit() return file_name_parts[0] @@ -144,7 +144,7 @@ def log_unmapped_handles(site_username_tuple_list: List[Tuple[str, str]]) -> Non log_unmapped_handles(site_handle_tuple_list) - logging.info('Mapped ') + log.info('Mapped usernames to SRNs') def remove_unmapped_handles_from_rank_file(file_name: str) -> None: @@ -152,10 +152,10 @@ def remove_unmapped_handles_from_rank_file(file_name: str) -> None: Removes unmapped handles from outdated rank files to reduce space and time it takes for the script to run """ - with open(join(CONTEST_RANKS_DIR, file_name), 'r') as rank_file: + with open(file_name, 'r') as rank_file: input_data: str = rank_file.read() - - with open(join(CONTEST_RANKS_DIR, file_name), 'w') as rank_file: + count = 0 + with open(file_name, 'w') as rank_file: for user_name_line in input_data.split("\n"): check_occurrence_in_line: bool = False for user_name in user_name_line.split(): @@ -164,7 +164,9 @@ def remove_unmapped_handles_from_rank_file(file_name: str) -> None: rank_file.write(user_name + " ") if check_occurrence_in_line: rank_file.write("\n") - logging.info(f'Cleaned {file_name}') + count+=1 + loginfo = file_name.split('/')[2] + log.info(f'Cleaned {loginfo}') def export_to_csv(db_file: str = DB_FILE, scoreboard_file: str = SCOREBOARD_FILE) -> None: @@ -190,7 +192,7 @@ def export_to_csv(db_file: str = DB_FILE, scoreboard_file: str = SCOREBOARD_FILE wr = csv.writer(fp) wr.writerows(csv_table) - logging.info(f'Successfully exported database from {db_file} to {scoreboard_file}') + log.info(f'Successfully exported database from {db_file} to {scoreboard_file}') def prettify(db_file: str = DB_FILE) -> None: @@ -201,11 +203,11 @@ def prettify(db_file: str = DB_FILE) -> None: fp.write_back(fp.all()) -if __name__ == "__main__": +'''if __name__ == "__main__": # While executing this script, you can specify which function to execute func_str: str = sys.argv[1] try: func_obj: Callable = globals()[func_str] func_obj(*sys.argv[2:]) # Arguments to specified function can be passed - except KeyError: - logging.error(f'Provided invalid argument. No function {func_str}') + except KeyError:' + log.error(f'Provided invalid argument. No function {func_str}')''' \ No newline at end of file diff --git a/ratings/processor.py b/ratings/processor.py index 17c302b..0893039 100644 --- a/ratings/processor.py +++ b/ratings/processor.py @@ -1,22 +1,23 @@ import sys -import logging from time import time from ratings import elo from database import db_tools as db from tinydb import TinyDB, where - +from utils import log class RatingProcessor: - def __init__(self, database: TinyDB, rank_file): + def __init__(self, database: TinyDB, rank_file_path): self.database: TinyDB = database self.N: int = 0 self.Cf: float = 0.0 self.Rb_Vb_list: list = [] self.usn_rank_dict: dict = {} + self.rank_file_path = rank_file_path + self.rank_file = open(rank_file_path) - self.read_contest_ranks(rank_file) # sets usn_rank_dict + self.read_contest_ranks(self.rank_file) # sets usn_rank_dict self.set_contest_details() # sets N, Cf and Rb_Vb_list self.process_competition() # uses the set attributes to compute new ratings @@ -35,9 +36,9 @@ def read_contest_ranks(self, rank_file) -> None: self.usn_rank_dict[usn] = current_rank same_rank_count += 1 else: - logging.info(f'Ignoring usn {usn}') + log.info(f'Ignoring SRN {usn}') current_rank += same_rank_count # ranks are not 1, 1, 1, 2 but 1, 1, 1, 4 - logging.debug(self.usn_rank_dict) + log.debug(self.usn_rank_dict) def set_contest_details(self) -> None: """ @@ -54,12 +55,12 @@ def set_contest_details(self) -> None: self.N = len(self.usn_rank_dict) self.Cf = elo.Cf(rating_list, vol_list, self.N) self.Rb_Vb_list = list(zip(rating_list, vol_list)) - logging.debug(f'Contest: {rank_file_path}\nPlayers: {self.N}\nCompetition Factor: {self.Cf}') + log.debug(f'Contest: {self.rank_file_path}\nPlayers: {self.N}\nCompetition Factor: {self.Cf}') @staticmethod def _decay_player(player_dict: dict) -> None: """ - Reduces ratings by 10% for those who have competed at least once + Reduces ratings by 1% for those who have competed at least once but have not taken part in the past 5 contests :param player_dict: dict with all details of a player """ @@ -76,7 +77,7 @@ def _decay_player(player_dict: dict) -> None: player_dict[db.RATING] = rating player_dict[db.LAST_FIVE] = max(1, last_five) - logging.debug('Successfully decayed ratings') + log.debug('Successfully decayed ratings') def _update_player(self, player_dict: dict, actual_rank: int) -> None: """ @@ -99,27 +100,27 @@ def _update_player(self, player_dict: dict, actual_rank: int) -> None: player_dict[db.BEST] = max(old_best, new_rating) player_dict[db.LAST_FIVE] = 5 - logging.debug('Successfully updated ratings') + log.debug('Successfully updated ratings') def process_competition(self) -> None: rows = self.database.all() for row in rows: - logging.debug(f'Before: {row}') + log.debug(f'Before: {row}') if row[db.USN] in self.usn_rank_dict: actual_rank = self.usn_rank_dict[row[db.USN]] self._update_player(row, actual_rank) else: self._decay_player(row) - logging.debug(f'After: {row}') + log.debug(f'After: {row}') self.database.write_back(rows) -def read_argv(argv_format_alert: str): - """ +"""def read_argv(argv_format_alert: str): + ''' :param argv_format_alert: An error message on what the command line arguments should be :return: rank file if argv is valid - """ + ''' try: assert len(sys.argv) == 2 rank_file = sys.argv[1] @@ -128,15 +129,28 @@ def read_argv(argv_format_alert: str): return rank_file except IOError or FileNotFoundError: - logging.error(f'Invalid file path for rank file: {rank_file}\n{argv_format_alert}') + error(f'Invalid file path for rank file: {rank_file}\n{argv_format_alert}') quit() except AssertionError: - logging.error(f'Invalid command line arguments.\n{argv_format_alert}') - quit() + error(f'Invalid command line arguments.\n{argv_format_alert}') + quit()""" +def process(rank_file_path): + start_time = time() + # Main logic starts here + database_obj = TinyDB(db.DB_FILE) + RatingProcessor(database_obj, rank_file_path) + database_obj.close() -if __name__ == "__main__": + duration = time()-start_time + log.debug(f'Updated ratings for {rank_file_path}') + if duration > 10: + log.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' + f'Consider removing unnecessary handles or optimize ratings algorithm') + + +'''if __name__ == "__main__": start_time = time() argv_format = 'processor.py rank_file_path' @@ -149,7 +163,7 @@ def read_argv(argv_format_alert: str): database_obj.close() duration = time()-start_time - logging.debug(f'Updated ratings for {rank_file_path}') + log.debug(f'Updated ratings for {rank_file_path}') if duration > 10: - logging.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' - f'Consider removing unnecessary handles or optimize ratings algorithm') + logging.log.critical(f'Ratings update for {rank_file_path} took {duration} seconds.\n' + f'Consider removing unnecessary handles or optimize ratings algorithm')''' diff --git a/requirements.txt b/requirements.txt index c731e28..3d32c79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -tinydb -requests -bs4 -selenium \ No newline at end of file +requests==2.22.0 +beautifulsoup4==4.9.1 +google_api_python_client==1.10.0 +google_auth_oauthlib==0.4.1 +selenium==3.141.0 +tinydb==3.15.2 diff --git a/run.py b/run.py new file mode 100644 index 0000000..e170edf --- /dev/null +++ b/run.py @@ -0,0 +1,181 @@ +import datetime +import pickle +import os.path +from googleapiclient.discovery import build +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request +import sys +import os +sys.path.append( # Add absolute path of utils to sys.path + os.path.join( os.path.dirname( os.path.realpath( __file__ )), + '../student-ratings' )) +from scrapers import codechef, hackerearth +from database import db_tools as tools +from ratings import processor +from pathlib import Path +from collections import defaultdict +from utils import log + +PATH_TO_RANK_FILES = 'database/contest_ranks/' # Change this path to 'database/[YOUR_CUSTOM_RANKS_DIR]' to calculate ratings for only a few contests +contest_names_file_path = 'database/contest_names_file.in' # Change this path to 'database/[YOUR_CUSTOM_CONTEST_NAMES_FILE.in]' and add required (supported) contests to calculate ratings for only those +SCOPES = ['https://www.googleapis.com/auth/calendar.readonly'] +months = ['jan', 'feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] + +# Objects of this class are made for each contest; This allows the association of name, website, etc to that particular contest +class contest_details(): + def __init__(self, url): + self.website = str(url[0].split('.')[1]) # Fetch the platform name + self.contest_code = str(url[-1]) + self.file_name = self.make_file_name() + + def make_file_name(self): + if self.website == 'codechef': + if self.contest_code[0:5] == 'LTIME': + # Specific formula to determine month and year for Lunchtime based on numeric ID + month, year = months[(int(self.contest_code[5:]) + 4) % 12], int((int(self.contest_code[5:]) + 5) / 12) + 13 + return f'codechef-{month}-lunchtime-{year}.in' + # Specific formula to determine month and year for Cookoff based on numeric ID + elif self.contest_code[0:4]=="COOK": + month, year = months[(int(self.contest_code[4:]) + 6) % 12], int((int(self.contest_code[4:]) + 7) / 12) + 10 + return f'codechef-{month}-cookoff-{year}.in' + else: + month = self.contest_code[:-2].lower() + return f'codechef-{month}-long-{self.contest_code[-2:]}.in' + elif self.website == 'hackerearth': + return f'hackerearth-{self.contest_code}.in' + + def set_leaderboard(self, leaderboard): + self.leaderboard = leaderboard + + +def get_calendar_events(DAYS): + #This block of code is to allow OAuth + creds = None + if os.path.exists('token.pickle'): + with open('token.pickle', 'rb') as token: + creds = pickle.load(token) + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + 'credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + with open('token.pickle', 'wb') as token: + pickle.dump(creds, token) + + service = build('calendar', 'v3', credentials=creds) + + """ Call the Calendar API + Z indicates UTC time, as Google requires + the input timezones to be consistent """ + now = datetime.utcnow().isoformat() + 'Z' + tmin = (datetime.utcnow() - timedelta(days=DAYS)).isoformat() + 'Z' + response = service.events().list(calendarId='7tldkuuq0qmf9onobqoprgfup4@group.calendar.google.com', timeMin=tmin, + timeMax=now, singleEvents=True, + orderBy='startTime').execute() + + calendar_response = response.get('items', []) + return calendar_response + + +def get_all_contests(DAYS): + calendar_response = get_calendar_events(DAYS) # Gets all contest event in the last [DAYS] days + contests = defaultdict(list) + contest_names_file = open(contest_names_file_path, 'r+') # Contains list of all contests scraped till now + existing_contests = list(contest_names_file.read().split('\n')) + if not calendar_response: + log.error('No upcoming contests found.') + contest_names_file.close() + return calendar_response + else: + for event in calendar_response: + try: + url = event['location'].replace('https://', '').split('/') # Remove the https and make the parts of the url a list + except: + log.error('The contest {} does not have an associated website and is hence ignored.'.format(event['summary'])) + continue + try: + url.remove('') # To remove any unexpected blank items caused by a trailing slash + except: + pass + + contest = contest_details(url) # Create a contest_details object for the contest + if contest.website not in ['codechef', 'hackerearth']: # Only codechef and hackerearth scrapers are compatible as of now + continue + if contest.file_name not in existing_contests: # Checks whether the contest has already been scraped, if not writes it to scraped contests + contest_names_file.write(contest.file_name+'\n') + contests[contest.website].append(contest) + else: + log.warn(f'{contest.file_name} already exists, ignoring; To re-scrape, delete the file and remove this entry.') + + contest_names_file.close() + return contests + + +""" The scrapers take in a list of contest id's at a go to avoid the overhead of repeatedly calling it. This means that the output + leaderboards have to be reverse mapped back to the contest_details objects; Since they are in a list and the order is preserved, + we use the index of the leaderboard and map it to the object of the same index """ +def scrape(DAYS=30): + contests = get_all_contests(DAYS) # Returns a list of contest_details objects for each contest event in the calendar + if contests: # If contests have been found + leaderboards = codechef.scrape(list(contest.contest_code for contest in contests['codechef'])) + assert len(leaderboards) == len(contests['codechef']) # Make sure the number of leaderboards is the same as number of contests + for i in range(len(leaderboards)): + contests['codechef'][i].set_leaderboard(leaderboards[i]) + + leaderboards = hackerearth.scrape(list(contest.contest_code for contest in contests['hackerearth'])) + assert len(leaderboards) == len(contests['hackerearth']) # Make sure the number of leaderboards is the same as number of contests + for i in range(len(leaderboards)): + contests['hackerearth'][i].set_leaderboard(leaderboards[i]) + + else: + return + + for platform in contests: + for contest in contests[platform]: + file_path = PATH_TO_RANK_FILES + contest.file_name + with open (file_path, 'w+') as rank_file: + for rank in contest.leaderboard: + rank_file.write(rank + '\n') + log.info('Wrote to {file_path}') + + +def recalculate(clean=False): # Recalculates the ratings from ground-up; This is to ensure integrity and to allow for later joinees + contest_names_file = open(contest_names_file_path, 'r') + contest_names = list(contest_names_file.read().split('\n')) + try: + contest_names.remove('') # Removes trailing newline in case the input file had it + except: + pass + log.info('Built list of files to process') + for contest in contest_names: + if clean: + """ Removes handles that couldn't be mapped to a USN + Usually required in a contest where we couldn't obtain handles of only required students, such as HackerEarth """ + tools.remove_unmapped_handles_from_rank_file(f'{PATH_TO_RANK_FILES}{contest}') + processor.process(f'{PATH_TO_RANK_FILES}{contest}') # Call the processor for each contest + log.info(f'Processed contest: {contest}') + tools.export_to_csv() + tools.prettify() + contest_names_file.close() + + +def make_scoreboard(map_USN=True, clean=False): + tools.reset_database() + if map_USN: + tools.map_username_to_usn() + recalculate(clean) + +''' [DAYS]: No of days to fetch calendar events from + [map_USN]: Whether to map usernames to USNs + [clean]: Whether to remove unmapped handles ''' +def execute(DAYS=30, map_USN=True, clean=False): # + scrape(DAYS=DAYS) + make_scoreboard(map_USN=map_USN, clean=clean) + + +""" Uncomment one of the two lines depending on requirement, or call your desired function yourself """ + +# execute(clean=True) +# make_scoreboard(map_USN=True, clean=True) \ No newline at end of file diff --git a/scrapers/README.md b/scrapers/README.md index 04f5b02..0173e7e 100644 --- a/scrapers/README.md +++ b/scrapers/README.md @@ -4,12 +4,4 @@ This module contains scripts responsible for scraping contest pages. #### Description -1. map_handle_to_usn.py: After a contest is scraped this one maps the handles to its respective USN so that even if the handle changes in the future, it does not affect the rank. -1. codejam.py: Scraper for scoreboard for Google Codejam for its new interface that was launched in 2018. -2. codechef.py: Scraper for Codechef contests. This one uses regular expression to fetch the score board from the raw web page. -3. hackearth.py: Scraper for Hackerearth contests. This scrapes the entire global leaderboard without any filter. - -#### Usage - -1. Modify the scoreboard url in google.py, Run it, make necessary changes in the scoreboard(Change default view to 20, so on...), Press enter to continue. -2. Copy the raw scoreboard from codechef to a file say `codechef-contest-year.in` and run `python3 codechef.py codechef-contest-year.in` +**UNDER PROGRESS** \ No newline at end of file diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 98fdc2b..1a935aa 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -3,11 +3,7 @@ from datetime import datetime import sys import os -sys.path.append( # Add absolute path of utils to sys.path - os.path.join( os.path.dirname( os.path.realpath( __file__ )), - '../../student-ratings' )) -from utils import selenium_utils -from utils.log import info +from utils import selenium_utils, log driver = selenium_utils.make_driver() load_all = selenium_utils.load_all(driver) @@ -17,12 +13,11 @@ division = namedtuple('division',['problems','scraped_scoreboard']) divisions: dict = {'A':division(set(), list()),'B':division(set(), list())} -month = ['jan', 'feb', 'march', 'april', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] def get_problems(site): driver.get(site) - info(f'Initialised website: {site}') + log.info(f'Initialised website: {site}') problem_list = list(load(r'tbody', 'tag').text.split('\n'))[1::4] problems = set() for question in problem_list: @@ -31,7 +26,7 @@ def get_problems(site): def get_rankings(site, contest_code): driver.get(site) - info(f'Initialised website: {site}') + log.info(f'Initialised website: {site}') total_pages = int(load_all(r'jump', 'class')[-1].text) scraped_scoreboard = [] for page in range(total_pages): @@ -63,7 +58,7 @@ def scrape(contest_codes): scoreboard_filter_query:str = "?filterBy=Institution%3DPES%20University&itemsPerPage=100&order=asc&sortBy=rank" leaderboards = [] for contest_code in contest_codes: - info(f'Codechef contest {contest_code}:') + log.info(f'Codechef contest {contest_code}:') final_scoreboard:list=list() easy_points:int=100 # Points to add to division A participants assuming they can solve all easy div B problems @@ -84,7 +79,6 @@ def scrape(contest_codes): if final_scoreboard: # If scoreboard's not empty rank_list = [] - #contest_ranks_file = f'database/contest_ranks/{contest_name}' if contest_code[0:5]=="LTIME" or contest_code[0:4]=="COOK": rank_list = [x[0] for x in final_scoreboard] else: # Shared ranking possible for long contests. @@ -102,5 +96,4 @@ def scrape(contest_codes): shared_rank.append(user[0]) leaderboards.append(rank_list) return leaderboards - #info(f'Leaderboard written to {contest_ranks_file}') driver.close() \ No newline at end of file diff --git a/scrapers/hackerearth.py b/scrapers/hackerearth.py index 1e47651..a94a86d 100644 --- a/scrapers/hackerearth.py +++ b/scrapers/hackerearth.py @@ -1,13 +1,10 @@ import requests import sys import os -sys.path.append( # Add absolute path of utils to sys.path - os.path.join( os.path.dirname( os.path.realpath( __file__ )), - '../../student-ratings' )) from bs4 import BeautifulSoup from tinydb import TinyDB, where from database.db_tools import DB_FILE, HACKEREARTH -from utils.log import * +from utils import log # 0 - event_id # 1 - page number @@ -36,11 +33,11 @@ def get_leaderboard(event_id): handles = get_handles(requests.get(API.format(event_id, page_num)).text) # url returns last page for page_num greater than last page number - if leaderboard[-len(handles):] == handles or page_num > 10: + if leaderboard[-len(handles):] == handles: break leaderboard.extend(handles) - info(f'Handles retrieved for page {page_num}') + log.debug(f'Handles retrieved for page {page_num}') page_num += 1 return leaderboard @@ -49,7 +46,7 @@ def get_contest_IDs(contest_codes): contest_IDs = dict() for contest_code in contest_codes: # Gets the ICPC contest ID from the url; We use this ID to send the request - contest_IDs[contest_code] = BeautifulSoup(requests.get(leaderboard_base_url.format(contest_code)).text, 'html.parser').find('div', class_='event-id').text # + contest_IDs[contest_code] = BeautifulSoup(requests.get(leaderboard_base_url.format(contest_code)).text, 'html.parser').find('div', class_='event-id').text return contest_IDs @@ -57,19 +54,11 @@ def scrape(contest_codes): contest_IDs = get_contest_IDs(contest_codes) leaderboards = [] for contest in contest_IDs: - info(f'HackerEarth contest {contest}:') + log.info(f'HackerEarth contest {contest}:') leaderboard = get_leaderboard(contest_IDs[contest]) - #contest_ranks_file = f'database/contest_ranks/hackerearth-{contest}.in' leaderboards.append(leaderboard) return leaderboards -'''if __name__ == "__main__": - logging.basicConfig(level='INFO') - event_id = '814357' - leaderboard = get_leaderboard(event_id) - print(*leaderboard, sep='\n')''' - - """Uncomment to output mapped PES handles only with TinyDB(DB_FILE) as database: pes_hackerearth_users = {x[HACKEREARTH] for x in database.search(where(HACKEREARTH))} diff --git a/utils/log.py b/utils/log.py index eb5b639..5ce5da4 100644 --- a/utils/log.py +++ b/utils/log.py @@ -1,9 +1,8 @@ -from selenium import webdriver from datetime import datetime, timedelta import logging -from time import strftime, sleep +from time import strftime -logging.basicConfig(format='%(message)s', level='INFO', datefmt=strftime("%d/%m/%Y, %H:%M:%S")) +logging.basicConfig(format='%(message)s', level='DEBUG', datefmt=strftime("%d/%m/%Y, %H:%M:%S")) def info(message): logging.info(datetime.now().strftime("%d/%m/%Y, %H:%M:%S") + ': ' + message) @@ -12,6 +11,6 @@ def warn(message): def critical(message): logging.critical('CRITICAL: ' + datetime.now().strftime("%d/%m/%Y, %H:%M:%S") + ': ' + message) def debug(message): - logging.debug('DEBUGGING: ' + datetime.now().strftime("%d/%m/%Y, %H:%M:%S") + ': ' + str(message)) + logging.debug(datetime.now().strftime("%d/%m/%Y, %H:%M:%S") + ': ' + str(message)) def error(message): logging.error('ERROR: ' + datetime.now().strftime("%d/%m/%Y, %H:%M:%S") + ': ' + message)