From 40c966e7e7ff8d4198240cb261b1eac54e301755 Mon Sep 17 00:00:00 2001 From: Dolsy Smith Date: Thu, 24 Aug 2023 13:52:37 -0400 Subject: [PATCH] Added postprocessing tests --- course-utils/gw_bookstore_scraping.ipynb | 835 ------------------ course_utils/__init__.py | 0 {course-utils => course_utils}/autograder.py | 0 .../bookstore_scraper.py | 0 .../parsons_builder.py | 0 .../postprocessing.py | 60 +- publish.sh | 4 +- tests/__init__.py | 0 tests/test_postprocessing.ipynb | 122 +++ tests/test_postprocessing.py | 26 + .../homework/HW_1_from_code_to_data.ipynb | 82 +- 11 files changed, 260 insertions(+), 869 deletions(-) delete mode 100644 course-utils/gw_bookstore_scraping.ipynb create mode 100644 course_utils/__init__.py rename {course-utils => course_utils}/autograder.py (100%) rename {course-utils => course_utils}/bookstore_scraper.py (100%) rename {course-utils => course_utils}/parsons_builder.py (100%) rename {course-utils => course_utils}/postprocessing.py (77%) create mode 100644 tests/__init__.py create mode 100644 tests/test_postprocessing.ipynb create mode 100644 tests/test_postprocessing.py diff --git a/course-utils/gw_bookstore_scraping.ipynb b/course-utils/gw_bookstore_scraping.ipynb deleted file mode 100644 index c09c573..0000000 --- a/course-utils/gw_bookstore_scraping.ipynb +++ /dev/null @@ -1,835 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import re\n", - "from collections import defaultdict\n", - "from copy import deepcopy\n", - "import json\n", - "from datetime import datetime\n", - "#from scraping_config import config" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Scraping course info from GW Schedule of Classes" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "base_url = config['schedule_page_url']\n", - "params = {'campid': config['campus_id'],\n", - " 'termid': config['term_id']} # Summer 2023\n", - "# Get list of departments\n", - "dept_page = requests.get(base_url, params=params)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "soup = BeautifulSoup(dept_page.text)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "dept_links = soup.find_all(\"a\", href=re.compile(r'.+&subjId=.+'))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract dept. codes from list of departments\n", - "dept_codes = [d['href'].split('&subjId=')[-1] for d in dept_links]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Get each page (first page of results for each dept)\n", - "course_url = config['course_page_url']\n", - "course_pages = defaultdict(list)\n", - "for code in dept_codes:\n", - " params['subjid'] = code\n", - " page = requests.post(course_url, params=params)\n", - " course_pages[code].append(page.text)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract course & section numbers from first page\n", - "courses = []\n", - "for course_code, pages in course_pages.items():\n", - " soup_1 = BeautifulSoup(pages[0])\n", - " courses.extend([course for course in extract_course_info(soup_1)])\n", - " more_pages = list(get_more_results(soup_1, params, course_code))\n", - " if more_pages:\n", - " courses.extend([course for page in more_pages\n", - " for course in extract_course_info(BeautifulSoup(page))])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_course_info(soup):\n", - " listings = soup.find_all('tr', class_=\"crseRow1\")\n", - " for listing in listings:\n", - " course = {'code': course_code}\n", - " info = listing.find_all('td')\n", - " # Course number should reside under the 3rd table element, in the tag\n", - " course['number'] = info[2].a.text.strip()\n", - " # Course section is in the fourth element\n", - " course['section'] = info[3].text.strip()\n", - " # Title and instructor are in the fifth and seventh elements\n", - " course['title'] = info[4].text.strip()\n", - " course['instructor'] = info[6].text.strip()\n", - " yield course" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Additional results for a given department may be on subsequent pages\n", - "# Identify any pages that have links to more results\n", - "def get_more_results(soup, params, course_code):\n", - " pages = {t.text for t in soup.find_all('a', href=re.compile('javascript:goToPage')) if t.text != '1'}\n", - " if pages:\n", - " for page in pages:\n", - " params['subjid'] = course_code\n", - " r = requests.post(course_url, \n", - " params=params, \n", - " headers={'Content-Type': 'application/x-www-form-urlencoded'}, \n", - " data=f\"pageNum={page}\")\n", - " if r. status_code == 200:\n", - " yield r.text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieving records from the GW Bookstore for each course" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "bkst_base_url = config['bookstore_url']\n", - "bkst_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0',\n", - " 'Accept': 'application/json, text/plain, */*',\n", - " 'Content-Type': 'application/json'}" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "def create_bkst_payload(course_data, term_id):\n", - " return {'bookstoreId': config['bookstore_id'],\n", - " 'courses':[{'courseDisplayName': course['number'],\n", - " 'departmentDisplayName': course['code'],\n", - " 'sectionDisplayName': course['section']}\n", - " for course in course_data],\n", - " 'termId': term_id}" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "bkst_data = []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "courses_list = deepcopy(courses)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/gw-courses-mc-202302.json') as f:\n", - " courses = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "from time import sleep" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Captcha detected\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Captcha detected\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m#break\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m360\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mbkst_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp_j\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "while courses:\n", - " course = courses[0]\n", - " payload = create_bkst_payload([course], params['termid'])\n", - " r = requests.post(bkst_base_url, headers=bkst_headers, json=payload)\n", - " resp_j = r.json()\n", - " if 'blockScript' in resp_j:\n", - " print(\"Captcha detected\")\n", - " #break\n", - " sleep(360)\n", - " else:\n", - " bkst_data.append(resp_j)\n", - " courses.pop(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1152" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(courses)" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "98" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(bkst_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": {}, - "outputs": [], - "source": [ - "#with open(f'../data/gw-courses-mc-{params[\"termid\"]}.json', 'w') as f:\n", - "# json.dump(courses_list, f)\n", - "with open(f'../data/gw-books-mc-{params[\"termid\"]}-{datetime.now().isoformat()}.json', 'w') as f:\n", - " json.dump(bkst_data, f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Parsing GW Bookstore data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "with open(f'../data/gw-books-mc-202302-2023-07-25T14:05:09.978808.json') as f:\n", - " bkst_data = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{1}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Length(s) of course section for each result\n", - "{len(c['courseSectionDTO']) for c in bkst_data}" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Courses with books have this key\n", - "with_books = [c for c in bkst_data if c['courseSectionDTO'][0].get('courseMaterialResultsList')]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['courseSectionStatus', 'termId', 'termName', 'termNumber', 'termStatus', 'termOpen', 'rentalReturnDate', 'programId', 'programName', 'campusId', 'campusName', 'institutionName', 'ddcsBreadCrumb', 'adoptionStatus', 'courseMaterialResultsList', 'division', 'divisionName', 'divisionDescriptorCode', 'department', 'departmentName', 'departmentDescriptorCode', 'course', 'courseName', 'courseDescriptorCode', 'section', 'sectionName', 'sectionDescriptorCode', 'courseId', 'displayFlag', 'checkAvailabilityLink', 'salesTrack', 'courseMinimum', 'includEDProgramName', 'includEDAdoptionNote', 'includEdDTO', 'instructor', 'sectionAdoptionDTO', 'rentalReturnDateDisplay'])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "with_books[0]['courseSectionDTO'][0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'RECOMMENDED', 'REQUIRED'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# This key contains an indication of which books are required vs. recommended\n", - "{k for book in with_books\n", - " for k in book['courseSectionDTO'][0]['sectionAdoptionDTO']['materialAdoptions'].keys()}" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'digitalItemDTOs', 'printItemDTOs'}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Keys for item types (e vs print)\n", - "{k for book in with_books\n", - " for r in book['courseSectionDTO'][0]['courseMaterialResultsList']\n", - " for k in r.keys() if k.endswith('ItemDTOs')}" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'BUY_NEW', 'BUY_USED', 'RENTAL_NEW', 'RENTAL_USED'}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Keys for item subtypes (new. used, etc.)\n", - "{k for book in with_books\n", - " for r in book['courseSectionDTO'][0]['courseMaterialResultsList']\n", - " for k in r.get('printItemDTOs', {})\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Keys for extraction from bookstore JSON\n", - "top_keys = ['storeId', # int\n", - " 'storeNumber', # str\n", - " 'storeDisplayName', # str\n", - " 'currency', # str\n", - " 'requirementTypeLabelMap', # dict\n", - " 'courseSectionDTO'] # list \n", - "course_section_keys = ['courseSectionStatus', # dict\n", - " 'termId', # str\n", - " 'termName', # str\n", - " 'termNumber', # str\n", - " 'termStatus', # str\n", - " 'termOpen', # bool\n", - " 'programId', # str\n", - " 'programName', # str\n", - " 'campusId', # str\n", - " 'campusName', # str\n", - " 'institutionName', # str\n", - " 'department', # str\n", - " 'course', # str\n", - " 'section', # str\n", - " 'courseId', # str\n", - " 'instructor', # str\n", - " 'courseMaterialResultsList'] # list of dicts\n", - "course_materials_keys = ['title', # str\n", - " 'edition', # str\n", - " 'author', # str\n", - " 'isbn', # str\n", - " 'materialType', # str\n", - " 'requirementType', # str\n", - " 'isPackage', # bool\n", - " 'publisherCode', # str\n", - " 'copyRightYear', # str\n", - " 'publisher', # str\n", - " 'priceRangeDisplay' # str\n", - " ,'digitalItemDTOs', # list\n", - " 'printItemDTOs'] # dict\n", - "# dict\n", - "print_item_keys = ['BUY_NEW', 'BUY_USED', 'RENTAL_NEW', 'RENTAL_USED']\n", - "# dict\n", - "print_item_subkeys = ['typeCondition', 'priceDisplay', 'inventoryStatusDB',\n", - " 'binding', 'priceNumeric', 'nonRentalChargesTotal', \n", - " 'nonRentalBreakageCharge', 'nonRentalRestockingFee'] \n", - "# list of dict\n", - "digital_item_keys = ['subscription', 'typeCondition', 'priceDisplay', 'priceNumeric'] # subscription is optional key" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def clean_course_material(material):\n", - " '''Reduces a dict of info about a specific course material'''\n", - " cleaned_material = {k: v for k,v in material.items() if k in course_materials_keys}\n", - " # clean the inner list or dict of items\n", - " for i, item in enumerate(cleaned_material.get('digitalItemDTOs', [])):\n", - " cleaned_item = clean_digital_item(item)\n", - " cleaned_material['digitalItemDTOs'][i] = cleaned_item\n", - " for k, v in cleaned_material.get('printItemDTOs', {}).items():\n", - " cleaned_material['printItemDTOs'][k] = clean_print_item(v)\n", - " return cleaned_material\n", - "def clean_print_item(item):\n", - " '''Reduces a dict of info about a specific print item for sale'''\n", - " return {k: v for k, v in item.items() if k in print_item_subkeys}\n", - "def clean_digital_item(item):\n", - " '''Reduces a dict of info about a specific digital item for sale'''\n", - " return {k: v for k,v in item.items() if k in digital_item_keys}" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "bkst_data_cleaned = []\n", - "for d in bkst_data:\n", - " d1 = {k: v for k,v in d.items() if k in top_keys} # Top level elements present in all records\n", - " # Reduce dictionaries in course section data\n", - " d1['courseSectionDTO'] = [{k: v for k,v in s.items() if k in course_section_keys} # elements present in all course-section blocks \n", - " for s in d1['courseSectionDTO']] \n", - " for i, section in enumerate(d1['courseSectionDTO']):\n", - " for j, material in enumerate(section.get('courseMaterialResultsList', [])):\n", - " d1['courseSectionDTO'][i]['courseMaterialResultsList'][j] = clean_course_material(material)\n", - " bkst_data_cleaned.append(d1)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "with_books_cleaned = [c for c in bkst_data_cleaned if c['courseSectionDTO'][0].get('courseMaterialResultsList')]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/bookstore-data-cleaned.json', 'w') as f:\n", - " json.dump(bkst_data_cleaned, f)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "assert len(with_books) == len(with_books_cleaned)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Cleaning the data further" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/bookstore-data-cleaned.json') as f:\n", - " bkst_data = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# This inner is list is always length 1, so we can reduce it to its inner dict\n", - "len([b for b in bkst_data if len(b['courseSectionDTO']) > 1])" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "bkst_data_cleaned2 = []\n", - "for b in bkst_data_cleaned:\n", - " b['courseSection'] = b['courseSectionDTO'][0]\n", - " del b['courseSectionDTO']\n", - " bkst_data_cleaned2.append(b)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "# Shortening key names for useful items and removing some extraneous keys\n", - "for b in bkst_data_cleaned2:\n", - " del b['courseSection']['courseSectionStatus']\n", - " section = b['courseSection']\n", - " if 'courseMaterialResultsList' in section:\n", - " section['courseMaterials'] = section['courseMaterialResultsList']\n", - " del section['courseMaterialResultsList']\n", - " for m in section['courseMaterials']:\n", - " if 'printItemDTOs' in m:\n", - " m['printItems'] = m['printItemDTOs']\n", - " del m['printItemDTOs']\n", - " if 'digitalItemDTOs' in m:\n", - " m['digitalItems'] = m['digitalItemDTOs']\n", - " del m['digitalItemDTOs']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/bookstore-data-cleaned.json', 'w') as f:\n", - " json.dump(bkst_data_cleaned, f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Creating a simplified, uniform dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/bookstore-data-cleaned.json') as f:\n", - " bkst_data = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "with_books = [b for b in bkst_data_cleaned2 if b['courseSection'].get('courseMaterials')]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "course_keys = ['department', 'course', 'section', 'instructor', 'termName']\n", - "book_keys = ['title', 'author', 'edition', 'isbn', 'materialType', 'requirementType',\n", - " 'copyRightYear', 'publisher']\n", - "item_keys = ['typeCondition', 'priceDisplay']\n" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "simplified = []\n", - "for course in bkst_data_cleaned2:\n", - " course_data = {k: course['courseSection'].get(k) for k in course_keys} \n", - " books = []\n", - " for book in course['courseSection'].get('courseMaterials',[]):\n", - " book_data = {k: book.get(k) for k in book_keys}\n", - " for item in book.get('printItems', {}).values():\n", - " book_item = deepcopy(book_data)\n", - " book_item.update({k: item[k] for k in item_keys})\n", - " book_item['itemType'] = 'print'\n", - " books.append(book_item)\n", - " for item in book.get('digitalItems', []):\n", - " book_item = deepcopy(book_data)\n", - " book_item.update({k: item[k] for k in item_keys})\n", - " book_item['itemType'] = 'digital'\n", - " books.append(book_item)\n", - " course_data['texts'] = books\n", - " simplified.append(course_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# Convert camel case to snake case \n", - "case_convert = re.compile(r'(?0])" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1203" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(simplified)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1182" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len({(c['department'], c['course'], c['section']) for c in simplified})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ENV", - "language": "python", - "name": "env" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/course_utils/__init__.py b/course_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/course-utils/autograder.py b/course_utils/autograder.py similarity index 100% rename from course-utils/autograder.py rename to course_utils/autograder.py diff --git a/course-utils/bookstore_scraper.py b/course_utils/bookstore_scraper.py similarity index 100% rename from course-utils/bookstore_scraper.py rename to course_utils/bookstore_scraper.py diff --git a/course-utils/parsons_builder.py b/course_utils/parsons_builder.py similarity index 100% rename from course-utils/parsons_builder.py rename to course_utils/parsons_builder.py diff --git a/course-utils/postprocessing.py b/course_utils/postprocessing.py similarity index 77% rename from course-utils/postprocessing.py rename to course_utils/postprocessing.py index e534c94..9123d03 100644 --- a/course-utils/postprocessing.py +++ b/course_utils/postprocessing.py @@ -34,6 +34,20 @@ def __init__(self, nb_file): :param nb_file: str or Path to an ipynb file. ''' self.nb_json = self.load_nb(nb_file) + self.data = self.nb_json['cells'] + + def __iter__(self): + # implements iteration protocol + self.index = -1 + return self + + def __next__(self): + # iterates through notebook cells + if self.index < len(self.data) - 1: + self.index += 1 + return self + else: + raise StopIteration def hide_tags(self): '''' @@ -47,9 +61,8 @@ def clear_outputs(self): ''' Clears output on cells -- assuming the student notebooks should be clean of all outputs. ''' - for cell in self.nb_json['cells']: - if cell['cell_type'] == 'code': - cell['outputs'] = [] + if self.data[self.index]['cell_type'] == 'code': + self.data[self.index]['outputs'] = [] return self def make_glossary_links(self): @@ -63,21 +76,30 @@ def term_expand(match_obj): term = match_obj.group(1) return f'[{term}]({GLOSSARY_URL}{term.replace(" ", "-")})' - for cell in self.nb_json['cells']: - if cell['cell_type'] == 'markdown': - for i, line in enumerate(cell['source']): - # Iterate over matches in line - new_line = re.sub(TERM_PATTERN, term_expand, line) - cell['source'][i] = new_line + if self.data[self.index]['cell_type'] == 'markdown': + for i, line in enumerate(self.data[self.index]['source']): + # Iterate over matches in line + new_line = re.sub(TERM_PATTERN, term_expand, line) + self.data[self.index]['source'][i] = new_line return self def ensure_hidden(self): ''' Ensures that cells using the Exercise2 Jupyter Notebook extension are hidden by default. ''' - for cell in self.nb_json['cells']: - if 'solution2' in cell['metadata']: - cell['metadata']['solution2'] = 'hidden' + if 'solution2' in self.data[self.index]['metadata']: + self.data[self.index]['metadata']['solution2'] = 'hidden' + return self + + def apply_hidden(self): + ''' + Toggles the visibility of cells with the hide-cell tag. + ''' + if 'hide-cell' in self.data[self.index]['metadata'].get('tags', []): + self.data[self.index]['metadata']['jupyter'] = {'source_hidden': True} + if self.data[self.index]['cell_type'] == 'code': + # Add comment that will be visible on toggled cell + self.data[self.index]['source'].insert(0, '#Click to see the solution.\n') return self def remove_directives(self): @@ -115,18 +137,15 @@ def remove_tagged_cells(self, tags=TAGS_TO_REMOVE): ''' :param tags: should be a Python set of tags. Any cells with any of these tags will be removed from the output notebook. ''' - output = [] - for cell in self.nb_json['cells']: - cell_tags = cell['metadata'].get('tags', []) - if not (tags & set(cell_tags)): - output.append(deepcopy(cell)) - self.nb_json['cells'] = output + self.data = [cell for cell in self.data + if not (tags & set(cell['metadata'].get('tags', [])))] return self def save_nb(self, nb_file): ''' Saves notebook json at provided path ''' + self.nb_json['cells'] = self.data with open(nb_file, 'w') as f: json.dump(self.nb_json, f) return self @@ -160,7 +179,10 @@ def main(nb_input, nb_output): logger.info(f'Processing notebook {in_}; saving output to {out}.') nb = Notebook(in_) - nb.remove_directives().remove_tagged_cells().make_glossary_links().ensure_hidden().clear_outputs().hide_tags().save_nb(out) + nb.remove_tagged_cells() + for cell in nb: + cell.make_glossary_links().apply_hidden().clear_outputs() + nb.hide_tags().save_nb(out) if __name__ == '__main__': main() \ No newline at end of file diff --git a/publish.sh b/publish.sh index e6cbaf3..ee13634 100755 --- a/publish.sh +++ b/publish.sh @@ -40,14 +40,14 @@ function update_hw_modules() { function main() { echo "Building Parsons Problems" - python ./course-utils/parsons_builder.py + python ./course_utils/parsons_builder.py echo "Building book from scratch" jupyter-book clean textbook/ jupyter-book build textbook/ echo "Cleaning up notebook formatting" - python ./course-utils/postprocessing.py + python ./course_utils/postprocessing.py echo "Publishing to GH pages" ghp-import -n -p -f textbook/_build/html diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_postprocessing.ipynb b/tests/test_postprocessing.ipynb new file mode 100644 index 0000000..edc3546 --- /dev/null +++ b/tests/test_postprocessing.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "39c26caf-db01-40d7-b62b-4973537b7b9c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "clear-outputs" + ] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "99.95" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# clear outputs\n", + "book_price = 99.95\n", + "num_students = 55\n", + "book_price" + ] + }, + { + "cell_type": "markdown", + "id": "ff62f5c5-3ad9-454c-ac42-7bd955e7c8d6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "term-directive" + ] + }, + "source": [ + "{term}`glossary term` should contain a link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e41339fc-9bff-4560-b682-9169405f8199", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "hide-cell" + ] + }, + "outputs": [], + "source": [ + "print(\"This cell should be hidden\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "53658e40-ffe4-4c73-9368-374d8cf54abb", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "This cell should have been removed.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#This cell should be removed.\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mThis cell should have been removed.\u001b[39m\u001b[38;5;124m'\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: This cell should have been removed." + ] + } + ], + "source": [ + "#This cell should be removed.\n", + "assert False, 'This cell should have been removed.'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python Camp", + "language": "python", + "name": "env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py new file mode 100644 index 0000000..1640e39 --- /dev/null +++ b/tests/test_postprocessing.py @@ -0,0 +1,26 @@ +from course_utils.postprocessing import Notebook +import unittest +import json + +class TestPostProcessing(unittest.TestCase): + + def setUp(self): + self.notebook = Notebook('tests/test_postprocessing.ipynb') + self.notebook.remove_tagged_cells() + for cell in self.notebook: + cell.make_glossary_links().apply_hidden().clear_outputs() + self.notebook.hide_tags() + + def testProcessing(self): + + self.assertEqual(self.notebook.data[0]['outputs'], [], + 'cell output not cleared') + self.assertRegex(self.notebook.data[1]['source'][0], r'\[.+\]\(https://gwu-libraries\.github.io/python-camp/glossary\.html#.+\)', + 'Markdown link missing or malformed') + self.assertEqual(self.notebook.data[2]['source'][0], '#Click to see the solution.\n', 'Hidden code cell missing initial comment.') + self.assertEqual(self.notebook.data[2]['metadata']['jupyter'], + {"source_hidden": True}, + 'source_hidden flag missing from hidden cell metadata') + cells_for_removal = [cell for cell in self.notebook.data if 'remove-cell' in cell['metadata'].get('tags', [])] + self.assertEqual(cells_for_removal, [], 'cells tagged for removal not removed') + diff --git a/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb b/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb index fa71dba..0e9cad9 100644 --- a/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb +++ b/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb @@ -292,7 +292,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "source": [ "````{admonition} Try it out!\n", ":class: try-it-out\n", @@ -323,8 +329,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, "solution2": "hidden", "tags": [ "hide-cell" @@ -380,7 +390,13 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "outputs": [], "source": [ "course = 'CHEM 1002 10'\n", @@ -389,26 +405,38 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "source": [ "What if we want to extract the department code, the course number, and the section number from the `course` variable? \n", "\n", "By counting characters, we can see the following:\n", "- The department code occupies the first four (4) index positions. With strings, the first position is labeled `0`, not `1`, so the first 4 characters would fall in positions `0`, `1`, `2`, and `3`.\n", + "- The course number occupies four more positions, but we also have to account for the intervening space: `4` (the space), then `5`, `6`, `7`, `8`.\n", + "\n", "\n", "|0|1|2|3|4|5|6|7|8|9|10|11|\n", - "|-|-|-|-|-|-|-|-|-|-|-|-|\n", + "|:-|:-|:-|:-|:-|:-|:-|:-|:-|:-|:-|:-|\n", "|C|H|E|M| |1|0|0|2| |1|0|\n", "\n", - "- The course number occupies four more positions, but we also have to account for the intervening space: `4` (the space), then `5`, `6`, `7`, `8`.\n", - "\n", "We can use this information to **slice** our `course` variable as follows:" ] }, { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -428,7 +456,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "source": [ "````{admonition} Notes\n", ":class: notes\n", @@ -512,7 +546,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "source": [ "What if we don't know the exact position of the characters we want to extract? \n", "\n", @@ -656,8 +696,13 @@ { "cell_type": "markdown", "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, "solution2": "hidden", - "solution2_first": true + "solution2_first": true, + "tags": [] }, "source": [ "Now check your answer by expanding the hidden solution cell below." @@ -666,10 +711,17 @@ { "cell_type": "markdown", "metadata": { - "solution2": "hidden" + "editable": true, + "slideshow": { + "slide_type": "" + }, + "solution2": "hidden", + "tags": [] }, "source": [ - "````{toggle}\n", + "````{hint} Solution\n", + ":class: dropdown\n", + "\n", "If you ran the code `term.split()`, you should have gotten output like this:\n", "\n", "`['Summer', '2023']`\n", @@ -784,6 +836,10 @@ "cell_type": "code", "execution_count": 50, "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, "solution2": "hidden", "tags": [ "hide-cell"