From 40c966e7e7ff8d4198240cb261b1eac54e301755 Mon Sep 17 00:00:00 2001
From: Dolsy Smith <dsmith@gwu.edu>
Date: Thu, 24 Aug 2023 13:52:37 -0400
Subject: [PATCH] Added postprocessing tests

---
 course-utils/gw_bookstore_scraping.ipynb      | 835 ------------------
 course_utils/__init__.py                      |   0
 {course-utils => course_utils}/autograder.py  |   0
 .../bookstore_scraper.py                      |   0
 .../parsons_builder.py                        |   0
 .../postprocessing.py                         |  60 +-
 publish.sh                                    |   4 +-
 tests/__init__.py                             |   0
 tests/test_postprocessing.ipynb               | 122 +++
 tests/test_postprocessing.py                  |  26 +
 .../homework/HW_1_from_code_to_data.ipynb     |  82 +-
 11 files changed, 260 insertions(+), 869 deletions(-)
 delete mode 100644 course-utils/gw_bookstore_scraping.ipynb
 create mode 100644 course_utils/__init__.py
 rename {course-utils => course_utils}/autograder.py (100%)
 rename {course-utils => course_utils}/bookstore_scraper.py (100%)
 rename {course-utils => course_utils}/parsons_builder.py (100%)
 rename {course-utils => course_utils}/postprocessing.py (77%)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_postprocessing.ipynb
 create mode 100644 tests/test_postprocessing.py

diff --git a/course-utils/gw_bookstore_scraping.ipynb b/course-utils/gw_bookstore_scraping.ipynb
deleted file mode 100644
index c09c573..0000000
--- a/course-utils/gw_bookstore_scraping.ipynb
+++ /dev/null
@@ -1,835 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import requests\n",
-    "from bs4 import BeautifulSoup\n",
-    "import re\n",
-    "from collections import defaultdict\n",
-    "from copy import deepcopy\n",
-    "import json\n",
-    "from datetime import datetime\n",
-    "#from scraping_config import config"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Scraping course info from GW Schedule of Classes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "base_url = config['schedule_page_url']\n",
-    "params = {'campid': config['campus_id'],\n",
-    "        'termid': config['term_id']} # Summer 2023\n",
-    "# Get list of departments\n",
-    "dept_page = requests.get(base_url, params=params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "soup = BeautifulSoup(dept_page.text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dept_links = soup.find_all(\"a\", href=re.compile(r'.+&subjId=.+'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract dept. codes from list of departments\n",
-    "dept_codes = [d['href'].split('&subjId=')[-1] for d in dept_links]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get each page (first page of results for each dept)\n",
-    "course_url = config['course_page_url']\n",
-    "course_pages = defaultdict(list)\n",
-    "for code in dept_codes:\n",
-    "    params['subjid'] = code\n",
-    "    page = requests.post(course_url, params=params)\n",
-    "    course_pages[code].append(page.text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract course & section numbers from first page\n",
-    "courses = []\n",
-    "for course_code, pages in course_pages.items():\n",
-    "    soup_1 = BeautifulSoup(pages[0])\n",
-    "    courses.extend([course for course in extract_course_info(soup_1)])\n",
-    "    more_pages = list(get_more_results(soup_1, params, course_code))\n",
-    "    if more_pages:\n",
-    "        courses.extend([course for page in more_pages\n",
-    "                       for course in extract_course_info(BeautifulSoup(page))])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_course_info(soup):\n",
-    "    listings = soup.find_all('tr', class_=\"crseRow1\")\n",
-    "    for listing in listings:\n",
-    "        course = {'code': course_code}\n",
-    "        info = listing.find_all('td')\n",
-    "        # Course number should reside under the 3rd table element, in the <a> tag\n",
-    "        course['number'] = info[2].a.text.strip()\n",
-    "        # Course section is in the fourth element\n",
-    "        course['section'] = info[3].text.strip()\n",
-    "        # Title and instructor are in the fifth and seventh elements\n",
-    "        course['title'] = info[4].text.strip()\n",
-    "        course['instructor'] = info[6].text.strip()\n",
-    "        yield course"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Additional results for a given department may be on subsequent pages\n",
-    "# Identify any pages that have links to more results\n",
-    "def get_more_results(soup, params, course_code):\n",
-    "    pages = {t.text for t in soup.find_all('a', href=re.compile('javascript:goToPage')) if t.text != '1'}\n",
-    "    if pages:\n",
-    "        for page in pages:\n",
-    "            params['subjid'] = course_code\n",
-    "            r = requests.post(course_url, \n",
-    "                              params=params, \n",
-    "                              headers={'Content-Type': 'application/x-www-form-urlencoded'}, \n",
-    "                              data=f\"pageNum={page}\")\n",
-    "            if r. status_code == 200:\n",
-    "                yield r.text"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Retrieving records from the GW Bookstore for each course"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bkst_base_url = config['bookstore_url']\n",
-    "bkst_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0',\n",
-    "             'Accept': 'application/json, text/plain, */*',\n",
-    "             'Content-Type': 'application/json'}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_bkst_payload(course_data, term_id):\n",
-    "    return {'bookstoreId': config['bookstore_id'],\n",
-    "            'courses':[{'courseDisplayName': course['number'],\n",
-    "                       'departmentDisplayName': course['code'],\n",
-    "                       'sectionDisplayName': course['section']}\n",
-    "                      for course in course_data],\n",
-    "            'termId': term_id}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bkst_data = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "courses_list = deepcopy(courses)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/gw-courses-mc-202302.json') as f:\n",
-    "    courses = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from time import sleep"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 121,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Captcha detected\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-121-2049f005d09f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Captcha detected\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m         \u001b[0;31m#break\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m         \u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m360\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m         \u001b[0mbkst_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp_j\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "while courses:\n",
-    "    course = courses[0]\n",
-    "    payload = create_bkst_payload([course], params['termid'])\n",
-    "    r = requests.post(bkst_base_url, headers=bkst_headers, json=payload)\n",
-    "    resp_j = r.json()\n",
-    "    if 'blockScript' in resp_j:\n",
-    "        print(\"Captcha detected\")\n",
-    "        #break\n",
-    "        sleep(360)\n",
-    "    else:\n",
-    "        bkst_data.append(resp_j)\n",
-    "        courses.pop(0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1152"
-      ]
-     },
-     "execution_count": 122,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(courses)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "98"
-      ]
-     },
-     "execution_count": 123,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(bkst_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 124,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#with open(f'../data/gw-courses-mc-{params[\"termid\"]}.json', 'w') as f:\n",
-    "#    json.dump(courses_list, f)\n",
-    "with open(f'../data/gw-books-mc-{params[\"termid\"]}-{datetime.now().isoformat()}.json', 'w') as f:\n",
-    "    json.dump(bkst_data, f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Parsing GW Bookstore data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open(f'../data/gw-books-mc-202302-2023-07-25T14:05:09.978808.json') as f:\n",
-    "    bkst_data = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{1}"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Length(s) of course section for each result\n",
-    "{len(c['courseSectionDTO']) for c in bkst_data}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Courses with books have this key\n",
-    "with_books = [c for c in bkst_data if c['courseSectionDTO'][0].get('courseMaterialResultsList')]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['courseSectionStatus', 'termId', 'termName', 'termNumber', 'termStatus', 'termOpen', 'rentalReturnDate', 'programId', 'programName', 'campusId', 'campusName', 'institutionName', 'ddcsBreadCrumb', 'adoptionStatus', 'courseMaterialResultsList', 'division', 'divisionName', 'divisionDescriptorCode', 'department', 'departmentName', 'departmentDescriptorCode', 'course', 'courseName', 'courseDescriptorCode', 'section', 'sectionName', 'sectionDescriptorCode', 'courseId', 'displayFlag', 'checkAvailabilityLink', 'salesTrack', 'courseMinimum', 'includEDProgramName', 'includEDAdoptionNote', 'includEdDTO', 'instructor', 'sectionAdoptionDTO', 'rentalReturnDateDisplay'])"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "with_books[0]['courseSectionDTO'][0].keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'RECOMMENDED', 'REQUIRED'}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# This key contains an indication of which books are required vs. recommended\n",
-    "{k for book in with_books\n",
-    "     for k in book['courseSectionDTO'][0]['sectionAdoptionDTO']['materialAdoptions'].keys()}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'digitalItemDTOs', 'printItemDTOs'}"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Keys for item types (e vs print)\n",
-    "{k for book in with_books\n",
-    "    for r in book['courseSectionDTO'][0]['courseMaterialResultsList']\n",
-    "    for  k in r.keys() if k.endswith('ItemDTOs')}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'BUY_NEW', 'BUY_USED', 'RENTAL_NEW', 'RENTAL_USED'}"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Keys for item subtypes (new. used, etc.)\n",
-    "{k for book in with_books\n",
-    "    for r in book['courseSectionDTO'][0]['courseMaterialResultsList']\n",
-    "    for k in r.get('printItemDTOs', {})\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Keys for extraction from bookstore JSON\n",
-    "top_keys = ['storeId', # int\n",
-    "           'storeNumber', # str\n",
-    "           'storeDisplayName', # str\n",
-    "           'currency', # str\n",
-    "           'requirementTypeLabelMap', # dict\n",
-    "           'courseSectionDTO']  # list \n",
-    "course_section_keys = ['courseSectionStatus', # dict\n",
-    "                      'termId', # str\n",
-    "                       'termName', # str\n",
-    "                       'termNumber', # str\n",
-    "                       'termStatus', # str\n",
-    "                       'termOpen', # bool\n",
-    "                       'programId', # str\n",
-    "                       'programName', # str\n",
-    "                       'campusId', # str\n",
-    "                       'campusName', # str\n",
-    "                       'institutionName', # str\n",
-    "                       'department', # str\n",
-    "                       'course', # str\n",
-    "                       'section', # str\n",
-    "                       'courseId', # str\n",
-    "                       'instructor', # str\n",
-    "                       'courseMaterialResultsList'] # list of dicts\n",
-    "course_materials_keys = ['title', # str\n",
-    "                        'edition', # str\n",
-    "                         'author', # str\n",
-    "                         'isbn',  # str\n",
-    "                         'materialType', # str\n",
-    "                         'requirementType', # str\n",
-    "                         'isPackage', # bool\n",
-    "                         'publisherCode', # str\n",
-    "                         'copyRightYear', # str\n",
-    "                         'publisher', # str\n",
-    "                         'priceRangeDisplay' # str\n",
-    "                         ,'digitalItemDTOs', # list\n",
-    "                         'printItemDTOs']  # dict\n",
-    "# dict\n",
-    "print_item_keys = ['BUY_NEW', 'BUY_USED', 'RENTAL_NEW', 'RENTAL_USED']\n",
-    "# dict\n",
-    "print_item_subkeys = ['typeCondition', 'priceDisplay', 'inventoryStatusDB',\n",
-    "                     'binding', 'priceNumeric', 'nonRentalChargesTotal', \n",
-    "                      'nonRentalBreakageCharge', 'nonRentalRestockingFee'] \n",
-    "# list of dict\n",
-    "digital_item_keys = ['subscription', 'typeCondition', 'priceDisplay', 'priceNumeric'] # subscription is optional key"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def clean_course_material(material):\n",
-    "    '''Reduces a dict of info about a specific course material'''\n",
-    "    cleaned_material = {k: v for k,v in material.items() if k in course_materials_keys}\n",
-    "    # clean the inner list or dict of items\n",
-    "    for i, item in enumerate(cleaned_material.get('digitalItemDTOs', [])):\n",
-    "        cleaned_item = clean_digital_item(item)\n",
-    "        cleaned_material['digitalItemDTOs'][i] = cleaned_item\n",
-    "    for k, v in cleaned_material.get('printItemDTOs', {}).items():\n",
-    "        cleaned_material['printItemDTOs'][k] = clean_print_item(v)\n",
-    "    return cleaned_material\n",
-    "def clean_print_item(item):\n",
-    "    '''Reduces a dict of info about a specific print item for sale'''\n",
-    "    return {k: v for k, v in item.items() if k in print_item_subkeys}\n",
-    "def clean_digital_item(item):\n",
-    "    '''Reduces a dict of info about a specific digital item for sale'''\n",
-    "    return {k: v for k,v in item.items() if k in digital_item_keys}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bkst_data_cleaned = []\n",
-    "for d in bkst_data:\n",
-    "    d1 = {k: v for k,v in d.items() if k in top_keys}  # Top level elements present in all records\n",
-    "    # Reduce dictionaries in course section data\n",
-    "    d1['courseSectionDTO'] = [{k: v  for k,v in s.items() if k in course_section_keys} # elements present in all course-section blocks \n",
-    "                               for s in d1['courseSectionDTO']]            \n",
-    "    for i, section in enumerate(d1['courseSectionDTO']):\n",
-    "        for j, material in enumerate(section.get('courseMaterialResultsList', [])):\n",
-    "            d1['courseSectionDTO'][i]['courseMaterialResultsList'][j] = clean_course_material(material)\n",
-    "    bkst_data_cleaned.append(d1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with_books_cleaned = [c for c in bkst_data_cleaned if c['courseSectionDTO'][0].get('courseMaterialResultsList')]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/bookstore-data-cleaned.json', 'w') as f:\n",
-    "    json.dump(bkst_data_cleaned, f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert len(with_books) == len(with_books_cleaned)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Cleaning the data further"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/bookstore-data-cleaned.json') as f:\n",
-    "    bkst_data = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# This inner is list is always length 1, so we can reduce it to its inner dict\n",
-    "len([b for b in bkst_data if len(b['courseSectionDTO']) > 1])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bkst_data_cleaned2 = []\n",
-    "for b in bkst_data_cleaned:\n",
-    "    b['courseSection'] = b['courseSectionDTO'][0]\n",
-    "    del b['courseSectionDTO']\n",
-    "    bkst_data_cleaned2.append(b)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Shortening key names for useful items and removing some extraneous keys\n",
-    "for b in bkst_data_cleaned2:\n",
-    "    del b['courseSection']['courseSectionStatus']\n",
-    "    section = b['courseSection']\n",
-    "    if 'courseMaterialResultsList' in section:\n",
-    "        section['courseMaterials'] = section['courseMaterialResultsList']\n",
-    "        del section['courseMaterialResultsList']\n",
-    "        for m in section['courseMaterials']:\n",
-    "            if 'printItemDTOs' in m:\n",
-    "                m['printItems'] = m['printItemDTOs']\n",
-    "                del m['printItemDTOs']\n",
-    "            if 'digitalItemDTOs' in m:\n",
-    "                m['digitalItems'] = m['digitalItemDTOs']\n",
-    "                del m['digitalItemDTOs']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/bookstore-data-cleaned.json', 'w') as f:\n",
-    "    json.dump(bkst_data_cleaned, f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Creating a simplified, uniform dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/bookstore-data-cleaned.json') as f:\n",
-    "    bkst_data = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with_books = [b for b in bkst_data_cleaned2 if b['courseSection'].get('courseMaterials')]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "course_keys = ['department', 'course', 'section', 'instructor', 'termName']\n",
-    "book_keys = ['title', 'author', 'edition', 'isbn', 'materialType', 'requirementType',\n",
-    "            'copyRightYear', 'publisher']\n",
-    "item_keys = ['typeCondition', 'priceDisplay']\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "simplified = []\n",
-    "for course in bkst_data_cleaned2:\n",
-    "    course_data = {k: course['courseSection'].get(k) for k in course_keys} \n",
-    "    books = []\n",
-    "    for book in course['courseSection'].get('courseMaterials',[]):\n",
-    "        book_data = {k: book.get(k) for k in book_keys}\n",
-    "        for item in book.get('printItems', {}).values():\n",
-    "            book_item = deepcopy(book_data)\n",
-    "            book_item.update({k: item[k] for k in item_keys})\n",
-    "            book_item['itemType'] = 'print'\n",
-    "            books.append(book_item)\n",
-    "        for item in book.get('digitalItems', []):\n",
-    "            book_item = deepcopy(book_data)\n",
-    "            book_item.update({k: item[k] for k in item_keys})\n",
-    "            book_item['itemType'] = 'digital'\n",
-    "            books.append(book_item)\n",
-    "    course_data['texts'] = books\n",
-    "    simplified.append(course_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert camel case to snake case \n",
-    "case_convert = re.compile(r'(?<!^)(?=[A-Z])')\n",
-    "def camel_to_snake(item):\n",
-    "    '''\n",
-    "    :param item: should be a dictionary\n",
-    "    Will recurse for nested lists of dicts\n",
-    "    '''\n",
-    "    if isinstance(item, dict):\n",
-    "        new_dict = {}\n",
-    "        for key, value in item.items():\n",
-    "            new_key = re.sub(case_convert, '_', key).lower()\n",
-    "            if isinstance(value, list):\n",
-    "                new_dict[new_key] = [camel_to_snake(v) for v in value]\n",
-    "            else:\n",
-    "                new_dict[new_key] = value\n",
-    "        return new_dict"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "simplified = [camel_to_snake(s) for s in simplified]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/bookstore-data-simplified.json', 'w') as f:\n",
-    "    json.dump(simplified, f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "51"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len([c for c in simplified if len(c['texts'])>0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1203"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(simplified)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1182"
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len({(c['department'], c['course'], c['section']) for c in simplified})"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "ENV",
-   "language": "python",
-   "name": "env"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/course_utils/__init__.py b/course_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/course-utils/autograder.py b/course_utils/autograder.py
similarity index 100%
rename from course-utils/autograder.py
rename to course_utils/autograder.py
diff --git a/course-utils/bookstore_scraper.py b/course_utils/bookstore_scraper.py
similarity index 100%
rename from course-utils/bookstore_scraper.py
rename to course_utils/bookstore_scraper.py
diff --git a/course-utils/parsons_builder.py b/course_utils/parsons_builder.py
similarity index 100%
rename from course-utils/parsons_builder.py
rename to course_utils/parsons_builder.py
diff --git a/course-utils/postprocessing.py b/course_utils/postprocessing.py
similarity index 77%
rename from course-utils/postprocessing.py
rename to course_utils/postprocessing.py
index e534c94..9123d03 100644
--- a/course-utils/postprocessing.py
+++ b/course_utils/postprocessing.py
@@ -34,6 +34,20 @@ def __init__(self, nb_file):
         :param nb_file: str or Path to an ipynb file.
         '''
         self.nb_json = self.load_nb(nb_file)
+        self.data = self.nb_json['cells']
+
+    def __iter__(self):
+        # implements iteration protocol
+        self.index = -1
+        return self
+
+    def __next__(self):
+        # iterates through notebook cells
+        if self.index < len(self.data) - 1:
+            self.index += 1
+            return self
+        else:
+            raise StopIteration
     
     def hide_tags(self):
         ''''
@@ -47,9 +61,8 @@ def clear_outputs(self):
         '''
         Clears output on cells -- assuming the student notebooks should be clean of all outputs.
         '''
-        for cell in self.nb_json['cells']:
-            if  cell['cell_type'] == 'code':
-                cell['outputs'] = []
+        if self.data[self.index]['cell_type'] == 'code':
+            self.data[self.index]['outputs'] = []
         return self
 
     def make_glossary_links(self):
@@ -63,21 +76,30 @@ def term_expand(match_obj):
             term = match_obj.group(1)
             return f'[{term}]({GLOSSARY_URL}{term.replace(" ", "-")})'
 
-        for cell in self.nb_json['cells']:
-            if cell['cell_type'] == 'markdown':
-                for i, line in enumerate(cell['source']):
-                    # Iterate over matches in line
-                    new_line = re.sub(TERM_PATTERN, term_expand, line)
-                    cell['source'][i] = new_line
+        if self.data[self.index]['cell_type'] == 'markdown':
+            for i, line in enumerate(self.data[self.index]['source']):
+                # Iterate over matches in line
+                new_line = re.sub(TERM_PATTERN, term_expand, line)
+                self.data[self.index]['source'][i] = new_line
         return self
     
     def ensure_hidden(self):
         '''
         Ensures that cells using the Exercise2 Jupyter Notebook extension are hidden by default. 
         '''
-        for cell in self.nb_json['cells']:
-            if 'solution2' in cell['metadata']:
-                cell['metadata']['solution2'] = 'hidden'
+        if 'solution2' in self.data[self.index]['metadata']:
+            self.data[self.index]['metadata']['solution2'] = 'hidden'
+        return self
+
+    def apply_hidden(self):
+        '''
+        Toggles the visibility of cells with the hide-cell tag.
+        '''
+        if 'hide-cell' in self.data[self.index]['metadata'].get('tags', []): 
+            self.data[self.index]['metadata']['jupyter'] = {'source_hidden': True}
+            if self.data[self.index]['cell_type'] == 'code':
+                # Add comment that will be visible on toggled cell
+                self.data[self.index]['source'].insert(0, '#Click to see the solution.\n')
         return self
 
     def remove_directives(self):
@@ -115,18 +137,15 @@ def remove_tagged_cells(self, tags=TAGS_TO_REMOVE):
         '''
         :param tags: should be a Python set of tags. Any cells with any of these tags will be removed from the output notebook.
         '''
-        output = []
-        for cell in self.nb_json['cells']:
-            cell_tags = cell['metadata'].get('tags', [])
-            if not (tags & set(cell_tags)):
-                output.append(deepcopy(cell))
-        self.nb_json['cells'] = output
+        self.data = [cell for cell in self.data 
+                     if not (tags & set(cell['metadata'].get('tags', [])))]
         return self      
 
     def save_nb(self, nb_file):
         '''
         Saves notebook json at provided path
         '''          
+        self.nb_json['cells'] = self.data
         with open(nb_file, 'w') as f:
             json.dump(self.nb_json, f)
         return self
@@ -160,7 +179,10 @@ def main(nb_input, nb_output):
         
         logger.info(f'Processing notebook {in_}; saving output to {out}.')
         nb = Notebook(in_)
-        nb.remove_directives().remove_tagged_cells().make_glossary_links().ensure_hidden().clear_outputs().hide_tags().save_nb(out)
+        nb.remove_tagged_cells()
+        for cell in nb:
+            cell.make_glossary_links().apply_hidden().clear_outputs()
+        nb.hide_tags().save_nb(out)
 
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/publish.sh b/publish.sh
index e6cbaf3..ee13634 100755
--- a/publish.sh
+++ b/publish.sh
@@ -40,14 +40,14 @@ function update_hw_modules() {
 function main() {
 
     echo "Building Parsons Problems"
-    python ./course-utils/parsons_builder.py
+    python ./course_utils/parsons_builder.py
 
     echo "Building book from scratch"
     jupyter-book clean textbook/
     jupyter-book build textbook/
 
     echo "Cleaning up notebook formatting"
-    python ./course-utils/postprocessing.py
+    python ./course_utils/postprocessing.py
 
     echo "Publishing to GH pages"
     ghp-import -n -p -f textbook/_build/html
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_postprocessing.ipynb b/tests/test_postprocessing.ipynb
new file mode 100644
index 0000000..edc3546
--- /dev/null
+++ b/tests/test_postprocessing.ipynb
@@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "39c26caf-db01-40d7-b62b-4973537b7b9c",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": [
+     "clear-outputs"
+    ]
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "99.95"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# clear outputs\n",
+    "book_price = 99.95\n",
+    "num_students = 55\n",
+    "book_price"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff62f5c5-3ad9-454c-ac42-7bd955e7c8d6",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": [
+     "term-directive"
+    ]
+   },
+   "source": [
+    "{term}`glossary term` should contain a link."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e41339fc-9bff-4560-b682-9169405f8199",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": [
+     "hide-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "print(\"This cell should be hidden\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "53658e40-ffe4-4c73-9368-374d8cf54abb",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [
+    {
+     "ename": "AssertionError",
+     "evalue": "This cell should have been removed.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#This cell should be removed.\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mThis cell should have been removed.\u001b[39m\u001b[38;5;124m'\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: This cell should have been removed."
+     ]
+    }
+   ],
+   "source": [
+    "#This cell should be removed.\n",
+    "assert False, 'This cell should have been removed.'"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python Camp",
+   "language": "python",
+   "name": "env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py
new file mode 100644
index 0000000..1640e39
--- /dev/null
+++ b/tests/test_postprocessing.py
@@ -0,0 +1,26 @@
+from course_utils.postprocessing import Notebook
+import unittest
+import json
+
+class TestPostProcessing(unittest.TestCase):
+
+    def setUp(self):
+        self.notebook = Notebook('tests/test_postprocessing.ipynb')
+        self.notebook.remove_tagged_cells()
+        for cell in self.notebook:
+            cell.make_glossary_links().apply_hidden().clear_outputs()
+        self.notebook.hide_tags()
+    
+    def testProcessing(self):
+
+        self.assertEqual(self.notebook.data[0]['outputs'], [], 
+                         'cell output not cleared')
+        self.assertRegex(self.notebook.data[1]['source'][0], r'\[.+\]\(https://gwu-libraries\.github.io/python-camp/glossary\.html#.+\)', 
+                         'Markdown link missing or malformed')
+        self.assertEqual(self.notebook.data[2]['source'][0], '#Click to see the solution.\n', 'Hidden code cell missing initial comment.')
+        self.assertEqual(self.notebook.data[2]['metadata']['jupyter'], 
+                         {"source_hidden": True},
+                      'source_hidden flag missing from hidden cell metadata')
+        cells_for_removal = [cell for cell in self.notebook.data if 'remove-cell' in cell['metadata'].get('tags', [])]
+        self.assertEqual(cells_for_removal, [], 'cells tagged for removal not removed')
+
diff --git a/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb b/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb
index fa71dba..0e9cad9 100644
--- a/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb
+++ b/textbook/notebooks/homework/HW_1_from_code_to_data.ipynb
@@ -292,7 +292,13 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "source": [
     "````{admonition} Try it out!\n",
     ":class: try-it-out\n",
@@ -323,8 +329,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
     "solution2": "hidden",
     "tags": [
      "hide-cell"
@@ -380,7 +390,13 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "course = 'CHEM 1002 10'\n",
@@ -389,26 +405,38 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "source": [
     "What if we want to extract the department code, the course number, and the section number from the `course` variable? \n",
     "\n",
     "By counting characters, we can see the following:\n",
     "- The department code occupies the first four (4) index positions. With strings, the first position is labeled `0`, not `1`, so the first 4 characters would fall in positions `0`, `1`, `2`, and `3`.\n",
+    "- The course number occupies four more positions, but we also have to account for the intervening space: `4` (the space), then `5`, `6`, `7`, `8`.\n",
+    "\n",
     "\n",
     "|0|1|2|3|4|5|6|7|8|9|10|11|\n",
-    "|-|-|-|-|-|-|-|-|-|-|-|-|\n",
+    "|:-|:-|:-|:-|:-|:-|:-|:-|:-|:-|:-|:-|\n",
     "|C|H|E|M| |1|0|0|2| |1|0|\n",
     "\n",
-    "- The course number occupies four more positions, but we also have to account for the intervening space: `4` (the space), then `5`, `6`, `7`, `8`.\n",
-    "\n",
     "We can use this information to **slice** our `course` variable as follows:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -428,7 +456,13 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "source": [
     "````{admonition} Notes\n",
     ":class: notes\n",
@@ -512,7 +546,13 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "source": [
     "What if we don't know the exact position of the characters we want to extract? \n",
     "\n",
@@ -656,8 +696,13 @@
   {
    "cell_type": "markdown",
    "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
     "solution2": "hidden",
-    "solution2_first": true
+    "solution2_first": true,
+    "tags": []
    },
    "source": [
     "Now check your answer by expanding the hidden solution cell below."
@@ -666,10 +711,17 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "solution2": "hidden"
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "solution2": "hidden",
+    "tags": []
    },
    "source": [
-    "````{toggle}\n",
+    "````{hint} Solution\n",
+    ":class: dropdown\n",
+    "\n",
     "If you ran the code `term.split()`, you should have gotten output like this:\n",
     "\n",
     "`['Summer', '2023']`\n",
@@ -784,6 +836,10 @@
    "cell_type": "code",
    "execution_count": 50,
    "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
     "solution2": "hidden",
     "tags": [
      "hide-cell"