Included original description and further standardized tokens

gwu-libraries · Jan 25, 2025 · e9130ad · e9130ad
1 parent b8fd707
commit e9130ad
Show file tree

Hide file tree

Showing 3 changed files with 217 additions and 167 deletions.
diff --git a/text-as-data/course_bulletin.ipynb b/text-as-data/course_bulletin.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 1,
    "id": "281c90df-540e-4139-9434-aa4255d95f72",
    "metadata": {},
    "outputs": [],
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "e97bf66a-71fc-4e9c-98cf-d10d116c8485",
    "metadata": {},
    "outputs": [],
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "id": "e2bdf5d2-cf3a-4ba2-9f46-d836fdd07e01",
    "metadata": {},
    "outputs": [],
@@ -44,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 6,
    "id": "b0221ab4-f5a1-4e35-b30f-2f749e88da3a",
    "metadata": {},
    "outputs": [],
@@ -54,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 7,
    "id": "11bbdde9-26ec-4ceb-a935-211147105a8a",
    "metadata": {},
    "outputs": [],
@@ -68,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 8,
    "id": "4ff4c238-0f7c-4292-b78c-153e724ad863",
    "metadata": {},
    "outputs": [],
@@ -83,27 +83,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 10,
    "id": "f423091c-d598-4518-af24-ecce58dcc97d",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting en-core-web-sm==3.8.0\n",
-      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
-      "\u001b[?25hInstalling collected packages: en-core-web-sm\n",
-      "Successfully installed en-core-web-sm-3.8.0\n",
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
-      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
-      "You can now load the package via spacy.load('en_core_web_sm')\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import spacy\n",
     "#!python -m spacy download en_core_web_sm\n",
@@ -112,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 11,
    "id": "727e803e-278e-41ce-8974-635b5df97ac1",
    "metadata": {},
    "outputs": [],
@@ -122,26 +105,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 44,
    "id": "32d622e6-6a3c-49a8-b898-ec0583c5e1cf",
    "metadata": {},
    "outputs": [],
    "source": [
-    "cleaned_courses = defaultdict(dict)\n",
+    "cleaned_courses = []\n",
     "i = 0\n",
     "for dept, dept_dict in courses.items():\n",
     "    for title, desc in dept_dict.items():\n",
-    "        desc_tokens = [t.text for t in docs[i] if not t.is_space and not t.is_punct]\n",
+    "        desc_tokens = [t.text.lower() for t in docs[i] if t.is_alpha]\n",
     "        if desc_tokens:\n",
     "            title = title.replace('\\xa0', ' ')\n",
-    "            cleaned_courses[dept][title] = desc_tokens\n",
+    "            desc = desc.replace('\\xa0', ' ')\n",
+    "            cleaned_courses.append({'dept': dept, 'title': title, 'desc': desc, 'tokens': desc_tokens})\n",
     "        i += 1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "69aed025-42ba-4b37-89cd-bfab4a0c648b",
+   "execution_count": 45,
+   "id": "297a6b38-00da-455b-ae4d-b4e220cd3931",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,79 +136,148 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "b16ce674-9b4e-4e8b-8240-8a4343591d05",
+   "execution_count": 13,
+   "id": "ad769062-acf7-4be8-90c2-7577d2e8e573",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import json\n",
-    "with open('gw_bulletin.json') as f:\n",
-    "    cleaned_courses = json.load(f)"
+    "from random import sample"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "ad769062-acf7-4be8-90c2-7577d2e8e573",
+   "execution_count": 20,
+   "id": "e2d53e37-7f6c-494b-acf7-609995603067",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from random import sample"
+    "sample_keys = sample(list(cleaned_courses.keys()), k=10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "e2d53e37-7f6c-494b-acf7-609995603067",
+   "execution_count": 21,
+   "id": "e306ed9c-13ff-4753-930e-6d67f9880140",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['School of Media and Public Affairs (SMPA)',\n",
+       " 'Corcoran Interaction Design (CIXD)',\n",
+       " 'Political Science (PSC)',\n",
+       " 'Computer Science (CSCI)',\n",
+       " 'Professional Studies Public Leadership (PSPL)',\n",
+       " 'Chinese (CHIN)',\n",
+       " 'Regulatory Affairs (RAFF)',\n",
+       " 'Hominid Paleobiology (HOMP)',\n",
+       " 'English (ENGL)',\n",
+       " 'Speech, Language, and Hearing Science (SLHS)']"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_keys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "986ea482-381a-49ab-a6fa-21b771983836",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sample_keys = sample(list(cleaned_courses.keys()), k=10)"
+    "num_docs = 100"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "e306ed9c-13ff-4753-930e-6d67f9880140",
+   "execution_count": 56,
+   "id": "c241bb98-1d10-4940-ba49-970e7ee2d776",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = [c for c in cleaned_courses if c['dept'] in sample_keys]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "a5fcf158-c533-4434-9b5a-2cc92ff98ca8",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['International Business (IBUS)',\n",
-       " 'Anatomy and Cell Biology (ANAT)',\n",
-       " 'Management (MGT)',\n",
-       " 'Sustainability (SUST)',\n",
-       " 'Health Services Management and Leadership (HSML)',\n",
-       " 'Counseling (CNSL)',\n",
-       " 'Sociology (SOC)',\n",
-       " 'Geology (GEOL)',\n",
-       " 'Speech, Language, and Hearing Science (SLHS)',\n",
-       " 'Business Administration (BADM)']"
+       "825"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "sample_keys"
+    "len(sample)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 24,
+   "id": "5aab7d7c-bb30-459d-893b-2be0335b8a25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import *\n",
+    "\n",
+    "def roundrobin(*iterables):\n",
+    "    \"Visit input iterables in a cycle until each is exhausted.\"\n",
+    "    # roundrobin('ABC', 'D', 'EF') → A D E B F C\n",
+    "    # Algorithm credited to George Sakkis\n",
+    "    iterators = map(iter, iterables)\n",
+    "    for num_active in range(len(iterables), 0, -1):\n",
+    "        iterators = cycle(islice(iterators, num_active))\n",
+    "        yield from map(next, iterators)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "6b3acb61-cfe2-456c-9d87-9c5d45a6d4bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groups = [list(g) for k, g in groupby(sample, key=lambda x: x['dept'])]\n",
+    "sample = [c for c in roundrobin(*groups)][:100]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "39555eea-d8f3-4e5f-adc9-e737788d9db6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = sorted(sample, key=lambda x: x['dept'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
    "id": "b1eab6dc-3e72-49db-bf7c-02252e17f5e6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "fieldnames = ['department', 'course', 'description']"
+    "fieldnames = ['department', 'course', 'description', 'tokens']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 65,
    "id": "363ea68b-5af8-4fd0-9d39-bdfff3162499",
    "metadata": {},
    "outputs": [],
@@ -233,11 +286,8 @@
     "with open('gw_bulletin.csv', 'w') as f:\n",
     "    writer = DictWriter(f, fieldnames)\n",
     "    writer.writeheader()\n",
-    "    for dept, course_dict in cleaned_courses.items():\n",
-    "        if dept in sample_keys:\n",
-    "            for i, (title, desc) in enumerate(course_dict.items()):\n",
-    "                if i < 10:\n",
-    "                    writer.writerow(dict(zip(fieldnames, (dept, title, '|'.join(desc)))))"
+    "    for course in sample:\n",
+    "        writer.writerow(dict(zip(fieldnames, (course['dept'], course['title'], course['desc'], '|'.join(course['tokens'])))))"
    ]
   }
  ],