Skip to content

Commit

Permalink
Included original description and further standardized tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
dolsysmith committed Jan 25, 2025
1 parent b8fd707 commit e9130ad
Show file tree
Hide file tree
Showing 3 changed files with 217 additions and 167 deletions.
180 changes: 115 additions & 65 deletions text-as-data/course_bulletin.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 1,
"id": "281c90df-540e-4139-9434-aa4255d95f72",
"metadata": {},
"outputs": [],
Expand All @@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "e97bf66a-71fc-4e9c-98cf-d10d116c8485",
"metadata": {},
"outputs": [],
Expand All @@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"id": "e2bdf5d2-cf3a-4ba2-9f46-d836fdd07e01",
"metadata": {},
"outputs": [],
Expand All @@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 6,
"id": "b0221ab4-f5a1-4e35-b30f-2f749e88da3a",
"metadata": {},
"outputs": [],
Expand All @@ -54,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 7,
"id": "11bbdde9-26ec-4ceb-a935-211147105a8a",
"metadata": {},
"outputs": [],
Expand All @@ -68,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 8,
"id": "4ff4c238-0f7c-4292-b78c-153e724ad863",
"metadata": {},
"outputs": [],
Expand All @@ -83,27 +83,10 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 10,
"id": "f423091c-d598-4518-af24-ecce58dcc97d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting en-core-web-sm==3.8.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hInstalling collected packages: en-core-web-sm\n",
"Successfully installed en-core-web-sm-3.8.0\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('en_core_web_sm')\n"
]
}
],
"outputs": [],
"source": [
"import spacy\n",
"#!python -m spacy download en_core_web_sm\n",
Expand All @@ -112,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 11,
"id": "727e803e-278e-41ce-8974-635b5df97ac1",
"metadata": {},
"outputs": [],
Expand All @@ -122,26 +105,27 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 44,
"id": "32d622e6-6a3c-49a8-b898-ec0583c5e1cf",
"metadata": {},
"outputs": [],
"source": [
"cleaned_courses = defaultdict(dict)\n",
"cleaned_courses = []\n",
"i = 0\n",
"for dept, dept_dict in courses.items():\n",
" for title, desc in dept_dict.items():\n",
" desc_tokens = [t.text for t in docs[i] if not t.is_space and not t.is_punct]\n",
" desc_tokens = [t.text.lower() for t in docs[i] if t.is_alpha]\n",
" if desc_tokens:\n",
" title = title.replace('\\xa0', ' ')\n",
" cleaned_courses[dept][title] = desc_tokens\n",
" desc = desc.replace('\\xa0', ' ')\n",
" cleaned_courses.append({'dept': dept, 'title': title, 'desc': desc, 'tokens': desc_tokens})\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "69aed025-42ba-4b37-89cd-bfab4a0c648b",
"execution_count": 45,
"id": "297a6b38-00da-455b-ae4d-b4e220cd3931",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -152,79 +136,148 @@
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b16ce674-9b4e-4e8b-8240-8a4343591d05",
"execution_count": 13,
"id": "ad769062-acf7-4be8-90c2-7577d2e8e573",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open('gw_bulletin.json') as f:\n",
" cleaned_courses = json.load(f)"
"from random import sample"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ad769062-acf7-4be8-90c2-7577d2e8e573",
"execution_count": 20,
"id": "e2d53e37-7f6c-494b-acf7-609995603067",
"metadata": {},
"outputs": [],
"source": [
"from random import sample"
"sample_keys = sample(list(cleaned_courses.keys()), k=10)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2d53e37-7f6c-494b-acf7-609995603067",
"execution_count": 21,
"id": "e306ed9c-13ff-4753-930e-6d67f9880140",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['School of Media and Public Affairs (SMPA)',\n",
" 'Corcoran Interaction Design (CIXD)',\n",
" 'Political Science (PSC)',\n",
" 'Computer Science (CSCI)',\n",
" 'Professional Studies Public Leadership (PSPL)',\n",
" 'Chinese (CHIN)',\n",
" 'Regulatory Affairs (RAFF)',\n",
" 'Hominid Paleobiology (HOMP)',\n",
" 'English (ENGL)',\n",
" 'Speech, Language, and Hearing Science (SLHS)']"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_keys"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "986ea482-381a-49ab-a6fa-21b771983836",
"metadata": {},
"outputs": [],
"source": [
"sample_keys = sample(list(cleaned_courses.keys()), k=10)"
"num_docs = 100"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e306ed9c-13ff-4753-930e-6d67f9880140",
"execution_count": 56,
"id": "c241bb98-1d10-4940-ba49-970e7ee2d776",
"metadata": {},
"outputs": [],
"source": [
"sample = [c for c in cleaned_courses if c['dept'] in sample_keys]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "a5fcf158-c533-4434-9b5a-2cc92ff98ca8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['International Business (IBUS)',\n",
" 'Anatomy and Cell Biology (ANAT)',\n",
" 'Management (MGT)',\n",
" 'Sustainability (SUST)',\n",
" 'Health Services Management and Leadership (HSML)',\n",
" 'Counseling (CNSL)',\n",
" 'Sociology (SOC)',\n",
" 'Geology (GEOL)',\n",
" 'Speech, Language, and Hearing Science (SLHS)',\n",
" 'Business Administration (BADM)']"
"825"
]
},
"execution_count": 7,
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_keys"
"len(sample)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 24,
"id": "5aab7d7c-bb30-459d-893b-2be0335b8a25",
"metadata": {},
"outputs": [],
"source": [
"from itertools import *\n",
"\n",
"def roundrobin(*iterables):\n",
" \"Visit input iterables in a cycle until each is exhausted.\"\n",
" # roundrobin('ABC', 'D', 'EF') → A D E B F C\n",
" # Algorithm credited to George Sakkis\n",
" iterators = map(iter, iterables)\n",
" for num_active in range(len(iterables), 0, -1):\n",
" iterators = cycle(islice(iterators, num_active))\n",
" yield from map(next, iterators)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "6b3acb61-cfe2-456c-9d87-9c5d45a6d4bf",
"metadata": {},
"outputs": [],
"source": [
"groups = [list(g) for k, g in groupby(sample, key=lambda x: x['dept'])]\n",
"sample = [c for c in roundrobin(*groups)][:100]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "39555eea-d8f3-4e5f-adc9-e737788d9db6",
"metadata": {},
"outputs": [],
"source": [
"sample = sorted(sample, key=lambda x: x['dept'])"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "b1eab6dc-3e72-49db-bf7c-02252e17f5e6",
"metadata": {},
"outputs": [],
"source": [
"fieldnames = ['department', 'course', 'description']"
"fieldnames = ['department', 'course', 'description', 'tokens']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 65,
"id": "363ea68b-5af8-4fd0-9d39-bdfff3162499",
"metadata": {},
"outputs": [],
Expand All @@ -233,11 +286,8 @@
"with open('gw_bulletin.csv', 'w') as f:\n",
" writer = DictWriter(f, fieldnames)\n",
" writer.writeheader()\n",
" for dept, course_dict in cleaned_courses.items():\n",
" if dept in sample_keys:\n",
" for i, (title, desc) in enumerate(course_dict.items()):\n",
" if i < 10:\n",
" writer.writerow(dict(zip(fieldnames, (dept, title, '|'.join(desc)))))"
" for course in sample:\n",
" writer.writerow(dict(zip(fieldnames, (course['dept'], course['title'], course['desc'], '|'.join(course['tokens'])))))"
]
}
],
Expand Down
Loading

0 comments on commit e9130ad

Please sign in to comment.