-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathumdmajors.py
36 lines (25 loc) · 1.01 KB
/
umdmajors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import urllib
from BeautifulSoup import BeautifulSoup
import csv
all_rows = []
headers = [u'majorName', u'school', u'webAddress']
all_rows.append(headers)
link = "http://www.admissions.umd.edu/academics/Majors.php?m=1"
f = urllib.urlopen(link)
myfile = f.read()
soup = BeautifulSoup(myfile)
content = soup.find("div", {'class':'chld3OneColContent'})
content1 = content.findAll('a', href=True)
for item in content1:
row_content = []
if (item['href'].find('http') != -1 and len(item.contents[0].split(' ')) > 1):
row_content.append(item.contents[0].split(' ')[0])
row_content.append(item.contents[0].split(' ')[1].replace('(', '').replace(')', ''))
row_content.append(item['href'])
all_rows.append(row_content)
#Add a # before the "print all_rows" line to comment it out, or remove it.
#Shows the raw rows being printed to the spreadsheet. It will spit out a lot of code in your command line.
print all_rows
handle = open('umdmajors.csv', 'w')
outfile = csv.writer(handle)
outfile.writerows(all_rows)