-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoed-xml.py
102 lines (74 loc) · 2.62 KB
/
oed-xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
"""
oed-xml.py - OED XML Python Query
Copyright 2014 Sujeet Akula ([email protected])
Licensed under the Eiffel Forum License 2.
"""
from lxml import etree
from StringIO import StringIO
import contextlib, urllib, urllib2
from HTMLParser import HTMLParser
import re
#from urllib import urlopen, urlencode
oed = r'http://www.oed.com/srupage'
oed_url = r'http://www.oed.com/srupage?operation=searchRetrieve&query=cql.serverChoice+=+{}&maximumRecords=100&startRecord=1'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
srw = r'{http://www.loc.gov/zing/srw/}'
sru_dc = r'{info:srw/schema/1/dc-v1.1}'
dc = r'{http://purl.org/dc/elements/1.1/}'
oed_req_vals = {'operation' : 'searchRetrieve',
'query' : 'cql.serverChoice = {}',
'maximumRecords' : '100',
'startRecord' : '1'}
headers = { 'User-Agent' : user_agent }
#quick and dirty
rm_disp = re.compile(r'<[/]*display>')
rm_span = re.compile(r'<[/]*span[^>]*>')
sub_em = re.compile(r'<[/]*em>')
sub_strong = re.compile(r'<[/]*strong>')
hparse = HTMLParser()
def fetch(req_vals):
oed_req = urllib2.Request(oed, urllib.urlencode(req_vals), headers)
u = urllib2.urlopen(oed_req)
data = u.read()
return data
def search(query):
defs = list()
req_vals = oed_req_vals
req_vals['query'] = req_vals['query'].format(urllib.quote(query))
result_xml = fetch(req_vals)
result_tree = etree.parse(StringIO(result_xml))
root = result_tree.getroot()
num_records = root.find(srw + 'numberOfRecords')
if num_records is None:
print 'Error: Unknown number of records'
return
else:
num_records = int(num_records.text)
if num_records < 1:
print 'Error: No records found'
return
records = root.find(srw + 'records').getiterator()
for record in records:
rdata = record.find(srw + 'recordData')
if rdata is not None:
data = rdata.find(sru_dc + 'dc')
title = hparse.unescape(data.find(dc + 'title').text).encode('utf-8')
desc = hparse.unescape(clean_desc(data.find(dc + 'description').text)).encode('utf-8')
defs.append(title + ' :: ' + desc)
return (num_records, defs)
def clean_desc(desc):
desc = rm_disp.sub('', desc)
desc = rm_span.sub('', desc)
desc = sub_em.sub('/', desc)
desc = sub_strong.sub('*', desc)
return desc
def test():
query = 'troglodyte'
(num, defs) = search(query)
print str(num) + ' record(s) found.'
if defs is not None:
for defn in defs:
print(defn)
if __name__ == '__main__':
test()