-
Notifications
You must be signed in to change notification settings - Fork 136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Python 3 version of timex library #26
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,366 @@ | ||
# Code for tagging temporal expressions in text | ||
# For details of the TIMEX format, see http://timex2.mitre.org/ | ||
# Converted to Python3 by Brian Hockenmaier in 2019 | ||
|
||
import re | ||
import string | ||
import os | ||
import sys | ||
from datetime import datetime, timedelta | ||
|
||
# Python3 version no longer requires eGenix.com mx Base Distribution | ||
# http://www.egenix.com/products/python/mxBase/ | ||
|
||
# Predefined strings. | ||
numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ | ||
eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \ | ||
eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \ | ||
ninety|hundred|thousand)" | ||
day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" | ||
week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" | ||
month = "(january|february|march|april|may|june|july|august|september| \ | ||
october|november|december)" | ||
dmy = "(year|day|week|month)" | ||
rel_day = "(today|yesterday|tomorrow|tonight|tonite)" | ||
exp1 = "(before|after|earlier|later|ago)" | ||
exp2 = "(this|next|last)" | ||
iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+" | ||
year = "((?<=\s)\d{4}|^\d{4})" | ||
regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")" | ||
regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))" | ||
|
||
reg1 = re.compile(regxp1, re.IGNORECASE) | ||
reg2 = re.compile(regxp2, re.IGNORECASE) | ||
reg3 = re.compile(rel_day, re.IGNORECASE) | ||
reg4 = re.compile(iso) | ||
reg5 = re.compile(year) | ||
|
||
def tag(text): | ||
|
||
# Initialization | ||
timex_found = [] | ||
|
||
# re.findall() finds all the substring matches, keep only the full | ||
# matching string. Captures expressions such as 'number of days' ago, etc. | ||
found = reg1.findall(text) | ||
found = [a[0] for a in found if len(a) > 1] | ||
for timex in found: | ||
timex_found.append(timex) | ||
|
||
# Variations of this thursday, next year, etc | ||
found = reg2.findall(text) | ||
found = [a[0] for a in found if len(a) > 1] | ||
for timex in found: | ||
timex_found.append(timex) | ||
|
||
# today, tomorrow, etc | ||
found = reg3.findall(text) | ||
for timex in found: | ||
timex_found.append(timex) | ||
|
||
# ISO | ||
found = reg4.findall(text) | ||
for timex in found: | ||
timex_found.append(timex) | ||
|
||
# Year | ||
found = reg5.findall(text) | ||
for timex in found: | ||
timex_found.append(timex) | ||
|
||
# Tag only temporal expressions which haven't been tagged. | ||
for timex in timex_found: | ||
text = re.sub(timex + '(?!</TIMEX2>)', '<TIMEX2>' + timex + '</TIMEX2>', text) | ||
|
||
return text | ||
|
||
# Hash function for week days to simplify the grounding task. | ||
# [Mon..Sun] -> [0..6] | ||
hashweekdays = { | ||
'monday': 0, | ||
'tuesday': 1, | ||
'wednesday': 2, | ||
'thursday': 3, | ||
'friday': 4, | ||
'saturday': 5, | ||
'sunday': 6} | ||
|
||
# Hash function for months to simplify the grounding task. | ||
# [Jan..Dec] -> [1..12] | ||
hashmonths = { | ||
'january': 1, | ||
'february': 2, | ||
'march': 3, | ||
'april': 4, | ||
'may': 5, | ||
'june': 6, | ||
'july': 7, | ||
'august': 8, | ||
'september': 9, | ||
'october': 10, | ||
'november': 11, | ||
'december': 12} | ||
|
||
# Hash number in words into the corresponding integer value | ||
def hashnum(number): | ||
if re.match(r'one|^a\b', number, re.IGNORECASE): | ||
return 1 | ||
if re.match(r'two', number, re.IGNORECASE): | ||
return 2 | ||
if re.match(r'three', number, re.IGNORECASE): | ||
return 3 | ||
if re.match(r'four', number, re.IGNORECASE): | ||
return 4 | ||
if re.match(r'five', number, re.IGNORECASE): | ||
return 5 | ||
if re.match(r'six', number, re.IGNORECASE): | ||
return 6 | ||
if re.match(r'seven', number, re.IGNORECASE): | ||
return 7 | ||
if re.match(r'eight', number, re.IGNORECASE): | ||
return 8 | ||
if re.match(r'nine', number, re.IGNORECASE): | ||
return 9 | ||
if re.match(r'ten', number, re.IGNORECASE): | ||
return 10 | ||
if re.match(r'eleven', number, re.IGNORECASE): | ||
return 11 | ||
if re.match(r'twelve', number, re.IGNORECASE): | ||
return 12 | ||
if re.match(r'thirteen', number, re.IGNORECASE): | ||
return 13 | ||
if re.match(r'fourteen', number, re.IGNORECASE): | ||
return 14 | ||
if re.match(r'fifteen', number, re.IGNORECASE): | ||
return 15 | ||
if re.match(r'sixteen', number, re.IGNORECASE): | ||
return 16 | ||
if re.match(r'seventeen', number, re.IGNORECASE): | ||
return 17 | ||
if re.match(r'eighteen', number, re.IGNORECASE): | ||
return 18 | ||
if re.match(r'nineteen', number, re.IGNORECASE): | ||
return 19 | ||
if re.match(r'twenty', number, re.IGNORECASE): | ||
return 20 | ||
if re.match(r'thirty', number, re.IGNORECASE): | ||
return 30 | ||
if re.match(r'forty', number, re.IGNORECASE): | ||
return 40 | ||
if re.match(r'fifty', number, re.IGNORECASE): | ||
return 50 | ||
if re.match(r'sixty', number, re.IGNORECASE): | ||
return 60 | ||
if re.match(r'seventy', number, re.IGNORECASE): | ||
return 70 | ||
if re.match(r'eighty', number, re.IGNORECASE): | ||
return 80 | ||
if re.match(r'ninety', number, re.IGNORECASE): | ||
return 90 | ||
if re.match(r'hundred', number, re.IGNORECASE): | ||
return 100 | ||
if re.match(r'thousand', number, re.IGNORECASE): | ||
return 1000 | ||
|
||
# Given a timex_tagged_text and a Date object set to base_date, | ||
# returns timex_grounded_text | ||
def ground(tagged_text, base_date): | ||
|
||
# Find all identified timex and put them into a list | ||
timex_regex = re.compile(r'<TIMEX2>.*?</TIMEX2>', re.DOTALL) | ||
timex_found = timex_regex.findall(tagged_text) | ||
timex_found = map(lambda timex:re.sub(r'</?TIMEX2.*?>', '', timex), \ | ||
timex_found) | ||
timexList = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This new variable is used to return timex values as a list in addition to the timex tagged format. |
||
|
||
# Calculate the new date accordingly | ||
for timex in timex_found: | ||
# global month | ||
month = "(january|february|march|april|may|june|july|august|september| \ | ||
october|november|december)" | ||
|
||
Comment on lines
+179
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a bug in the original version here where the global variable month was overwritten and then unusable on subsequent calls to timex. Adding month in here solves the issue. |
||
timex_val = 'UNKNOWN' # Default value | ||
|
||
timex_ori = timex # Backup original timex for later substitution | ||
|
||
# If numbers are given in words, hash them into corresponding numbers. | ||
# eg. twenty five days ago --> 25 days ago | ||
if re.search(numbers, timex, re.IGNORECASE): | ||
split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \ | ||
timex, re.IGNORECASE) | ||
value = split_timex[0] | ||
unit = split_timex[1] | ||
num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \ | ||
value, re.IGNORECASE)) | ||
timex = sum(num_list) + ' ' + unit | ||
|
||
# If timex matches ISO format, remove 'time' and reorder 'date' | ||
if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex): | ||
dmy = re.split(r'\s', timex)[0] | ||
dmy = re.split(r'/|-', dmy) | ||
timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0]) | ||
|
||
# Specific dates | ||
elif re.match(r'\d{4}', timex): | ||
timex_val = str(timex) | ||
|
||
# Relative dates | ||
elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE): | ||
timex_val = str(base_date) | ||
elif re.match(r'yesterday', timex, re.IGNORECASE): | ||
timex_val = str(base_date + timedelta(days=-1)) | ||
elif re.match(r'tomorrow', timex, re.IGNORECASE): | ||
timex_val = str(base_date + timedelta(days=+1)) | ||
|
||
# Weekday in the previous week. | ||
elif re.match(r'last ' + week_day, timex, re.IGNORECASE): | ||
target_day = hashweekdays[timex.split()[1]] | ||
monday_of_base_week = base_date - timedelta(days=base_date.weekday()) | ||
monday_of_target_week = base_date + timedelta(weeks=-1) | ||
timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) | ||
|
||
# Weekday in the current week. | ||
elif re.match(r'this ' + week_day, timex, re.IGNORECASE): | ||
target_day = hashweekdays[timex.split()[1]] | ||
monday_of_base_week = base_date - timedelta(days=base_date.weekday()) | ||
monday_of_target_week = base_date + timedelta(weeks=0) | ||
timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) | ||
|
||
# Weekday in the following week. | ||
elif re.match(r'next ' + week_day, timex, re.IGNORECASE): | ||
target_day = hashweekdays[timex.split()[1]] | ||
monday_of_base_week = base_date - timedelta(days=base_date.weekday()) | ||
monday_of_target_week = base_date + timedelta(weeks=+1) | ||
timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) | ||
|
||
# Last, this, next week. | ||
elif re.match(r'last week', timex, re.IGNORECASE): | ||
year = (base_date + timedelta(weeks=-1)).year | ||
|
||
# iso_week returns a triple (year, week, day) hence, retrieve | ||
# only week value. | ||
week = (base_date + timedelta(weeks=-1)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'this week', timex, re.IGNORECASE): | ||
year = (base_date + timedelta(weeks=0)).year | ||
week = (base_date + timedelta(weeks=0)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'next week', timex, re.IGNORECASE): | ||
year = (base_date + timedelta(weeks=+1)).year | ||
week = (base_date + timedelta(weeks=+1)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
|
||
# Month in the previous year. | ||
elif re.match(r'last ' + month, timex, re.IGNORECASE): | ||
month = hashmonths[timex.split()[1]] | ||
timex_val = str(base_date.year - 1) + '-' + str(month) | ||
|
||
# Month in the current year. | ||
elif re.match(r'this ' + month, timex, re.IGNORECASE): | ||
month = hashmonths[timex.split()[1]] | ||
timex_val = str(base_date.year) + '-' + str(month) | ||
|
||
# Month in the following year. | ||
elif re.match(r'next ' + month, timex, re.IGNORECASE): | ||
month = hashmonths[timex.split()[1]] | ||
timex_val = str(base_date.year + 1) + '-' + str(month) | ||
elif re.match(r'last month', timex, re.IGNORECASE): | ||
|
||
# Handles the year boundary. | ||
if base_date.month == 1: | ||
timex_val = str(base_date.year - 1) + '-' + '12' | ||
else: | ||
timex_val = str(base_date.year) + '-' + str(base_date.month - 1) | ||
elif re.match(r'this month', timex, re.IGNORECASE): | ||
timex_val = str(base_date.year) + '-' + str(base_date.month) | ||
elif re.match(r'next month', timex, re.IGNORECASE): | ||
|
||
# Handles the year boundary. | ||
if base_date.month == 12: | ||
timex_val = str(base_date.year + 1) + '-' + '1' | ||
else: | ||
timex_val = str(base_date.year) + '-' + str(base_date.month + 1) | ||
elif re.match(r'last year', timex, re.IGNORECASE): | ||
timex_val = str(base_date.year - 1) | ||
elif re.match(r'this year', timex, re.IGNORECASE): | ||
timex_val = str(base_date.year) | ||
elif re.match(r'next year', timex, re.IGNORECASE): | ||
timex_val = str(base_date.year + 1) | ||
elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE): | ||
|
||
# Calculate the offset by taking '\d+' part from the timex. | ||
offset = int(re.split(r'\s', timex)[0]) | ||
timex_val = str(base_date + timedelta(days=-offset)) | ||
elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
timex_val = str(base_date + timedelta(days=+offset)) | ||
elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
year = (base_date + timedelta(weeks=-offset)).year | ||
week = (base_date + \ | ||
timedelta(weeks=-offset)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
year = (base_date + timedelta(weeks=+offset)).year | ||
week = (base_date + timedelta(weeks=+offset)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE): | ||
extra = 0 | ||
offset = int(re.split(r'\s', timex)[0]) | ||
|
||
# Checks if subtracting the remainder of (offset / 12) to the base month | ||
# crosses the year boundary. | ||
if (base_date.month - offset % 12) < 1: | ||
extra = 1 | ||
|
||
# Calculate new values for the year and the month. | ||
year = str(base_date.year - offset // 12 - extra) | ||
month = str((base_date.month - offset % 12) % 12) | ||
|
||
# Fix for the special case. | ||
if month == '0': | ||
month = '12' | ||
timex_val = year + '-' + month | ||
elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE): | ||
extra = 0 | ||
offset = int(re.split(r'\s', timex)[0]) | ||
if (base_date.month + offset % 12) > 12: | ||
extra = 1 | ||
year = str(base_date.year + offset // 12 + extra) | ||
month = str((base_date.month + offset % 12) % 12) | ||
if month == '0': | ||
month = '12' | ||
timex_val = year + '-' + month | ||
elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
timex_val = str(base_date.year - offset) | ||
elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
timex_val = str(base_date.year + offset) | ||
|
||
# Remove 'time' from timex_val. | ||
# For example, If timex_val = 2000-02-20 12:23:34.45, then | ||
# timex_val = 2000-02-20 | ||
timex_val = re.sub(r'\s.*', '', timex_val) | ||
|
||
# Substitute tag+timex in the text with grounded tag+timex. | ||
tagged_text = re.sub('<TIMEX2>' + timex_ori + '</TIMEX2>', '<TIMEX2 val=\"' \ | ||
+ timex_val + '\">' + timex_ori + '</TIMEX2>', tagged_text) | ||
|
||
timexList.append({ | ||
"text": timex_ori, | ||
"value": timex_val | ||
}) | ||
|
||
return tagged_text, timexList | ||
|
||
#### | ||
|
||
def demo(): | ||
import nltk | ||
text = nltk.corpus.abc.raw('rural.txt')[:10000] | ||
print(tag(text)) | ||
|
||
if __name__ == '__main__': | ||
demo() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new timedelta features of python3 allow us to remove the dependency on mx.DateTime