From 37d26be1f6d0ec8ffa840a4dfcdd16640a3be580 Mon Sep 17 00:00:00 2001 From: Brian Hockenmaier Date: Thu, 26 Dec 2019 21:52:12 +0000 Subject: [PATCH 1/2] duplicated timex file to create the python3 version --- nltk_contrib/timex3.py | 357 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100755 nltk_contrib/timex3.py diff --git a/nltk_contrib/timex3.py b/nltk_contrib/timex3.py new file mode 100755 index 0000000..3b1b5ae --- /dev/null +++ b/nltk_contrib/timex3.py @@ -0,0 +1,357 @@ +# Code for tagging temporal expressions in text +# For details of the TIMEX format, see http://timex2.mitre.org/ + +import re +import string +import os +import sys + +# Requires eGenix.com mx Base Distribution +# http://www.egenix.com/products/python/mxBase/ +try: + from mx.DateTime import * +except ImportError: + print """ +Requires eGenix.com mx Base Distribution +http://www.egenix.com/products/python/mxBase/""" + +# Predefined strings. +numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ + eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \ + eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \ + ninety|hundred|thousand)" +day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" +week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" +month = "(january|february|march|april|may|june|july|august|september| \ + october|november|december)" +dmy = "(year|day|week|month)" +rel_day = "(today|yesterday|tomorrow|tonight|tonite)" +exp1 = "(before|after|earlier|later|ago)" +exp2 = "(this|next|last)" +iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+" +year = "((?<=\s)\d{4}|^\d{4})" +regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")" +regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))" + +reg1 = re.compile(regxp1, re.IGNORECASE) +reg2 = re.compile(regxp2, re.IGNORECASE) +reg3 = re.compile(rel_day, re.IGNORECASE) +reg4 = re.compile(iso) +reg5 = re.compile(year) + +def tag(text): + + # Initialization + timex_found = [] + + # re.findall() finds all the substring matches, keep only the full + # matching string. Captures expressions such as 'number of days' ago, etc. + found = reg1.findall(text) + found = [a[0] for a in found if len(a) > 1] + for timex in found: + timex_found.append(timex) + + # Variations of this thursday, next year, etc + found = reg2.findall(text) + found = [a[0] for a in found if len(a) > 1] + for timex in found: + timex_found.append(timex) + + # today, tomorrow, etc + found = reg3.findall(text) + for timex in found: + timex_found.append(timex) + + # ISO + found = reg4.findall(text) + for timex in found: + timex_found.append(timex) + + # Year + found = reg5.findall(text) + for timex in found: + timex_found.append(timex) + + # Tag only temporal expressions which haven't been tagged. + for timex in timex_found: + text = re.sub(timex + '(?!)', '' + timex + '', text) + + return text + +# Hash function for week days to simplify the grounding task. +# [Mon..Sun] -> [0..6] +hashweekdays = { + 'Monday': 0, + 'Tuesday': 1, + 'Wednesday': 2, + 'Thursday': 3, + 'Friday': 4, + 'Saturday': 5, + 'Sunday': 6} + +# Hash function for months to simplify the grounding task. +# [Jan..Dec] -> [1..12] +hashmonths = { + 'January': 1, + 'February': 2, + 'March': 3, + 'April': 4, + 'May': 5, + 'June': 6, + 'July': 7, + 'August': 8, + 'September': 9, + 'October': 10, + 'November': 11, + 'December': 12} + +# Hash number in words into the corresponding integer value +def hashnum(number): + if re.match(r'one|^a\b', number, re.IGNORECASE): + return 1 + if re.match(r'two', number, re.IGNORECASE): + return 2 + if re.match(r'three', number, re.IGNORECASE): + return 3 + if re.match(r'four', number, re.IGNORECASE): + return 4 + if re.match(r'five', number, re.IGNORECASE): + return 5 + if re.match(r'six', number, re.IGNORECASE): + return 6 + if re.match(r'seven', number, re.IGNORECASE): + return 7 + if re.match(r'eight', number, re.IGNORECASE): + return 8 + if re.match(r'nine', number, re.IGNORECASE): + return 9 + if re.match(r'ten', number, re.IGNORECASE): + return 10 + if re.match(r'eleven', number, re.IGNORECASE): + return 11 + if re.match(r'twelve', number, re.IGNORECASE): + return 12 + if re.match(r'thirteen', number, re.IGNORECASE): + return 13 + if re.match(r'fourteen', number, re.IGNORECASE): + return 14 + if re.match(r'fifteen', number, re.IGNORECASE): + return 15 + if re.match(r'sixteen', number, re.IGNORECASE): + return 16 + if re.match(r'seventeen', number, re.IGNORECASE): + return 17 + if re.match(r'eighteen', number, re.IGNORECASE): + return 18 + if re.match(r'nineteen', number, re.IGNORECASE): + return 19 + if re.match(r'twenty', number, re.IGNORECASE): + return 20 + if re.match(r'thirty', number, re.IGNORECASE): + return 30 + if re.match(r'forty', number, re.IGNORECASE): + return 40 + if re.match(r'fifty', number, re.IGNORECASE): + return 50 + if re.match(r'sixty', number, re.IGNORECASE): + return 60 + if re.match(r'seventy', number, re.IGNORECASE): + return 70 + if re.match(r'eighty', number, re.IGNORECASE): + return 80 + if re.match(r'ninety', number, re.IGNORECASE): + return 90 + if re.match(r'hundred', number, re.IGNORECASE): + return 100 + if re.match(r'thousand', number, re.IGNORECASE): + return 1000 + +# Given a timex_tagged_text and a Date object set to base_date, +# returns timex_grounded_text +def ground(tagged_text, base_date): + + # Find all identified timex and put them into a list + timex_regex = re.compile(r'.*?', re.DOTALL) + timex_found = timex_regex.findall(tagged_text) + timex_found = map(lambda timex:re.sub(r'', '', timex), \ + timex_found) + + # Calculate the new date accordingly + for timex in timex_found: + timex_val = 'UNKNOWN' # Default value + + timex_ori = timex # Backup original timex for later substitution + + # If numbers are given in words, hash them into corresponding numbers. + # eg. twenty five days ago --> 25 days ago + if re.search(numbers, timex, re.IGNORECASE): + split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \ + timex, re.IGNORECASE) + value = split_timex[0] + unit = split_timex[1] + num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \ + value, re.IGNORECASE)) + timex = `sum(num_list)` + ' ' + unit + + # If timex matches ISO format, remove 'time' and reorder 'date' + if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex): + dmy = re.split(r'\s', timex)[0] + dmy = re.split(r'/|-', dmy) + timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0]) + + # Specific dates + elif re.match(r'\d{4}', timex): + timex_val = str(timex) + + # Relative dates + elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE): + timex_val = str(base_date) + elif re.match(r'yesterday', timex, re.IGNORECASE): + timex_val = str(base_date + RelativeDateTime(days=-1)) + elif re.match(r'tomorrow', timex, re.IGNORECASE): + timex_val = str(base_date + RelativeDateTime(days=+1)) + + # Weekday in the previous week. + elif re.match(r'last ' + week_day, timex, re.IGNORECASE): + day = hashweekdays[timex.split()[1]] + timex_val = str(base_date + RelativeDateTime(weeks=-1, \ + weekday=(day,0))) + + # Weekday in the current week. + elif re.match(r'this ' + week_day, timex, re.IGNORECASE): + day = hashweekdays[timex.split()[1]] + timex_val = str(base_date + RelativeDateTime(weeks=0, \ + weekday=(day,0))) + + # Weekday in the following week. + elif re.match(r'next ' + week_day, timex, re.IGNORECASE): + day = hashweekdays[timex.split()[1]] + timex_val = str(base_date + RelativeDateTime(weeks=+1, \ + weekday=(day,0))) + + # Last, this, next week. + elif re.match(r'last week', timex, re.IGNORECASE): + year = (base_date + RelativeDateTime(weeks=-1)).year + + # iso_week returns a triple (year, week, day) hence, retrieve + # only week value. + week = (base_date + RelativeDateTime(weeks=-1)).iso_week[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'this week', timex, re.IGNORECASE): + year = (base_date + RelativeDateTime(weeks=0)).year + week = (base_date + RelativeDateTime(weeks=0)).iso_week[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'next week', timex, re.IGNORECASE): + year = (base_date + RelativeDateTime(weeks=+1)).year + week = (base_date + RelativeDateTime(weeks=+1)).iso_week[1] + timex_val = str(year) + 'W' + str(week) + + # Month in the previous year. + elif re.match(r'last ' + month, timex, re.IGNORECASE): + month = hashmonths[timex.split()[1]] + timex_val = str(base_date.year - 1) + '-' + str(month) + + # Month in the current year. + elif re.match(r'this ' + month, timex, re.IGNORECASE): + month = hashmonths[timex.split()[1]] + timex_val = str(base_date.year) + '-' + str(month) + + # Month in the following year. + elif re.match(r'next ' + month, timex, re.IGNORECASE): + month = hashmonths[timex.split()[1]] + timex_val = str(base_date.year + 1) + '-' + str(month) + elif re.match(r'last month', timex, re.IGNORECASE): + + # Handles the year boundary. + if base_date.month == 1: + timex_val = str(base_date.year - 1) + '-' + '12' + else: + timex_val = str(base_date.year) + '-' + str(base_date.month - 1) + elif re.match(r'this month', timex, re.IGNORECASE): + timex_val = str(base_date.year) + '-' + str(base_date.month) + elif re.match(r'next month', timex, re.IGNORECASE): + + # Handles the year boundary. + if base_date.month == 12: + timex_val = str(base_date.year + 1) + '-' + '1' + else: + timex_val = str(base_date.year) + '-' + str(base_date.month + 1) + elif re.match(r'last year', timex, re.IGNORECASE): + timex_val = str(base_date.year - 1) + elif re.match(r'this year', timex, re.IGNORECASE): + timex_val = str(base_date.year) + elif re.match(r'next year', timex, re.IGNORECASE): + timex_val = str(base_date.year + 1) + elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE): + + # Calculate the offset by taking '\d+' part from the timex. + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date + RelativeDateTime(days=-offset)) + elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date + RelativeDateTime(days=+offset)) + elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + year = (base_date + RelativeDateTime(weeks=-offset)).year + week = (base_date + \ + RelativeDateTime(weeks=-offset)).iso_week[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + year = (base_date + RelativeDateTime(weeks=+offset)).year + week = (base_date + RelativeDateTime(weeks=+offset)).iso_week[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE): + extra = 0 + offset = int(re.split(r'\s', timex)[0]) + + # Checks if subtracting the remainder of (offset / 12) to the base month + # crosses the year boundary. + if (base_date.month - offset % 12) < 1: + extra = 1 + + # Calculate new values for the year and the month. + year = str(base_date.year - offset // 12 - extra) + month = str((base_date.month - offset % 12) % 12) + + # Fix for the special case. + if month == '0': + month = '12' + timex_val = year + '-' + month + elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE): + extra = 0 + offset = int(re.split(r'\s', timex)[0]) + if (base_date.month + offset % 12) > 12: + extra = 1 + year = str(base_date.year + offset // 12 + extra) + month = str((base_date.month + offset % 12) % 12) + if month == '0': + month = '12' + timex_val = year + '-' + month + elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date.year - offset) + elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date.year + offset) + + # Remove 'time' from timex_val. + # For example, If timex_val = 2000-02-20 12:23:34.45, then + # timex_val = 2000-02-20 + timex_val = re.sub(r'\s.*', '', timex_val) + + # Substitute tag+timex in the text with grounded tag+timex. + tagged_text = re.sub('' + timex_ori + '', '' + timex_ori + '', tagged_text) + + return tagged_text + +#### + +def demo(): + import nltk + text = nltk.corpus.abc.raw('rural.txt')[:10000] + print tag(text) + +if __name__ == '__main__': + demo() From 8ff9357c8078570ef8c0bdc7ea2b162c442f2be3 Mon Sep 17 00:00:00 2001 From: Brian Hockenmaier Date: Thu, 26 Dec 2019 21:54:20 +0000 Subject: [PATCH 2/2] converted to python3 using timedelta feature and removed dependency on eGenix.com mx Base code --- nltk_contrib/timex3.py | 115 ++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 53 deletions(-) diff --git a/nltk_contrib/timex3.py b/nltk_contrib/timex3.py index 3b1b5ae..030dc62 100755 --- a/nltk_contrib/timex3.py +++ b/nltk_contrib/timex3.py @@ -1,19 +1,15 @@ # Code for tagging temporal expressions in text # For details of the TIMEX format, see http://timex2.mitre.org/ +# Converted to Python3 by Brian Hockenmaier in 2019 import re import string import os import sys +from datetime import datetime, timedelta -# Requires eGenix.com mx Base Distribution +# Python3 version no longer requires eGenix.com mx Base Distribution # http://www.egenix.com/products/python/mxBase/ -try: - from mx.DateTime import * -except ImportError: - print """ -Requires eGenix.com mx Base Distribution -http://www.egenix.com/products/python/mxBase/""" # Predefined strings. numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ @@ -81,29 +77,29 @@ def tag(text): # Hash function for week days to simplify the grounding task. # [Mon..Sun] -> [0..6] hashweekdays = { - 'Monday': 0, - 'Tuesday': 1, - 'Wednesday': 2, - 'Thursday': 3, - 'Friday': 4, - 'Saturday': 5, - 'Sunday': 6} + 'monday': 0, + 'tuesday': 1, + 'wednesday': 2, + 'thursday': 3, + 'friday': 4, + 'saturday': 5, + 'sunday': 6} # Hash function for months to simplify the grounding task. # [Jan..Dec] -> [1..12] hashmonths = { - 'January': 1, - 'February': 2, - 'March': 3, - 'April': 4, - 'May': 5, - 'June': 6, - 'July': 7, - 'August': 8, - 'September': 9, - 'October': 10, - 'November': 11, - 'December': 12} + 'january': 1, + 'february': 2, + 'march': 3, + 'april': 4, + 'may': 5, + 'june': 6, + 'july': 7, + 'august': 8, + 'september': 9, + 'october': 10, + 'november': 11, + 'december': 12} # Hash number in words into the corresponding integer value def hashnum(number): @@ -175,9 +171,14 @@ def ground(tagged_text, base_date): timex_found = timex_regex.findall(tagged_text) timex_found = map(lambda timex:re.sub(r'', '', timex), \ timex_found) + timexList = [] # Calculate the new date accordingly for timex in timex_found: + # global month + month = "(january|february|march|april|may|june|july|august|september| \ + october|november|december)" + timex_val = 'UNKNOWN' # Default value timex_ori = timex # Backup original timex for later substitution @@ -191,7 +192,7 @@ def ground(tagged_text, base_date): unit = split_timex[1] num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \ value, re.IGNORECASE)) - timex = `sum(num_list)` + ' ' + unit + timex = sum(num_list) + ' ' + unit # If timex matches ISO format, remove 'time' and reorder 'date' if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex): @@ -207,43 +208,46 @@ def ground(tagged_text, base_date): elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE): timex_val = str(base_date) elif re.match(r'yesterday', timex, re.IGNORECASE): - timex_val = str(base_date + RelativeDateTime(days=-1)) + timex_val = str(base_date + timedelta(days=-1)) elif re.match(r'tomorrow', timex, re.IGNORECASE): - timex_val = str(base_date + RelativeDateTime(days=+1)) + timex_val = str(base_date + timedelta(days=+1)) # Weekday in the previous week. elif re.match(r'last ' + week_day, timex, re.IGNORECASE): - day = hashweekdays[timex.split()[1]] - timex_val = str(base_date + RelativeDateTime(weeks=-1, \ - weekday=(day,0))) + target_day = hashweekdays[timex.split()[1]] + monday_of_base_week = base_date - timedelta(days=base_date.weekday()) + monday_of_target_week = base_date + timedelta(weeks=-1) + timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) # Weekday in the current week. elif re.match(r'this ' + week_day, timex, re.IGNORECASE): - day = hashweekdays[timex.split()[1]] - timex_val = str(base_date + RelativeDateTime(weeks=0, \ - weekday=(day,0))) + target_day = hashweekdays[timex.split()[1]] + monday_of_base_week = base_date - timedelta(days=base_date.weekday()) + monday_of_target_week = base_date + timedelta(weeks=0) + timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) # Weekday in the following week. elif re.match(r'next ' + week_day, timex, re.IGNORECASE): - day = hashweekdays[timex.split()[1]] - timex_val = str(base_date + RelativeDateTime(weeks=+1, \ - weekday=(day,0))) + target_day = hashweekdays[timex.split()[1]] + monday_of_base_week = base_date - timedelta(days=base_date.weekday()) + monday_of_target_week = base_date + timedelta(weeks=+1) + timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) # Last, this, next week. elif re.match(r'last week', timex, re.IGNORECASE): - year = (base_date + RelativeDateTime(weeks=-1)).year + year = (base_date + timedelta(weeks=-1)).year # iso_week returns a triple (year, week, day) hence, retrieve # only week value. - week = (base_date + RelativeDateTime(weeks=-1)).iso_week[1] + week = (base_date + timedelta(weeks=-1)).isocalendar()[1] timex_val = str(year) + 'W' + str(week) elif re.match(r'this week', timex, re.IGNORECASE): - year = (base_date + RelativeDateTime(weeks=0)).year - week = (base_date + RelativeDateTime(weeks=0)).iso_week[1] + year = (base_date + timedelta(weeks=0)).year + week = (base_date + timedelta(weeks=0)).isocalendar()[1] timex_val = str(year) + 'W' + str(week) elif re.match(r'next week', timex, re.IGNORECASE): - year = (base_date + RelativeDateTime(weeks=+1)).year - week = (base_date + RelativeDateTime(weeks=+1)).iso_week[1] + year = (base_date + timedelta(weeks=+1)).year + week = (base_date + timedelta(weeks=+1)).isocalendar()[1] timex_val = str(year) + 'W' + str(week) # Month in the previous year. @@ -286,20 +290,20 @@ def ground(tagged_text, base_date): # Calculate the offset by taking '\d+' part from the timex. offset = int(re.split(r'\s', timex)[0]) - timex_val = str(base_date + RelativeDateTime(days=-offset)) + timex_val = str(base_date + timedelta(days=-offset)) elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE): offset = int(re.split(r'\s', timex)[0]) - timex_val = str(base_date + RelativeDateTime(days=+offset)) + timex_val = str(base_date + timedelta(days=+offset)) elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE): offset = int(re.split(r'\s', timex)[0]) - year = (base_date + RelativeDateTime(weeks=-offset)).year + year = (base_date + timedelta(weeks=-offset)).year week = (base_date + \ - RelativeDateTime(weeks=-offset)).iso_week[1] + timedelta(weeks=-offset)).isocalendar()[1] timex_val = str(year) + 'W' + str(week) elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE): offset = int(re.split(r'\s', timex)[0]) - year = (base_date + RelativeDateTime(weeks=+offset)).year - week = (base_date + RelativeDateTime(weeks=+offset)).iso_week[1] + year = (base_date + timedelta(weeks=+offset)).year + week = (base_date + timedelta(weeks=+offset)).isocalendar()[1] timex_val = str(year) + 'W' + str(week) elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE): extra = 0 @@ -343,15 +347,20 @@ def ground(tagged_text, base_date): # Substitute tag+timex in the text with grounded tag+timex. tagged_text = re.sub('' + timex_ori + '', '' + timex_ori + '', tagged_text) + + timexList.append({ + "text": timex_ori, + "value": timex_val + }) - return tagged_text + return tagged_text, timexList #### def demo(): import nltk text = nltk.corpus.abc.raw('rural.txt')[:10000] - print tag(text) + print(tag(text)) if __name__ == '__main__': - demo() + demo() \ No newline at end of file