Skip to content

Commit

Permalink
Organized folders, added some starter files, changed naming conventio…
Browse files Browse the repository at this point in the history
…n for lowest level of definition data structure from 'defs' to 'senses'
  • Loading branch information
itincknell committed Nov 21, 2023
1 parent a807405 commit 3d63557
Show file tree
Hide file tree
Showing 33 changed files with 337,397 additions and 338 deletions.
Binary file added .DS_Store
Binary file not shown.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ Definitions are made of standard python data structures
"partOfSpeech": string "verb", "noun" etc.,
"principleParts": string representing principle parts,
"simpleParts": simplified version of principle parts supported for Latin,
"defs": [list of def objects, typically displayed as an ordered list],
"senses": [list of word 'sense' objects, typically displayed as an ordered list],
"etymology": string containing etymology information
}</p>
<p>Defs:
<p>Senses:
{
"gloss": string containing a word sense you would find in a typical dictionary,
"tags": tags related to a specific word sense such as "Pre-classical" or "transitive"
Expand Down
Binary file added dumps_sorted/.DS_Store
Binary file not shown.
Binary file added dumps_unsorted/.DS_Store
Binary file not shown.
4 changes: 2 additions & 2 deletions src/create_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def create_word(current_dict,tags):
new_word['entries'][0], dummy = edit_entry.edit_entry(new_word['entries'][0],new_word)

# call word options, from here return
load_dict.change_path('dumps sorted')
load_dict.change_path('dumps_sorted')
if current_dict['language'] == 'Latin' or current_dict['language'] == "Ancient Greek":
wiki_dump = parser_shell.load_big_language(new_word['heading'][0],current_dict['language'])
else:
Expand Down Expand Up @@ -149,7 +149,7 @@ def create_entry(new_word):
else:
user_input['tags'].append(new_tag)
# assign to entry defintions list
entry['defs'] = [user_input]
entry['senses'] = [user_input]

# append new entry to new word, return True
new_word['entries'].append(entry)
Expand Down
11 changes: 11 additions & 0 deletions src/dict_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@


# Print Progress
def printpr(counter, modulo=10000):
if counter % modulo == 0:
print(".",end='',flush=True)
if counter % (modulo*100) == 0:
print(f' {counter:,} lines parsed',flush=True)



107 changes: 55 additions & 52 deletions src/dictionary_LSJ.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def smart_join(text):
return s.replace(" .",".").replace(" ,",",").replace(" :",":")


def configure_parts(defs):
def configure_parts(senses):
count = 0

for i in defs[0]['gloss']:
for i in senses[0]['gloss']:
if i == "(":
count += 1

Expand All @@ -58,35 +58,35 @@ def configure_parts(defs):

if count < 0:
# Error too many )s, unbalanced parens
print(defs[0]['gloss'])
print(defs[1]['gloss'])
print(senses[0]['gloss'])
print(senses[1]['gloss'])
break

if count != 0:
parens = 0

for i in range(len(defs[1]['gloss'])):
if defs[1]['gloss'][i] == ")":
for i in range(len(senses[1]['gloss'])):
if senses[1]['gloss'][i] == ")":
parens += 1

if defs[1]['gloss'][i] == "(":
if senses[1]['gloss'][i] == "(":
parens -= 1

if parens == count:
break

if i < len(defs[1]['gloss']) - 1:
defs[0]['gloss'] = smart_join([defs[0]['gloss'],defs[1]['gloss'][: i + 1]])
defs[1]['gloss'] = defs[1]['gloss'][i + 1 :]
if i < len(senses[1]['gloss']) - 1:
senses[0]['gloss'] = smart_join([senses[0]['gloss'],senses[1]['gloss'][: i + 1]])
senses[1]['gloss'] = senses[1]['gloss'][i + 1 :]

for i in range(len(defs[1]['gloss'])):
if defs[1]['gloss'][i].isalpha() or defs[1]['gloss'][i] == "=":
for i in range(len(senses[1]['gloss'])):
if senses[1]['gloss'][i].isalpha() or senses[1]['gloss'][i] == "=":
break
defs[1]['gloss'] = defs[1]['gloss'][i:]
senses[1]['gloss'] = senses[1]['gloss'][i:]
else:
defs[0]['gloss'] += ")"
senses[0]['gloss'] += ")"

return defs
return senses


def process_entry(text):
Expand All @@ -95,7 +95,7 @@ def process_entry(text):
'tags':set(),
'entries':[]}

entry = {'defs':[],
entry = {'senses':[],
'partOfSpeech':'',
'principleParts':'',
'simpleParts':'',
Expand All @@ -108,17 +108,17 @@ def process_entry(text):
print("@"*5000 + f"\nheading \"{definition['heading']}\"")

while text != "":
text, defs = get_def(text)
text, senses = get_def(text)

for x in defs:
for x in senses:
if debug_print:
print(f"definition: {x['gloss']}")
entry['defs'].append(deepcopy(x))
entry['senses'].append(deepcopy(x))

if len(entry['defs']) > 1:
entry['defs'] = configure_parts(entry['defs'])
entry['simpleParts'] = entry["principleParts"] = entry['defs'][0]['gloss']
entry['defs'].pop(0)
if len(entry['senses']) > 1:
entry['senses'] = configure_parts(entry['senses'])
entry['simpleParts'] = entry["principleParts"] = entry['senses'][0]['gloss']
entry['senses'].pop(0)
else:
entry['simpleParts'] = entry["principleParts"] = definition['heading']

Expand All @@ -129,7 +129,7 @@ def process_entry(text):
def get_def(text):
m = 0
gloss = []
defs = []
senses = []
greek = False
candidate_tag = ''
while True:
Expand All @@ -147,7 +147,7 @@ def get_def(text):
if "<sense" in brac:
gloss = smart_join(gloss).strip(",. ")
if gloss != "":
defs.append({'gloss':gloss,'tags':[]})
senses.append({'gloss':gloss,'tags':[]})
gloss = []

if pull != "":
Expand All @@ -156,10 +156,10 @@ def get_def(text):
if brac == "</sense>" or text == "":
gloss = smart_join(gloss).strip(",. ")
if gloss != "":
defs.append({'gloss':gloss,'tags':[]})
senses.append({'gloss':gloss,'tags':[]})
break

return text, defs
return text, senses

def extract_dictionary(perseus, dictionary):
line_list = []
Expand Down Expand Up @@ -187,32 +187,35 @@ def LSJ(new_dictionary):
dictionary = {'file':'','definitions':[],"language":''}

for i in range(1,28):
with open('grc.lsj.perseus-eng' + str(i) + '.txt','r') as f:# ####
if progress_print:
print(f"Parsing '{'grc.lsj.perseus-eng' + str(i) + '.txt:' + chr(39):<28}",end='',flush=True)


line_list = []
ignition = False
counter = 0
for line in f.readlines():
if "<entryFree" in line:
ignition = True

if ignition:
line_list.append(line.strip(" \n\t"))

if "</entryFree" in line and ignition:
line_list.append(line.strip())
line_list = "".join(line_list)
dictionary['definitions'].append(process_entry(line_list))
ignition = False
line_list = []
counter += 1
try:
with open('grc.lsj.perseus-eng' + str(i) + '.txt','r') as f:# ####
if progress_print:
printpr(counter)

print(f' {counter:,} lines parsed',flush=True)
print(f"Parsing '{'grc.lsj.perseus-eng' + str(i) + '.txt:' + chr(39):<28}",end='',flush=True)


line_list = []
ignition = False
counter = 0
for line in f.readlines():
if "<entryFree" in line:
ignition = True

if ignition:
line_list.append(line.strip(" \n\t"))

if "</entryFree" in line and ignition:
line_list.append(line.strip())
line_list = "".join(line_list)
dictionary['definitions'].append(process_entry(line_list))
ignition = False
line_list = []
counter += 1
if progress_print:
printpr(counter)

print(f' {counter:,} lines parsed',flush=True)
except FileNotFoundError:
print(f"'{'grc.lsj.perseus-eng' + str(i) + '.txt'} not found in 'texts' directory")

new_dictionary['definitions'].extend(dictionary['definitions'])
return new_dictionary
Expand Down
24 changes: 12 additions & 12 deletions src/dictionary_Lewis.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,13 @@ def get_entry(i,debug_print):
entry['partOfSpeech'] = i['part_of_speech']
if not isinstance(i['main_notes'],str):
for item in i['main_notes']:
entry['defs'].append({'gloss':item,'tags':[]})
entry['senses'].append({'gloss':item,'tags':[]})
if debug_print:
print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}")
print(f"entry['defs'] = {entry['defs']}")
print(f"entry['senses'] = {entry['senses']}")
exit()
else:
entry['defs'] = [{'gloss':i['main_notes'],'tags':[]}]
entry['senses'] = [{'gloss':i['main_notes'],'tags':[]}]

if 'alternative_orthography' in i:
entry['principleParts'] = i['alternative_orthography']
Expand Down Expand Up @@ -97,13 +97,13 @@ def get_senses(entry,sense):
for item in sense:
get_senses(entry,item)
else:
entry['defs'].append({'gloss':sense,'tags':[]})
entry['senses'].append({'gloss':sense,'tags':[]})
get_senses(entry,sense)

if debug_print:
print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}")
print(f"entry with senses = ")
for x, item in enumerate(entry['defs']):
for x, item in enumerate(entry['senses']):
print(f"{x+1} tags: {item['tags']} gloss: {item['gloss']}")
print(f"senses limit reached ")
if counter_senses > 5:
Expand All @@ -117,7 +117,7 @@ def get_senses(entry,sense):
print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}")
print(f"['greek_word'] == {i['greek_word']}")

entry['defs'].append({'gloss':i['greek_word'],'tags':['greek_word']})
entry['senses'].append({'gloss':i['greek_word'],'tags':['greek_word']})
print(f"entry == {entry}")
if counter_grk > 1:
print(f"Greek Word Break")
Expand All @@ -136,20 +136,20 @@ def get_senses(entry,sense):
#exit()
if 'principleParts' not in entry:

defs = entry['defs'][0]['gloss']
defs = defs.split()
senses = entry['senses'][0]['gloss']
senses = senses.split()
gloss = ''
tag = ''
flag = False
for x, q in enumerate(defs):
for x, q in enumerate(senses):
if debug_print:
print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}")
print(f"defs = {defs}")
print(f"senses = {senses}")
print(f"q = {q}")
print(f"q[-1] = {q[-1]}")
if (x and q[0].isupper()) or q[0] == '(' or q[0].isnumeric():
break
if ((q[-1] == ',' and not flag) or (x == len(defs)-1 and not flag) or x == 0) and not (q[-1] == '.'):
if ((q[-1] == ',' and not flag) or (x == len(senses)-1 and not flag) or x == 0) and not (q[-1] == '.'):
gloss += q.strip(":") + ' '
else:
tag += q + ' '
Expand All @@ -176,7 +176,7 @@ def get_senses(entry,sense):
print(f"final {entry['principleParts']}",flush=True)
exit()
else:
for x in entry['defs']:
for x in entry['senses']:
if isinstance(x['gloss'],list):
print(f"Gloss = List {x['gloss']}",flush=True)
print(f"i = {i}")
Expand Down
Loading

0 comments on commit 3d63557

Please sign in to comment.