Organized folders, added some starter files, changed naming conventio…

…n for lowest level of definition data structure from 'defs' to 'senses'
itincknell · Nov 21, 2023 · 3d63557 · 3d63557
1 parent a807405
commit 3d63557
Show file tree

Hide file tree

Showing 33 changed files with 337,397 additions and 338 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -30,10 +30,10 @@ Definitions are made of standard python data structures
 "partOfSpeech": string "verb", "noun" etc.,
 "principleParts": string representing principle parts,
 "simpleParts": simplified version of principle parts supported for Latin,
-"defs": [list of def objects, typically displayed as an ordered list],
+"senses": [list of word 'sense' objects, typically displayed as an ordered list],
 "etymology": string containing etymology information
 }</p>
-<p>Defs:
+<p>Senses:
 {
 "gloss": string containing a word sense you would find in a typical dictionary,
 "tags": tags related to a specific word sense such as "Pre-classical" or "transitive"

diff --git a/dumps_sorted/.DS_Store b/dumps_sorted/.DS_Store
diff --git a/dumps_unsorted/.DS_Store b/dumps_unsorted/.DS_Store
diff --git a/src/create_word.py b/src/create_word.py
@@ -49,7 +49,7 @@ def create_word(current_dict,tags):
 	new_word['entries'][0], dummy = edit_entry.edit_entry(new_word['entries'][0],new_word)
 
 	# call word options, from here return
-	load_dict.change_path('dumps sorted')
+	load_dict.change_path('dumps_sorted')
 	if current_dict['language'] == 'Latin' or current_dict['language'] == "Ancient Greek":
 		wiki_dump = parser_shell.load_big_language(new_word['heading'][0],current_dict['language'])
 	else:
@@ -149,7 +149,7 @@ def create_entry(new_word):
 							else:
 								user_input['tags'].append(new_tag)
 						# assign to entry defintions list
-						entry['defs'] = [user_input]
+						entry['senses'] = [user_input]
 
 						# append new entry to new word, return True
 						new_word['entries'].append(entry)

diff --git a/src/dict_utilities.py b/src/dict_utilities.py
@@ -0,0 +1,11 @@
+
+
+# Print Progress
+def printpr(counter, modulo=10000):
+	if counter % modulo == 0:
+		print(".",end='',flush=True)
+	if counter % (modulo*100) == 0:
+		print(f' {counter:,} lines parsed',flush=True)
+
+
+
diff --git a/src/dictionary_LSJ.py b/src/dictionary_LSJ.py
@@ -46,10 +46,10 @@ def smart_join(text):
 	return s.replace(" .",".").replace(" ,",",").replace(" :",":")
 
 
-def configure_parts(defs):
+def configure_parts(senses):
 	count = 0
 
-	for i in defs[0]['gloss']:
+	for i in senses[0]['gloss']:
 		if i == "(":
 			count += 1
 
@@ -58,35 +58,35 @@ def configure_parts(defs):
 
 		if count < 0:
 			# Error too many )s, unbalanced parens
-			print(defs[0]['gloss'])
-			print(defs[1]['gloss'])
+			print(senses[0]['gloss'])
+			print(senses[1]['gloss'])
 			break
 
 	if count != 0:
 		parens = 0
 
-		for i in range(len(defs[1]['gloss'])):
-			if defs[1]['gloss'][i] == ")":
+		for i in range(len(senses[1]['gloss'])):
+			if senses[1]['gloss'][i] == ")":
 				parens += 1
 
-			if defs[1]['gloss'][i] == "(":
+			if senses[1]['gloss'][i] == "(":
 				parens -= 1
 
 			if parens == count:
 				break
 
-		if i < len(defs[1]['gloss']) - 1:
-			defs[0]['gloss'] = smart_join([defs[0]['gloss'],defs[1]['gloss'][: i + 1]])
-			defs[1]['gloss'] = defs[1]['gloss'][i + 1 :]
+		if i < len(senses[1]['gloss']) - 1:
+			senses[0]['gloss'] = smart_join([senses[0]['gloss'],senses[1]['gloss'][: i + 1]])
+			senses[1]['gloss'] = senses[1]['gloss'][i + 1 :]
 
-			for i in range(len(defs[1]['gloss'])):
-				if defs[1]['gloss'][i].isalpha() or defs[1]['gloss'][i] == "=":
+			for i in range(len(senses[1]['gloss'])):
+				if senses[1]['gloss'][i].isalpha() or senses[1]['gloss'][i] == "=":
 					break
-			defs[1]['gloss'] = defs[1]['gloss'][i:]
+			senses[1]['gloss'] = senses[1]['gloss'][i:]
 		else:
-			defs[0]['gloss'] += ")"
+			senses[0]['gloss'] += ")"
 
-	return defs
+	return senses
 
 
 def process_entry(text):
@@ -95,7 +95,7 @@ def process_entry(text):
 				'tags':set(),
 				'entries':[]}
 
-	entry = {'defs':[],
+	entry = {'senses':[],
 			'partOfSpeech':'',
 			'principleParts':'',
 			'simpleParts':'',
@@ -108,17 +108,17 @@ def process_entry(text):
 		print("@"*5000 + f"\nheading \"{definition['heading']}\"")
 
 	while text != "":
-		text, defs = get_def(text)
+		text, senses = get_def(text)
 
-		for x in defs:
+		for x in senses:
 			if debug_print:
 				print(f"definition: {x['gloss']}")
-			entry['defs'].append(deepcopy(x))
+			entry['senses'].append(deepcopy(x))
 
-	if len(entry['defs']) > 1:
-		entry['defs'] = configure_parts(entry['defs'])
-		entry['simpleParts'] = entry["principleParts"] = entry['defs'][0]['gloss']
-		entry['defs'].pop(0)
+	if len(entry['senses']) > 1:
+		entry['senses'] = configure_parts(entry['senses'])
+		entry['simpleParts'] = entry["principleParts"] = entry['senses'][0]['gloss']
+		entry['senses'].pop(0)
 	else:
 		entry['simpleParts'] = entry["principleParts"] = definition['heading']
 
@@ -129,7 +129,7 @@ def process_entry(text):
 def get_def(text):
 	m = 0
 	gloss = []
-	defs = []
+	senses = []
 	greek = False
 	candidate_tag = ''
 	while True:
@@ -147,7 +147,7 @@ def get_def(text):
 		if "<sense" in brac:
 			gloss = smart_join(gloss).strip(",. ")
 			if gloss != "":
-				defs.append({'gloss':gloss,'tags':[]})
+				senses.append({'gloss':gloss,'tags':[]})
 			gloss = []
 
 		if pull != "":
@@ -156,10 +156,10 @@ def get_def(text):
 		if brac == "</sense>" or text == "":
 			gloss = smart_join(gloss).strip(",. ")
 			if gloss != "":
-				defs.append({'gloss':gloss,'tags':[]})
+				senses.append({'gloss':gloss,'tags':[]})
 			break
 
-	return text, defs
+	return text, senses
 
 def extract_dictionary(perseus, dictionary):
 	line_list = []
@@ -187,32 +187,35 @@ def LSJ(new_dictionary):
 	dictionary = {'file':'','definitions':[],"language":''}
 
 	for i in range(1,28):
-		with open('grc.lsj.perseus-eng' + str(i) + '.txt','r') as f:# ####
-			if progress_print:
-				print(f"Parsing '{'grc.lsj.perseus-eng' + str(i) + '.txt:' + chr(39):<28}",end='',flush=True)
-
-
-			line_list = []
-			ignition = False
-			counter = 0
-			for line in f.readlines():
-				if "<entryFree" in line:
-					ignition = True
-
-				if ignition:
-					line_list.append(line.strip(" \n\t"))
-
-				if "</entryFree" in line and ignition:
-					line_list.append(line.strip())
-					line_list = "".join(line_list)
-					dictionary['definitions'].append(process_entry(line_list))
-					ignition = False
-					line_list = []
-				counter += 1
+		try:
+			with open('grc.lsj.perseus-eng' + str(i) + '.txt','r') as f:# ####
 				if progress_print:
-					printpr(counter)
-
-			print(f' {counter:,} lines parsed',flush=True)
+					print(f"Parsing '{'grc.lsj.perseus-eng' + str(i) + '.txt:' + chr(39):<28}",end='',flush=True)
+
+
+				line_list = []
+				ignition = False
+				counter = 0
+				for line in f.readlines():
+					if "<entryFree" in line:
+						ignition = True
+
+					if ignition:
+						line_list.append(line.strip(" \n\t"))
+
+					if "</entryFree" in line and ignition:
+						line_list.append(line.strip())
+						line_list = "".join(line_list)
+						dictionary['definitions'].append(process_entry(line_list))
+						ignition = False
+						line_list = []
+					counter += 1
+					if progress_print:
+						printpr(counter)
+
+				print(f' {counter:,} lines parsed',flush=True)
+		except FileNotFoundError:
+			print(f"'{'grc.lsj.perseus-eng' + str(i) + '.txt'} not found in 'texts' directory")
 
 	new_dictionary['definitions'].extend(dictionary['definitions'])
 	return new_dictionary

diff --git a/src/dictionary_Lewis.py b/src/dictionary_Lewis.py
@@ -42,13 +42,13 @@ def get_entry(i,debug_print):
 			entry['partOfSpeech'] = i['part_of_speech']
 			if not isinstance(i['main_notes'],str):
 				for item in i['main_notes']:
-					entry['defs'].append({'gloss':item,'tags':[]})
+					entry['senses'].append({'gloss':item,'tags':[]})
 				if debug_print:
 					print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}") 
-					print(f"entry['defs'] = {entry['defs']}")
+					print(f"entry['senses'] = {entry['senses']}")
 					exit()
 			else:
-				entry['defs'] = [{'gloss':i['main_notes'],'tags':[]}]
+				entry['senses'] = [{'gloss':i['main_notes'],'tags':[]}]
 
 			if 'alternative_orthography' in i:
 				entry['principleParts'] = i['alternative_orthography']
@@ -97,13 +97,13 @@ def get_senses(entry,sense):
 							for item in sense:
 								get_senses(entry,item)
 						else:
-							entry['defs'].append({'gloss':sense,'tags':[]})
+							entry['senses'].append({'gloss':sense,'tags':[]})
 					get_senses(entry,sense)
 
 					if debug_print:
 						print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}")  
 						print(f"entry with senses = ")
-						for x, item in enumerate(entry['defs']):
+						for x, item in enumerate(entry['senses']):
 							print(f"{x+1} tags: {item['tags']} gloss: {item['gloss']}")
 						print(f"senses limit reached ")
 						if counter_senses > 5:
@@ -117,7 +117,7 @@ def get_senses(entry,sense):
 					print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}")  
 					print(f"['greek_word'] == {i['greek_word']}")
 
-					entry['defs'].append({'gloss':i['greek_word'],'tags':['greek_word']})
+					entry['senses'].append({'gloss':i['greek_word'],'tags':['greek_word']})
 					print(f"entry == {entry}")
 					if counter_grk > 1:
 						print(f"Greek Word Break")
@@ -136,20 +136,20 @@ def get_senses(entry,sense):
 				#exit()
 		if 'principleParts' not in entry:
 
-			defs = entry['defs'][0]['gloss']
-			defs = defs.split()
+			senses = entry['senses'][0]['gloss']
+			senses = senses.split()
 			gloss = ''
 			tag = ''
 			flag = False
-			for x, q in enumerate(defs):
+			for x, q in enumerate(senses):
 				if debug_print:
 					print("\n\n\n\t\t\t" + "*"*1000 + f"\n\nLINE: {current_line_number()}") 
-					print(f"defs = {defs}")
+					print(f"senses = {senses}")
 					print(f"q = {q}")
 					print(f"q[-1] = {q[-1]}")
 				if (x and q[0].isupper()) or q[0] == '(' or q[0].isnumeric():
 					break
-				if ((q[-1] == ',' and not flag) or (x == len(defs)-1 and not flag)  or x == 0) and not (q[-1] == '.'):
+				if ((q[-1] == ',' and not flag) or (x == len(senses)-1 and not flag)  or x == 0) and not (q[-1] == '.'):
 					gloss += q.strip(":") + ' '
 				else:
 					tag += q + ' '
@@ -176,7 +176,7 @@ def get_senses(entry,sense):
 						print(f"final {entry['principleParts']}",flush=True)
 						exit()
 					else:
-						for x in entry['defs']:
+						for x in entry['senses']:
 							if isinstance(x['gloss'],list):
 								print(f"Gloss = List {x['gloss']}",flush=True)
 								print(f"i = {i}")