forked from lexibank/mixtecansubgrouping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexibank_mixtecansubgrouping.py
140 lines (117 loc) · 5.19 KB
/
lexibank_mixtecansubgrouping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from pathlib import Path
import re
from pylexibank.dataset import Dataset as BaseDataset
from pylexibank.models import Language, Concept, Lexeme, Cognate
from clldutils.misc import slug
import attr
import lingpy
@attr.s
class CustomLanguage(Language):
Location = attr.ib(default=None)
SubGroup = attr.ib(default=None)
Number = attr.ib(default=None)
@attr.s
class CustomConcept(Concept):
Spanish_Gloss = attr.ib(default=None)
Number = attr.ib(default=None)
@attr.s
class CustomLexeme(Lexeme):
Floating_Tone = attr.ib(default=None)
Loan = attr.ib(default=None)
Loan_Source = attr.ib(default=None)
Partial_Cognacy_Broad = attr.ib(
default=attr.Factory(list),
validator=attr.validators.instance_of(list),
metadata={'separator': ' '})
Partial_Cognacy_Fine = attr.ib(
default=attr.Factory(list),
validator=attr.validators.instance_of(list),
metadata={'separator': ' '})
@attr.s
class CustomCognate(Cognate):
Morpheme_Index = attr.ib(default=None)
Cognate_Coding = attr.ib(
default=attr.Factory(list),
validator=attr.validators.instance_of(list),
metadata={'separator': ';'})
class Dataset(BaseDataset):
dir = Path(__file__).parent
id = 'mixtecansubgrouping'
language_class = CustomLanguage
concept_class = CustomConcept
lexeme_class = CustomLexeme
cognate_class = CustomCognate
def cmd_makecldf(self, args):
word_list = lingpy.Wordlist(
str(self.raw_dir / 'sm3_mixtecan_cognates.tsv'))
args.writer.add_sources()
languages = {}
for language in self.languages:
language['ID'] = re.sub('_[A-Z]+$', '', language['ID'])
language['Name'] = re.sub('_[A-Z]+$', '', language['Name'])
if language['Name'] not in languages:
languages[language['Name']] = language['ID']
args.writer.add_language(**language)
concepts = {}
for concept in self.concepts:
id_ = '{}_{}'.format(concept['NUMBER'], slug(concept['ENGLISH']))
args.writer.add_concept(
ID=id_,
Name=concept['ENGLISH'],
Number=concept['NUMBER'],
Concepticon_ID=concept['CONCEPTICON_ID'],
Concepticon_Gloss=concept['CONCEPTICON_GLOSS'],
Spanish_Gloss=concept['SPANISH'])
concepts[concept['ENGLISH']] = id_
errors = set()
for key in word_list:
if word_list[key, "doculect"] not in languages:
errors.add("language missing {0}".format(
word_list[key, "doculect"]))
elif word_list[key, "concept"] not in concepts:
errors.add("concept missing {0}".format(
word_list[key, "concept"]))
elif word_list[key, 'form']:
segmented_word = word_list[key, "tokens"]
form_count = len(segmented_word.n)
broad_cognate_id_str = word_list[key, 'cogids_broad']
broad_cognate_ids = broad_cognate_id_str.split()
if form_count != len(broad_cognate_ids):
errors.add("partial cognates: {0} / {1} / {2}".format(
key, str(segmented_word), broad_cognate_id_str))
fine_cognate_id_str = word_list[key, 'cogids_fine']
fine_cognate_ids = fine_cognate_id_str.split()
if form_count != len(fine_cognate_ids):
errors.add("partial cognates: {0} / {1} / {2}".format(
key, str(segmented_word), fine_cognate_id_str))
lexeme = args.writer.add_form_with_segments(
Local_ID=key,
Language_ID=languages[word_list[key, 'doculect']],
Parameter_ID=concepts[word_list[key, 'concept']],
Value=word_list[key, 'value'],
Form=word_list[key, 'form'],
Segments=word_list[key, "tokens"],
Source=word_list[key, 'source'],
Partial_Cognacy_Broad=broad_cognate_ids,
Partial_Cognacy_Fine=fine_cognate_ids)
cognate_ids = list(enumerate(broad_cognate_ids))
cognate_ids.extend(
(morpheme_index, id_)
for morpheme_index, id_ in enumerate(fine_cognate_ids)
if id_ not in broad_cognate_ids)
for morpheme_index, cognate_id in cognate_ids:
is_broad = cognate_id in broad_cognate_ids
is_fine = cognate_id in fine_cognate_ids
if is_broad and is_fine:
cognate_coding = ['broad', 'fine']
elif is_broad:
cognate_coding = ['broad']
else:
cognate_coding = ['fine']
args.writer.add_cognate(
lexeme=lexeme,
Cognateset_ID=cognate_id,
Cognate_Coding=cognate_coding,
Morpheme_Index=morpheme_index)
for i, error in enumerate(sorted(errors)):
print("{0:4}".format(i + 1), error)