-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathtokenize.xfst
89 lines (68 loc) · 2.76 KB
/
tokenize.xfst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
! tokenize.xfst
!
! Released under the terms of the MIT License
! Copyright (c) 2011-2015 Çağrı Çöltekin <[email protected]>
!
! Permission is hereby granted, free of charge, to any person obtaining a
! copy of this software and associated documentation files (the "Software"),
! to deal in the Software without restriction, including without limitation
! the rights to use, copy, modify, merge, publish, distribute, sublicense,
! and/or sell copies of the Software, and to permit persons to whom the
! Software is furnished to do so, subject to the following conditions:
!
! The above copyright notice and this permission notice shall be included in
! all copies or substantial portions of the Software.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
! DEALINGS IN THE SOFTWARE.
!
!
! This FST is a *very* simple tokenizer that splits given input to
! tokens and sentences.
! White space
define WS " "|"\u0009"|"\u000a"|"\u000d"|
"\u00a0"|"\u1680"|
"\u2000"|"\u2001"|"\u2002"|"\u2003"|"\u2004"|"\u2005"|
"\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
"\u2028"|"\u2029"|"\u202f"|"\u205f"|"\u3000";
! Punctuation that ends sentences
define SP "..."|"."|"?"|"!"|"…";
! Left punctuation
define LP "("|"["|"{"|
"“"|"‘"|"‹"|"«"|
"'"|%"|"''";
! Right punctuation - excluding the characters that can be used as apostrophe
define RP SP|","|";"|":"|
")"|"]"|"{"|
"”"|"›"|"»"|
%"|"''";
define Sym "-"|"+"|"<"|">"|"*"|"/";
define Punct LP|RP|Sym;
define Apos %'|%’;
!define TB "\u000a";
!define SB "\u000a\u000a";
define TB "<tb>";
define SB "<sb>";
define Digit [%0|1|2|3|4|5|6|7|8|9];
define define CAP [A|B|C|Ç|D|E|F|G|Ğ|H|I|İ|J|K|L|M|N|O|Ö|P|R|S|Ş|T|U|Ü|V|Y|Z|Î|Û|Â|X|W|Q];
define define LC [a|b|c|ç|d|e|f|g|ğ|h|i|i|j|k|l|m|n|o|ö|p|r|s|ş|t|u|ü|v|y|z|î|û|â|x|w|q];
define nonSym \[WS|LP|RP|Sym];
define Word nonSym+ [(["."|"-"|Apos] nonSym+)*];
define Initial \[WS|LP|RP|Sym] ".";
define Number Digit+ [([%.|%,] Digit+)*];
define Abbr @txt"abbr.txt";
define T [Word|Punct|Initial|Number] @-> ... TB
.o. Apos @-> TB ... || _ TB
.o. TB -> SB || [SP|.#.] _
.o. SB -> TB || [Abbr|Digit+] SP _
.o. SB -> TB || LP .* SP _ RP
.o. WS+ -> 0
.o. TB -> " "
.o. SB -> "\u000a";
regex T.i;
save stack tokenize.fst