@@ -39,6 +39,7 @@ def get_arguments() -> argparse.Namespace:
39
39
'-ot' , '--output-type' , help = 'the output file type, Default = json' , type = str ,
40
40
choices = ['json' , 'csv' ], default = 'json'
41
41
)
42
+ parser .add_argument ('--csv-write-header' , help = 'write header in csv output' , action = 'store_true' )
42
43
parser .add_argument (
43
44
'--cefr-level' , help = 'minimum word\' s cefr level to consider, default = B1' , type = str ,
44
45
choices = ['A1' , 'A2' , 'B1' , 'B2' , 'C1' , 'C2' ], default = 'B1'
@@ -50,6 +51,24 @@ def get_arguments() -> argparse.Namespace:
50
51
parser .add_argument (
51
52
'-v' , '--verbose' , help = 'verbose mode' , default = False , action = 'store_true'
52
53
)
54
+
55
+ parser .add_argument (
56
+ '--disable-meaningful-words-filter' ,
57
+ help = 'by default we try to filter out meaningless words, this option disable it' ,
58
+ action = 'store_true'
59
+ )
60
+
61
+ parser .add_argument (
62
+ '--char-limiter-min-length' ,
63
+ help = 'minimum length of the word to consider, default = 3' ,
64
+ type = int , default = 3
65
+ )
66
+
67
+ parser .add_argument (
68
+ '--disable-char-limiter-filter' ,
69
+ help = 'by default we try to filter out words with too few characters, this option disable it' ,
70
+ action = 'store_true'
71
+ )
53
72
args = parser .parse_args ()
54
73
return args
55
74
@@ -89,13 +108,16 @@ def get_tokenizer(_: argparse.Namespace) -> Tokenizer:
89
108
90
109
91
110
def get_middlewares (args : argparse .Namespace ) -> List [Middleware ]:
92
- return [
93
- Number (),
94
- CharLengthValidator (min_length = 3 ),
95
- MeaningfulWords (),
96
- CEFRLimiter (min_cefr = args .cefr_level , filter_unknowns = args .ignore_unknown_cefr )
111
+ rules = [
112
+ (Number (), True ),
113
+ (CharLengthValidator (min_length = args .char_limiter_min_length ), not args .disable_char_limiter_filter ),
114
+ (MeaningfulWords (), not args .disable_meaningful_words_filter ),
115
+ (CEFRLimiter (min_cefr = args .cefr_level , filter_unknowns = not args .ignore_unknown_cefr ),
116
+ not args .disable_meaningful_words_filter )
97
117
]
98
118
119
+ return list (rule [0 ] for rule in rules if rule [1 ])
120
+
99
121
100
122
def get_definer (_ : argparse .Namespace ) -> Definer :
101
123
return MultiSourceDefinerWithStorage (
0 commit comments