From 4fc7ffd4facd07b13768f169ae10238fbbd51791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Wed, 26 Jun 2024 18:45:59 +0200 Subject: [PATCH 1/8] more comments. --- src/digit_string.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/digit_string.rs b/src/digit_string.rs index 9e80b8d..5952076 100644 --- a/src/digit_string.rs +++ b/src/digit_string.rs @@ -197,6 +197,7 @@ impl DigitString { Ok(()) }; } + // maybe subpart of a bigger number let mut padding_zeroes = self.buffer[(l - positions)..] .iter() .take_while(|&c| *c == b'0') From 16bf1c61357b1e923075f0978299a5ca4bf59cef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Wed, 26 Jun 2024 18:56:16 +0200 Subject: [PATCH 2/8] Various fixes in EN, DE, FR, ES. --- src/lang/de/mod.rs | 2 +- src/lang/de/vocabulary.rs | 2 +- src/lang/en/mod.rs | 3 ++- src/lang/es/mod.rs | 4 +++- src/lang/fr/mod.rs | 20 ++++++++++++++++++-- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/lang/de/mod.rs b/src/lang/de/mod.rs index b6b3e51..c949b09 100644 --- a/src/lang/de/mod.rs +++ b/src/lang/de/mod.rs @@ -174,7 +174,7 @@ impl LangInterpretor for German { } } "tausend" | "tausendste" if b.is_range_free(3, 5) => b.shift(3), - "million" | "millionen" | "millionste" => b.shift(6), + "million" | "millionen" | "millionste" if b.is_range_free(6, 8) => b.shift(6), "milliarde" | "milliarden" | "milliardste" => b.shift(9), "billion" | "billionste" => b.shift(12), "und" => Err(Error::Incomplete), diff --git a/src/lang/de/vocabulary.rs b/src/lang/de/vocabulary.rs index ab3bae1..4fa9d14 100644 --- a/src/lang/de/vocabulary.rs +++ b/src/lang/de/vocabulary.rs @@ -1,5 +1,5 @@ use phf::{phf_set, Set}; pub static INSIGNIFICANT: Set<&'static str> = phf_set! { - "und", "so", "ach", "doch", "ja" + "und", "so", "ach", "doch", "ja", "ok" }; diff --git a/src/lang/en/mod.rs b/src/lang/en/mod.rs index 90d0e5a..1a49e46 100644 --- a/src/lang/en/mod.rs +++ b/src/lang/en/mod.rs @@ -86,7 +86,7 @@ impl LangInterpretor for English { } } "thousand" | "thousandth" if b.is_range_free(3, 5) => b.shift(3), - "million" | "millionth" => b.shift(6), + "million" | "millionth" if b.is_range_free(6, 8) => b.shift(6), "billion" | "billionth" => b.shift(9), "and" if b.len() >= 2 => Err(Error::Incomplete), @@ -346,6 +346,7 @@ mod tests { one hundred twenty point o five, one point two hundred thirty-six, one point two three six.", "12.99, 120.05, 120.05, 1.2 136, 1.236." ); + assert_replace_numbers!("I say point three", "I say point three"); } #[test] diff --git a/src/lang/es/mod.rs b/src/lang/es/mod.rs index e791b62..b9d8bf3 100644 --- a/src/lang/es/mod.rs +++ b/src/lang/es/mod.rs @@ -99,7 +99,9 @@ impl LangInterpretor for Spanish { "ochociento" | "ochocienta" | "octingentésimo" | "octingentésima" => b.put(b"800"), "noveciento" | "novecienta" | "noningentésimo" | "noningentésima" => b.put(b"900"), "mil" | "milésimo" | "milésima" if b.is_range_free(3, 5) => b.shift(3), - "millon" | "millón" | "millonésimo" | "millonésima" => b.shift(6), + "millon" | "millón" | "millonésimo" | "millonésima" if b.is_range_free(6, 8) => { + b.shift(6) + } "y" if b.len() >= 2 => Err(Error::Incomplete), _ => Err(Error::NaN), diff --git a/src/lang/fr/mod.rs b/src/lang/fr/mod.rs index bd5efe1..3c5c7b2 100644 --- a/src/lang/fr/mod.rs +++ b/src/lang/fr/mod.rs @@ -157,7 +157,7 @@ impl LangInterpretor for French { } "cent" | "centième" => { let peek = b.peek(2); - if (peek.len() == 1 || peek < b"20") && peek != b"1" { + if (peek.len() == 1 || peek < b"20") && peek != b"1" && peek != b"01" { b.shift(2) } else { Err(Error::Overlap) @@ -171,7 +171,7 @@ impl LangInterpretor for French { b.shift(3) } } - "million" | "millionième" => b.shift(6), + "million" | "millionième" if b.is_range_free(6, 8) => b.shift(6), "milliard" | "milliardième" => b.shift(9), "et" if b.len() >= 2 => Err(Error::Incomplete), @@ -290,16 +290,26 @@ mod tests { #[test] fn test_apply() { + assert_text2digits!( + "cinquante trois mille millions deux cent quarante-trois mille sept cent vingt-quatre", + "53000243724" + ); + assert_text2digits!( "cinquante trois mille millions deux cent quarante trois mille sept cent vingt quatre", "53000243724" ); + assert_text2digits!( + "cinquante et un million cinq cent soixante-dix-huit mille trois cent deux", + "51578302" + ); assert_text2digits!( "cinquante et un million cinq cent soixante dix huit mille trois cent deux", "51578302" ); + assert_text2digits!("quatre-vingt-cinq", "85"); assert_text2digits!("quatre vingt cinq", "85"); assert_text2digits!("quatre vingt un", "81"); @@ -328,17 +338,20 @@ mod tests { #[test] fn test_centuries() { + assert_text2digits!("dix neuf cent soixante-treize", "1973"); assert_text2digits!("dix neuf cent soixante treize", "1973"); } #[test] fn test_ordinals() { + assert_text2digits!("vingt-cinquième", "25ème"); assert_text2digits!("vingt cinquième", "25ème"); assert_text2digits!("vingt et unième", "21ème"); } #[test] fn test_fractions() { + assert_text2digits!("vingt-cinquièmes", "25èmes"); assert_text2digits!("vingt cinquièmes", "25èmes"); assert_text2digits!("vingt et unièmes", "21èmes"); } @@ -347,6 +360,7 @@ mod tests { fn test_zeroes() { assert_text2digits!("zéro", "0"); assert_text2digits!("zéro huit", "08"); + assert_text2digits!("zéro zéro cent vingt-cinq", "00125"); assert_text2digits!("zéro zéro cent vingt cinq", "00125"); assert_invalid!("cinq zéro"); assert_invalid!("cinquante zéro trois"); @@ -366,6 +380,7 @@ mod tests { assert_invalid!("vingt un"); assert_invalid!("zéro zéro trente quatre vingt"); assert_invalid!("quatre-vingt dix-huit"); + assert_invalid!("mille un cent"); } #[test] @@ -451,6 +466,7 @@ mod tests { "la densité moyenne est de zéro virgule cinq.", "la densité moyenne est de 0,5." ); + assert_replace_numbers!("Je dis virgule cinq", "Je dis virgule cinq"); } #[test] From ab7e919e3f2104b3e1f4f1816c75832f590becf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Wed, 26 Jun 2024 18:56:46 +0200 Subject: [PATCH 3/8] First iteration for IT. --- src/lang/it/mod.rs | 578 ++++++++++++++++++++++++++++++++++++++ src/lang/it/vocabulary.rs | 5 + src/lang/mod.rs | 9 +- 3 files changed, 591 insertions(+), 1 deletion(-) create mode 100644 src/lang/it/mod.rs create mode 100644 src/lang/it/vocabulary.rs diff --git a/src/lang/it/mod.rs b/src/lang/it/mod.rs new file mode 100644 index 0000000..5388bc8 --- /dev/null +++ b/src/lang/it/mod.rs @@ -0,0 +1,578 @@ +//! Italian number interpretor + +use crate::digit_string::DigitString; +use crate::error::Error; +use crate::tokenizer::WordSplitter; + +mod vocabulary; + +use super::{LangInterpretor, MorphologicalMarker}; +use vocabulary::INSIGNIFICANT; + +pub struct Italian { + word_splitter: WordSplitter, +} + +fn lemmatize(word: &str) -> &str { + let candidate = word.trim_end_matches(['o', 'a', 'e', 'i']); + if matches!( + candidate, + "prim" + | "second" + | "terz" + | "quart" + | "quint" + | "sest" + | "settim" + | "ottav" + | "ttav" + | "non" + | "decim" + ) || candidate.ends_with("esim") + { + candidate + } else { + word + } +} + +impl Default for Italian { + fn default() -> Self { + Self { + word_splitter: WordSplitter::new([ + "miliardesim", + "milionesim", + "bilionesim", + "cinquanta", + "centesim", + "millesim", + "miliardo", + "miliardi", + "quaranta", + "sessanta", + "settanta", + "milione", + "milioni", + "bilione", + "bilioni", + "ottanta", + "novanta", + "trenta", + "ttanta", + "cento", + "mille", + "venti", + "mila", + ]) + .unwrap(), + } + } +} + +impl Italian { + pub fn new() -> Self { + Default::default() + } +} + +impl LangInterpretor for Italian { + fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { + let lemma = lemmatize(num_func); + if self.word_splitter.is_splittable(lemma) { + return match self.exec_group(self.word_splitter.split(lemma)) { + Ok(ds) => { + if ds.len() > 3 && ds.len() <= 6 && !b.is_range_free(3, 5) { + return Err(Error::Overlap); + } + b.put(&ds)?; + let marker = self.get_morph_marker(num_func); + if marker.is_ordinal() { + b.marker = marker; + b.freeze() + } + Ok(()) + } + Err(err) => Err(err), + }; + } + let status = match lemmatize(num_func) { + "zero" => b.put(b"0"), + "un" | "uno" | "una" | "unesim" if b.peek(2) != b"10" => b.put(b"1"), + "prim" if b.is_empty() => b.put(b"1"), + "due" | "duesim" if b.peek(2) != b"10" => b.put(b"2"), + "second" if b.is_empty() => b.put(b"2"), + "tre" | "tré" | "treesim" if b.peek(2) != b"10" => b.put(b"3"), + "terz" if b.is_empty() => b.put(b"3"), + "quattro" | "quattresim" if b.peek(2) != b"10" => b.put(b"4"), + "quart" if b.is_empty() => b.put(b"4"), + "cinque" | "cinquesim" if b.peek(2) != b"10" => b.put(b"5"), + "quint" if b.is_empty() => b.put(b"5"), + "sei" | "seiesim" if b.peek(2) != b"10" => b.put(b"6"), + "sest" if b.is_empty() => b.put(b"6"), + "sette" | "settesim" if b.peek(2) != b"10" => b.put(b"7"), + "settim" if b.is_empty() => b.put(b"7"), + "otto" | "tto" | "ottesim" | "ttesim" if b.peek(2) != b"10" => b.put(b"8"), + "ottav" if b.is_empty() => b.put(b"8"), + "nove" | "novesim" if b.peek(2) != b"10" => b.put(b"9"), + "non" if b.is_empty() => b.put(b"9"), + "dieci" | "decim" => b.put(b"10"), + "undici" | "undicesim" => b.put(b"11"), + "dodici" | "dodicesim" => b.put(b"12"), + "tredici" | "tredicesim" => b.put(b"13"), + "quattordici" | "quattordicesim" => b.put(b"14"), + "quindici" | "quindicesim" => b.put(b"15"), + "sedici" | "dedicesim" => b.put(b"16"), + "diciassette" | "diciassettesim" => b.put(b"17"), + "diciotto" | "diciottesim" => b.put(b"18"), + "diciannove" | "diciannovesim" => b.put(b"19"), + "venti" | "ventesim" => b.put(b"20"), + "ventuno" | "ventun" | "ventunesim" => b.put(b"21"), + "ventotto" | "ventottesim" => b.put(b"28"), + "trenta" | "trentesim" => b.put(b"30"), + "trentuno" | "trentun" | "trentunesim" => b.put(b"31"), + "trentotto" | "trentottesim" => b.put(b"38"), + "quaranta" | "quarantesim" => b.put(b"40"), + "quarantuno" | "quarantun" | "quarantunesim" => b.put(b"41"), + "quarantotto" | "quarantottesim" => b.put(b"48"), + "cinquanta" | "cinquantesim" => b.put(b"50"), + "cinquantuno" | "cinquantun" | "cinquantunesim" => b.put(b"51"), + "cinquantotto" | "cinquantottesim" => b.put(b"58"), + "sessanta" | "sessantesim" => b.put(b"60"), + "sessantuno" | "sessantun" | "sessantunesim" => b.put(b"61"), + "sessantotto" | "sessantottesim" => b.put(b"68"), + "settanta" | "settantesim" => b.put(b"70"), + "settantuno" | "settantun" | "settanunesim" => b.put(b"71"), + "settantotto" | "settantottesim" => b.put(b"78"), + "ottanta" | "ottantesim" | "ttanta" | "ttantesim" => b.put(b"80"), + "ottantuno" | "ottantun" | "ottantunesim" => b.put(b"81"), + "ottantotto" | "ottantottesim" => b.put(b"88"), + "novanta" | "novantesim" => b.put(b"90"), + "novantuno" | "novantun" | "novantunesim" => b.put(b"91"), + "novantotto" | "novantottesim" => b.put(b"98"), + "cento" | "centesim" => { + let peek = b.peek(2); + if (peek.len() == 1 || peek < b"10") && peek != b"1" && peek != b"01" { + b.shift(2) + } else { + Err(Error::Overlap) + } + } + "centuno" | "centun" | "centunesimo" => b.put(b"101"), + "mille" if b.is_range_free(3, 5) => b.put(b"1000"), + "mila" if b.is_range_free(3, 5) => { + let peek = b.peek(3); + if peek == b"1" || peek == b"001" || peek.is_empty() || peek == b"000" { + Err(Error::NaN) + } else { + b.shift(3) + } + } + "millesim" if b.is_range_free(3, 5) => { + let peek = b.peek(3); + if peek == b"1" || peek == b"001" { + Err(Error::NaN) + } else { + b.shift(3) + } + } + "milione" if b.is_range_free(6, 8) => { + if b.len() != 1 || b.peek(1) != b"1" { + Err(Error::NaN) + } else { + b.shift(6) + } + } + "milionesim" if b.is_range_free(6, 8) => { + if b.len() == 1 && b.peek(1) == b"1" { + Err(Error::NaN) + } else { + b.shift(6) + } + } + "milioni" if b.is_range_free(6, 8) => { + if b.is_empty() || b.len() == 1 && b.peek(1) == b"1" { + Err(Error::NaN) + } else { + b.shift(6) + } + } + "miliardo" => { + if b.len() != 1 || b.peek(1) != b"1" { + Err(Error::NaN) + } else { + b.shift(9) + } + } + "miliardesim" => { + if b.len() == 1 && b.peek(1) == b"1" { + Err(Error::NaN) + } else { + b.shift(9) + } + } + "miliardi" => { + if b.is_empty() || b.len() == 1 && b.peek(1) == b"1" { + Err(Error::NaN) + } else { + b.shift(9) + } + } + "bilione" => { + if b.len() != 1 || b.peek(1) != b"1" { + Err(Error::NaN) + } else { + b.shift(12) + } + } + "bilionesim" => { + if b.len() == 1 && b.peek(1) == b"1" { + Err(Error::NaN) + } else { + b.shift(12) + } + } + "bilioni" => { + if b.is_empty() || b.len() == 1 && b.peek(1) == b"1" { + Err(Error::NaN) + } else { + b.shift(12) + } + } + "e" if b.len() >= 2 => Err(Error::Incomplete), + _ => Err(Error::NaN), + }; + let marker = self.get_morph_marker(num_func); + if status.is_ok() && !marker.is_none() { + b.marker = marker; + b.freeze(); + } + status + } + fn apply_decimal(&self, decimal_func: &str, b: &mut DigitString) -> Result<(), Error> { + self.apply(decimal_func, b) + } + fn get_morph_marker(&self, word: &str) -> MorphologicalMarker { + let base = lemmatize(word); + // as we only lemmatized ordinals, we have a quick test + if base != word { + // word is guaranteed not to be empty + match word.chars().last().unwrap() { + 'o' | 'i' => MorphologicalMarker::Ordinal("º"), + 'a' | 'e' => MorphologicalMarker::Ordinal("ª"), + _ => MorphologicalMarker::None, + } + } else { + MorphologicalMarker::None + } + } + fn is_decimal_sep(&self, word: &str) -> bool { + word == "virgola" + } + fn format_and_value(&self, b: &DigitString) -> (String, f64) { + let repr = b.to_string(); + let val = repr.parse().unwrap(); + if let MorphologicalMarker::Ordinal(marker) = b.marker { + (format!("{}{}", b.to_string(), marker), val) + } else { + (repr, val) + } + } + fn format_decimal_and_value(&self, int: &DigitString, dec: &DigitString) -> (String, f64) { + let sint = int.to_string(); + let sdec = dec.to_string(); + let val = format!("{sint}.{sdec}").parse().unwrap(); + (format!("{sint},{sdec}"), val) + } + + fn is_linking(&self, word: &str) -> bool { + INSIGNIFICANT.contains(word) + } + + fn is_ambiguous(&self, _number: &str) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::word_to_digit::{replace_numbers, text2digits}; + + macro_rules! assert_text2digits { + ($text:expr, $res:expr) => { + let f = Italian::default(); + let res = text2digits($text, &f); + dbg!(&res); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), $res) + }; + } + + macro_rules! assert_replace_numbers { + ($text:expr, $res:expr) => { + let f = Italian::default(); + assert_eq!(replace_numbers($text, &f, 10.0), $res) + }; + } + + macro_rules! assert_replace_all_numbers { + ($text:expr, $res:expr) => { + let f = Italian::default(); + assert_eq!(replace_numbers($text, &f, 0.0), $res) + }; + } + + macro_rules! assert_invalid { + ($text:expr) => { + let f = Italian::default(); + let res = text2digits($text, &f); + assert!(res.is_err()); + }; + } + + #[test] + fn test_basic() { + assert_text2digits!("due", "2"); + assert_text2digits!("dieci", "10"); + assert_text2digits!("dieci", "10"); + assert_text2digits!("tredici", "13"); + assert_text2digits!("diciassette", "17"); + assert_text2digits!("venti", "20"); + assert_text2digits!("ventuno", "21"); + assert_text2digits!("ventun", "21"); + assert_text2digits!("ventidue", "22"); + assert_text2digits!("ventisette", "27"); + assert_text2digits!("ventotto", "28"); + assert_text2digits!("trentotto", "38"); + assert_text2digits!("trentatré", "33"); + assert_text2digits!("trecentoquarantadue", "342"); + assert_text2digits!("millenove", "1009"); + assert_text2digits!("novecento", "900"); + assert_text2digits!("millenovecento", "1900"); + assert_text2digits!("millenovecentottantaquattro", "1984"); + assert_text2digits!("cento e uno", "101"); + assert_text2digits!("seicento", "600"); + assert_text2digits!("tremila", "3000"); + assert_text2digits!("tremilaseicento", "3600"); + assert_text2digits!("tremila e seicento", "3600"); + assert_text2digits!("milleuno", "1001"); + assert_text2digits!("novecentonovantanove", "999"); + assert_text2digits!("duemilatrecentoquarantacinque", "2345"); + assert_text2digits!("seicentomiladue", "600002"); + assert_text2digits!("settecentosessantacinquemila duecento", "765200"); + } + + #[test] + fn test_basic_invalid() { + assert_invalid!("duemille"); + assert_invalid!("unmille"); + assert_invalid!("unmila"); + } + + #[test] + fn test_apply() { + assert_text2digits!( + "cinquantatremila milioni duecentoquarantatremilasettecentoventiquattro", + "53000243724" + ); + + assert_text2digits!( + "cinquantuno milioni cinquecentosettantaottomilatrecentodue", + "51578302" + ); + + assert_text2digits!("ottantacinque", "85"); + + assert_text2digits!("ottantuno", "81"); + + assert_text2digits!("quindici", "15"); + + assert_text2digits!("settantacinquemila", "75000"); + assert_text2digits!("un miliardo venticinque milioni", "1025000000"); + } + + #[test] + fn test_apply_variants() { + assert_text2digits!("novantaotto", "98"); + assert_text2digits!("settantaotto", "78"); + assert_text2digits!("ottantaotto", "88"); + assert_text2digits!("ottantuno", "81"); + assert_text2digits!("ottanta", "80"); + assert_text2digits!("millenovecentoventi", "1920"); + } + + // #[test] + // fn test_centuries() { + // assert_text2digits!("millenovecentosettantatré", "1973"); + // // specific to saying "the seventies": + // assert_text2digits!("diciannove anni settanta", "1970"); + // // Middle-Ages and Renaissance centuries (from year 1000 to 1599) are often referred as + // // "the two-hundred" for the XIth, "the three hundred" for XIIth... + // // "il trecento": "the three hundred" / "XIV secolo": "the XIVth century" + // assert_text2digits!("il mille", "XI secolo"); + // assert_text2digits!("il millecento", "XII secolo"); + // assert_text2digits!("il duecento", "XIII secolo"); + // assert_text2digits!("il trecento", "XIV secolo"); + // assert_text2digits!("il quattrocento", "XV secolo"); + // assert_text2digits!("il cinquecento", "XVI secolo"); + // } + + #[test] + fn test_ordinals() { + assert_text2digits!("venticinquesimo", "25º"); + assert_text2digits!("ventiunesimo", "21º"); + assert_text2digits!("venticinquesimi", "25º"); + assert_text2digits!("ventunesimi", "21º"); + } + + #[test] + fn test_zeroes() { + assert_text2digits!("zero", "0"); + assert_text2digits!("zero otto", "08"); + assert_text2digits!("zero zero centoventicinque", "00125"); + assert_invalid!("cinque zero"); + assert_invalid!("cinquanta zero tre"); + assert_invalid!("cinquanta tre zero"); + assert_invalid!("dieci zero"); + } + + #[test] + fn test_invalid() { + // I've translated litteraly + assert_invalid!("mille mille duecento"); + assert_invalid!("sessanta quindici"); + assert_invalid!("quaranta dodici"); + assert_invalid!("sessanta e"); + assert_invalid!("dici due"); + assert_invalid!("dici primo"); + assert_invalid!("ventesimo cinque"); + assert_invalid!("venti uno"); + assert_invalid!("zero zero trenta quattro venti"); + assert_invalid!("ottanta diciotto"); + } + + #[test] + fn test_replace_numbers_integers() { + assert_replace_numbers!( + "venticinque mucche, dodici polli e centoventicinque kg di patate.", + "25 mucche, 12 polli e 125 kg di patate." + ); + assert_replace_numbers!("Milleduecentosessantasei chiodi.", "1266 chiodi."); + assert_replace_numbers!("Novantacinque = ottanta + quindici", "95 = 80 + 15"); + assert_replace_numbers!("uno due tre quattro venti quindici.", "1 2 3 4 20 15."); + assert_replace_numbers!("uno due tre novantacinque.", "1 2 3 95."); + assert_replace_numbers!( + "uno, due, tre, quattro, venti, quindici.", + "1, 2, 3, 4, 20, 15." + ); + assert_replace_numbers!("ventuno, trentuno.", "21, 31."); + assert_replace_numbers!("duecentomila quattordicimila", "200000 14000"); + assert_replace_numbers!("venti-uno", "venti-uno"); + assert_replace_numbers!("ventuno", "21"); + assert_replace_numbers!("venti uno", "20 1"); + assert_replace_numbers!("novanta cinque, settanta cinque", "90 5, 70 5"); + assert_replace_numbers!("novanta uno, settanta uno", "90 1, 70 1"); + } + + #[test] + fn test_replace_numbers_formal() { + assert_replace_numbers!( + "zero nove sessanta zero sei dodici ventuno", + "09 60 06 12 21" + ); + assert_replace_numbers!("zero uno millenovecentonovanta", "01 1990"); + assert_replace_numbers!("zero uno cento", "01 100"); + } + + #[test] + fn test_trente_et_onze() { + assert_replace_numbers!("cinquanta sessanta trenta e dodici", "50 60 30 e 12"); + } + + #[test] + fn test_replace_numbers_zero() { + assert_replace_numbers!("tredicimila zero novanta", "13000 090"); + assert_replace_numbers!("tredicimila zero ottanta", "13000 080"); + assert_replace_numbers!("zero", "zero"); + assert_replace_all_numbers!("zero", "0"); + assert_replace_numbers!("zero cinque", "05"); + assert_replace_numbers!("zero, cinque", "0, 5"); + assert_replace_numbers!("sette uno zero", "7 1 0"); + assert_replace_numbers!("Il vostro servizio è zero!", "Il vostro servizio è zero!"); + assert_replace_numbers!( + "a a uno tre sette tre tre sette cinque quattro zero c c", + "a a 1 3 7 3 3 7 5 4 0 c c" + ); + } + + #[test] + fn test_replace_numbers_ordinals() { + assert_replace_numbers!( + "Quinto secondo terzo ventunesimo centesimo milleduecentotrentesimo.", + "5º 2º 3º 21º 100º 1230º." + ); + assert_replace_numbers!("prima seconda", "1ª 2ª"); + assert_replace_numbers!("cinquecentounesimo", "501º"); + assert_replace_numbers!("cinquecento primi", "500 primi"); + assert_replace_numbers!("cinquecento primo", "500 primo"); + assert_replace_numbers!("primo secondo", "primo secondo"); + } + + #[test] + fn test_replace_numbers_decimals() { + assert_replace_numbers!( + "dodici virgola novantanove, centoventi virgola zero cinque, uno virgola duecentotrentasei, uno virgola due tre sei.", + "12,99, 120,05, 1,236, 1,2 3 6." + ); + assert_replace_numbers!("zero virgola centododici", "0,112"); + assert_replace_numbers!( + "la densità media è di zero virgola cinque.", + "la densità media è di 0,5." + ); + assert_replace_numbers!("Dico virgola cinque", "Dico virgola cinque"); + } + + #[test] + fn test_isolates() { + assert_replace_numbers!( + "Un articolo o un pronome non devono essere sostituiti.", + "Un articolo o un pronome non devono essere sostituiti." + ); + assert_replace_numbers!("Uno como l'altro.", "Uno como l'altro."); + // I'm not totally sure for this one... + assert_replace_all_numbers!( + "Un articolo o un pronome non devono essere sostituiti..", + "1 articolo o 1 pronome non devono essere sostituiti.." + ); + assert_replace_all_numbers!("Uno como l'altro.", "1 como l'altro."); + assert_replace_numbers!( + "Ma possiamo sostituire una sequenza: uno, due, tre.", + "Ma possiamo sostituire una sequenza: 1, 2, 3." + ); + assert_replace_numbers!( + "Il mio primo arriva prima del secondo e del terzo", + "Il mio primo arriva prima del secondo e del terzo" + ); + assert_replace_all_numbers!( + "Il mio primo arriva prima del secondo e del terzo", + "Il mio 1° arriva 1° del 2° e del 3°" + ); + assert_replace_numbers!("Una dodicesima prova", "Una 12° prova"); + assert_replace_numbers!("Primo, secondo, terzo", "1°, 2°, 3°"); + assert_replace_numbers!("un po' d'acqua", "un po' d'acqua"); + assert_replace_numbers!("un po' meno", "un po' meno"); + // assert_replace_numbers!("dodici é un po' di piu", "11 é un po' di piu"); + + assert_replace_all_numbers!("allogio nuovo", "allogio nuovo"); + assert_replace_all_numbers!("allogio nove", "allogio 9"); + assert_replace_all_numbers!("allogio nove due sette", "allogio 9 2 7"); + } + + #[test] + fn test_isolates_with_noise() { + assert_replace_numbers!( + "poi due e tre più cinque ehm sei poi sette e ancora otto meno quattro è ben tre", + "poi 2 e 3 più 5 ehm 6 poi 7 e ancora 8 meno 4 è ben 3" + ); + } +} diff --git a/src/lang/it/vocabulary.rs b/src/lang/it/vocabulary.rs new file mode 100644 index 0000000..a8c70ef --- /dev/null +++ b/src/lang/it/vocabulary.rs @@ -0,0 +1,5 @@ +use phf::{phf_set, Set}; + +pub static INSIGNIFICANT: Set<&'static str> = phf_set! { + "a" +}; diff --git a/src/lang/mod.rs b/src/lang/mod.rs index adc284a..15d32bc 100644 --- a/src/lang/mod.rs +++ b/src/lang/mod.rs @@ -26,6 +26,7 @@ mod de; mod en; mod es; mod fr; +mod it; use crate::digit_string::DigitString; @@ -35,6 +36,7 @@ pub use de::German; pub use en::English; pub use es::Spanish; pub use fr::French; +pub use it::Italian; /// Model the Morphological markers that differenciate ordinals or fractions from cardinals, /// and that must be retained on the digit form. @@ -124,6 +126,7 @@ pub enum Language { English(English), French(French), German(German), + Italian(Italian), Spanish(Spanish), } @@ -140,6 +143,10 @@ impl Language { Language::German(German::default()) } + pub fn italian() -> Self { + Language::Italian(Italian::default()) + } + pub fn spanish() -> Self { Language::Spanish(Spanish::default()) } @@ -212,5 +219,5 @@ macro_rules! delegate { } impl LangInterpretor for Language { - delegate!(French, English, German, Spanish); + delegate!(French, English, German, Italian, Spanish); } From 2ece052c0b88a2d35f40fb3c195d42f3dffc26e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Thu, 27 Jun 2024 18:06:17 +0200 Subject: [PATCH 4/8] Complete italian support. --- src/lang/de/vocabulary.rs | 2 +- src/lang/it/mod.rs | 34 +++++++++++++++++++--------------- src/lang/it/vocabulary.rs | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/lang/de/vocabulary.rs b/src/lang/de/vocabulary.rs index 4fa9d14..fb5a8d2 100644 --- a/src/lang/de/vocabulary.rs +++ b/src/lang/de/vocabulary.rs @@ -1,5 +1,5 @@ use phf::{phf_set, Set}; pub static INSIGNIFICANT: Set<&'static str> = phf_set! { - "und", "so", "ach", "doch", "ja", "ok" + "aber", "ah", "äh", "ähm", "also", "gut", "auch", "denn", "doch", "dort", "eben", "eh", "halt", "ja", "mal", "sehen", "naja", "nun", "ok", "schon", "so", "genau", "und", "noch" }; diff --git a/src/lang/it/mod.rs b/src/lang/it/mod.rs index 5388bc8..46daefd 100644 --- a/src/lang/it/mod.rs +++ b/src/lang/it/mod.rs @@ -28,7 +28,8 @@ fn lemmatize(word: &str) -> &str { | "ttav" | "non" | "decim" - ) || candidate.ends_with("esim") + ) && word != "secondi" + || candidate.ends_with("esim") { candidate } else { @@ -97,7 +98,7 @@ impl LangInterpretor for Italian { } let status = match lemmatize(num_func) { "zero" => b.put(b"0"), - "un" | "uno" | "una" | "unesim" if b.peek(2) != b"10" => b.put(b"1"), + "un" | "uno" | "una" | "unesim" if b.is_free(2) => b.put(b"1"), "prim" if b.is_empty() => b.put(b"1"), "due" | "duesim" if b.peek(2) != b"10" => b.put(b"2"), "second" if b.is_empty() => b.put(b"2"), @@ -111,10 +112,10 @@ impl LangInterpretor for Italian { "sest" if b.is_empty() => b.put(b"6"), "sette" | "settesim" if b.peek(2) != b"10" => b.put(b"7"), "settim" if b.is_empty() => b.put(b"7"), - "otto" | "tto" | "ottesim" | "ttesim" if b.peek(2) != b"10" => b.put(b"8"), + "otto" | "tto" | "ottesim" | "ttesim" if b.is_free(2) => b.put(b"8"), "ottav" if b.is_empty() => b.put(b"8"), "nove" | "novesim" if b.peek(2) != b"10" => b.put(b"9"), - "non" if b.is_empty() => b.put(b"9"), + "non" if b.is_empty() && num_func != "non" => b.put(b"9"), "dieci" | "decim" => b.put(b"10"), "undici" | "undicesim" => b.put(b"11"), "dodici" | "dodicesim" => b.put(b"12"), @@ -377,7 +378,7 @@ mod tests { ); assert_text2digits!( - "cinquantuno milioni cinquecentosettantaottomilatrecentodue", + "cinquantuno milioni cinquecentosettantottomilatrecentodue", "51578302" ); @@ -393,9 +394,9 @@ mod tests { #[test] fn test_apply_variants() { - assert_text2digits!("novantaotto", "98"); - assert_text2digits!("settantaotto", "78"); - assert_text2digits!("ottantaotto", "88"); + assert_text2digits!("novantotto", "98"); + assert_text2digits!("settantotto", "78"); + assert_text2digits!("ottantotto", "88"); assert_text2digits!("ottantuno", "81"); assert_text2digits!("ottanta", "80"); assert_text2digits!("millenovecentoventi", "1920"); @@ -420,7 +421,7 @@ mod tests { #[test] fn test_ordinals() { assert_text2digits!("venticinquesimo", "25º"); - assert_text2digits!("ventiunesimo", "21º"); + assert_text2digits!("ventunesimo", "21º"); assert_text2digits!("venticinquesimi", "25º"); assert_text2digits!("ventunesimi", "21º"); } @@ -438,7 +439,7 @@ mod tests { #[test] fn test_invalid() { - // I've translated litteraly + assert_invalid!("ventiunesimo"); assert_invalid!("mille mille duecento"); assert_invalid!("sessanta quindici"); assert_invalid!("quaranta dodici"); @@ -447,8 +448,10 @@ mod tests { assert_invalid!("dici primo"); assert_invalid!("ventesimo cinque"); assert_invalid!("venti uno"); + assert_invalid!("venti otto"); assert_invalid!("zero zero trenta quattro venti"); assert_invalid!("ottanta diciotto"); + assert_invalid!("novantaotto"); } #[test] @@ -470,7 +473,7 @@ mod tests { assert_replace_numbers!("venti-uno", "venti-uno"); assert_replace_numbers!("ventuno", "21"); assert_replace_numbers!("venti uno", "20 1"); - assert_replace_numbers!("novanta cinque, settanta cinque", "90 5, 70 5"); + assert_replace_numbers!("novanta cinque, settanta cinque", "95, 75"); assert_replace_numbers!("novanta uno, settanta uno", "90 1, 70 1"); } @@ -515,7 +518,8 @@ mod tests { assert_replace_numbers!("cinquecentounesimo", "501º"); assert_replace_numbers!("cinquecento primi", "500 primi"); assert_replace_numbers!("cinquecento primo", "500 primo"); - assert_replace_numbers!("primo secondo", "primo secondo"); + assert_replace_numbers!("un secondo", "un secondo"); + assert_replace_numbers!("due secondi", "due secondi"); } #[test] @@ -555,10 +559,10 @@ mod tests { ); assert_replace_all_numbers!( "Il mio primo arriva prima del secondo e del terzo", - "Il mio 1° arriva 1° del 2° e del 3°" + "Il mio 1º arriva 1ª del 2º e del 3º" ); - assert_replace_numbers!("Una dodicesima prova", "Una 12° prova"); - assert_replace_numbers!("Primo, secondo, terzo", "1°, 2°, 3°"); + assert_replace_numbers!("Una dodicesima prova", "Una 12ª prova"); + assert_replace_numbers!("Primo, secondo, terzo", "1º, 2º, 3º"); assert_replace_numbers!("un po' d'acqua", "un po' d'acqua"); assert_replace_numbers!("un po' meno", "un po' meno"); // assert_replace_numbers!("dodici é un po' di piu", "11 é un po' di piu"); diff --git a/src/lang/it/vocabulary.rs b/src/lang/it/vocabulary.rs index a8c70ef..c9e3fad 100644 --- a/src/lang/it/vocabulary.rs +++ b/src/lang/it/vocabulary.rs @@ -1,5 +1,5 @@ use phf::{phf_set, Set}; pub static INSIGNIFICANT: Set<&'static str> = phf_set! { - "a" + "e", "ehm", "più", "poi", "ancora", "meno", "è", "ben" }; From a049e025497bfbd1e27964284e5396afaeb304b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Thu, 27 Jun 2024 18:06:33 +0200 Subject: [PATCH 5/8] Updated licence. --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index b379444..3e3cb3e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Groupe Allo-Media +Copyright (c) 2021-2024 Groupe Allo-Media Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 5ddad968a8879cedd1a4d1e78d08e9c10e6d1ba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Fri, 28 Jun 2024 10:56:00 +0200 Subject: [PATCH 6/8] Next version is 2.3.0 --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 215a3ed..8574e03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [package] name = "text2num" -version = "2.2.0" +version = "2.3.0" authors = ["Allo-Media "] edition = "2021" license = "MIT" -description = "Parse and convert numbers written in English, Spanish, German or French into their digit representation." +description = "Parse and convert numbers written in English, Spanish, German, Italian or French into their digit representation." keywords = ["NLP", "words-to-numbers"] categories = ["text-processing"] repository = "https://github.com/allo-media/text2num-rs" From 2d0591655e7dd9297fabd9f82b16a9b4a6065640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Fri, 28 Jun 2024 11:14:25 +0200 Subject: [PATCH 7/8] Removed point in Spanish abbreviated cardinals. --- src/lang/es/mod.rs | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/lang/es/mod.rs b/src/lang/es/mod.rs index b9d8bf3..54e5929 100644 --- a/src/lang/es/mod.rs +++ b/src/lang/es/mod.rs @@ -147,17 +147,17 @@ impl LangInterpretor for Spanish { "primer" => MorphologicalMarker::Ordinal(".ᵉʳ"), "primero" | "segundo" | "tercero" | "cuarto" | "quinto" | "sexto" | "séptimo" | "octavo" | "ctavo" | "noveno" => { - MorphologicalMarker::Ordinal(if is_plur { ".ᵒˢ" } else { ".º" }) + MorphologicalMarker::Ordinal(if is_plur { "ᵒˢ" } else { "º" }) } "primera" | "segunda" | "tercera" | "cuarta" | "quinta" | "sexta" | "séptima" | "octava" | "ctava" | "novena" => { - MorphologicalMarker::Ordinal(if is_plur { ".ᵃˢ" } else { ".ª" }) + MorphologicalMarker::Ordinal(if is_plur { "ᵃˢ" } else { "ª" }) } ord if ord.ends_with("imo") => { - MorphologicalMarker::Ordinal(if is_plur { ".ᵒˢ" } else { ".º" }) + MorphologicalMarker::Ordinal(if is_plur { "ᵒˢ" } else { "º" }) } ord if ord.ends_with("ima") => { - MorphologicalMarker::Ordinal(if is_plur { ".ᵃˢ" } else { ".ª" }) + MorphologicalMarker::Ordinal(if is_plur { "ᵃˢ" } else { "ª" }) } ord if ord.ends_with("avo") => MorphologicalMarker::Fraction("avo"), _ => MorphologicalMarker::None, @@ -273,19 +273,20 @@ mod tests { fn test_variants() { assert_text2digits!("un millon", "1000000"); assert_text2digits!("un millón", "1000000"); - assert_text2digits!("décimo primero", "11.º"); - assert_text2digits!("decimoprimero", "11.º"); - assert_text2digits!("undécimo", "11.º"); - assert_text2digits!("décimo segundo", "12.º"); - assert_text2digits!("decimosegundo", "12.º"); - assert_text2digits!("duodécimo", "12.º"); + assert_text2digits!("décimo primero", "11º"); + assert_text2digits!("decimoprimero", "11º"); + assert_text2digits!("undécimo", "11º"); + assert_text2digits!("décimo segundo", "12º"); + assert_text2digits!("decimosegundo", "12º"); + assert_text2digits!("duodécimo", "12º"); } #[test] fn test_ordinals() { - assert_text2digits!("vigésimo cuarto", "24.º"); - assert_text2digits!("vigésimo primero", "21.º"); - assert_text2digits!("decimosexta", "16.ª"); + assert_text2digits!("vigésimo cuarto", "24º"); + assert_text2digits!("vigésimo primero", "21º"); + assert_text2digits!("ciento primero", "101º"); + assert_text2digits!("decimosexta", "16ª"); assert_text2digits!("decimosextas", "16.ᵃˢ"); assert_text2digits!("decimosextos", "16.ᵒˢ"); } @@ -306,6 +307,7 @@ mod tests { assert_invalid!("cincuenta cero tres"); assert_invalid!("cincuenta y tres cero"); assert_invalid!("diez cero"); + assert_invalid!("cero uno"); } #[test] @@ -365,22 +367,22 @@ mod tests { fn test_replace_numbers_ordinals() { assert_replace_numbers!( "Cuarto quinto segundo tercero vigésimo primero centésimo milésimo ducentésimo trigésimo.", - "4.º 5.º segundo 3.º 21.º 100230.º." + "4º 5º segundo 3º 21º 100230º." ); - assert_replace_numbers!("centésimo trigésimo segundo", "132.º"); - assert_replace_numbers!("centésimo, trigésimo, segundo", "100.º, 30.º, segundo"); + assert_replace_numbers!("centésimo trigésimo segundo", "132º"); + assert_replace_numbers!("centésimo, trigésimo, segundo", "100º, 30º, segundo"); assert_replace_numbers!( "Un segundo por favor! Vigésimo segundo es diferente que veinte segundos.", - "Un segundo por favor! 22.º es diferente que 20 segundos." + "Un segundo por favor! 22º es diferente que 20 segundos." ); assert_replace_numbers!( "Un segundo por favor! Vigésimos segundos es diferente que veinte segundos.", - "Un segundo por favor! 22.ᵒˢ es diferente que 20 segundos." + "Un segundo por favor! 22ᵒˢ es diferente que 20 segundos." ); - assert_replace_all_numbers!("Él ha quedado tercero", "Él ha quedado 3.º"); - assert_replace_all_numbers!("Ella ha quedado tercera", "Ella ha quedado 3.ª"); - assert_replace_all_numbers!("Ellos han quedado terceros", "Ellos han quedado 3.ᵒˢ"); - assert_replace_all_numbers!("Ellas han quedado terceras", "Ellas han quedado 3.ᵃˢ"); + assert_replace_all_numbers!("Él ha quedado tercero", "Él ha quedado 3º"); + assert_replace_all_numbers!("Ella ha quedado tercera", "Ella ha quedado 3ª"); + assert_replace_all_numbers!("Ellos han quedado terceros", "Ellos han quedado 3ᵒˢ"); + assert_replace_all_numbers!("Ellas han quedado terceras", "Ellas han quedado 3ᵃˢ"); } #[test] From f38def07b8f8deb6fa6dddca6437b6ce80ada179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Fri, 28 Jun 2024 11:32:17 +0200 Subject: [PATCH 8/8] [Spanish] More tests. --- src/lang/es/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lang/es/mod.rs b/src/lang/es/mod.rs index 54e5929..157cc88 100644 --- a/src/lang/es/mod.rs +++ b/src/lang/es/mod.rs @@ -285,10 +285,10 @@ mod tests { fn test_ordinals() { assert_text2digits!("vigésimo cuarto", "24º"); assert_text2digits!("vigésimo primero", "21º"); - assert_text2digits!("ciento primero", "101º"); + assert_text2digits!("centésimo primero", "101º"); assert_text2digits!("decimosexta", "16ª"); - assert_text2digits!("decimosextas", "16.ᵃˢ"); - assert_text2digits!("decimosextos", "16.ᵒˢ"); + assert_text2digits!("decimosextas", "16ᵃˢ"); + assert_text2digits!("decimosextos", "16ᵒˢ"); } #[test] @@ -301,13 +301,13 @@ mod tests { #[test] fn test_zeroes() { assert_text2digits!("cero", "0"); + assert_text2digits!("cero uno", "01"); assert_text2digits!("cero ocho", "08"); assert_text2digits!("cero cero ciento veinticinco", "00125"); assert_invalid!("cinco cero"); assert_invalid!("cincuenta cero tres"); assert_invalid!("cincuenta y tres cero"); assert_invalid!("diez cero"); - assert_invalid!("cero uno"); } #[test] @@ -360,6 +360,7 @@ mod tests { assert_replace_numbers!("trece mil cero noventa", "13000 090"); assert_replace_numbers!("cero", "cero"); assert_replace_numbers!("cero cinco", "05"); + assert_replace_numbers!("cero uno ochenta y cinco", "01 85"); assert_replace_numbers!("cero, cinco", "0, 5"); }