Skip to content

Commit

Permalink
Merge pull request #35 from allo-media/master
Browse files Browse the repository at this point in the history
Version 2.3.0: Italian.
  • Loading branch information
rtxm authored Jun 28, 2024
2 parents fdee28d + 4ec9fdc commit c41fddf
Show file tree
Hide file tree
Showing 11 changed files with 651 additions and 34 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[package]
name = "text2num"
version = "2.2.0"
version = "2.3.0"
authors = ["Allo-Media <[email protected]>"]
edition = "2021"
license = "MIT"
description = "Parse and convert numbers written in English, Spanish, German or French into their digit representation."
description = "Parse and convert numbers written in English, Spanish, German, Italian or French into their digit representation."
keywords = ["NLP", "words-to-numbers"]
categories = ["text-processing"]
repository = "https://github.com/allo-media/text2num-rs"
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2021 Groupe Allo-Media
Copyright (c) 2021-2024 Groupe Allo-Media

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
1 change: 1 addition & 0 deletions src/digit_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ impl DigitString {
Ok(())
};
}
// maybe subpart of a bigger number
let mut padding_zeroes = self.buffer[(l - positions)..]
.iter()
.take_while(|&c| *c == b'0')
Expand Down
2 changes: 1 addition & 1 deletion src/lang/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ impl LangInterpretor for German {
}
}
"tausend" | "tausendste" if b.is_range_free(3, 5) => b.shift(3),
"million" | "millionen" | "millionste" => b.shift(6),
"million" | "millionen" | "millionste" if b.is_range_free(6, 8) => b.shift(6),
"milliarde" | "milliarden" | "milliardste" => b.shift(9),
"billion" | "billionste" => b.shift(12),
"und" => Err(Error::Incomplete),
Expand Down
2 changes: 1 addition & 1 deletion src/lang/de/vocabulary.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use phf::{phf_set, Set};

pub static INSIGNIFICANT: Set<&'static str> = phf_set! {
"und", "so", "ach", "doch", "ja"
"aber", "ah", "äh", "ähm", "also", "gut", "auch", "denn", "doch", "dort", "eben", "eh", "halt", "ja", "mal", "sehen", "naja", "nun", "ok", "schon", "so", "genau", "und", "noch"
};
3 changes: 2 additions & 1 deletion src/lang/en/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ impl LangInterpretor for English {
}
}
"thousand" | "thousandth" if b.is_range_free(3, 5) => b.shift(3),
"million" | "millionth" => b.shift(6),
"million" | "millionth" if b.is_range_free(6, 8) => b.shift(6),
"billion" | "billionth" => b.shift(9),
"and" if b.len() >= 2 => Err(Error::Incomplete),

Expand Down Expand Up @@ -346,6 +346,7 @@ mod tests {
one hundred twenty point o five, one point two hundred thirty-six, one point two three six.",
"12.99, 120.05, 120.05, 1.2 136, 1.236."
);
assert_replace_numbers!("I say point three", "I say point three");
}

#[test]
Expand Down
55 changes: 30 additions & 25 deletions src/lang/es/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ impl LangInterpretor for Spanish {
"ochociento" | "ochocienta" | "octingentésimo" | "octingentésima" => b.put(b"800"),
"noveciento" | "novecienta" | "noningentésimo" | "noningentésima" => b.put(b"900"),
"mil" | "milésimo" | "milésima" if b.is_range_free(3, 5) => b.shift(3),
"millon" | "millón" | "millonésimo" | "millonésima" => b.shift(6),
"millon" | "millón" | "millonésimo" | "millonésima" if b.is_range_free(6, 8) => {
b.shift(6)
}
"y" if b.len() >= 2 => Err(Error::Incomplete),

_ => Err(Error::NaN),
Expand Down Expand Up @@ -145,17 +147,17 @@ impl LangInterpretor for Spanish {
"primer" => MorphologicalMarker::Ordinal(".ᵉʳ"),
"primero" | "segundo" | "tercero" | "cuarto" | "quinto" | "sexto" | "séptimo"
| "octavo" | "ctavo" | "noveno" => {
MorphologicalMarker::Ordinal(if is_plur { ".ᵒˢ" } else { ".º" })
MorphologicalMarker::Ordinal(if is_plur { "ᵒˢ" } else { "º" })
}
"primera" | "segunda" | "tercera" | "cuarta" | "quinta" | "sexta" | "séptima"
| "octava" | "ctava" | "novena" => {
MorphologicalMarker::Ordinal(if is_plur { ".ᵃˢ" } else { ".ª" })
MorphologicalMarker::Ordinal(if is_plur { "ᵃˢ" } else { "ª" })
}
ord if ord.ends_with("imo") => {
MorphologicalMarker::Ordinal(if is_plur { ".ᵒˢ" } else { ".º" })
MorphologicalMarker::Ordinal(if is_plur { "ᵒˢ" } else { "º" })
}
ord if ord.ends_with("ima") => {
MorphologicalMarker::Ordinal(if is_plur { ".ᵃˢ" } else { ".ª" })
MorphologicalMarker::Ordinal(if is_plur { "ᵃˢ" } else { "ª" })
}
ord if ord.ends_with("avo") => MorphologicalMarker::Fraction("avo"),
_ => MorphologicalMarker::None,
Expand Down Expand Up @@ -271,21 +273,22 @@ mod tests {
fn test_variants() {
assert_text2digits!("un millon", "1000000");
assert_text2digits!("un millón", "1000000");
assert_text2digits!("décimo primero", "11.º");
assert_text2digits!("decimoprimero", "11.º");
assert_text2digits!("undécimo", "11.º");
assert_text2digits!("décimo segundo", "12.º");
assert_text2digits!("decimosegundo", "12.º");
assert_text2digits!("duodécimo", "12.º");
assert_text2digits!("décimo primero", "11º");
assert_text2digits!("decimoprimero", "11º");
assert_text2digits!("undécimo", "11º");
assert_text2digits!("décimo segundo", "12º");
assert_text2digits!("decimosegundo", "12º");
assert_text2digits!("duodécimo", "12º");
}

#[test]
fn test_ordinals() {
assert_text2digits!("vigésimo cuarto", "24.º");
assert_text2digits!("vigésimo primero", "21.º");
assert_text2digits!("decimosexta", "16.ª");
assert_text2digits!("decimosextas", "16.ᵃˢ");
assert_text2digits!("decimosextos", "16.ᵒˢ");
assert_text2digits!("vigésimo cuarto", "24º");
assert_text2digits!("vigésimo primero", "21º");
assert_text2digits!("centésimo primero", "101º");
assert_text2digits!("decimosexta", "16ª");
assert_text2digits!("decimosextas", "16ᵃˢ");
assert_text2digits!("decimosextos", "16ᵒˢ");
}

#[test]
Expand All @@ -298,6 +301,7 @@ mod tests {
#[test]
fn test_zeroes() {
assert_text2digits!("cero", "0");
assert_text2digits!("cero uno", "01");
assert_text2digits!("cero ocho", "08");
assert_text2digits!("cero cero ciento veinticinco", "00125");
assert_invalid!("cinco cero");
Expand Down Expand Up @@ -356,29 +360,30 @@ mod tests {
assert_replace_numbers!("trece mil cero noventa", "13000 090");
assert_replace_numbers!("cero", "cero");
assert_replace_numbers!("cero cinco", "05");
assert_replace_numbers!("cero uno ochenta y cinco", "01 85");
assert_replace_numbers!("cero, cinco", "0, 5");
}

#[test]
fn test_replace_numbers_ordinals() {
assert_replace_numbers!(
"Cuarto quinto segundo tercero vigésimo primero centésimo milésimo ducentésimo trigésimo.",
"4.º 5.º segundo 3.º 21.º 100230.º."
"4º 5º segundo 3º 21º 100230º."
);
assert_replace_numbers!("centésimo trigésimo segundo", "132.º");
assert_replace_numbers!("centésimo, trigésimo, segundo", "100.º, 30.º, segundo");
assert_replace_numbers!("centésimo trigésimo segundo", "132º");
assert_replace_numbers!("centésimo, trigésimo, segundo", "100º, 30º, segundo");
assert_replace_numbers!(
"Un segundo por favor! Vigésimo segundo es diferente que veinte segundos.",
"Un segundo por favor! 22.º es diferente que 20 segundos."
"Un segundo por favor! 22º es diferente que 20 segundos."
);
assert_replace_numbers!(
"Un segundo por favor! Vigésimos segundos es diferente que veinte segundos.",
"Un segundo por favor! 22.ᵒˢ es diferente que 20 segundos."
"Un segundo por favor! 22ᵒˢ es diferente que 20 segundos."
);
assert_replace_all_numbers!("Él ha quedado tercero", "Él ha quedado 3.º");
assert_replace_all_numbers!("Ella ha quedado tercera", "Ella ha quedado 3.ª");
assert_replace_all_numbers!("Ellos han quedado terceros", "Ellos han quedado 3.ᵒˢ");
assert_replace_all_numbers!("Ellas han quedado terceras", "Ellas han quedado 3.ᵃˢ");
assert_replace_all_numbers!("Él ha quedado tercero", "Él ha quedado ");
assert_replace_all_numbers!("Ella ha quedado tercera", "Ella ha quedado ");
assert_replace_all_numbers!("Ellos han quedado terceros", "Ellos han quedado 3ᵒˢ");
assert_replace_all_numbers!("Ellas han quedado terceras", "Ellas han quedado 3ᵃˢ");
}

#[test]
Expand Down
20 changes: 18 additions & 2 deletions src/lang/fr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ impl LangInterpretor for French {
}
"cent" | "centième" => {
let peek = b.peek(2);
if (peek.len() == 1 || peek < b"20") && peek != b"1" {
if (peek.len() == 1 || peek < b"20") && peek != b"1" && peek != b"01" {
b.shift(2)
} else {
Err(Error::Overlap)
Expand All @@ -171,7 +171,7 @@ impl LangInterpretor for French {
b.shift(3)
}
}
"million" | "millionième" => b.shift(6),
"million" | "millionième" if b.is_range_free(6, 8) => b.shift(6),
"milliard" | "milliardième" => b.shift(9),
"et" if b.len() >= 2 => Err(Error::Incomplete),

Expand Down Expand Up @@ -290,16 +290,26 @@ mod tests {

#[test]
fn test_apply() {
assert_text2digits!(
"cinquante trois mille millions deux cent quarante-trois mille sept cent vingt-quatre",
"53000243724"
);

assert_text2digits!(
"cinquante trois mille millions deux cent quarante trois mille sept cent vingt quatre",
"53000243724"
);

assert_text2digits!(
"cinquante et un million cinq cent soixante-dix-huit mille trois cent deux",
"51578302"
);
assert_text2digits!(
"cinquante et un million cinq cent soixante dix huit mille trois cent deux",
"51578302"
);

assert_text2digits!("quatre-vingt-cinq", "85");
assert_text2digits!("quatre vingt cinq", "85");

assert_text2digits!("quatre vingt un", "81");
Expand Down Expand Up @@ -328,17 +338,20 @@ mod tests {

#[test]
fn test_centuries() {
assert_text2digits!("dix neuf cent soixante-treize", "1973");
assert_text2digits!("dix neuf cent soixante treize", "1973");
}

#[test]
fn test_ordinals() {
assert_text2digits!("vingt-cinquième", "25ème");
assert_text2digits!("vingt cinquième", "25ème");
assert_text2digits!("vingt et unième", "21ème");
}

#[test]
fn test_fractions() {
assert_text2digits!("vingt-cinquièmes", "25èmes");
assert_text2digits!("vingt cinquièmes", "25èmes");
assert_text2digits!("vingt et unièmes", "21èmes");
}
Expand All @@ -347,6 +360,7 @@ mod tests {
fn test_zeroes() {
assert_text2digits!("zéro", "0");
assert_text2digits!("zéro huit", "08");
assert_text2digits!("zéro zéro cent vingt-cinq", "00125");
assert_text2digits!("zéro zéro cent vingt cinq", "00125");
assert_invalid!("cinq zéro");
assert_invalid!("cinquante zéro trois");
Expand All @@ -366,6 +380,7 @@ mod tests {
assert_invalid!("vingt un");
assert_invalid!("zéro zéro trente quatre vingt");
assert_invalid!("quatre-vingt dix-huit");
assert_invalid!("mille un cent");
}

#[test]
Expand Down Expand Up @@ -451,6 +466,7 @@ mod tests {
"la densité moyenne est de zéro virgule cinq.",
"la densité moyenne est de 0,5."
);
assert_replace_numbers!("Je dis virgule cinq", "Je dis virgule cinq");
}

#[test]
Expand Down
Loading

0 comments on commit c41fddf

Please sign in to comment.