diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c1ec629..80a6536 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,6 +31,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -69,6 +70,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -110,6 +112,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -144,6 +147,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -190,6 +194,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -234,6 +239,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 diff --git a/Cargo.toml b/Cargo.toml index f8542d9..798af40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,18 +1,18 @@ [package] name = "py_rust_stemmers" -version = "0.1.5" +version = "0.1.6" edition = "2021" [lib] crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.18", features = ["extension-module"] } +pyo3 = { version = "0.25", features = ["extension-module"] } rust-stemmers = "1.2.0" rayon = "1.6" [dev-dependencies] -pyo3 = { version = "0.18", features = ["extension-module"] } +pyo3 = { version = "0.25", features = ["extension-module"] } [profile.release] opt-level = 3 diff --git a/pyproject.toml b/pyproject.toml index 4e7f0d7..ffe1a6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "py_rust_stemmers" -version = "0.1.5" +version = "0.1.6" description = "Fast and parallel snowball stemmer" # Include a long description diff --git a/src/lib.rs b/src/lib.rs index fc84d57..1c576c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ pub struct SnowballStemmer { #[pymethods] impl SnowballStemmer { #[new] - fn new(lang: &str) -> Self { + fn new(lang: &str) -> PyResult { let algorithm = match lang.to_lowercase().as_str() { "arabic" => Algorithm::Arabic, "danish" => Algorithm::Danish, @@ -33,10 +33,11 @@ impl SnowballStemmer { "swedish" => Algorithm::Swedish, "tamil" => Algorithm::Tamil, "turkish" => Algorithm::Turkish, - _ => panic!("Unsupported language: {}", lang), + // throw exception instead of crashing, preserve prior test behavior + _ => return Err(pyo3::exceptions::PyValueError::new_err(format!("Unsupported language: {}", lang))), }; let stemmer = Stemmer::create(algorithm); - SnowballStemmer { stemmer } + Ok(SnowballStemmer { stemmer }) } #[inline(always)] @@ -45,18 +46,23 @@ impl SnowballStemmer { } #[inline(always)] - pub fn stem_words_parallel(&self, inputs: Vec<&str>) -> Vec { - inputs - .into_par_iter() - .map(|word| self.stemmer.stem(word).into_owned()) - .collect() + pub fn stem_words_parallel(&self, py: Python<'_>, inputs: Vec) -> PyResult> { + // release GIL + py.allow_threads(|| { + let result = inputs + .par_iter() + .map(|word| self.stemmer.stem(word.as_str()).into_owned()) + .collect(); + Ok(result) + }) } + // refactor to Vec based on the discussion(s) here: https://github.com/PyO3/pyo3/discussions/4830 #[inline(always)] - pub fn stem_words(&self, inputs: Vec<&str>) -> Vec { + pub fn stem_words(&self, inputs: Vec) -> Vec { inputs - .into_iter() - .map(|word| self.stemmer.stem(word)) + .iter() + .map(|word| self.stemmer.stem(word.as_str())) .map(|stemmed| stemmed.into_owned()) .collect() } @@ -64,7 +70,7 @@ impl SnowballStemmer { /// This module is required for the Python interpreter to access the Rust functions. #[pymodule] -fn py_rust_stemmers(_py: Python, m: &PyModule) -> PyResult<()> { +fn py_rust_stemmers(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; Ok(()) -} +} \ No newline at end of file diff --git a/tests/speedtest.py b/tests/speedtest.py index 3867d5d..1c1eeb9 100644 --- a/tests/speedtest.py +++ b/tests/speedtest.py @@ -31,8 +31,9 @@ s = stemmer('english') b = time.perf_counter() -for _ in range(loops): +# only time a fraction of these, 500k takes 10 minutes +for _ in range(loops // 100): for word in words: - stemmed = s.stemWord(word.encode('utf-8')) -print("Time taken snowballstemmer with PyStemmer installed:", time.perf_counter() - b) + stemmed = s.stemWord(word) +print("Time taken snowballstemmer with PyStemmer installed:", (time.perf_counter() - b) * 100) diff --git a/tests/test_py_rust_stemers.py b/tests/test_py_rust_stemmers.py similarity index 65% rename from tests/test_py_rust_stemers.py rename to tests/test_py_rust_stemmers.py index 2d397ab..9a62587 100644 --- a/tests/test_py_rust_stemers.py +++ b/tests/test_py_rust_stemmers.py @@ -1,30 +1,32 @@ import unittest -import py_rust_stemers +from py_rust_stemmers import SnowballStemmer class TestRustStemmer(unittest.TestCase): - + def test_english_stemming(self): + s = SnowballStemmer('english') words = ["fruitlessly", "happiness", "computations"] expected = ["fruitless", "happi", "comput"] - result = py_rust_stemers.rust_stem(words, "english") + result = [s.stem_word(w) for w in words] self.assertEqual(result, expected) def test_spanish_stemming(self): + s = SnowballStemmer('spanish') words = ["frutalmente", "felicidad", "computaciones"] expected = ["frutal", "felic", "comput"] - result = py_rust_stemers.rust_stem(words, "spanish") + result = [s.stem_word(w) for w in words] self.assertEqual(result, expected) def test_empty_input(self): - words = [] - expected = [] - result = py_rust_stemers.rust_stem(words, "english") + s = SnowballStemmer('english') + expected = [''] + result = [s.stem_word("")] self.assertEqual(result, expected) def test_invalid_language(self): words = ["fruitlessly", "happiness", "computations"] with self.assertRaises(ValueError): - py_rust_stemers.rust_stem(words, "invalid_lang") + s = SnowballStemmer('invalid_lang') if __name__ == '__main__': unittest.main()