From 767d43552e43264467c90c7a5e903d09e9d2e449 Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sat, 17 May 2025 19:10:26 -0700 Subject: [PATCH 1/7] bump pyo3 version for 3.13t compatibility, update rust code to newer pyo3 style, update tests --- .gitignore | 3 ++ Cargo.toml | 4 +- PKG-INFO | 91 ++++++++++++++++++++++++++++++++++ src/lib.rs | 32 +++++++----- tests/speedtest.py | 7 +-- tests/test_py_rust_stemmers.py | 32 ++++++++++++ 6 files changed, 151 insertions(+), 18 deletions(-) create mode 100644 .gitignore create mode 100644 PKG-INFO create mode 100644 tests/test_py_rust_stemmers.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..22abb57 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__* +/target +Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index f8542d9..d178f9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,12 +7,12 @@ edition = "2021" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.18", features = ["extension-module"] } +pyo3 = { version = "0.25", features = ["extension-module"] } rust-stemmers = "1.2.0" rayon = "1.6" [dev-dependencies] -pyo3 = { version = "0.18", features = ["extension-module"] } +pyo3 = { version = "0.25", features = ["extension-module"] } [profile.release] opt-level = 3 diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..174c0ae --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,91 @@ +Metadata-Version: 2.3 +Name: py_rust_stemmers +Version: 0.1.5 +License-File: LICENSE +Summary: Fast and parallel snowball stemmer +Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM + +# py-rust-stemmers +py-rust-stemmers is a high-performance Python wrapper around the rust-stemmers library, utilizing the Snowball stemming algorithm. This library allows for efficient stemming of words with support for parallel processing, making it a powerful tool for text processing tasks. The library is built using maturin to compile the Rust code into a Python package. + +## Features +* Snowball Stemmer: Uses the well-known Snowball stemming algorithms for efficient word stemming in multiple languages. +* Parallelism Support: Offers parallel processing for batch stemming, providing significant speedup for larger text sequences. +* Rust Performance: Leverages the performance of Rust for fast, reliable text processing. + +## Installation +You can install py-rust-stemmers via pip: + +```pip install py-rust-stemmers``` + +## Usage +Here's a simple example showing how to use py-rust-stemmers to stem words using the Snowball algorithm: + +``` +from py_rust_stemmers import SnowballStemmer + +# Initialize the stemmer for the English language +s = SnowballStemmer('english') + +# Input text +text = """This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so awe and awful don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer.""" +words = text.split() + +# Example usage of the methods +stemmed = s.stem_word(words[0]) +print(f"Stemmed word: {stemmed}") + +# Stem a list of words +stemmed_words = s.stem_words(words) +print(f"Stemmed words: {stemmed_words}") + +# Stem words in parallel +stemmed_words_parallel = s.stem_words_parallel(words) +print(f"Stemmed words (parallel): {stemmed_words_parallel}") +``` +___ +## Methods +```stem_word(word: str) -> str``` + +This method stems a single word. It is best used for small or isolated stemming tasks. + +Example: +``` +s.stem_word("running") # Output: "run" +``` +___ +``` +stem_words(words: List[str]) -> List[str] +``` + +This method stems a list of words sequentially. It is ideal for processing short to moderately sized text sequences. + +Example: + +``` +s.stem_words(["running", "jumps", "easily"]) # Output: ["run", "jump", "easili"] +``` +___ +``` +stem_words_parallel(words: List[str]) -> List[str] +``` + +This method stems a list of words in parallel. It provides significant speedup for longer text sequences (e.g., sequences longer than 512 tokens) by utilizing parallel processing. It is ideal for batch processing of large datasets. + +Example: + +``` +s.stem_words_parallel(["running", "jumps", "easily"]) # Output: ["run", "jump", "easili"] +``` + +## Build from source +* Install maturin +* Go to project dir + +``` +maturin build --release +pip install target/wheels/py_rust_stemmers-.whl +``` + +## License +This project is licensed under the MIT License. See the LICENSE file for more details. diff --git a/src/lib.rs b/src/lib.rs index fc84d57..1c576c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ pub struct SnowballStemmer { #[pymethods] impl SnowballStemmer { #[new] - fn new(lang: &str) -> Self { + fn new(lang: &str) -> PyResult { let algorithm = match lang.to_lowercase().as_str() { "arabic" => Algorithm::Arabic, "danish" => Algorithm::Danish, @@ -33,10 +33,11 @@ impl SnowballStemmer { "swedish" => Algorithm::Swedish, "tamil" => Algorithm::Tamil, "turkish" => Algorithm::Turkish, - _ => panic!("Unsupported language: {}", lang), + // throw exception instead of crashing, preserve prior test behavior + _ => return Err(pyo3::exceptions::PyValueError::new_err(format!("Unsupported language: {}", lang))), }; let stemmer = Stemmer::create(algorithm); - SnowballStemmer { stemmer } + Ok(SnowballStemmer { stemmer }) } #[inline(always)] @@ -45,18 +46,23 @@ impl SnowballStemmer { } #[inline(always)] - pub fn stem_words_parallel(&self, inputs: Vec<&str>) -> Vec { - inputs - .into_par_iter() - .map(|word| self.stemmer.stem(word).into_owned()) - .collect() + pub fn stem_words_parallel(&self, py: Python<'_>, inputs: Vec) -> PyResult> { + // release GIL + py.allow_threads(|| { + let result = inputs + .par_iter() + .map(|word| self.stemmer.stem(word.as_str()).into_owned()) + .collect(); + Ok(result) + }) } + // refactor to Vec based on the discussion(s) here: https://github.com/PyO3/pyo3/discussions/4830 #[inline(always)] - pub fn stem_words(&self, inputs: Vec<&str>) -> Vec { + pub fn stem_words(&self, inputs: Vec) -> Vec { inputs - .into_iter() - .map(|word| self.stemmer.stem(word)) + .iter() + .map(|word| self.stemmer.stem(word.as_str())) .map(|stemmed| stemmed.into_owned()) .collect() } @@ -64,7 +70,7 @@ impl SnowballStemmer { /// This module is required for the Python interpreter to access the Rust functions. #[pymodule] -fn py_rust_stemmers(_py: Python, m: &PyModule) -> PyResult<()> { +fn py_rust_stemmers(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; Ok(()) -} +} \ No newline at end of file diff --git a/tests/speedtest.py b/tests/speedtest.py index 3867d5d..1c1eeb9 100644 --- a/tests/speedtest.py +++ b/tests/speedtest.py @@ -31,8 +31,9 @@ s = stemmer('english') b = time.perf_counter() -for _ in range(loops): +# only time a fraction of these, 500k takes 10 minutes +for _ in range(loops // 100): for word in words: - stemmed = s.stemWord(word.encode('utf-8')) -print("Time taken snowballstemmer with PyStemmer installed:", time.perf_counter() - b) + stemmed = s.stemWord(word) +print("Time taken snowballstemmer with PyStemmer installed:", (time.perf_counter() - b) * 100) diff --git a/tests/test_py_rust_stemmers.py b/tests/test_py_rust_stemmers.py new file mode 100644 index 0000000..9a62587 --- /dev/null +++ b/tests/test_py_rust_stemmers.py @@ -0,0 +1,32 @@ +import unittest +from py_rust_stemmers import SnowballStemmer + +class TestRustStemmer(unittest.TestCase): + + def test_english_stemming(self): + s = SnowballStemmer('english') + words = ["fruitlessly", "happiness", "computations"] + expected = ["fruitless", "happi", "comput"] + result = [s.stem_word(w) for w in words] + self.assertEqual(result, expected) + + def test_spanish_stemming(self): + s = SnowballStemmer('spanish') + words = ["frutalmente", "felicidad", "computaciones"] + expected = ["frutal", "felic", "comput"] + result = [s.stem_word(w) for w in words] + self.assertEqual(result, expected) + + def test_empty_input(self): + s = SnowballStemmer('english') + expected = [''] + result = [s.stem_word("")] + self.assertEqual(result, expected) + + def test_invalid_language(self): + words = ["fruitlessly", "happiness", "computations"] + with self.assertRaises(ValueError): + s = SnowballStemmer('invalid_lang') + +if __name__ == '__main__': + unittest.main() From 5c835f02a1f07de65ec9751001dc81c746131830 Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sat, 17 May 2025 19:13:27 -0700 Subject: [PATCH 2/7] update workflow for 3.13t --- .github/workflows/release.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c1ec629..80a6536 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,6 +31,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -69,6 +70,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -110,6 +112,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -144,6 +147,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -190,6 +194,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 @@ -234,6 +239,7 @@ jobs: - '3.11' - '3.12' - '3.13' + - '3.13t' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v5 From 46ed72a039fce5138904c15a1348c1e57e21f23c Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sat, 17 May 2025 19:17:15 -0700 Subject: [PATCH 3/7] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d178f9f..798af40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py_rust_stemmers" -version = "0.1.5" +version = "0.1.6" edition = "2021" [lib] From 368d693049cb7f28ee6f6967e36d5f21a5f75b4a Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sat, 17 May 2025 19:18:27 -0700 Subject: [PATCH 4/7] remove PKG-INFO --- .gitignore | 3 +- PKG-INFO | 91 ------------------------------------------------------ 2 files changed, 2 insertions(+), 92 deletions(-) delete mode 100644 PKG-INFO diff --git a/.gitignore b/.gitignore index 22abb57..9bd9521 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __* /target -Cargo.lock \ No newline at end of file +Cargo.lock +PKG-INFO \ No newline at end of file diff --git a/PKG-INFO b/PKG-INFO deleted file mode 100644 index 174c0ae..0000000 --- a/PKG-INFO +++ /dev/null @@ -1,91 +0,0 @@ -Metadata-Version: 2.3 -Name: py_rust_stemmers -Version: 0.1.5 -License-File: LICENSE -Summary: Fast and parallel snowball stemmer -Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM - -# py-rust-stemmers -py-rust-stemmers is a high-performance Python wrapper around the rust-stemmers library, utilizing the Snowball stemming algorithm. This library allows for efficient stemming of words with support for parallel processing, making it a powerful tool for text processing tasks. The library is built using maturin to compile the Rust code into a Python package. - -## Features -* Snowball Stemmer: Uses the well-known Snowball stemming algorithms for efficient word stemming in multiple languages. -* Parallelism Support: Offers parallel processing for batch stemming, providing significant speedup for larger text sequences. -* Rust Performance: Leverages the performance of Rust for fast, reliable text processing. - -## Installation -You can install py-rust-stemmers via pip: - -```pip install py-rust-stemmers``` - -## Usage -Here's a simple example showing how to use py-rust-stemmers to stem words using the Snowball algorithm: - -``` -from py_rust_stemmers import SnowballStemmer - -# Initialize the stemmer for the English language -s = SnowballStemmer('english') - -# Input text -text = """This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so awe and awful don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer.""" -words = text.split() - -# Example usage of the methods -stemmed = s.stem_word(words[0]) -print(f"Stemmed word: {stemmed}") - -# Stem a list of words -stemmed_words = s.stem_words(words) -print(f"Stemmed words: {stemmed_words}") - -# Stem words in parallel -stemmed_words_parallel = s.stem_words_parallel(words) -print(f"Stemmed words (parallel): {stemmed_words_parallel}") -``` -___ -## Methods -```stem_word(word: str) -> str``` - -This method stems a single word. It is best used for small or isolated stemming tasks. - -Example: -``` -s.stem_word("running") # Output: "run" -``` -___ -``` -stem_words(words: List[str]) -> List[str] -``` - -This method stems a list of words sequentially. It is ideal for processing short to moderately sized text sequences. - -Example: - -``` -s.stem_words(["running", "jumps", "easily"]) # Output: ["run", "jump", "easili"] -``` -___ -``` -stem_words_parallel(words: List[str]) -> List[str] -``` - -This method stems a list of words in parallel. It provides significant speedup for longer text sequences (e.g., sequences longer than 512 tokens) by utilizing parallel processing. It is ideal for batch processing of large datasets. - -Example: - -``` -s.stem_words_parallel(["running", "jumps", "easily"]) # Output: ["run", "jump", "easili"] -``` - -## Build from source -* Install maturin -* Go to project dir - -``` -maturin build --release -pip install target/wheels/py_rust_stemmers-.whl -``` - -## License -This project is licensed under the MIT License. See the LICENSE file for more details. From c9644eaeea1ebdb6b71626f0788aa3b164f7394c Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sat, 17 May 2025 19:24:40 -0700 Subject: [PATCH 5/7] finish version bump =) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4e7f0d7..ffe1a6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "py_rust_stemmers" -version = "0.1.5" +version = "0.1.6" description = "Fast and parallel snowball stemmer" # Include a long description From e2ef6bbaa188af91381c6a0e257b0e75d108094e Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sat, 17 May 2025 19:40:01 -0700 Subject: [PATCH 6/7] remove old test --- tests/test_py_rust_stemers.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 tests/test_py_rust_stemers.py diff --git a/tests/test_py_rust_stemers.py b/tests/test_py_rust_stemers.py deleted file mode 100644 index 2d397ab..0000000 --- a/tests/test_py_rust_stemers.py +++ /dev/null @@ -1,30 +0,0 @@ -import unittest -import py_rust_stemers - -class TestRustStemmer(unittest.TestCase): - - def test_english_stemming(self): - words = ["fruitlessly", "happiness", "computations"] - expected = ["fruitless", "happi", "comput"] - result = py_rust_stemers.rust_stem(words, "english") - self.assertEqual(result, expected) - - def test_spanish_stemming(self): - words = ["frutalmente", "felicidad", "computaciones"] - expected = ["frutal", "felic", "comput"] - result = py_rust_stemers.rust_stem(words, "spanish") - self.assertEqual(result, expected) - - def test_empty_input(self): - words = [] - expected = [] - result = py_rust_stemers.rust_stem(words, "english") - self.assertEqual(result, expected) - - def test_invalid_language(self): - words = ["fruitlessly", "happiness", "computations"] - with self.assertRaises(ValueError): - py_rust_stemers.rust_stem(words, "invalid_lang") - -if __name__ == '__main__': - unittest.main() From c4561dc70769a16a0c3bb3ad01d2b187e4d45b3f Mon Sep 17 00:00:00 2001 From: 0xDEADFED5 Date: Sun, 18 May 2025 06:38:10 -0700 Subject: [PATCH 7/7] remove my .gitignore --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 9bd9521..0000000 --- a/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -__* -/target -Cargo.lock -PKG-INFO \ No newline at end of file