diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5bcfa83..d99006a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,9 +25,14 @@ jobs: platform: - runner: ubuntu-22.04 target: x86_64 + - runner: ubuntu-22.04 + target: aarch64 python-version: + - "3.9" + - "3.10" - "3.11" - "3.12" + - "3.13" steps: - uses: actions/checkout@v4 - name: Build wheels @@ -53,8 +58,11 @@ jobs: - runner: macos-14 target: aarch64 python-version: + - "3.9" + - "3.10" - "3.11" - "3.12" + - "3.13" steps: - uses: actions/checkout@v4 - name: Build wheels diff --git a/Cargo.toml b/Cargo.toml index ecd71f0..7185e7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "needletail" version = "0.6.3" -authors = ["Roderick Bovee ", "Vincent Prouillet "] +authors = [ + "Roderick Bovee ", + "Vincent Prouillet ", +] description = "FASTX parsing and k-mer methods" keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"] categories = ["science", "parsing"] @@ -28,7 +31,7 @@ bytecount = { version = "0.6", features = ["runtime-dispatch-simd"] } bzip2 = { version = "0.4", optional = true } flate2 = { version = "1.0.30", optional = true } memchr = "2.7.2" -pyo3 = { version = "0.21.2", optional = true } +pyo3 = { version = "0.24.1", optional = true } liblzma = { version = "0.3.1", optional = true } zstd = { version = "0.13.2", optional = true } diff --git a/needletail.pyi b/needletail.pyi new file mode 100644 index 0000000..71c35a3 --- /dev/null +++ b/needletail.pyi @@ -0,0 +1,235 @@ +from pathlib import Path +from typing import Iterator, Union + +class FastxReader(Iterator[Record]): + """An iterator that yields sequence records. + + Yields + ------ + Record + A `Record` object representing a sequence record. + + See also + -------- + parse_fastx_file: + A function to parse sequence records from a FASTA/FASTQ file. + parse_fastx_string: + A function to parse sequence records from a FASTA/FASTQ string. + Record: + A class representing a FASTA/FASTQ sequence record. + """ + +class Record: + """ + A record representing a biological sequence. + + Parameters + ---------- + id : str + The identifier of the sequence record. + seq : str + A string representing the sequence. + + Attributes + ---------- + id : str + The identifier of the sequence record. In a FASTA file, this is the + string containing all characters (including whitespaces) after the + leading '>' character. In a FASTQ file, this is the string containing + all characters (including whitespaces) after the leading '@' character. + seq : str + A string representing the sequence. + qual : str, optional + A string representing the quality scores of the sequence. If the object + represents a FASTA record, this attribute will be `None`. + name : str + The name of the sequence record. This is the string before the first + whitespace character in the `id` attribute. + description : str, optional + The description of the sequence record. This is the string after the + first whitespace character in the `id` attribute. If the `id` attribute + contains no whitespace characters, this attribute will be `None`. + + Methods + ------- + is_fasta + Check if the object represents a FASTA record. + is_fastq + Check if the object represents a FASTQ record. + normalize(iupac) + Normalize the sequence stored in the `seq` attribute of the object. + """ + def is_fasta(self) -> bool: + """ + Check if the object represents a FASTA record. + + Returns + ------- + bool + `True` if the record lacks quality information, otherwise `False`. + """ + pass + + def is_fastq(self) -> bool: + """ + Check if the object represents a FASTQ record. + + Returns + ------- + bool + `True` if the record has quality information, otherwise `False`. + """ + pass + + def normalize(self, iupac: bool) -> None: + """ + Normalize the sequence stored in the `seq` attribute of the object. + + See also + -------- + normalize_seq: A function to normalize nucleotide sequence strings. + + Notes + ----- + The `normalize` method is designed for nucleotide sequences only. If + used with protein sequences, it will incorrectly process amino acid + characters as if they were nucleotides. + """ + pass + +def parse_fastx_file(path: Union[str, Path]) -> FastxReader: + """ + Returns an iterator that parses a FASTA/FASTQ file and yields sequence + records. + + Parameters + ---------- + path : str or pathlib.Path + The path to a FASTA/FASTQ file. + + Returns + ------- + FastxReader + A `FastxReader` iterator that yields `Record` objects representing + sequences from the input file. + + Raises + ------ + NeedletailError + If an error occurs while reading and parsing the input file. + + See also + -------- + parse_fastx_string: + A function to parse sequence records from a FASTA/FASTQ string. + FastxReader: + A class with instances that are iterators that yield `Record` objects. + """ + pass + +def parse_fastx_string(fastx_string: str) -> FastxReader: + """ + Returns an iterator that parses a FASTA/FASTQ string and yields sequence + records. + + Parameters + ---------- + content : str + A string containing FASTA/FASTQ-formatted sequence records. + + Returns + ------- + FastxReader + A `FastxReader` iterator that yields `Record` objects representing + sequences from the input string. + + Raises + ------ + NeedletailError + If an error occurs while parsing the input string. + + See also + -------- + parse_fastx_file: + A function to parse sequence records from a FASTA/FASTQ file. + FastxReader: + A class with instances that are iterators that yield `Record` objects. + """ + pass + +def normalize_seq(seq: str, iupac: bool) -> str: + """ + Normalize the sequence string of nucleotide records by: + + - Converting lowercase characters to uppercase. + - Removing whitespace and newline characters. + - Replacing 'U' with 'T'. + - Replacing '.' and '~' with '-'. + - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is `True`, + in which case characters representing nucleotide ambiguity are not + replaced. + + Parameters + ---------- + seq : str + A string representing a nucleotide sequence. + iupac : bool, default: False + If `True`, characters representing nucleotide ambiguity ('B', 'D', + 'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase + forms) will not be converted to 'N'. Lowercase characters will still + be converted to uppercase. + + Returns + ------- + str + The normalized sequence string. + + Notes + ----- + The `normalize_seq` function is designed for nucleotide sequences only. If + used with protein sequences, it will incorrectly process amino acid + characters as if they were nucleotides. + """ + pass + +def reverse_complement(seq: str) -> str: + """ + Compute the reverse complement of a nucleotide sequence. + + Parameters + ---------- + seq : str + A string representing a nucleotide sequence. + + Returns + ------- + str + The reverse complement of the input nucleotide sequence. + + Notes + ----- + The `reverse_complement` method is designed for nucleotide sequences + only. If used with protein sequences, it will incorrectly process + amino acid characters as if they were nucleotides. + """ + pass + +def decode_phred(qual: str, base_64: bool) -> tuple[int]: + """ + Decode Phred quality strings to quality scores. + + Parameters + ---------- + phred : str + A string representing Phred-encoded quality strings. + base_64 : bool, default=False + If `True`, return the quality using the Phred+64 encoding, otherwise + the Phred+33 encoding will be used. + + Returns + ------- + tuple of int + A list of integers representing quality scores derived from the + probability of a base-calling error using a logarithmic transformation. + """ + pass diff --git a/pyproject.toml b/pyproject.toml index 3394db4..e61223d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,14 @@ [build-system] -requires = ["maturin>=1.7,<2.0"] +requires = ["maturin>=1.8,<2.0"] build-backend = "maturin" [project] name = "needletail" +requires-python = ">=3.8" dynamic = ["version"] classifier = [ "Intended Audience :: Science/Research", - "Programming Language :: Python :: 3", + "Programming Language :: Rust", "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics", ] diff --git a/src/python.rs b/src/python.rs index cbf1914..0d1642b 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,6 +1,11 @@ //! Python bindings for needletail // TODO: +// - The `normalize` method of the `Record` class should return a new `Record` +// object with the normalized sequence. +// - Add a `reverse_complement` method to the `Record` class that returns a new +// `Record` object with the reverse complement of the sequence. +// - Turn `is_fasta` and `is_fastq` into properties. // - Make the return values of `__repr__` and `__str__` show up as raw strings. // - Make `normalize_seq`, `reverse_complement`, and `decode_phred` functions // able to handle `Record` objects as input. @@ -17,6 +22,7 @@ use pyo3::{create_exception, wrap_pyfunction}; use std::hash::{DefaultHasher, Hash, Hasher}; use std::io::Cursor; use std::path::PathBuf; +use std::sync::Mutex; create_exception!(needletail, NeedletailError, pyo3::exceptions::PyException); @@ -53,8 +59,9 @@ fn get_seq_snippet(seq: &str, max_len: usize) -> String { /// Record: /// A class representing a FASTA/FASTQ sequence record. #[pyclass] +#[pyo3(name = "FastxReader")] pub struct PyFastxReader { - reader: Box, + reader: Mutex>, } #[pymethods] @@ -67,8 +74,8 @@ impl PyFastxReader { slf } - fn __next__(mut slf: PyRefMut) -> PyResult> { - if let Some(rec) = slf.reader.next() { + fn __next__(slf: PyRefMut) -> PyResult> { + if let Some(rec) = slf.reader.lock().unwrap().next() { let record = py_try!(rec); Ok(Some(Record::from_sequence_record(&record))) } else { @@ -179,6 +186,12 @@ impl Record { /// See also /// -------- /// normalize_seq: A function to normalize nucleotide sequence strings. + /// + // Notes + // ----- + // The `normalize` method is designed for nucleotide sequences only. If + // used with protein sequences, it will incorrectly process amino acid + // characters as if they were nucleotides. #[pyo3(signature = (iupac=false))] pub fn normalize(&mut self, iupac: bool) -> PyResult<()> { if let Some(s) = normalize(self.seq.as_bytes(), iupac) { @@ -251,7 +264,8 @@ impl Record { } } -/// An iterator that reads sequence records from a FASTA/FASTQ file. +/// Returns an iterator that parses a FASTA/FASTQ file and yields sequence +/// records. /// /// Parameters /// ---------- @@ -260,8 +274,8 @@ impl Record { /// /// Returns /// ------- -/// PyFastxReader -/// A `PyFastxReader` iterator that yields `Record` objects representing +/// FastxReader +/// A `FastxReader` iterator that yields `Record` objects representing /// sequences from the input file. /// /// Raises @@ -273,16 +287,19 @@ impl Record { /// -------- /// parse_fastx_string: /// A function to parse sequence records from a FASTA/FASTQ string. -/// PyFastxReader: +/// FastxReader: /// A class with instances that are iterators that yield `Record` objects. #[pyfunction] #[pyo3(name = "parse_fastx_file")] fn py_parse_fastx_file(path: PathBuf) -> PyResult { let reader = py_try!(parse_fastx_file(path)); - Ok(PyFastxReader { reader }) + Ok(PyFastxReader { + reader: reader.into(), + }) } -/// Parse sequence records from a FASTA/FASTQ string. +/// Returns an iterator that parses a FASTA/FASTQ string and yields sequence +/// records. /// /// Parameters /// ---------- @@ -291,8 +308,8 @@ fn py_parse_fastx_file(path: PathBuf) -> PyResult { /// /// Returns /// ------- -/// PyFastxReader -/// A `PyFastxReader` iterator that yields `Record` objects representing +/// FastxReader +/// A `FastxReader` iterator that yields `Record` objects representing /// sequences from the input string. /// /// Raises @@ -304,12 +321,14 @@ fn py_parse_fastx_file(path: PathBuf) -> PyResult { /// -------- /// parse_fastx_file: /// A function to parse sequence records from a FASTA/FASTQ file. -/// PyFastxReader: +/// FastxReader: /// A class with instances that are iterators that yield `Record` objects. #[pyfunction] -fn parse_fastx_string(content: &str) -> PyResult { - let reader = py_try!(parse_fastx_reader(Cursor::new(content.to_owned()))); - Ok(PyFastxReader { reader }) +fn parse_fastx_string(fastx_string: &str) -> PyResult { + let reader = py_try!(parse_fastx_reader(Cursor::new(fastx_string.to_owned()))); + Ok(PyFastxReader { + reader: reader.into(), + }) } /// Normalize the sequence string of nucleotide records by: @@ -339,7 +358,7 @@ fn parse_fastx_string(content: &str) -> PyResult { /// /// Notes /// ----- -/// The `normalize` method is designed for nucleotide sequences only. If +/// The `normalize_seq` function is designed for nucleotide sequences only. If /// used with protein sequences, it will incorrectly process amino acid /// characters as if they were nucleotides. #[pyfunction] @@ -363,6 +382,12 @@ pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { /// -------- /// str /// The reverse complement of the input nucleotide sequence. +/// +/// Notes +/// ----- +/// The `reverse_complement` function is designed for nucleotide sequences +/// only. If used with protein sequences, it will incorrectly process +/// amino acid characters as if they were nucleotides. #[pyfunction] pub fn reverse_complement(seq: &str) -> PyResult { let comp: Vec = seq @@ -374,12 +399,12 @@ pub fn reverse_complement(seq: &str) -> PyResult { Ok(String::from_utf8_lossy(&comp).to_string()) } -/// Decode Phred quality data to quality scores. +/// Decode Phred quality strings to quality scores. /// /// Parameters: /// ----------- /// phred : str -/// A string representing Phred-encoded quality data. +/// A string representing Phred-encoded quality strings. /// base_64 : bool, default=False /// If `True`, return the quality using the Phred+64 encoding, otherwise /// the Phred+33 encoding will be used. @@ -397,9 +422,9 @@ pub fn py_decode_phred(qual: &str, base_64: bool, py: Python<'_>) -> PyResult) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(normalize_seq))?; m.add_wrapped(wrap_pyfunction!(reverse_complement))?; m.add_wrapped(wrap_pyfunction!(py_decode_phred))?; - m.add("NeedletailError", py.get_type_bound::())?; + m.add("NeedletailError", py.get_type::())?; Ok(()) }