diff --git a/ms2pip/_utils/dlib.py b/ms2pip/_utils/dlib.py index f33ee35..50714da 100644 --- a/ms2pip/_utils/dlib.py +++ b/ms2pip/_utils/dlib.py @@ -18,16 +18,18 @@ TypeDecorator, ) from sqlalchemy.dialects.sqlite import BLOB +from sqlalchemy.engine import Connection DLIB_VERSION = "0.1.14" class CompressedArray(TypeDecorator): - """ Sqlite-like does not support arrays. - Let's use a custom type decorator. + """Sqlite-like does not support arrays. + Let's use a custom type decorator. - See http://docs.sqlalchemy.org/en/latest/core/types.html#sqlalchemy.types.TypeDecorator + See http://docs.sqlalchemy.org/en/latest/core/types.html#sqlalchemy.types.TypeDecorator """ + impl = BLOB def __init__(self, dtype, *args, **kwargs): @@ -49,51 +51,55 @@ def copy(self): metadata = MetaData() -big_float = numpy.dtype('>f4') -big_double = numpy.dtype('>f8') +big_float = numpy.dtype(">f4") +big_double = numpy.dtype(">f8") Entry = Table( - 'entries', + "entries", metadata, - Column('PrecursorMz', Float, nullable=False, index=True), - Column('PrecursorCharge', Integer, nullable=False), - Column('PeptideModSeq', String, nullable=False), - Column('PeptideSeq', String, nullable=False, index=True), - Column('Copies', Integer, nullable=False), - Column('RTInSeconds', Float, nullable=False), - Column('Score', Float, nullable=False), - Column('MassEncodedLength', Integer, nullable=False), - Column('MassArray', CompressedArray(big_double), nullable=False), - Column('IntensityEncodedLength', Integer, nullable=False), - Column('IntensityArray', CompressedArray(big_float), nullable=False), - Column('CorrelationEncodedLength', Integer, nullable=True), - Column('CorrelationArray', CompressedArray(big_float), nullable=True), - Column('RTInSecondsStart', Float, nullable=True), - Column('RTInSecondsStop', Float, nullable=True), - Column('MedianChromatogramEncodedLength', Integer, nullable=True), - Column('MedianChromatogramArray', CompressedArray(big_float), nullable=True), - Column('SourceFile', String, nullable=False), + Column("PrecursorMz", Float, nullable=False, index=True), + Column("PrecursorCharge", Integer, nullable=False), + Column("PeptideModSeq", String, nullable=False), + Column("PeptideSeq", String, nullable=False, index=True), + Column("Copies", Integer, nullable=False), + Column("RTInSeconds", Float, nullable=False), + Column("Score", Float, nullable=False), + Column("MassEncodedLength", Integer, nullable=False), + Column("MassArray", CompressedArray(big_double), nullable=False), + Column("IntensityEncodedLength", Integer, nullable=False), + Column("IntensityArray", CompressedArray(big_float), nullable=False), + Column("CorrelationEncodedLength", Integer, nullable=True), + Column("CorrelationArray", CompressedArray(big_float), nullable=True), + Column("RTInSecondsStart", Float, nullable=True), + Column("RTInSecondsStop", Float, nullable=True), + Column("MedianChromatogramEncodedLength", Integer, nullable=True), + Column("MedianChromatogramArray", CompressedArray(big_float), nullable=True), + Column("SourceFile", String, nullable=False), ) -Index('ix_entries_PeptideModSeq_PrecursorCharge_SourceFile', Entry.c.PeptideModSeq, Entry.c.PrecursorCharge, Entry.c.SourceFile) +Index( + "ix_entries_PeptideModSeq_PrecursorCharge_SourceFile", + Entry.c.PeptideModSeq, + Entry.c.PrecursorCharge, + Entry.c.SourceFile, +) PeptideToProtein = Table( - 'peptidetoprotein', + "peptidetoprotein", metadata, - Column('PeptideSeq', String, nullable=False, index=True), - Column('isDecoy', Boolean, nullable=True), - Column('ProteinAccession', String, nullable=False, index=True), + Column("PeptideSeq", String, nullable=False, index=True), + Column("isDecoy", Boolean, nullable=True), + Column("ProteinAccession", String, nullable=False, index=True), ) Metadata = Table( - 'metadata', + "metadata", metadata, - Column('Key', String, nullable=False, index=True), - Column('Value', String, nullable=False), + Column("Key", String, nullable=False, index=True), + Column("Value", String, nullable=False), ) -def open_sqlite(filename: Union[str, Path]) -> sqlalchemy.engine.Connection: +def open_sqlite(filename: Union[str, Path]) -> Connection: engine = sqlalchemy.create_engine(f"sqlite:///{filename}") - metadata.bind = engine return engine.connect() diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index fc17461..c769024 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -47,13 +47,15 @@ from collections import defaultdict from io import StringIO from pathlib import Path +from os import PathLike from time import localtime, strftime from typing import Any, Dict, Generator, List, Optional, Union import numpy as np from psm_utils import PSM, Peptidoform from pyteomics import proforma -from sqlalchemy import engine, select +from sqlalchemy import select +from sqlalchemy.engine import Connection from ms2pip._utils import dlib from ms2pip.result import ProcessingResult @@ -62,7 +64,7 @@ def write_spectra( - filename: Union[str, Path], + filename: Union[str, PathLike], processing_results: List[ProcessingResult], file_format: str = "tsv", write_mode: str = "w", @@ -92,7 +94,7 @@ class _Writer(ABC): suffix = "" - def __init__(self, filename: Union[str, Path], write_mode: str = "w"): + def __init__(self, filename: Union[str, PathLike], write_mode: str = "w"): self.filename = Path(filename).with_suffix(self.suffix) self.write_mode = write_mode @@ -466,7 +468,7 @@ class Bibliospec(_Writer): "ion-mobility", ] - def __init__(self, filename: Union[str, Path], write_mode: str = "w"): + def __init__(self, filename: Union[str, PathLike], write_mode: str = "w"): super().__init__(filename, write_mode) self.ssl_file = self.filename.with_suffix(self.ssl_suffix) self.ms2_file = self.filename.with_suffix(self.ms2_suffix) @@ -618,7 +620,7 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str: ) @staticmethod - def _get_last_ssl_scan_number(ssl_file: Union[str, Path, StringIO]): + def _get_last_ssl_scan_number(ssl_file: Union[str, PathLike, StringIO]): """Read scan number of last line in a Bibliospec SSL file.""" if isinstance(ssl_file, StringIO): ssl_file.seek(0) @@ -653,7 +655,7 @@ def open(self): def write(self, processing_results: List[ProcessingResult]): """Write MS2PIP predictions to a DLIB SQLite file.""" connection = self._file_object - dlib.metadata.create_all() + dlib.metadata.create_all(connection.engine) self._write_metadata(connection) self._write_entries(processing_results, connection, self.filename) self._write_peptide_to_protein(processing_results, connection) @@ -682,11 +684,11 @@ def _format_modified_sequence(peptidoform: Peptidoform) -> str: ) @staticmethod - def _write_metadata(connection: engine.Connection): + def _write_metadata(connection: Connection): """Write metadata to DLIB SQLite file.""" with connection.begin(): version = connection.execute( - select([dlib.Metadata.c.Value]).where(dlib.Metadata.c.Key == "version") + select(dlib.Metadata.c.Value).where(dlib.Metadata.c.Key == "version") ).scalar() if version is None: connection.execute( @@ -699,8 +701,8 @@ def _write_metadata(connection: engine.Connection): @staticmethod def _write_entries( processing_results: List[ProcessingResult], - connection: engine.Connection, - output_filename: str, + connection: Connection, + output_filename: Union[str, PathLike], ): """Write spectra to DLIB SQLite file.""" with connection.begin(): @@ -730,7 +732,7 @@ def _write_entries( ) @staticmethod - def _write_peptide_to_protein(results: List[ProcessingResult], connection: engine.Connection): + def _write_peptide_to_protein(results: List[ProcessingResult], connection: Connection): """Write peptide-to-protein mappings to DLIB SQLite file.""" peptide_to_proteins = { (result.psm.peptidoform.sequence, protein) @@ -743,7 +745,7 @@ def _write_peptide_to_protein(results: List[ProcessingResult], connection: engin sql_peptide_to_proteins = set() proteins = {protein for _, protein in peptide_to_proteins} for peptide_to_protein in connection.execute( - dlib.PeptideToProtein.select().where( + select(dlib.PeptideToProtein).where( dlib.PeptideToProtein.c.ProteinAccession.in_(proteins) ) ): diff --git a/pyproject.toml b/pyproject.toml index 87e772b..83639c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,8 +37,7 @@ dependencies = [ "pandas>=1,<3", "pyarrow", "pyteomics>=3.5,<5", - "tomlkit>=0.5,<1", - "sqlalchemy>=1.3,<2", + "sqlalchemy>=1.4,<3", "click>=7,<9", "xgboost>=1.3,<3", "lxml>=4", diff --git a/tests/test_spectrum_output.py b/tests/test_spectrum_output.py index 9ddf8d6..7d89fc5 100644 --- a/tests/test_spectrum_output.py +++ b/tests/test_spectrum_output.py @@ -1,6 +1,12 @@ -from psm_utils import Peptidoform +import tempfile +from pathlib import Path -from ms2pip.spectrum_output import MSP, Bibliospec, DLIB +import numpy as np +import pytest +from psm_utils import PSM, Peptidoform + +from ms2pip.result import ProcessingResult +from ms2pip.spectrum_output import DLIB, MSP, Bibliospec class TestMSP: @@ -45,3 +51,226 @@ def test__format_modified_sequence(self): for test_in, expected_out in test_cases: assert DLIB._format_modified_sequence(Peptidoform(test_in)) == expected_out + + def test_dlib_database_creation(self): + """Test that DLIB file creation works with SQLAlchemy (integration test).""" + # Create test data + pep = Peptidoform("ACDE/2") + psm = PSM( + peptidoform=pep, + spectrum_id=1, + retention_time=100.0, + protein_list=["PROT1", "PROT2"], + ) + result = ProcessingResult( + psm_index=0, + psm=psm, + theoretical_mz={ + "b": np.array([72.04435, 175.05354, 290.08047], dtype=np.float32), + "y": np.array([148.0604, 263.0873, 366.0965], dtype=np.float32), + }, + predicted_intensity={ + "b": np.array([0.1, 0.5, 0.3], dtype=np.float32), + "y": np.array([0.8, 0.6, 0.2], dtype=np.float32), + }, + observed_intensity=None, + correlation=None, + feature_vectors=None, + ) + + # Write DLIB file + with tempfile.TemporaryDirectory() as tmpdir: + dlib_file = Path(tmpdir) / "test.dlib" + with DLIB(dlib_file) as writer: + writer.write([result]) + + # Verify file was created + assert dlib_file.exists() + + # Verify database structure and content using SQLAlchemy + from ms2pip._utils import dlib as dlib_module + + connection = dlib_module.open_sqlite(dlib_file) + try: + # Test that metadata table exists and has version + from sqlalchemy import select + + version = connection.execute( + select(dlib_module.Metadata.c.Value).where( + dlib_module.Metadata.c.Key == "version" + ) + ).scalar() + assert version == dlib_module.DLIB_VERSION + + # Test that Entry table has data (select specific columns to avoid nullable CompressedArray) + from sqlalchemy import func + + entry_count = connection.execute( + select(func.count()).select_from(dlib_module.Entry) + ).scalar() + assert entry_count == 1 + + # Select specific non-nullable columns + entry = connection.execute( + select( + dlib_module.Entry.c.PeptideSeq, + dlib_module.Entry.c.PrecursorCharge, + dlib_module.Entry.c.RTInSeconds, + dlib_module.Entry.c.MassArray, + dlib_module.Entry.c.IntensityArray, + ) + ).fetchone() + assert entry.PeptideSeq == "ACDE" + assert entry.PrecursorCharge == 2 + assert entry.RTInSeconds == 100.0 + assert len(entry.MassArray) == 6 # 3 b-ions + 3 y-ions + assert len(entry.IntensityArray) == 6 + + # Test that PeptideToProtein table has data + peptide_to_proteins = connection.execute( + select(dlib_module.PeptideToProtein) + ).fetchall() + assert len(peptide_to_proteins) == 2 + proteins = {p.ProteinAccession for p in peptide_to_proteins} + assert proteins == {"PROT1", "PROT2"} + assert all(p.PeptideSeq == "ACDE" for p in peptide_to_proteins) + finally: + connection.close() + + def test_dlib_multiple_results(self): + """Test writing multiple ProcessingResults to DLIB file.""" + # Create multiple test results + results = [] + for i, seq in enumerate(["ACDE/2", "PEPTIDE/2", "TESTK/2"]): + pep = Peptidoform(seq) + psm = PSM( + peptidoform=pep, + spectrum_id=i, + retention_time=100.0 + i * 10, + protein_list=[f"PROT{i}"], + ) + result = ProcessingResult( + psm_index=i, + psm=psm, + theoretical_mz={ + "b": np.array([72.04435, 175.05354], dtype=np.float32), + "y": np.array([148.0604, 263.0873], dtype=np.float32), + }, + predicted_intensity={ + "b": np.array([0.1, 0.5], dtype=np.float32), + "y": np.array([0.8, 0.6], dtype=np.float32), + }, + observed_intensity=None, + correlation=None, + feature_vectors=None, + ) + results.append(result) + + # Write DLIB file + with tempfile.TemporaryDirectory() as tmpdir: + dlib_file = Path(tmpdir) / "test_multiple.dlib" + with DLIB(dlib_file) as writer: + writer.write(results) + + # Verify all entries were written + from sqlalchemy import select + + from ms2pip._utils import dlib as dlib_module + + connection = dlib_module.open_sqlite(dlib_file) + try: + # Select specific columns to avoid nullable CompressedArray + entries = connection.execute( + select( + dlib_module.Entry.c.PeptideSeq, + dlib_module.Entry.c.RTInSeconds, + ) + ).fetchall() + assert len(entries) == 3 + + peptides = {e.PeptideSeq for e in entries} + assert peptides == {"ACDE", "PEPTIDE", "TESTK"} + + # Verify retention times + rt_values = {e.RTInSeconds for e in entries} + assert rt_values == {100.0, 110.0, 120.0} + finally: + connection.close() + + def test_dlib_sqlalchemy_select_syntax(self): + """Test that SQLAlchemy v2 select() syntax works correctly.""" + # This test specifically verifies the SQLAlchemy v2 compatibility changes + pep = Peptidoform("ACDE/2") + psm = PSM( + peptidoform=pep, + spectrum_id=1, + retention_time=100.0, + protein_list=["PROT1"], + ) + result = ProcessingResult( + psm_index=0, + psm=psm, + theoretical_mz={ + "b": np.array([72.04435], dtype=np.float32), + "y": np.array([148.0604], dtype=np.float32), + }, + predicted_intensity={ + "b": np.array([0.5], dtype=np.float32), + "y": np.array([0.8], dtype=np.float32), + }, + observed_intensity=None, + correlation=None, + feature_vectors=None, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + dlib_file = Path(tmpdir) / "test_sqlalchemy.dlib" + with DLIB(dlib_file) as writer: + writer.write([result]) + + # Test the specific SQLAlchemy operations that were modified + from sqlalchemy import select + + from ms2pip._utils import dlib as dlib_module + + connection = dlib_module.open_sqlite(dlib_file) + try: + # Test select(Table) syntax (changed from Table.select()) + peptide_to_protein_results = connection.execute( + select(dlib_module.PeptideToProtein).where( + dlib_module.PeptideToProtein.c.ProteinAccession == "PROT1" + ) + ).fetchall() + assert len(peptide_to_protein_results) == 1 + assert peptide_to_protein_results[0].PeptideSeq == "ACDE" + + # Test select(column) syntax (changed from select([column])) + version = connection.execute( + select(dlib_module.Metadata.c.Value).where( + dlib_module.Metadata.c.Key == "version" + ) + ).scalar() + assert version is not None + assert version == dlib_module.DLIB_VERSION + finally: + connection.close() + + def test_dlib_missing_retention_time(self): + """Test that DLIB writing raises error when retention time is missing.""" + pep = Peptidoform("ACDE/2") + psm = PSM(peptidoform=pep, spectrum_id=1) # No retention_time + result = ProcessingResult( + psm_index=0, + psm=psm, + theoretical_mz={"b": np.array([72.04435], dtype=np.float32)}, + predicted_intensity={"b": np.array([0.5], dtype=np.float32)}, + observed_intensity=None, + correlation=None, + feature_vectors=None, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + dlib_file = Path(tmpdir) / "test_no_rt.dlib" + with pytest.raises(ValueError, match="Retention time required"): + with DLIB(dlib_file) as writer: + writer.write([result])