Source code for pyXLMS.parser._parser_xldbse_xlinkx

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import sqlite3
import warnings
import pandas as pd
from tqdm import tqdm
from os.path import splitext

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._crosslink import create_crosslink
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import MODIFICATIONS
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Tuple
from typing import List

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


def __read_xlinkx_pdresult(filename: str, drop: bool = False) -> List[pd.DataFrame]:
    r"""Read an XlinkX pdResult file and convert it to standard XlinkX result tables.

    Reads an XlinkX pdResult file and converts it to standard XlinkX CSM and crosslink
    tables as pandas DataFrame objects.

    Parameters
    ----------
    filename : str
        Filename/path of the pdResult file.
    drop : bool, default = False
        Whether or not the not-needed columns should be dropped.
        Defaults to not dropping not-needed columns.

    Returns
    -------
    list of pd.DataFrame
        The converted CSM and crosslink tables in XlinkX format.

    Notes
    -----
    This function should not be called directly, it is called from ``read_xlinkx()``.
    """
    conn = sqlite3.connect(filename)
    csms = pd.read_sql_query("SELECT * FROM CSMs", conn)
    dcsms = pd.read_sql_query("SELECT * FROM DecoyCSMs", conn)
    xls = pd.read_sql_query("SELECT * FROM Crosslinks", conn)
    conn.close()
    column_mapping_csms = {
        "SequenceA": "Sequence A",
        "SequenceB": "Sequence B",
        "ModificationsA": "Modifications A",
        "ModificationsB": "Modifications B",
        "CrosslinkerPositionA": "Crosslinker Position A",
        "CrosslinkerPositionB": "Crosslinker Position B",
        "ProteinAccessionA": "Protein Accession A",
        "ProteinAccessionB": "Protein Accession B",
        "LeadingProteinPositionA": "Leading Protein Position A",
        "LeadingProteinPositionB": "Leading Protein Position B",
        "IsDecoy": "Is Decoy",
        "XlinkXScore": "XlinkX Score",
        "SpectrumFileName": "Spectrum File",
        "FirstScan": "First Scan",
        "ChargeState": "Charge",
        "RetentionTime": "RT [min]",
    }
    column_mapping_xls = {
        "SequenceA": "Sequence A",
        "SequenceB": "Sequence B",
        "ModificationsA": "Modifications A",
        "ModificationsB": "Modifications B",
        "Crosslinker": "Crosslinker",
        "PositionA": "Position A",
        "PositionB": "Position B",
        "AccessionA": "Accession A",
        "AccessionB": "Accession B",
        "IsDecoy": "Is Decoy",
        "MaxXlinkXScore": "Max. XlinkX Score",
    }
    csms.rename(columns=column_mapping_csms, inplace=True)
    if "Is Decoy" not in csms.columns:
        csms["Is Decoy"] = [False for i in range(csms.shape[0])]
    dcsms.rename(columns=column_mapping_csms, inplace=True)
    if "Is Decoy" not in dcsms.columns:
        dcsms["Is Decoy"] = [True for i in range(dcsms.shape[0])]
    xls.rename(columns=column_mapping_xls, inplace=True)
    if drop:
        csms.drop(
            columns=list(
                set(csms.columns.values.tolist())
                - set(list(column_mapping_csms.values()))
            ),
            inplace=True,
        )
        dcsms.drop(
            columns=list(
                set(dcsms.columns.values.tolist())
                - set(list(column_mapping_csms.values()))
            ),
            inplace=True,
        )
        xls.drop(
            columns=list(
                set(xls.columns.values.tolist())
                - set(list(column_mapping_xls.values()))
            ),
            inplace=True,
        )
    target_xls = xls[xls["Is Decoy"] == 0]
    if not isinstance(target_xls, pd.DataFrame):
        raise RuntimeError(
            "Selection of target crosslinks did not return a valid pandas DataFrame!"
        )
    return [csms, target_xls]



[docs]
def read_xlinkx(
    files: str | List[str] | BinaryIO,
    decoy: Optional[bool] = None,
    parse_modifications: bool = True,
    modifications: Dict[str, float] = MODIFICATIONS,
    format: Literal["auto", "csv", "txt", "tsv", "xlsx", "pdresult"] = "auto",
    sep: str = "\t",
    decimal: str = ".",
    ignore_errors: bool = False,
    verbose: Literal[0, 1, 2] = 1,
    **kwargs,
) -> ParserResult:
    r"""Read an XlinkX result file.

    Reads an XlinkX crosslink-spectrum-matches result file or crosslink result file in ``.csv`` or ``.xlsx`` format,
    or both from a ``.pdResult`` file from Proteome Discover, and returns a ``parser_result``.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the XlinkX result file(s) or a file-like object/stream.
    decoy : bool, or None
        Default decoy value to use if no decoy value is found. Only used if the "Is Decoy" column is not found
        in the supplied data.
    parse_modifications : bool, default = True
        Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
        Requires correct specification of the 'modifications' parameter.
    modifications: dict of str, float, default = ``constants.MODIFICATIONS``
        Mapping of modification names to modification masses.
    format : "auto", "csv", "tsv", "txt", "xlsx", or "pdresult", default = "auto"
        The format of the result file. ``"auto"`` is only available if the name/path to the XlinkX result file is given.
    sep : str, default = "\t"
        Seperator used in the ``.csv`` or ``.tsv`` file. Parameter is ignored if the file is in ``.xlsx`` or ``.pdResult`` format.
    decimal : str, default = "."
        Character to recognize as decimal point. Parameter is ignored if the file is in ``.xlsx`` or ``.pdResult`` format.
    ignore_errors : bool, default = False
        If missing crosslink positions should raise an error or not. Setting this to True will suppress the ``RuntimeError``
        for the crosslink position not being able to be parsed for at least one of the crosslinks. For these cases the crosslink
        position will be set to 100 000.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    ValueError
        If the input format is not supported or cannot be inferred.
    TypeError
        If parameter verbose was not set correctly.
    TypeError
        If the pdResult file is provided in the wrong format.
    RuntimeError
        If the crosslink position could not be parsed for at least one of the crosslinks.
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslinks or crosslink-spectrum-matches.
    KeyError
        If one of the found post-translational-modifications could not be found/mapped.

    Notes
    -----
    Uses ``XlinkX Score`` as the score of the crosslink-spectrum-match for crosslink-spectrum-matches,
    alpha and beta peptide scores are ``None`` for crosslink-spectrum-matches. Uses ``Max. XlinkX Score``
    as the score of the crosslink for crosslinks.

    Warnings
    --------
    XlinkX does not report if the individual peptides in a crosslink are from the target or decoy database.
    The parser assumes that both peptides from a target crosslink are from the target database, and vice versa,
    that both peptides are from the decoy database if it is a decoy crosslink. This leads to only TT and DD matches,
    which needs to be considered for FDR estimation. This applies to both crosslinks **and** crosslink-spectrum-matches.

    Examples
    --------
    >>> from pyXLMS.parser import read_xlinkx
    >>> csms_from_xlsx = read_xlinkx(
    ...     "data/xlinkx/XLpeplib_Beveridge_Lumos_DSSO_MS3_CSMs.xlsx"
    ... )

    >>> from pyXLMS.parser import read_xlinkx
    >>> crosslinks_from_xlsx = read_xlinkx(
    ...     "data/xlinkx/XLpeplib_Beveridge_Lumos_DSSO_MS3_Crosslinks.xlsx"
    ... )

    >>> from pyXLMS.parser import read_xlinkx
    >>> csms_from_tsv = read_xlinkx(
    ...     "data/xlinkx/XLpeplib_Beveridge_Lumos_DSSO_MS3_CSMs.txt"
    ... )

    >>> from pyXLMS.parser import read_xlinkx
    >>> crosslinks_from_tsv = read_xlinkx(
    ...     "data/xlinkx/XLpeplib_Beveridge_Lumos_DSSO_MS3_Crosslinks.txt"
    ... )

    >>> from pyXLMS.parser import read_xlinkx
    >>> csms_and_crosslinks_from_pdresult = read_xlinkx(
    ...     "data/xlinkx/XLpeplib_Beveridge_Lumos_DSSO_MS3.pdResult"
    ... )
    """
    ## check input
    _ok = check_input(decoy, "decoy", bool) if decoy is not None else True
    _ok = check_input(parse_modifications, "parse_modifications", bool)
    _ok = check_input(modifications, "modifications", dict, float)
    _ok = check_input(format, "format", str)
    _ok = check_input(sep, "sep", str)
    _ok = check_input(decimal, "decimal", str)
    _ok = check_input(ignore_errors, "ignore_errors", bool)
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")

    ## helper functions
    def parse_modification_str(
        sequence: str,
        modification_str: str,
        modifications: Dict[str, float] = modifications,
    ) -> Dict[int, Tuple[str, float]]:
        mods = [mod.strip() for mod in modification_str.split(";")]
        parsed_mods = dict()
        for mod in mods:
            mod_type = ")".join("(".join(mod.split("(")[1:]).split(")")[:-1]).strip()
            mod_pos = mod.split("(")[0].strip()
            if mod_type not in modifications:
                raise KeyError(
                    f"Unable to find modification {mod_type} in the set of provided modifications. "
                    + "Please pass the full set of expected modifications to the parser."
                )
            if "Nterm" in mod_pos or "N-Term" in mod_pos:
                parsed_mods[0] = (mod_type, modifications[mod_type])
            elif "Cterm" in mod_pos or "C-Term" in mod_pos:
                parsed_mods[len(sequence)] = (mod_type, modifications[mod_type])
            else:
                parsed_mods[__parse_int(mod_pos[1:])] = (
                    mod_type,
                    modifications[mod_type],
                )
        return parsed_mods

    def get_crosslink_position_from_peptide_seq(
        sequence: str,
        crosslinker: str,
        modifications: str,
        ignore_errors: bool = False,
        verbose: Literal[0, 1, 2] = 1,
    ) -> int:
        seq = str(sequence).strip()
        xl = str(crosslinker).strip()
        mods = [mod.strip() for mod in str(modifications).split(";")]
        for i, aa in enumerate(seq):
            if aa == "[":
                return i + 1
        for mod in mods:
            if xl in mod:
                mod_pos: str = mod.split("[")[-1].split("]")[0].strip()
                if "Nterm" in mod_pos or "N-Term" in mod_pos:
                    return 0
                if "Cterm" in mod_pos or "C-Term" in mod_pos:
                    return len(format_sequence(sequence))
                try:
                    return __parse_int(mod_pos[1:])
                except Exception as _e:
                    pass
                try:
                    # legacy fallback, this should already be caught by the code above
                    return __parse_int(mod.split("[")[1].split("]")[0][1:])
                except Exception as _e:
                    pass
        if verbose == 2 or not ignore_errors:
            raise RuntimeError(
                f"Could not parse crosslink position from sequence: {seq}, or modifications {modifications}!"
            )
        if verbose == 1:
            warnings.warn(
                RuntimeWarning(
                    f"Could not parse crosslink position from sequence: {seq}, or modifications {modifications}!"
                )
            )
        return 100000

    def adjust_crosslink_position(
        position: int,
        sequence: str,
    ) -> int:
        if position == 0:
            return 1
        if position > len(sequence.strip()):
            return len(sequence.strip())
        return position

    def adjust_protein_position(position: int) -> int:
        if position == 0:
            return 1
        return position

    ## data structures
    crosslinks = list()
    csms = list()

    ## handle input
    if not isinstance(files, list):
        inputs = [files]
    else:
        inputs = files

    for input in inputs:
        ## reading data
        data_objects = None
        if format == "auto" and not isinstance(input, str):
            raise ValueError(
                "Can't detect format for file-like objects. Please specify format manually!"
            )
        # and isinstance specified for type checking
        if format == "auto" and isinstance(input, str):
            file_extension = splitext(input)[1].lower()
            if (
                file_extension == ".txt"
                or file_extension == ".tsv"
                or file_extension == ".csv"
            ):
                data_objects = [
                    pd.read_csv(
                        input, sep=sep, decimal=decimal, low_memory=False, **kwargs
                    )
                ]
            elif file_extension == ".xlsx":
                data_objects = [pd.read_excel(input, engine="openpyxl", **kwargs)]
            elif file_extension == ".pdresult":
                data_objects = __read_xlinkx_pdresult(input)
            else:
                raise ValueError(
                    f"Detected file extension {file_extension} is not supported! Input file has to be a valid file with extension '.csv', '.tsv', '.xlsx', or '.pdResult'!"
                )
        elif format in ["csv", "tsv", "txt", "xlsx"]:
            if format == "xlsx":
                data_objects = [pd.read_excel(input, engine="openpyxl", **kwargs)]
            else:
                data_objects = [
                    pd.read_csv(  # ty: ignore[no-matching-overload]
                        input, sep=sep, decimal=decimal, low_memory=False, **kwargs
                    )
                ]
        elif format == "pdresult":
            if not isinstance(input, str):
                raise TypeError(
                    "Can't read pdResult files from a file-like object/stream. Please provide the filename/path instead!"
                )
            data_objects = __read_xlinkx_pdresult(input)
        else:
            raise ValueError(
                f"Provided input format {format} is not supported! Input format has to be of type 'csv', 'tsv', 'xlsx', or 'pdresult'!"
            )
        if data_objects is None:
            raise RuntimeError(
                "Something went wrong while reading the file! Please file a bug report!"
            )
        # this should be impossible, but check here for pyright
        if not isinstance(data_objects, list):
            raise RuntimeError(
                "Something went wrong while reading the file! Please file a bug report!"
            )
        for data in data_objects:
            # this should be impossible, but check here for pyright
            if not isinstance(data, pd.DataFrame):
                raise RuntimeError(
                    "Something went wrong while reading the file! Please file a bug report!"
                )
            ## detect input file type
            col_names = data.columns.values.tolist()
            is_crosslink_dataframe = "Max. XlinkX Score" in col_names
            ## process data
            if is_crosslink_dataframe:
                for i, row in tqdm(
                    data.iterrows(),
                    total=data.shape[0],
                    desc="Reading XlinkX crosslinks...",
                ):
                    # create crosslink
                    crosslink = create_crosslink(
                        peptide_a=format_sequence(str(row["Sequence A"])),
                        xl_position_peptide_a=get_crosslink_position_from_peptide_seq(
                            str(row["Sequence A"]),
                            str(row["Crosslinker"]),
                            str(row["Modifications A"]),
                            ignore_errors,
                            verbose,
                        ),
                        proteins_a=[
                            protein.strip()
                            for protein in str(row["Accession A"]).split(";")
                        ],
                        xl_position_proteins_a=[
                            adjust_protein_position(__parse_int(position))
                            for position in str(row["Position A"]).split(";")
                        ],
                        decoy_a=get_bool_from_value(row["Is Decoy"]),
                        peptide_b=format_sequence(str(row["Sequence B"])),
                        xl_position_peptide_b=get_crosslink_position_from_peptide_seq(
                            str(row["Sequence B"]),
                            str(row["Crosslinker"]),
                            str(row["Modifications B"]),
                            ignore_errors,
                            verbose,
                        ),
                        proteins_b=[
                            protein.strip()
                            for protein in str(row["Accession B"]).split(";")
                        ],
                        xl_position_proteins_b=[
                            adjust_protein_position(__parse_int(position))
                            for position in str(row["Position B"]).split(";")
                        ],
                        decoy_b=get_bool_from_value(row["Is Decoy"]),
                        score=__parse_float(row["Max. XlinkX Score"]),
                        additional_information={
                            "source": __serialize_pandas_series(row)
                        },
                    )
                    crosslinks.append(crosslink)
            else:
                for i, row in tqdm(
                    data.iterrows(),
                    total=data.shape[0],
                    desc="Reading XlinkX CSMs...",
                ):
                    # create csm
                    csm = create_csm(
                        peptide_a=format_sequence(str(row["Sequence A"])),
                        modifications_a=parse_modification_str(
                            format_sequence(str(row["Sequence A"]).strip()),
                            str(row["Modifications A"]).strip(),
                        )
                        if parse_modifications
                        else None,
                        xl_position_peptide_a=adjust_crosslink_position(
                            __parse_int(row["Crosslinker Position A"]),
                            format_sequence(str(row["Sequence A"]).strip()),
                        ),
                        proteins_a=[
                            protein.strip()
                            for protein in str(row["Protein Accession A"]).split(";")
                        ],
                        xl_position_proteins_a=[
                            adjust_protein_position(__parse_int(position))
                            for position in str(
                                row["Leading Protein Position A"]
                            ).split(";")
                        ],
                        pep_position_proteins_a=[
                            adjust_protein_position(__parse_int(position))
                            - adjust_crosslink_position(
                                __parse_int(row["Crosslinker Position A"]),
                                format_sequence(str(row["Sequence A"]).strip()),
                            )
                            + 1
                            for position in str(
                                row["Leading Protein Position A"]
                            ).split(";")
                        ],
                        score_a=None,
                        decoy_a=get_bool_from_value(row["Is Decoy"])
                        if "Is Decoy" in col_names
                        else decoy,
                        peptide_b=format_sequence(str(row["Sequence B"])),
                        modifications_b=parse_modification_str(
                            format_sequence(str(row["Sequence B"]).strip()),
                            str(row["Modifications B"]).strip(),
                        )
                        if parse_modifications
                        else None,
                        xl_position_peptide_b=adjust_crosslink_position(
                            __parse_int(row["Crosslinker Position B"]),
                            format_sequence(str(row["Sequence B"])),
                        ),
                        proteins_b=[
                            protein.strip()
                            for protein in str(row["Protein Accession B"]).split(";")
                        ],
                        xl_position_proteins_b=[
                            adjust_protein_position(__parse_int(position))
                            for position in str(
                                row["Leading Protein Position B"]
                            ).split(";")
                        ],
                        pep_position_proteins_b=[
                            adjust_protein_position(__parse_int(position))
                            - adjust_crosslink_position(
                                __parse_int(row["Crosslinker Position B"]),
                                format_sequence(str(row["Sequence B"])),
                            )
                            + 1
                            for position in str(
                                row["Leading Protein Position B"]
                            ).split(";")
                        ],
                        score_b=None,
                        decoy_b=get_bool_from_value(row["Is Decoy"])
                        if "Is Decoy" in col_names
                        else decoy,
                        score=__parse_float(row["XlinkX Score"]),
                        spectrum_file=str(row["Spectrum File"]).strip(),
                        scan_nr=__parse_int(row["First Scan"]),
                        charge=__parse_int(row["Charge"]),
                        rt=__parse_float(row["RT [min]"]) * 60.0,
                        im_cv=None,
                        additional_information={
                            "source": __serialize_pandas_series(row)
                        },
                    )
                    csms.append(csm)
    ## check results
    if len(crosslinks) + len(csms) == 0:
        raise RuntimeError(
            "No crosslink-spectrum-matches or crosslinks were parsed! If this is unexpected, please file a bug report!"
        )
    ## return parser result
    return create_parser_result(
        search_engine="XlinkX",
        csms=csms if len(csms) > 0 else None,
        crosslinks=crosslinks if len(crosslinks) > 0 else None,
    )