Source code for pyXLMS.parser._parser_xldbse_xinet_xiview

#!/usr/bin/env python3

# 2026 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import warnings
import pandas as pd
from tqdm import tqdm

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._csm import create_csm
from ..data._crosslink import create_crosslink
from ..data._parser_result import create_parser_result
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float

from typing import BinaryIO
from typing import List

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal



[docs]
def read_xinet(
    files: str | List[str] | BinaryIO,
    sep: str = ",",
    decimal: str = ".",
    verbose: Literal[0, 1, 2] = 1,
    **kwargs,
) -> ParserResult:
    r"""Read a xiNET exported result file.

    Reads a result file that was exported from xiNET in ``.csv`` (comma delimited) format
    and returns a ``parser_result``.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the xiNET exported result file(s) or a file-like object/stream.
    sep : str, default = ","
        Seperator used in the ``.csv`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches or crosslinks.
    RuntimeError
        If the number of proteins does not match the number of protein crosslink positions. Only
        raised if verbose is set to ``2`` otherwise ``None`` will be used!
    KeyError
        If one of the required columns is not found.
    TypeError
        If parameter verbose was not set correctly.

    Warnings
    --------
    Because modifications could be encoded in very different forms depending on the xiNET/xiVIEW input source,
    the parsing of modifications is not supported with this parser! For that purpose we would recommend using
    the original result file from the corresponding crosslink search engine directly!

    Examples
    --------
    >>> from pyXLMS.parser import read_xinet
    >>> csms = read_xinet("data/xiview/DDX39B_LCSDA_shared_links_open_clamped.csv")
    """
    ## check input
    _ok = check_input(sep, "sep", str)
    _ok = check_input(decimal, "decimal", str)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")

    ## helper functions
    def __get_xl_position_peptide(row: pd.Series, alpha: bool) -> int:
        if alpha:
            if "LinkPos1" in row and not pd.isna(row["LinkPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                return __parse_int(row["LinkPos1"])
            if "PepPos1" in row and not pd.isna(row["PepPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                pep_pos1: int = __parse_int(str(row["PepPos1"]).split(";")[0])
                if "SeqPos1" in row and not pd.isna(row["SeqPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                    seq_pos1: int = __parse_int(str(row["SeqPos1"]).split(";")[0])
                    return seq_pos1 - pep_pos1 + 1
                if "AbsPos1" in row and not pd.isna(row["AbsPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                    abs_pos1: int = __parse_int(str(row["AbsPos1"]).split(";")[0])
                    return abs_pos1 - pep_pos1 + 1
        else:
            if "LinkPos2" in row and not pd.isna(row["LinkPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                return __parse_int(row["LinkPos2"])
            if "PepPos2" in row and not pd.isna(row["PepPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                pep_pos2: int = __parse_int(str(row["PepPos2"]).split(";")[0])
                if "SeqPos2" in row and not pd.isna(row["SeqPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                    seq_pos2: int = __parse_int(str(row["SeqPos2"]).split(";")[0])
                    return seq_pos2 - pep_pos2 + 1
                if "AbsPos2" in row and not pd.isna(row["AbsPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                    abs_pos2: int = __parse_int(str(row["AbsPos2"]).split(";")[0])
                    return abs_pos2 - pep_pos2 + 1
        raise KeyError(
            "Could not get a suitable column for the peptide crosslink position!"
        )
        return -1

    def __get_proteins(row: pd.Series, alpha: bool) -> List[str] | None:
        if alpha:
            if "Protein1" in row and not pd.isna(row["Protein1"]):  # pyright: ignore[reportGeneralTypeIssues]
                return [p.strip() for p in str(row["Protein1"]).split(";")]
        else:
            if "Protein2" in row and not pd.isna(row["Protein2"]):  # pyright: ignore[reportGeneralTypeIssues]
                return [p.strip() for p in str(row["Protein2"]).split(";")]
        return None

    def __get_xl_position_proteins(row: pd.Series, alpha: bool) -> List[int] | None:
        if alpha:
            if "SeqPos1" in row:
                if not pd.isna(row["SeqPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                    return [__parse_int(x) for x in str(row["SeqPos1"]).split(";")]
            if "AbsPos1" in row:
                if not pd.isna(row["AbsPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                    return [__parse_int(x) for x in str(row["AbsPos1"]).split(";")]
            if "PepPos1" in row and "LinkPos1" in row:
                if not pd.isna(row["PepPos1"]) and not pd.isna(row["LinkPos1"]):  # pyright: ignore[reportGeneralTypeIssues]
                    return [
                        __parse_int(x) + __parse_int(row["LinkPos1"]) - 1
                        for x in str(row["PepPos1"]).split(";")
                    ]
        else:
            if "SeqPos2" in row:
                if not pd.isna(row["SeqPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                    return [__parse_int(x) for x in str(row["SeqPos2"]).split(";")]
            if "AbsPos2" in row:
                if not pd.isna(row["AbsPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                    return [__parse_int(x) for x in str(row["AbsPos2"]).split(";")]
            if "PepPos2" in row and "LinkPos2" in row:
                if not pd.isna(row["PepPos2"]) and not pd.isna(row["LinkPos2"]):  # pyright: ignore[reportGeneralTypeIssues]
                    return [
                        __parse_int(x) + __parse_int(row["LinkPos2"]) - 1
                        for x in str(row["PepPos2"]).split(";")
                    ]
        return None

    def __get_spectrum_file(row: pd.Series, verbose: int) -> str:
        if "PeakListFileName" in row and not pd.isna(row["PeakListFileName"]):  # pyright: ignore[reportGeneralTypeIssues]
            return str(row["PeakListFileName"]).strip()
        if "RawFileName" in row and not pd.isna(row["RawFileName"]):  # pyright: ignore[reportGeneralTypeIssues]
            return str(row["RawFileName"]).strip()
        if "run" in row and not pd.isna(row["run"]):  # pyright: ignore[reportGeneralTypeIssues]
            return str(row["run"]).strip()
        if verbose == 2:
            raise KeyError(
                "Could not get a suitable column or value for the spectrum file name!"
            )
        return ""

    def __get_scan_number(row: pd.Series, id: int, verbose: int) -> int:
        if "ScanNumber" in row and not pd.isna(row["ScanNumber"]):  # pyright: ignore[reportGeneralTypeIssues]
            return __parse_int(row["ScanNumber"])
        if "Id" in row and not pd.isna(row["Id"]):  # pyright: ignore[reportGeneralTypeIssues]
            try:
                return __parse_int(row["Id"])
            except Exception as _e:
                pass
        if verbose == 2:
            raise KeyError(
                "Could not get a suitable column or value for the scan number!"
            )
        return id

    ## data structures
    csms = list()
    crosslinks = list()

    ## handle input
    if not isinstance(files, list):
        inputs = [files]
    else:
        inputs = files

    ## process data
    for input in inputs:
        data = pd.read_csv(input, sep=sep, decimal=decimal, low_memory=False, **kwargs)  # ty: ignore[no-matching-overload]
        has_csms = "ScanNumber" in data and (
            "run" in data or "RawFileName" in data or "PeakListFileName" in data
        )
        if "PepSeq1" not in data or "PepSeq2" not in data:
            raise KeyError("Could not get a suitable column for the peptide sequence!")
        data = data.dropna(axis=0, subset=["PepSeq1", "PepSeq2"])
        id = 0
        for i, row in tqdm(
            data.iterrows(),
            total=data.shape[0],
            desc=f"Reading xiNET/xiVIEW {'CSMs' if has_csms else 'Crosslinks'}...",
        ):
            peptide_a: str = format_sequence(str(row["PepSeq1"]))
            xl_position_peptide_a: int = __get_xl_position_peptide(row, alpha=True)
            proteins_a: List[str] | None = __get_proteins(row, alpha=True)
            xl_position_proteins_a: List[int] | None = (
                __get_xl_position_proteins(row, alpha=True)
                if proteins_a is not None
                else None
            )
            if proteins_a is not None and xl_position_proteins_a is not None:
                if len(proteins_a) != len(xl_position_proteins_a):
                    if verbose == 1:
                        warnings.warn(
                            RuntimeWarning(
                                f"Could not extract all proteins and protein crosslink positions for row with index {i}\n"
                                f"Extracted proteins: {proteins_a}\nExtracted protein crosslink positions: {xl_position_proteins_a}!"
                            )
                        )
                    if verbose == 2:
                        raise RuntimeError(
                            f"Could not extract all proteins and protein crosslink positions for row with index {i}\n"
                            f"Extracted proteins: {proteins_a}\nExtracted protein crosslink positions: {xl_position_proteins_a}!"
                        )
                    proteins_a = None
                    xl_position_proteins_a = None
            decoy_a: bool | None = (
                get_bool_from_value(row["Decoy1"]) if "Decoy1" in row else None
            )
            peptide_b: str = format_sequence(str(row["PepSeq2"]))
            xl_position_peptide_b: int = __get_xl_position_peptide(row, alpha=False)
            proteins_b: List[str] | None = __get_proteins(row, alpha=False)
            xl_position_proteins_b: List[int] | None = (
                __get_xl_position_proteins(row, alpha=False)
                if proteins_b is not None
                else None
            )
            if proteins_b is not None and xl_position_proteins_b is not None:
                if len(proteins_b) != len(xl_position_proteins_b):
                    if verbose == 1:
                        warnings.warn(
                            RuntimeWarning(
                                f"Could not extract all proteins and protein crosslink positions for row with index {i}\n"
                                f"Extracted proteins: {proteins_b}\nExtracted protein crosslink positions: {xl_position_proteins_b}!"
                            )
                        )
                    if verbose == 2:
                        raise RuntimeError(
                            f"Could not extract all proteins and protein crosslink positions for row with index {i}\n"
                            f"Extracted proteins: {proteins_b}\nExtracted protein crosslink positions: {xl_position_proteins_b}!"
                        )
                    proteins_b = None
                    xl_position_proteins_b = None
            decoy_b: bool | None = (
                get_bool_from_value(row["Decoy2"]) if "Decoy2" in row else None
            )
            score: float | None = (
                __parse_float(row["Score"]) if "Score" in row else None
            )
            id += 1
            if not has_csms:
                # create crosslink
                crosslink = create_crosslink(
                    peptide_a=peptide_a,
                    xl_position_peptide_a=xl_position_peptide_a,
                    proteins_a=proteins_a,
                    xl_position_proteins_a=xl_position_proteins_a,
                    decoy_a=decoy_a,
                    peptide_b=peptide_b,
                    xl_position_peptide_b=xl_position_peptide_b,
                    proteins_b=proteins_b,
                    xl_position_proteins_b=xl_position_proteins_b,
                    decoy_b=decoy_b,
                    score=score,
                    additional_information={
                        "source": __serialize_pandas_series(row),
                    },
                )
                crosslinks.append(crosslink)
            else:
                # create csm
                csm = create_csm(
                    peptide_a=peptide_a,
                    modifications_a=None,
                    xl_position_peptide_a=xl_position_peptide_a,
                    proteins_a=proteins_a,
                    xl_position_proteins_a=xl_position_proteins_a,
                    pep_position_proteins_a=[
                        xl_position_protein_a - xl_position_peptide_a + 1
                        for xl_position_protein_a in xl_position_proteins_a
                    ]
                    if xl_position_proteins_a is not None
                    else None,
                    score_a=None,
                    decoy_a=decoy_a,
                    peptide_b=peptide_b,
                    modifications_b=None,
                    xl_position_peptide_b=xl_position_peptide_b,
                    proteins_b=proteins_b,
                    xl_position_proteins_b=xl_position_proteins_b,
                    pep_position_proteins_b=[
                        xl_position_protein_b - xl_position_peptide_b + 1
                        for xl_position_protein_b in xl_position_proteins_b
                    ]
                    if xl_position_proteins_b is not None
                    else None,
                    score_b=None,
                    decoy_b=decoy_b,
                    score=score,
                    spectrum_file=__get_spectrum_file(row, verbose),
                    scan_nr=__get_scan_number(row, id, verbose),
                    charge=__parse_int(row["Charge"]) if "Charge" in row else None,
                    rt=None,
                    im_cv=None,
                    additional_information={
                        "source": __serialize_pandas_series(row),
                    },
                )
                csms.append(csm)
    ## check results
    if len(csms) + len(crosslinks) == 0:
        raise RuntimeError(
            "No crosslink-spectrum-matches or crosslinks were parsed! If this is unexpected, please file a bug report!"
        )
    ## return parser result
    return create_parser_result(
        search_engine="xiNET/xiVIEW",
        csms=csms if len(csms) > 0 else None,
        crosslinks=crosslinks if len(crosslinks) > 0 else None,
    )




[docs]
def read_xiview(
    files: str | List[str] | BinaryIO,
    sep: str = ",",
    decimal: str = ".",
    verbose: Literal[0, 1, 2] = 1,
    **kwargs,
) -> ParserResult:
    r"""Read a xiVIEW exported result file.

    Reads a result file that was exported from xiVIEW in ``.csv`` (comma delimited) format
    and returns a ``parser_result``.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the xiVIEW exported result file(s) or a file-like object/stream.
    sep : str, default = ","
        Seperator used in the ``.csv`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches or crosslinks.
    RuntimeError
        If the number of proteins does not match the number of protein crosslink positions. Only
        raised if verbose is set to ``2`` otherwise ``None`` will be used!
    KeyError
        If one of the required columns is not found.
    TypeError
        If parameter verbose was not set correctly.

    Notes
    -----
    Internally this just calls ``parser.read_xinet()`` since both formats share columns and the parser
    tries to exhaustively match all columns it can.

    Warnings
    --------
    Because modifications could be encoded in very different forms depending on the xiNET/xiVIEW input source,
    the parsing of modifications is not supported with this parser! For that purpose we would recommend using
    the original result file from the corresponding crosslink search engine directly!

    Examples
    --------
    >>> from pyXLMS.parser import read_xiview
    >>> csms = read_xiview("data/xiview/DDX39B_LCSDA_shared_links_open_clamped.csv")
    """
    return read_xinet(
        files,
        sep,
        decimal,
        verbose,
        **kwargs,
    )