#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
from .parser import read
from .transform_summary import summary as transform_summary
from .transform_aggregate import unique as transform_unique
from .transform_validate import validate as transform_validate
from .transform_targets_only import targets_only as transform_targets_only
from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import List
# legacy
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
[docs]
def pipeline(
files: str | List[str] | BinaryIO,
engine: Literal[
"Custom",
"MaxQuant",
"MaxLynx",
"MS Annika",
"mzIdentML",
"pLink",
"Scout",
"xiSearch/xiFDR",
"XlinkX",
],
crosslinker: str,
unique: Optional[bool | Dict[str, Any]] = True,
validate: Optional[bool | Dict[str, Any]] = True,
targets_only: Optional[bool] = True,
**kwargs,
) -> Dict[str, Any]:
r"""Runs a standard down-stream analysis pipeline for crosslinks and crosslink-spectrum-matches.
Runs a standard down-stream analysis pipeline for crosslinks and crosslink-spectrum-matches. The pipeline first reads
a result file and subsequently optionally filters the the read data for unique crosslinks and crosslink-spectrum-matches,
optionally the data is validated by false discovery rate estimation and - also optionally - only target-target matches
are returned. Internally the pipeline calls ``parser.read()``, ``transform.unique()``, ``transform.validate()``, and
``transform.targets_only()``.
Parameters
----------
files : str, list of str, or file stream
The name/path of the result file(s) or a file-like object/stream.
engine : "Custom", "MaxQuant", "MaxLynx", "MS Annika", "mzIdentML", "pLink", "Scout", "xiSearch/xiFDR", or "XlinkX"
Crosslink search engine or format of the result file.
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
unique : dict of str, any, or bool, or None, default = True
If ``transform.unique()`` should be run in the pipeline. If None or False this step is omitted.
If True this step is run with default parameters. If a dictionary is given it should contain parameters for
running ``transform.unique()``. Omitting a parameter in the dictionary will fall back to its default value.
validate : dict of str, any, or bool, or None, default = True
If ``transform.validate()`` should be run in the pipeline. If None or False this step is omitted.
If True this step is run with default parameters. If a dictionary is given it should contain parameters for
running ``transform.validate()``. Omitting a parameter in the dictionary will fall back to its default value.
targets_only : bool, or None, default = True
If ``transform.targets_only()`` should be run in the pipeline. If None or False this step is omitted.
**kwargs
Any additional parameters will be passed to the specific result file parsers.
Returns
-------
dict of str, any
The transformed parser_result after all pipeline steps are completed.
Raises
------
TypeError
If any of the parameters do not have the correct type.
Notes
-----
Various helpful pipeline information is also printed to ``stdout``.
Examples
--------
>>> from pyXLMS.pipelines import pipeline
>>> pr = pipeline("data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
... engine="MS Annika",
... crosslinker="DSS",
... unique=True,
... validate={"fdr": 0.05, "formula":"(TD-DD)/TT"},
... targets_only=True)
Reading MS Annika CSMs...: 100%|██████████████████████████████████████████████████| 826/826 [00:00<00:00, 10337.98it/s]
---- Summary statistics before pipeline ----
Number of CSMs: 826.0
Number of unique CSMs: 826.0
Number of intra CSMs: 803.0
Number of inter CSMs: 23.0
Number of target-target CSMs: 786.0
Number of target-decoy CSMs: 39.0
Number of decoy-decoy CSMs: 1.0
Minimum CSM score: 1.11
Maximum CSM score: 452.99
Iterating over scores for FDR calculation...: 0%| | 0/826 [00:00<?, ?it/s]
---- Summary statistics after pipeline ----
Number of CSMs: 786.0
Number of unique CSMs: 786.0
Number of intra CSMs: 774.0
Number of inter CSMs: 12.0
Number of target-target CSMs: 786.0
Number of target-decoy CSMs: 0.0
Number of decoy-decoy CSMs: 0.0
Minimum CSM score: 1.28
Maximum CSM score: 452.99
---- Performed pipeline steps ----
:: parser.read() ::
:: parser.read() :: params :: <params omitted>
:: transform.unique() ::
:: transform.unique() :: params :: by=peptide
:: transform.unique() :: params :: score=higher_better
:: transform.validate() ::
:: transform.validate() :: params :: fdr=0.05
:: transform.validate() :: params :: formula=(TD-DD)/TT
:: transform.validate() :: params :: score=higher_better
:: transform.validate() :: params :: separate_intra_inter=False
:: transform.validate() :: params :: ignore_missing_labels=False
:: transform.targets_only() ::
:: transform.targets_only() :: params :: no params
"""
# steps: reading
pr = read(files, engine=engine, crosslinker=crosslinker, **kwargs)
# steps: summary (before)
print("---- Summary statistics before pipeline ----")
_ = transform_summary(pr)
# steps: unique
unique_params = {"by": "peptide", "score": "higher_better"}
if unique is not None:
if isinstance(unique, dict):
unique_params.update(unique)
pr = transform_unique(
pr,
by=str(unique_params["by"]), # pyright: ignore[reportArgumentType]
score=str(unique_params["score"]), # pyright: ignore[reportArgumentType]
)
elif isinstance(unique, bool):
if unique:
pr = transform_unique(
pr,
by=str(unique_params["by"]), # pyright: ignore[reportArgumentType]
score=str(unique_params["score"]), # pyright: ignore[reportArgumentType]
)
else:
raise TypeError(
"Parameter unique has to be a dictionary of parameters for transform.unique(), a boolean or None!"
)
# steps: validate
validate_params = {
"fdr": 0.01,
"formula": "D/T",
"score": "higher_better",
"separate_intra_inter": False,
"ignore_missing_labels": False,
}
if validate is not None:
if isinstance(validate, dict):
validate_params.update(validate)
pr = transform_validate(
pr,
fdr=float(validate_params["fdr"]),
formula=str(validate_params["formula"]), # pyright: ignore[reportArgumentType]
score=str(validate_params["score"]), # pyright: ignore[reportArgumentType]
separate_intra_inter=bool(validate_params["separate_intra_inter"]),
ignore_missing_labels=bool(validate_params["ignore_missing_labels"]),
)
elif isinstance(validate, bool):
if validate:
pr = transform_validate(
pr,
fdr=float(validate_params["fdr"]),
formula=str(validate_params["formula"]), # pyright: ignore[reportArgumentType]
score=str(validate_params["score"]), # pyright: ignore[reportArgumentType]
separate_intra_inter=bool(validate_params["separate_intra_inter"]),
ignore_missing_labels=bool(
validate_params["ignore_missing_labels"]
),
)
else:
raise TypeError(
"Parameter validate has to be a dictionary of parameters for transform.validate(), a boolean or None!"
)
# steps: targets only
if targets_only is not None:
if isinstance(targets_only, bool):
if targets_only:
pr = transform_targets_only(pr)
else:
raise TypeError("Parameter targets_only has to be a boolean or None!")
# steps: summary (after)
print("---- Summary statistics after pipeline ----")
_ = transform_summary(pr)
# steps: pipeline summary
print("---- Performed pipeline steps ----")
print(":: parser.read() ::")
print(":: parser.read() :: params :: <params omitted>")
if unique is not None:
if isinstance(unique, dict) or (isinstance(unique, bool) and unique):
print(":: transform.unique() ::")
for k, v in unique_params.items():
print(f":: transform.unique() :: params :: {k}={v}")
if validate is not None:
if isinstance(validate, dict) or (isinstance(validate, bool) and validate):
print(":: transform.validate() ::")
for k, v in validate_params.items():
print(f":: transform.validate() :: params :: {k}={v}")
if targets_only is not None and targets_only:
print(":: transform.targets_only() ::")
print(":: transform.targets_only() :: params :: no params")
# steps: finalize
if not isinstance(pr, dict):
raise RuntimeError(
"Something went wrong while running the pipeline.\n"
f"Expected data type: dict. Got: {type(pr)}."
)
return pr