#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._util import check_input_multi
from ..data._csm import create_crosslink_from_csm
from ..data._parser_result import create_parser_result
from ._util import get_available_keys
from ._util import check_available_keys
from ._util import assert_csms
from ._util import assert_xls
from ._util import assert_csms_or_xls
from typing import List
# legacy
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
def __score_better(
score: float, reference: float, function: Literal["higher_better", "lower_better"]
) -> bool:
r"""Checks if the score is better than the provided reference score.
Checks if the score is better than the provided reference score using the given scoring scheme.
Parameters
----------
score : float
The score that should be compared.
reference : float
The reference score to compare to.
function : str, one of "higher_better" or "lower_better"
If a higher score is considered better, or a lower score is considered better.
Returns
-------
bool
If the given score is better than the reference score.
"""
if function == "higher_better":
return score > reference
return score < reference
def __get_csm_key(csm: CrosslinkSpectrumMatch) -> str:
r"""Get the unique key for a crosslink-spectrum-match.
Parameters
----------
csm : CrosslinkSpectrumMatch
A pyXLMS crosslink-spectrum-match object.
Returns
-------
str
The unique key for the crosslink-spectrum-match.
"""
return f"{csm['spectrum_file']}_{csm['scan_nr']}"
def __get_xl_key(
xl: Crosslink | CrosslinkSpectrumMatch, by: Literal["peptide", "protein"]
) -> str:
r"""Get the unique key for a crosslink.
Parameters
----------
xl : Crosslink, or CrosslinkSpectrumMatch
A pyXLMS crosslink object.
Technically, a crosslink-spectrum-match is also allowed for type support in some functions.
by : str, one of "peptide" or "protein"
If peptide or protein crosslink position should be used for determining if a crosslink is unique.
Returns
-------
str
The unique key for the crosslink.
Notes
-----
This function should not be called directly, it is called from ``__unique_xls()``.
"""
if by == "peptide":
return (
f"{xl['alpha_peptide']}_{xl['alpha_peptide_crosslink_position']}_{xl['alpha_decoy']}-"
f"{xl['beta_peptide']}_{xl['beta_peptide_crosslink_position']}_{xl['beta_decoy']}"
)
_ok = check_available_keys(
[
"alpha_proteins",
"alpha_proteins_crosslink_positions",
"beta_proteins",
"beta_proteins_crosslink_positions",
],
assert_csms_or_xls([xl]),
)
prot_pos_a = (
"-".join(
sorted(
[
f"{xl['alpha_proteins'][i]}_{xl['alpha_proteins_crosslink_positions'][i]}"
for i in range(len(xl["alpha_proteins"]))
]
)
)
+ f"_{xl['alpha_decoy']}"
)
prot_pos_b = (
"-".join(
sorted(
[
f"{xl['beta_proteins'][i]}_{xl['beta_proteins_crosslink_positions'][i]}"
for i in range(len(xl["beta_proteins"]))
]
)
)
+ f"_{xl['beta_decoy']}"
)
return ":".join(sorted([prot_pos_a, prot_pos_b]))
def __unique_csms(
csms: List[CrosslinkSpectrumMatch],
has_scores: bool,
score: Literal["higher_better", "lower_better"],
) -> List[CrosslinkSpectrumMatch]:
r"""Filter for unique crosslink-spectrum-matches from a list on non-unique crosslink-spectrum-matches.
Filters for unique crosslink-spectrum-matches from a list on non-unique crosslink-spectrum-matches. A crosslink-
spectrum-match is considered unique if there is no other crosslink-spectrum-match from the same spectrum file and
with the same scan number. If more than one crosslink-spectrum-match exists per spectrum file and scan number, the
one with the better/best score is kept and the rest is filtered out. If crosslink-spectrum-matches without scores
are provided, the first crosslink-spectrum-match in the list is kept instead.
Parameters
----------
csms : list of CrosslinkSpectrumMatch
A list of pyXLMS crosslink-spectrum-match objects.
has_scores : bool
If the crosslink-spectrum-match objects contain scores.
score : str, one of "higher_better" or "lower_better"
If a higher score is considered better, or a lower score is considered better.
Returns
-------
list of CrosslinkSpectrumMatch
List of unique crosslink-spectrum-matches.
Notes
-----
This function should not be called directly, it is called from ``unique()``.
"""
unique_csms = dict()
for csm in csms:
key = __get_csm_key(csm)
if key not in unique_csms:
unique_csms[key] = csm
elif has_scores and __score_better(
csm["score"], unique_csms[key]["score"], score
):
unique_csms[key] = csm
else:
# do nothing
pass
return list(unique_csms.values())
def __unique_xls(
xls: List[Crosslink],
by: Literal["peptide", "protein"],
has_scores: bool,
score: Literal["higher_better", "lower_better"],
) -> List[Crosslink]:
r"""Filter for unique crosslinks from a list on non-unique crosslinks.
Filters for unique crosslinks from a list on non-unique crosslinks. A crosslink is considered unique if there is no
other crosslink with the same peptide sequence and crosslink position if ``by = "peptide"``, otherwise it is considered
unique if there are no other crosslinks with the same protein crosslink position (residue pair). If more than one
crosslink exists per peptide sequence/residue pair, the one with the better/best score is kept and the rest is filtered
out. If crosslinks without scores are provided, the first crosslink in the list is kept instead.
Parameters
----------
xls : list of Crosslink
A list of pyXLMS crosslink objects.
by : str, one of "peptide" or "protein"
If peptide or protein crosslink position should be used for determining if a crosslink is unique.
has_scores : bool
If the crosslink objects contain scores.
score : str, one of "higher_better" or "lower_better"
If a higher score is considered better, or a lower score is considered better.
Returns
-------
list of Crosslink
List of unique crosslinks.
Notes
-----
This function should not be called directly, it is called from ``unique()``.
"""
unique_xls = dict()
for xl in xls:
key = __get_xl_key(xl, by)
if key not in unique_xls:
unique_xls[key] = xl
elif has_scores and __score_better(
xl["score"], unique_xls[key]["score"], score
):
unique_xls[key] = xl
else:
# do nothing
pass
return list(unique_xls.values())
[docs]
def unique(
data: List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult,
by: Literal["peptide", "protein"] = "peptide",
score: Literal["higher_better", "lower_better"] = "higher_better",
) -> List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult:
r"""Filter for unique crosslinks or crosslink-spectrum-matches.
Filters for unique crosslinks from a list on non-unique crosslinks. A crosslink is considered unique if there is no
other crosslink with the same peptide sequence and crosslink position if ``by = "peptide"``, otherwise it is considered
unique if there are no other crosslinks with the same protein crosslink position (residue pair). If more than one
crosslink exists per peptide sequence/residue pair, the one with the better/best score is kept and the rest is filtered
out. If crosslinks without scores are provided, the first crosslink in the list is kept instead.
*or*
Filters for unique crosslink-spectrum-matches from a list on non-unique crosslink-spectrum-matches. A crosslink-
spectrum-match is considered unique if there is no other crosslink-spectrum-match from the same spectrum file and
with the same scan number. If more than one crosslink-spectrum-match exists per spectrum file and scan number, the
one with the better/best score is kept and the rest is filtered out. If crosslink-spectrum-matches without scores
are provided, the first crosslink-spectrum-match in the list is kept instead.
Parameters
----------
data : list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult
A list of crosslink-spectrum-matches or crosslinks to filter, or a parser_result.
by : str, one of "peptide" or "protein", default = "peptide"
If peptide or protein crosslink position should be used for determining if a crosslink is unique.
Only affects filtering for unique crosslinks and not crosslink-spectrum-matches. If protein crosslink
position is not available for all crosslinks a ``ValueError`` will be raised. Make sure that all
crosslinks have the ``_proteins`` and ``_proteins_crosslink_positions`` fields set. If this is not
already done by the parser, this can be achieved with ``transform.reannotate_positions()``.
score : str, one of "higher_better" or "lower_better", default = "higher_better"
If a higher score is considered better, or a lower score is considered better.
Returns
-------
list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult
If a list of crosslink-spectrum-matches or crosslinks was provided, a list of unique
crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided,
a parser_result with unique crosslink-spectrum-matches and/or unique crosslinks will
be returned.
Raises
------
TypeError
If a wrong data type is provided.
TypeError
If parameter by is not one of 'peptide' or 'protein'.
TypeError
If parameter score is not one of 'higher_better' or 'lower_better'.
ValueError
If parameter by is set to 'protein' but protein crosslink positions are not available.
Examples
--------
>>> from pyXLMS.parser import read
>>> from pyXLMS.transform import unique
>>> pr = read(
... ["data/_test/aggregate/csms.txt", "data/_test/aggregate/xls.txt"],
... engine="custom",
... crosslinker="DSS",
... )
>>> len(pr["crosslink-spectrum-matches"])
10
>>> len(pr["crosslinks"])
10
>>> unique_peptide = unique(pr, by="peptide")
>>> len(unique_peptide["crosslink-spectrum-matches"])
5
>>> len(unique_peptide["crosslinks"])
3
>>> from pyXLMS.parser import read
>>> from pyXLMS.transform import unique
>>> pr = read(
... ["data/_test/aggregate/csms.txt", "data/_test/aggregate/xls.txt"],
... engine="custom",
... crosslinker="DSS",
... )
>>> len(pr["crosslink-spectrum-matches"])
10
>>> len(pr["crosslinks"])
10
>>> unique_protein = unique(pr, by="protein")
>>> len(unique_protein["crosslink-spectrum-matches"])
5
>>> len(unique_protein["crosslinks"])
2
"""
_ok = check_input_multi(data, "data", [ParserResult, list])
_ok = check_input(by, "by", str)
_ok = check_input(score, "score", str)
if by not in ["peptide", "protein"]:
raise TypeError(
"Parameter 'by' has to be one of 'peptide' or 'protein'! Option 'peptide' will group by peptide sequence and "
"peptide crosslink position while option 'protein' will group by protein identifier and protein crosslink position."
)
if score not in ["higher_better", "lower_better"]:
raise TypeError(
"Parameter 'score' has to be one of 'higher_better' or 'lower_better'! If two identical crosslinks or crosslink-spectrum"
"-matches are found, the one with the higher score is kept if 'higher_better' is selected, and vice versa."
)
if isinstance(data, list):
if len(data) == 0:
return data
data = assert_csms_or_xls(data)
available_keys = get_available_keys(data)
# crosslink and by protein
if isinstance(data[0], Crosslink) and by == "protein":
_ok = check_available_keys(
[
"alpha_proteins",
"alpha_proteins_crosslink_positions",
"beta_proteins",
"beta_proteins_crosslink_positions",
],
data,
)
return __unique_xls(assert_xls(data), by, available_keys["score"], score)
# crosslink but not by protein
if isinstance(data[0], Crosslink):
return __unique_xls(assert_xls(data), by, available_keys["score"], score)
# csm
return __unique_csms(assert_csms(data), available_keys["score"], score)
new_csms = (
assert_csms(unique(data["crosslink-spectrum-matches"], by, score))
if data["crosslink-spectrum-matches"] is not None
else None
)
new_xls = (
assert_xls(unique(data["crosslinks"], by, score))
if data["crosslinks"] is not None
else None
)
return create_parser_result(
search_engine=data["search_engine"],
csms=new_csms,
crosslinks=new_xls,
)
[docs]
def aggregate(
csms: List[CrosslinkSpectrumMatch],
by: Literal["peptide", "protein"] = "peptide",
score: Literal["higher_better", "lower_better"] = "higher_better",
) -> List[Crosslink]:
r"""Aggregate crosslink-spectrum-matches to crosslinks.
Aggregates a list of crosslink-spectrum-matches to unique crosslinks. A crosslink is considered unique if there is no
other crosslink with the same peptide sequence and crosslink position if ``by = "peptide"``, otherwise it is considered
unique if there are no other crosslinks with the same protein crosslink position (residue pair). If more than one
crosslink exists per peptide sequence/residue pair, the one with the better/best score is kept and the rest is filtered
out. If crosslink-spectrum-matches without scores are provided, the crosslink of the first corresponding crosslink-spectrum
-match in the list is kept instead.
Parameters
----------
csms : list of CrosslinkSpectrumMatch
A list of crosslink-spectrum-matches.
by : str, one of "peptide" or "protein", default = "peptide"
If peptide or protein crosslink position should be used for determining if a crosslink is unique.
If protein crosslink position is not available for all crosslink-spectrum-matches a ``ValueError``
will be raised. Make sure that all crosslink-spectrum-matches have the ``_proteins`` and
``_proteins_crosslink_positions`` fields set. If this is not already done by the parser, this can
be achieved with ``transform.reannotate_positions()``.
score : str, one of "higher_better" or "lower_better", default = "higher_better"
If a higher score is considered better, or a lower score is considered better.
Returns
-------
list of Crosslink
A list of aggregated, unique crosslinks.
Warnings
--------
Aggregation will not conserve false discovery rate (FDR)! Aggregating crosslink-spectrum-matches that are
validated for 1% FDR will not result in crosslinks validated for 1% FDR! Aggregated crosslinks should be
validated with either external tools or with the built-in ``transform.validate()``!
Raises
------
TypeError
If a wrong data type is provided.
TypeError
If parameter by is not one of 'peptide' or 'protein'.
TypeError
If parameter score is not one of 'higher_better' or 'lower_better'.
ValueError
If parameter by is set to 'protein' but protein crosslink positions are not available.
Examples
--------
>>> from pyXLMS.parser import read
>>> from pyXLMS.transform import aggregate
>>> pr = read("data/_test/aggregate/csms.txt", engine="custom", crosslinker="DSS")
>>> len(pr["crosslink-spectrum-matches"])
10
>>> aggregate_peptide = aggregate(pr["crosslink-spectrum-matches"], by="peptide")
>>> len(aggregate_peptide)
3
>>> from pyXLMS.parser import read
>>> from pyXLMS.transform import aggregate
>>> pr = read("data/_test/aggregate/csms.txt", engine="custom", crosslinker="DSS")
>>> len(pr["crosslink-spectrum-matches"])
10
>>> aggregate_protein = aggregate(pr["crosslink-spectrum-matches"], by="protein")
>>> len(aggregate_protein)
2
"""
_ok = check_input(csms, "csms", list, CrosslinkSpectrumMatch)
_ok = check_input(by, "by", str)
_ok = check_input(score, "score", str)
if by not in ["peptide", "protein"]:
raise TypeError(
"Parameter 'by' has to be one of 'peptide' or 'protein'! Option 'peptide' will group by peptide sequence and "
"peptide crosslink position while option 'protein' will group by protein identifier and protein crosslink position."
)
if score not in ["higher_better", "lower_better"]:
raise TypeError(
"Parameter 'score' has to be one of 'higher_better' or 'lower_better'! If two identical crosslinks or crosslink-spectrum"
"-matches are found, the one with the higher score is kept if 'higher_better' is selected, and vice versa."
)
if len(csms) == 0:
return []
if by == "protein":
_ok = check_available_keys(
[
"alpha_proteins",
"alpha_proteins_crosslink_positions",
"beta_proteins",
"beta_proteins_crosslink_positions",
],
csms,
)
xls = [create_crosslink_from_csm(csm) for csm in csms]
return assert_xls(unique(xls, by, score))