#!/usr/bin/env python3
# 2024 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._util import check_input_multi
from typing import Optional
from typing import Any
from typing import Dict
from typing import Tuple
from typing import List
# legacy
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
[docs]
def modifications_to_str(
modifications: Optional[Dict[int, Tuple[str, float]]],
) -> str | None:
r"""Returns the string representation of a modifications dictionary.
Parameters
----------
modifications : dict of [str, tuple], or None
The modifications of a peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass.
``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``.
Returns
-------
str, or None
The string representation of the modifications (or ``None`` if no modification was provided).
Examples
--------
>>> from pyXLMS.transform import modifications_to_str
>>> modifications_to_str(
... {1: ("Oxidation", 15.994915), 5: ("Carbamidomethyl", 57.021464)}
... )
'(1:[Oxidation|15.994915]);(5:[Carbamidomethyl|57.021464])'
"""
## check input
_ok = (
check_input(modifications, "modifcations", dict, tuple)
if modifications is not None
else True
)
modifications_str = ""
if modifications is None:
return None
for modification_pos in sorted(modifications.keys()):
modifications_str += f"({modification_pos}:[{modifications[modification_pos][0]}|{modifications[modification_pos][1]}]);"
return modifications_str.rstrip(";")
[docs]
def assert_csms(maybe_csms: Any) -> List[CrosslinkSpectrumMatch]:
r"""Checks that the provided input is a list of type CrosslinkSpectrumMatch.
Parameters
----------
maybe_csms : any
The input data to be checked.
Returns
-------
list of CrosslinkSpectrumMatch
Returns a list of type CrosslinkSpectrumMatch if the provided data was one.
Raises
------
TypeError
If the provided data was not a list of CrosslinkSpectrumMatch.
"""
csms: List[CrosslinkSpectrumMatch] = list()
if isinstance(maybe_csms, list):
for item in maybe_csms:
if isinstance(item, CrosslinkSpectrumMatch):
csms.append(item)
else:
raise TypeError(
"Provided input is not a valid list of type CrosslinkSpectrumMatch!"
)
return csms
raise TypeError(
"Provided input is not a valid list of type CrosslinkSpectrumMatch!"
)
return csms
[docs]
def assert_xls(maybe_xls: Any) -> List[Crosslink]:
r"""Checks that the provided input is a list of type Crosslink.
Parameters
----------
maybe_xls : any
The input data to be checked.
Returns
-------
list of Crosslink
Returns a list of type Crosslink if the provided data was one.
Raises
------
TypeError
If the provided data was not a list of Crosslink.
"""
xls: List[Crosslink] = list()
if isinstance(maybe_xls, list):
for item in maybe_xls:
if isinstance(item, Crosslink):
xls.append(item)
else:
raise TypeError("Provided input is not a valid list of type Crosslink!")
return xls
raise TypeError("Provided input is not a valid list of type Crosslink!")
return xls
[docs]
def assert_csms_or_xls(
maybe_csms_or_xls: Any,
) -> List[CrosslinkSpectrumMatch] | List[Crosslink]:
r"""Checks that the provided input is a list of type CrosslinkSpectrumMatch or Crosslink.
Parameters
----------
maybe_csms_or_xls : any
The input data to be checked.
Returns
-------
list of CrosslinkSpectrumMatch, or list of Crosslink
Returns a list of type CrosslinkSpectrumMatch, or a list of type Crosslink
if the provided data was either.
Raises
------
TypeError
If the provided data was neither a list of CrosslinkSpectrumMatch nor a list
of Crosslink.
"""
if isinstance(maybe_csms_or_xls, list):
if len(maybe_csms_or_xls) == 0:
return []
if all(isinstance(item, CrosslinkSpectrumMatch) for item in maybe_csms_or_xls):
return assert_csms(maybe_csms_or_xls)
if all(isinstance(item, Crosslink) for item in maybe_csms_or_xls):
return assert_xls(maybe_csms_or_xls)
raise TypeError(
"Provided input is not a valid list of type CrosslinkSpectrumMatch or Crosslink!"
)
return []
[docs]
def assert_data_type_same(
data_list: List[CrosslinkSpectrumMatch] | List[Crosslink] | List[ParserResult],
) -> bool:
r"""Checks that all data is of the same data type.
Verifies that all elements in the provided list are of the same data type.
Parameters
----------
data_list : list of dict of str, any
A list of dictionaries with the ``data_type`` key.
Returns
-------
bool
If all elements are of the same data type.
Raises
------
TypeError
If the item in the data list are not of type CrosslinkSpectrumMatch, Crosslink, or ParserResult.
Examples
--------
>>> from pyXLMS.transform import assert_data_type_same
>>> from pyXLMS import data
>>> data_list = [
... data.create_crosslink_min("PEPK", 4, "PKEP", 2),
... data.create_crosslink_min("KPEP", 1, "PEKP", 3),
... ]
>>> assert_data_type_same(data_list)
True
>>> from pyXLMS.transform import assert_data_type_same
>>> from pyXLMS import data
>>> data_list = [
... data.create_crosslink_min("PEPK", 4, "PKEP", 2),
... data.create_csm_min("KPEP", 1, "PEKP", 3, "RUN_1", 1),
... ]
>>> assert_data_type_same(data_list)
False
"""
_ok = check_input(data_list, "data_list", list)
if len(data_list) == 0:
return True
data_type = type(data_list[0])
for item in data_list[1:]:
if not isinstance(item, data_type):
return False
if isinstance(data_list[0], CrosslinkSpectrumMatch):
return True
if isinstance(data_list[0], Crosslink):
return True
if isinstance(data_list[0], ParserResult):
return True
raise TypeError(
"Input list contains elements that are not of type CrosslinkSpectrumMatch, Crosslink, or ParserResult!"
)
return False
[docs]
def get_available_keys(
data_list: List[CrosslinkSpectrumMatch] | List[Crosslink],
always_revalidate: bool = True,
) -> Dict[str, bool]:
r"""Checks which data is available from a list of crosslinks or crosslink-spectrum-matches.
Verifies which data fields have been set for all crosslinks or crosslink-spectrum-matches in the
given list. Will return a dictionary structured the same as a crosslink or crosslink-spectrum-match,
but instead of the data it will return either True or False, depending if the field was set or not.
Parameters
----------
data_list : list of CrosslinkSpectrumMatch, or list of Crosslink
A list of crosslinks or crosslink-spectrum-matches.
always_revalidate : bool, default = True
If ``True`` (default) the assigned ``completeness`` will be ignored and all data fields
are re-checked. This is safer especially when data has been modified post reading.
Returns
-------
dict of str, bool
- If a list of crosslinks was provided, a dictionary with the following keys will be returned, where the value
of each key denotes if the data field is available for all crosslinks in ``data_list``.
Keys: ``data_type``, ``completeness``, ``alpha_peptide``, ``alpha_peptide_crosslink_position``,
``alpha_proteins``, ``alpha_proteins_crosslink_positions``, ``alpha_decoy``, ``beta_peptide``, ``beta_peptide_crosslink_position``,
``beta_proteins``, ``beta_proteins_crosslink_positions``, ``beta_decoy``, ``crosslink_type``, ``score``, and ``additional_information``.
- If a list of crosslink-spectrum-matches was provided, a dictionary with the following keys will be returned, where the value
of each key denotes if the data field is available for all crosslink-spectrum-matches in ``data_list``.
Keys: ``data_type``, ``completeness``, ``alpha_peptide``, ``alpha_modifications``,
``alpha_peptide_crosslink_position``, ``alpha_proteins``, ``alpha_proteins_crosslink_positions``, ``alpha_proteins_peptide_positions``,
``alpha_score``, ``alpha_decoy``, ``beta_peptide``, ``beta_modifications``, ``beta_peptide_crosslink_position``, ``beta_proteins``,
``beta_proteins_crosslink_positions``, ``beta_proteins_peptide_positions``, ``beta_score``, ``beta_decoy``, ``crosslink_type``, ``score``,
``spectrum_file``, ``scan_nr``, ``retention_time``, ``ion_mobility``, and ``additional_information``.
Raises
------
TypeError
If not all elements in ``data_list`` are of the same data type.
TypeError
If one or more elements in the list are of an unsupported data type.
Examples
--------
>>> from pyXLMS.transform import get_available_keys
>>> from pyXLMS import data
>>> data_list = [
... data.create_crosslink_min("PEPK", 4, "PKEP", 2),
... data.create_crosslink_min("KPEP", 1, "PEKP", 3),
... ]
>>> available_keys = get_available_keys(data_list)
>>> available_keys["alpha_peptide"]
True
>>> available_keys["score"]
False
"""
if not assert_data_type_same(data_list):
raise TypeError("Not all elements of the list have the same data type!")
if len(data_list) == 0:
raise ValueError("Provided data does not contain any elements!")
# available keys
modifications_a = True
proteins_a = True
xl_position_proteins_a = True
pep_position_proteins_a = True
score_a = True
decoy_a = True
modifications_b = True
proteins_b = True
xl_position_proteins_b = True
pep_position_proteins_b = True
score_b = True
decoy_b = True
score = True
charge = True
rt = True
im_cv = True
additional_information = True
# parse available keys
if isinstance(data_list[0], Crosslink):
for data in data_list:
if data["completeness"] != "full" or always_revalidate:
if data["alpha_proteins"] is None:
proteins_a = False
if data["alpha_proteins_crosslink_positions"] is None:
xl_position_proteins_a = False
if data["alpha_decoy"] is None:
decoy_a = False
if data["beta_proteins"] is None:
proteins_b = False
if data["beta_proteins_crosslink_positions"] is None:
xl_position_proteins_b = False
if data["beta_decoy"] is None:
decoy_b = False
if data["score"] is None:
score = False
if data["additional_information"] is None:
additional_information = False
return {
"data_type": True,
"completeness": True,
"alpha_peptide": True,
"alpha_peptide_crosslink_position": True,
"alpha_proteins": proteins_a,
"alpha_proteins_crosslink_positions": xl_position_proteins_a,
"alpha_decoy": decoy_a,
"beta_peptide": True,
"beta_peptide_crosslink_position": True,
"beta_proteins": proteins_b,
"beta_proteins_crosslink_positions": xl_position_proteins_b,
"beta_decoy": decoy_b,
"crosslink_type": True,
"score": score,
"additional_information": additional_information,
}
if isinstance(data_list[0], CrosslinkSpectrumMatch):
for data in data_list:
if data["completeness"] != "full" or always_revalidate:
if data["alpha_modifications"] is None:
modifications_a = False
if data["alpha_proteins"] is None:
proteins_a = False
if data["alpha_proteins_crosslink_positions"] is None:
xl_position_proteins_a = False
if data["alpha_proteins_peptide_positions"] is None:
pep_position_proteins_a = False
if data["alpha_score"] is None:
score_a = False
if data["alpha_decoy"] is None:
decoy_a = False
if data["beta_modifications"] is None:
modifications_b = False
if data["beta_proteins"] is None:
proteins_b = False
if data["beta_proteins_crosslink_positions"] is None:
xl_position_proteins_b = False
if data["beta_proteins_peptide_positions"] is None:
pep_position_proteins_b = False
if data["beta_score"] is None:
score_b = False
if data["beta_decoy"] is None:
decoy_b = False
if data["score"] is None:
score = False
if data["charge"] is None:
charge = False
if data["retention_time"] is None:
rt = False
if data["ion_mobility"] is None:
im_cv = False
if data["additional_information"] is None:
additional_information = False
return {
"data_type": True,
"completeness": True,
"alpha_peptide": True,
"alpha_modifications": modifications_a,
"alpha_peptide_crosslink_position": True,
"alpha_proteins": proteins_a,
"alpha_proteins_crosslink_positions": xl_position_proteins_a,
"alpha_proteins_peptide_positions": pep_position_proteins_a,
"alpha_score": score_a,
"alpha_decoy": decoy_a,
"beta_peptide": True,
"beta_modifications": modifications_b,
"beta_peptide_crosslink_position": True,
"beta_proteins": proteins_b,
"beta_proteins_crosslink_positions": xl_position_proteins_b,
"beta_proteins_peptide_positions": pep_position_proteins_b,
"beta_score": score_b,
"beta_decoy": decoy_b,
"crosslink_type": True,
"score": score,
"spectrum_file": True,
"scan_nr": True,
"charge": charge,
"retention_time": rt,
"ion_mobility": im_cv,
"additional_information": additional_information,
}
raise TypeError(
f"Unknown data type {type(data_list[0])}. Data type must be Crosslink or CrosslinkSpectrumMatch!"
)
return {"err": True}
[docs]
def check_available_keys(
required_keys: List[
Literal[
"data_type",
"completeness",
"alpha_peptide",
"alpha_modifications",
"alpha_peptide_crosslink_position",
"alpha_proteins",
"alpha_proteins_crosslink_positions",
"alpha_proteins_peptide_positions",
"alpha_score",
"alpha_decoy",
"beta_peptide",
"beta_modifications",
"beta_peptide_crosslink_position",
"beta_proteins",
"beta_proteins_crosslink_positions",
"beta_proteins_peptide_positions",
"beta_score",
"beta_decoy",
"crosslink_type",
"score",
"spectrum_file",
"scan_nr",
"charge",
"retention_time",
"ion_mobility",
"additional_information",
]
],
data_list: List[CrosslinkSpectrumMatch] | List[Crosslink],
always_revalidate: bool = True,
) -> bool:
r"""Checks if all required keys are available in a list of crosslinks or crosslink-spectrum-matches.
Parameters
----------
required_keys : list of keys
A list of valid Crosslink or CrosslinkSpectrumMatch keys/attributes to be checked.
data_list : list of CrosslinkSpectrumMatch, or list of Crosslink
A list of crosslinks or crosslink-spectrum-matches.
always_revalidate : bool, default = True
If ``True`` (default) the assigned ``completeness`` will be ignored and all data fields
are re-checked. This is safer especially when data has been modified post reading.
Returns
-------
bool
True if all items in the data list have the required keys and the keys are not None.
Raises
------
ValueError
If one of the keys is not available or None in any of the items in the data list.
Examples
--------
>>> from pyXLMS.transform import check_available_keys
>>> from pyXLMS import data
>>> data_list = [
... data.create_crosslink_min("PEPK", 4, "PKEP", 2),
... data.create_crosslink_min("KPEP", 1, "PEKP", 3),
... ]
>>> check_available_keys(["alpha_peptide"], data_list)
True
>>> check_available_keys(["score"], data_list)
ValueError: Attribute 'score' is missing in at least one element but is required!
"""
available_keys = get_available_keys(data_list, always_revalidate)
for key in required_keys:
if key not in available_keys or not available_keys[key]:
raise ValueError(
f"Attribute '{key}' is missing in at least one element but is required!"
)
return True
[docs]
def display(
data: CrosslinkSpectrumMatch | Crosslink | ParserResult,
show_additional_information: bool = False,
return_str: bool = False,
) -> None | str:
r"""Pretty prints a crosslink-spectrum-match or crosslink or parser_result.
Parameters
----------
data : CrosslinkSpectrumMatch, Crosslink, or ParserResult
A crosslink-spectrum-match or crosslink or parser_result to display.
show_additional_information : bool, default = False
Also display data in the ``additional_information``.
return_str : bool, default = False
If the display string should be returned.
Returns
-------
None, or str
The display string of the crosslink-spectrum-match, crosslink, or parser_result
if ``return_str = True`` otherwise None.
Raises
------
TypeError
If data is not a crosslink-spectrum-match, crosslink, or parser_result.
Examples
--------
>>> from pyXLMS import parser
>>> from pyXLMS import transform
>>> pr = parser.read(
... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.pdResult",
... engine="MS Annika",
... crosslinker="DSS",
... )
>>> transform.display(pr)
Data Type: parser_result
Completeness: full
Identifying Search Engine: MS Annika
Number of Crosslink-Spectrum-Matches: 826
Number of Crosslinks: 300
>>> from pyXLMS import parser
>>> from pyXLMS import transform
>>> pr = parser.read(
... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.pdResult",
... engine="MS Annika",
... crosslinker="DSS",
... )
>>> csms = pr["crosslink-spectrum-matches"]
>>> transform.display(csms[0])
Data Type: crosslink-spectrum-match
Completeness: full
Alpha Peptide: GQKNSR
Alpha Modifications: {3: ('DSS', 138.06808)}
Alpha Peptide Crosslink Position: 3
Alpha Proteins: ['Cas9']
Alpha Proteins Crosslink Positions: [779]
Alpha Proteins Peptide Positions: [777]
Alpha Peptide Score: 119.82548987540834
Alpha Decoy: False
Beta Peptide: GQKNSR
Beta Modifications: {3: ('DSS', 138.06808)}
Beta Peptide Crosslink Position: 3
Beta Proteins: ['Cas9']
Beta Proteins Crosslink Positions: [779]
Beta Proteins Peptide Positions: [777]
Beta Peptide Score: 119.82547820493929
Beta Decoy: False
Crosslink Type: intra
CSM Score: 119.82547820493929
Spectrum File: XLpeplib_Beveridge_QEx-HFX_DSS_R1.raw
Scan Number: 2257
Precursor Charge: 3
Retention Time: 733.1895599999999
Ion Mobility/FAIMS CV: 0.0
>>> from pyXLMS import parser
>>> from pyXLMS import transform
>>> pr = parser.read(
... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.pdResult",
... engine="MS Annika",
... crosslinker="DSS",
... )
>>> xls = pr["crosslinks"]
>>> transform.display(xls[0])
Data Type: crosslink
Completeness: full
Alpha Peptide: GQKNSR
Alpha Peptide Crosslink Position: 3
Alpha Proteins: ['Cas9']
Alpha Proteins Crosslink Positions: [779]
Alpha Decoy: False
Beta Peptide: GQKNSR
Beta Peptide Crosslink Position: 3
Beta Proteins: ['Cas9']
Beta Proteins Crosslink Positions: [779]
Beta Decoy: False
Crosslink Type: intra
Crosslink Score: 119.82547820493929
"""
_ok = check_input_multi(
data, "data", [CrosslinkSpectrumMatch, Crosslink, ParserResult]
)
_ok = check_input(show_additional_information, "show_additional_information", bool)
_ok = check_input(return_str, "return_str", bool)
display: str = ""
if isinstance(data, Crosslink):
display += f"Data Type: {data['data_type']}\n"
display += f"Completeness: {data['completeness']}\n"
display += f"Alpha Peptide: {data['alpha_peptide']}\n"
display += f"Alpha Peptide Crosslink Position: {data['alpha_peptide_crosslink_position']}\n"
display += f"Alpha Proteins: {data['alpha_proteins']}\n"
display += f"Alpha Proteins Crosslink Positions: {data['alpha_proteins_crosslink_positions']}\n"
display += f"Alpha Decoy: {data['alpha_decoy']}\n"
display += f"Beta Peptide: {data['beta_peptide']}\n"
display += f"Beta Peptide Crosslink Position: {data['beta_peptide_crosslink_position']}\n"
display += f"Beta Proteins: {data['beta_proteins']}\n"
display += f"Beta Proteins Crosslink Positions: {data['beta_proteins_crosslink_positions']}\n"
display += f"Beta Decoy: {data['beta_decoy']}\n"
display += f"Crosslink Type: {data['crosslink_type']}\n"
display += f"Crosslink Score: {data['score']}\n"
if show_additional_information:
display += f"Additional Information: {data['additional_information']}\n"
display = display.strip()
print(display)
if return_str:
return display
return
if isinstance(data, CrosslinkSpectrumMatch):
display += f"Data Type: {data['data_type']}\n"
display += f"Completeness: {data['completeness']}\n"
display += f"Alpha Peptide: {data['alpha_peptide']}\n"
display += f"Alpha Modifications: {data['alpha_modifications']}\n" # fmt: skip
display += f"Alpha Peptide Crosslink Position: {data['alpha_peptide_crosslink_position']}\n"
display += f"Alpha Proteins: {data['alpha_proteins']}\n"
display += f"Alpha Proteins Crosslink Positions: {data['alpha_proteins_crosslink_positions']}\n"
display += f"Alpha Proteins Peptide Positions: {data['alpha_proteins_peptide_positions']}\n"
display += f"Alpha Peptide Score: {data['alpha_score']}\n"
display += f"Alpha Decoy: {data['alpha_decoy']}\n"
display += f"Beta Peptide: {data['beta_peptide']}\n"
display += f"Beta Modifications: {data['beta_modifications']}\n"
display += f"Beta Peptide Crosslink Position: {data['beta_peptide_crosslink_position']}\n"
display += f"Beta Proteins: {data['beta_proteins']}\n"
display += f"Beta Proteins Crosslink Positions: {data['beta_proteins_crosslink_positions']}\n"
display += f"Beta Proteins Peptide Positions: {data['beta_proteins_peptide_positions']}\n"
display += f"Beta Peptide Score: {data['beta_score']}\n"
display += f"Beta Decoy: {data['beta_decoy']}\n"
display += f"Crosslink Type: {data['crosslink_type']}\n"
display += f"CSM Score: {data['score']}\n"
display += f"Spectrum File: {data['spectrum_file']}\n"
display += f"Scan Number: {data['scan_nr']}\n"
display += f"Precursor Charge: {data['charge']}\n"
display += f"Retention Time: {data['retention_time']}\n"
display += f"Ion Mobility/FAIMS CV: {data['ion_mobility']}\n"
if show_additional_information:
display += f"Additional Information: {data['additional_information']}\n"
display = display.strip()
print(display)
if return_str:
return display
return
if isinstance(data, ParserResult):
csms = data["crosslink-spectrum-matches"]
xls = data["crosslinks"]
display += f"Data Type: {data['data_type']}\n"
display += f"Completeness: {data['completeness']}\n"
display += f"Identifying Search Engine: {data['search_engine']}\n"
display += f"Number of Crosslink-Spectrum-Matches: {len(csms) if csms is not None else None}\n"
display += f"Number of Crosslinks: {len(xls) if xls is not None else None}\n"
display = display.strip()
print(display)
if return_str:
return display
return
raise TypeError(
f"Unknown data type {type(data)}. Data type must be CrosslinkSpectrumMatch, Crosslink, or ParserResult!"
)
return