Coverage for instanovo/utils/msreader.py: 77%
47 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-12-08 07:26 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-12-08 07:26 +0000
1from typing import Any
3from pyteomics import mgf, mzml, mzxml
4from pyteomics.auxiliary import cvquery
7# Unused
8def read_mgf(file_path: str) -> dict[str, list[Any]]:
9 """Read an mgf file and return a data dict."""
10 data = _initialize_data_dict()
12 with mgf.read(file_path, index_by_scans=True) as reader:
13 for spectrum in reader:
14 data["scan_number"].append(spectrum.get("params", {}).get("title", ""))
15 data["sequence"].append(spectrum.get("params", {}).get("seq", ""))
16 data["precursor_mz"].append(spectrum.get("params", {}).get("pepmass", [0])[0])
17 data["precursor_charge"].append(spectrum.get("params", {}).get("charge", [0])[0])
18 data["retention_time"].append(spectrum.get("params", {}).get("rtinseconds", 0))
19 data["mz_array"].append(spectrum.get("m/z array", []))
20 data["intensity_array"].append(spectrum.get("intensity array", []))
22 return data
25def read_mzml(
26 file_path: str,
27) -> dict[str, list[Any]]:
28 """Read an mzml file and return a data dict."""
29 data = _initialize_data_dict()
31 ms_vocab = {
32 "ms_level": "MS:1000511",
33 "sequence": "MS:1000889",
34 "precursor_mz": ["MS:1000040", "MS:1000827", "MS:1000744"],
35 "precursor_charge": "MS:1000041",
36 "retention_time": "MS:1000016",
37 "mz_array": "MS:1000514",
38 "intensity_array": "MS:1000515",
39 }
41 with mzml.read(file_path) as reader:
42 for spectrum in reader:
43 spectrum_dict = cvquery(spectrum)
44 if spectrum_dict.get(ms_vocab["ms_level"]) == 2: # Ensure it's an MS2 spectrum
45 data["scan_number"].append(spectrum.get("id", ""))
47 data["sequence"].append(spectrum_dict.get(ms_vocab["sequence"], ""))
49 # Find the first matching precursor mz term
50 pre_mz_key = next(
51 (key for key in ms_vocab["precursor_mz"] if key in spectrum_dict),
52 "",
53 )
54 data["precursor_mz"].append(spectrum_dict.get(pre_mz_key, 0))
55 data["precursor_charge"].append(spectrum_dict.get(ms_vocab["precursor_charge"], 0))
56 data["retention_time"].append(spectrum_dict.get(ms_vocab["retention_time"]))
57 data["mz_array"].append(list(spectrum_dict.get(ms_vocab["mz_array"])))
58 data["intensity_array"].append(list(spectrum_dict.get(ms_vocab["intensity_array"])))
60 return data
63def read_mzxml(file_path: str) -> dict[str, list[Any]]:
64 """Read an mzxml file and return a data dict."""
65 data = _initialize_data_dict()
67 with mzxml.read(file_path) as reader:
68 for spectrum in reader:
69 if spectrum.get("msLevel", 0) == 2: # Ensure it's an MS2 spectrum
70 data["scan_number"].append(spectrum.get("num", ""))
71 data["sequence"].append(spectrum.get("peptide", ""))
72 precursor = spectrum.get("precursorMz", [{}])[0]
73 data["precursor_mz"].append(precursor.get("precursorMz", 0))
74 data["precursor_charge"].append(precursor.get("precursorCharge", 0))
75 data["retention_time"].append(spectrum.get("retentionTime"))
76 data["mz_array"].append(list(spectrum.get("m/z array")))
77 data["intensity_array"].append(list(spectrum.get("intensity array")))
79 return data
82def _initialize_data_dict() -> dict[str, list[Any]]:
83 return {
84 "scan_number": [],
85 "sequence": [],
86 "precursor_mz": [],
87 "precursor_charge": [],
88 "retention_time": [],
89 "mz_array": [],
90 "intensity_array": [],
91 }