Coverage for instanovo/utils/msreader.py: 77%

47 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-12-08 07:26 +0000

1from typing import Any 

2 

3from pyteomics import mgf, mzml, mzxml 

4from pyteomics.auxiliary import cvquery 

5 

6 

7# Unused 

8def read_mgf(file_path: str) -> dict[str, list[Any]]: 

9 """Read an mgf file and return a data dict.""" 

10 data = _initialize_data_dict() 

11 

12 with mgf.read(file_path, index_by_scans=True) as reader: 

13 for spectrum in reader: 

14 data["scan_number"].append(spectrum.get("params", {}).get("title", "")) 

15 data["sequence"].append(spectrum.get("params", {}).get("seq", "")) 

16 data["precursor_mz"].append(spectrum.get("params", {}).get("pepmass", [0])[0]) 

17 data["precursor_charge"].append(spectrum.get("params", {}).get("charge", [0])[0]) 

18 data["retention_time"].append(spectrum.get("params", {}).get("rtinseconds", 0)) 

19 data["mz_array"].append(spectrum.get("m/z array", [])) 

20 data["intensity_array"].append(spectrum.get("intensity array", [])) 

21 

22 return data 

23 

24 

25def read_mzml( 

26 file_path: str, 

27) -> dict[str, list[Any]]: 

28 """Read an mzml file and return a data dict.""" 

29 data = _initialize_data_dict() 

30 

31 ms_vocab = { 

32 "ms_level": "MS:1000511", 

33 "sequence": "MS:1000889", 

34 "precursor_mz": ["MS:1000040", "MS:1000827", "MS:1000744"], 

35 "precursor_charge": "MS:1000041", 

36 "retention_time": "MS:1000016", 

37 "mz_array": "MS:1000514", 

38 "intensity_array": "MS:1000515", 

39 } 

40 

41 with mzml.read(file_path) as reader: 

42 for spectrum in reader: 

43 spectrum_dict = cvquery(spectrum) 

44 if spectrum_dict.get(ms_vocab["ms_level"]) == 2: # Ensure it's an MS2 spectrum 

45 data["scan_number"].append(spectrum.get("id", "")) 

46 

47 data["sequence"].append(spectrum_dict.get(ms_vocab["sequence"], "")) 

48 

49 # Find the first matching precursor mz term 

50 pre_mz_key = next( 

51 (key for key in ms_vocab["precursor_mz"] if key in spectrum_dict), 

52 "", 

53 ) 

54 data["precursor_mz"].append(spectrum_dict.get(pre_mz_key, 0)) 

55 data["precursor_charge"].append(spectrum_dict.get(ms_vocab["precursor_charge"], 0)) 

56 data["retention_time"].append(spectrum_dict.get(ms_vocab["retention_time"])) 

57 data["mz_array"].append(list(spectrum_dict.get(ms_vocab["mz_array"]))) 

58 data["intensity_array"].append(list(spectrum_dict.get(ms_vocab["intensity_array"]))) 

59 

60 return data 

61 

62 

63def read_mzxml(file_path: str) -> dict[str, list[Any]]: 

64 """Read an mzxml file and return a data dict.""" 

65 data = _initialize_data_dict() 

66 

67 with mzxml.read(file_path) as reader: 

68 for spectrum in reader: 

69 if spectrum.get("msLevel", 0) == 2: # Ensure it's an MS2 spectrum 

70 data["scan_number"].append(spectrum.get("num", "")) 

71 data["sequence"].append(spectrum.get("peptide", "")) 

72 precursor = spectrum.get("precursorMz", [{}])[0] 

73 data["precursor_mz"].append(precursor.get("precursorMz", 0)) 

74 data["precursor_charge"].append(precursor.get("precursorCharge", 0)) 

75 data["retention_time"].append(spectrum.get("retentionTime")) 

76 data["mz_array"].append(list(spectrum.get("m/z array"))) 

77 data["intensity_array"].append(list(spectrum.get("intensity array"))) 

78 

79 return data 

80 

81 

82def _initialize_data_dict() -> dict[str, list[Any]]: 

83 return { 

84 "scan_number": [], 

85 "sequence": [], 

86 "precursor_mz": [], 

87 "precursor_charge": [], 

88 "retention_time": [], 

89 "mz_array": [], 

90 "intensity_array": [], 

91 }