Coverage for instanovo/constants.py: 100%

43 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-12-08 07:26 +0000

1from __future__ import annotations 

2 

3from enum import Enum 

4 

5import polars as pl 

6 

7# CLI constants 

8 

9DEFAULT_TRAIN_CONFIG_PATH = "../configs" 

10DEFAULT_INFERENCE_CONFIG_PATH = "../configs/inference" 

11DEFAULT_INFERENCE_CONFIG_NAME = "default" 

12 

13# Logging 

14 

15USE_RICH_HANDLER = True 

16LOGGING_SHOW_PATH = False 

17LOGGING_SHOW_TIME = True 

18 

19# Constants 

20 

21H2O_MASS = 18.0106 

22CARBON_MASS_DELTA = 1.00335 

23PROTON_MASS_AMU = 1.007276 

24MASS_SCALE = 10000 

25MAX_MASS = 4000.0 

26 

27MAX_SEQUENCE_LENGTH = 200 

28 

29# Buffer size for shuffling the training dataset 

30SHUFFLE_BUFFER_SIZE = 100_000 

31 

32 

33class PrecursorDimension(Enum): 

34 """Names corresponding to indices in the precursor tensor.""" 

35 

36 PRECURSOR_MASS = 0 

37 PRECURSOR_CHARGE = 1 

38 PRECURSOR_MZ = 2 

39 

40 

41class SpecialTokens(Enum): 

42 """Special tokens used by the ResidueSet and model.""" 

43 

44 PAD_TOKEN = "[PAD]" # Padding token 

45 EOS_TOKEN = "[EOS]" # End of sequence 

46 SOS_TOKEN = "[SOS]" # Start of sequence 

47 

48 

49PRECURSOR_DIM = 3 

50 

51DIFFUSION_BASE_STEPS = 20 

52DIFFUSION_START_STEP = 15 

53DIFFUSION_EVAL_STEPS = (3, 8, 13, 18) 

54 

55 

56# Data handler constants 

57 

58 

59class MSColumns(Enum): 

60 """Columns names used by SpectrumDataFrame.""" 

61 

62 MZ_ARRAY = "mz_array" 

63 INTENSITY_ARRAY = "intensity_array" 

64 PRECURSOR_MZ = "precursor_mz" 

65 PRECURSOR_CHARGE = "precursor_charge" 

66 PRECURSOR_MASS = "precursor_mass" 

67 RETENTION_TIME = "retention_time" 

68 

69 

70ANNOTATED_COLUMN = "sequence" 

71 

72MS_TYPES: dict[MSColumns, pl.DataType] = { 

73 MSColumns.MZ_ARRAY: pl.List(pl.Float64), 

74 MSColumns.INTENSITY_ARRAY: pl.List(pl.Float64), 

75 MSColumns.PRECURSOR_MZ: pl.Float64, 

76 MSColumns.PRECURSOR_CHARGE: pl.Int64, 

77 MSColumns.PRECURSOR_MASS: pl.Float64, 

78 MSColumns.RETENTION_TIME: pl.Float64, 

79} 

80 

81DATASETS_COLUMNS = [ 

82 MSColumns.MZ_ARRAY.value, 

83 MSColumns.INTENSITY_ARRAY.value, 

84 MSColumns.PRECURSOR_MZ.value, 

85 MSColumns.PRECURSOR_CHARGE.value, 

86 MSColumns.PRECURSOR_MASS.value, 

87] 

88 

89ANNOTATION_ERROR = "Attempting to load annotated dataset, but some or all sequence annotations are missing." 

90 

91LEGACY_PTM_TO_UNIMOD: dict[str, str] = { 

92 "M(ox)": "M[UNIMOD:35]", 

93 "S(p)": "S[UNIMOD:21]", 

94 "T(p)": "T[UNIMOD:21]", 

95 "Y(p)": "Y[UNIMOD:21]", 

96 "Q(+0.98)": "Q[UNIMOD:7]", 

97 "N(+0.98)": "N[UNIMOD:7]", 

98 "M(+15.99)": "M[UNIMOD:35]", 

99 "C(+57.02)": "C[UNIMOD:4]", 

100 "S(+79.97)": "S[UNIMOD:21]", 

101 "T(+79.97)": "T[UNIMOD:21]", 

102 "Y(+79.97)": "Y[UNIMOD:21]", 

103 "Q(+.98)": "Q[UNIMOD:7]", 

104 "N(+.98)": "N[UNIMOD:7]", 

105 "(+42.01)": "[UNIMOD:1]", 

106 "(+43.01)": "[UNIMOD:5]", 

107 "(-17.03)": "[UNIMOD:385]", 

108} 

109 

110# Required output columns 

111PREDICTION_COLUMNS = [ 

112 "prediction_id", 

113 "predictions", 

114 "targets", 

115 "prediction_log_probability", 

116 "prediction_token_log_probabilities", 

117 "group", 

118] 

119 

120REFINEMENT_COLUMN = "input_predictions" 

121REFINEMENT_PROBABILITY_COLUMN = "input_log_probabilities"