Coverage for instanovo/constants.py: 100%
43 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-12-08 07:26 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-12-08 07:26 +0000
1from __future__ import annotations
3from enum import Enum
5import polars as pl
7# CLI constants
9DEFAULT_TRAIN_CONFIG_PATH = "../configs"
10DEFAULT_INFERENCE_CONFIG_PATH = "../configs/inference"
11DEFAULT_INFERENCE_CONFIG_NAME = "default"
13# Logging
15USE_RICH_HANDLER = True
16LOGGING_SHOW_PATH = False
17LOGGING_SHOW_TIME = True
19# Constants
21H2O_MASS = 18.0106
22CARBON_MASS_DELTA = 1.00335
23PROTON_MASS_AMU = 1.007276
24MASS_SCALE = 10000
25MAX_MASS = 4000.0
27MAX_SEQUENCE_LENGTH = 200
29# Buffer size for shuffling the training dataset
30SHUFFLE_BUFFER_SIZE = 100_000
33class PrecursorDimension(Enum):
34 """Names corresponding to indices in the precursor tensor."""
36 PRECURSOR_MASS = 0
37 PRECURSOR_CHARGE = 1
38 PRECURSOR_MZ = 2
41class SpecialTokens(Enum):
42 """Special tokens used by the ResidueSet and model."""
44 PAD_TOKEN = "[PAD]" # Padding token
45 EOS_TOKEN = "[EOS]" # End of sequence
46 SOS_TOKEN = "[SOS]" # Start of sequence
49PRECURSOR_DIM = 3
51DIFFUSION_BASE_STEPS = 20
52DIFFUSION_START_STEP = 15
53DIFFUSION_EVAL_STEPS = (3, 8, 13, 18)
56# Data handler constants
59class MSColumns(Enum):
60 """Columns names used by SpectrumDataFrame."""
62 MZ_ARRAY = "mz_array"
63 INTENSITY_ARRAY = "intensity_array"
64 PRECURSOR_MZ = "precursor_mz"
65 PRECURSOR_CHARGE = "precursor_charge"
66 PRECURSOR_MASS = "precursor_mass"
67 RETENTION_TIME = "retention_time"
70ANNOTATED_COLUMN = "sequence"
72MS_TYPES: dict[MSColumns, pl.DataType] = {
73 MSColumns.MZ_ARRAY: pl.List(pl.Float64),
74 MSColumns.INTENSITY_ARRAY: pl.List(pl.Float64),
75 MSColumns.PRECURSOR_MZ: pl.Float64,
76 MSColumns.PRECURSOR_CHARGE: pl.Int64,
77 MSColumns.PRECURSOR_MASS: pl.Float64,
78 MSColumns.RETENTION_TIME: pl.Float64,
79}
81DATASETS_COLUMNS = [
82 MSColumns.MZ_ARRAY.value,
83 MSColumns.INTENSITY_ARRAY.value,
84 MSColumns.PRECURSOR_MZ.value,
85 MSColumns.PRECURSOR_CHARGE.value,
86 MSColumns.PRECURSOR_MASS.value,
87]
89ANNOTATION_ERROR = "Attempting to load annotated dataset, but some or all sequence annotations are missing."
91LEGACY_PTM_TO_UNIMOD: dict[str, str] = {
92 "M(ox)": "M[UNIMOD:35]",
93 "S(p)": "S[UNIMOD:21]",
94 "T(p)": "T[UNIMOD:21]",
95 "Y(p)": "Y[UNIMOD:21]",
96 "Q(+0.98)": "Q[UNIMOD:7]",
97 "N(+0.98)": "N[UNIMOD:7]",
98 "M(+15.99)": "M[UNIMOD:35]",
99 "C(+57.02)": "C[UNIMOD:4]",
100 "S(+79.97)": "S[UNIMOD:21]",
101 "T(+79.97)": "T[UNIMOD:21]",
102 "Y(+79.97)": "Y[UNIMOD:21]",
103 "Q(+.98)": "Q[UNIMOD:7]",
104 "N(+.98)": "N[UNIMOD:7]",
105 "(+42.01)": "[UNIMOD:1]",
106 "(+43.01)": "[UNIMOD:5]",
107 "(-17.03)": "[UNIMOD:385]",
108}
110# Required output columns
111PREDICTION_COLUMNS = [
112 "prediction_id",
113 "predictions",
114 "targets",
115 "prediction_log_probability",
116 "prediction_token_log_probabilities",
117 "group",
118]
120REFINEMENT_COLUMN = "input_predictions"
121REFINEMENT_PROBABILITY_COLUMN = "input_log_probabilities"