Skip to content

Dataset

SpectrumDataset(df, s2i, n_peaks=200, min_mz=50.0, max_mz=2500.0, min_intensity=0.01, remove_precursor_tol=2.0, reverse_peptide=True, eos_symbol='</s>', annotated=True, return_str=False)

Bases: Dataset

Spectrum dataset class supporting .ipc and .csv.

Source code in instanovo/transformer/dataset.py
def __init__(
    self,
    df: pd.DataFrame | pl.DataFrame,
    s2i: dict[str, int],
    n_peaks: int = 200,
    min_mz: float = 50.0,
    max_mz: float = 2500.0,
    min_intensity: float = 0.01,
    remove_precursor_tol: float = 2.0,
    reverse_peptide: bool = True,
    eos_symbol: str = "</s>",
    annotated: bool = True,
    return_str: bool = False,
) -> None:
    super().__init__()
    self.df = df
    self.s2i = s2i
    self.n_peaks = n_peaks
    self.min_mz = min_mz
    self.max_mz = max_mz
    self.remove_precursor_tol = remove_precursor_tol
    self.min_intensity = min_intensity
    self.reverse_peptide = reverse_peptide
    self.annotated = annotated
    self.return_str = return_str

    if eos_symbol in self.s2i:
        self.EOS_ID = self.s2i[eos_symbol]
    else:
        self.EOS_ID = -1

    if isinstance(df, pd.DataFrame):
        self.data_type = "pd"
    elif isinstance(df, pl.DataFrame):
        self.data_type = "pl"
    elif isinstance(df, datasets.Dataset):
        self.data_type = "hf"
    else:
        raise Exception(f"Unsupported data type {type(df)}")

collate_batch(batch)

Collate batch of samples.

Source code in instanovo/transformer/dataset.py
def collate_batch(
    batch: list[tuple[Tensor, float, int, Tensor]]
) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
    """Collate batch of samples."""
    spectra, precursor_mzs, precursor_charges, peptides = zip(*batch)

    # Pad spectra
    ll = torch.tensor([x.shape[0] for x in spectra], dtype=torch.long)
    spectra = nn.utils.rnn.pad_sequence(spectra, batch_first=True)
    spectra_mask = torch.arange(spectra.shape[1], dtype=torch.long)[None, :] >= ll[:, None]

    # Pad peptide
    if isinstance(peptides[0], str):
        peptides_mask = None
    else:
        ll = torch.tensor([x.shape[0] for x in peptides], dtype=torch.long)
        peptides = nn.utils.rnn.pad_sequence(peptides, batch_first=True)
        peptides_mask = torch.arange(peptides.shape[1], dtype=torch.long)[None, :] >= ll[:, None]

    precursor_mzs = torch.tensor(precursor_mzs)
    precursor_charges = torch.tensor(precursor_charges)
    precursor_masses = (precursor_mzs - PROTON_MASS_AMU) * precursor_charges
    precursors = torch.vstack([precursor_masses, precursor_charges, precursor_mzs]).T.float()

    return spectra, precursors, spectra_mask, peptides, peptides_mask