Residues

`ResidueSet(residue_masses)`

A class for managing sets of residues.

Source code in instanovo/utils/residues.py

def __init__(self, residue_masses: dict[str, float]) -> None:
    self.residue_masses = residue_masses
    self.residue_to_index = {
        residue: index for index, residue in enumerate(self.residue_masses.keys())
    }
    self.index_to_residue = list(self.residue_to_index.keys())
    self.tokenizer_regex = r"(?<=.)(?=[A-Z])"
    self.eos_index = self.residue_to_index["$"]
    self.pad_index = self.eos_index

`decode(sequence)`

Map a sequence of indices to the corresponding sequence of residues.

Parameters:

Name	Type	Description	Default
`sequence`	`list[int]`	The sequence of residue indices.	required

Returns:

Type	Description
`list[str]`	list[str]: The corresponding sequence of residue strings.

Source code in instanovo/utils/residues.py

def decode(self, sequence: list[int]) -> list[str]:
    """Map a sequence of indices to the corresponding sequence of residues.

    Args:
        sequence (list[int]): The sequence of residue indices.

    Returns:
        list[str]: The corresponding sequence of residue strings.
    """
    return [self.index_to_residue[index] for index in sequence]

`detokenize(sequence)`

Joining a list of residues into a string representing the peptide.

Parameters:

Name	Type	Description	Default
`sequence`	`list[str]`	The sequence of residues.	required

Returns:

Name	Type	Description
`str`	`str`	The string representing the peptide.

Source code in instanovo/utils/residues.py

def detokenize(self, sequence: list[str]) -> str:
    """Joining a list of residues into a string representing the peptide.

    Args:
        sequence (list[str]):
            The sequence of residues.

    Returns:
        str:
            The string representing the peptide.
    """
    return "".join(sequence)

`encode(sequence, pad_length=None)`

Map a sequence of residues to their indices and optionally pad them to a fixed length.

Parameters:

Name	Type	Description	Default
`sequence`	`list[str]`	The sequence of residues.	required
`pad_length`	`int \| None`	An optional fixed length to pad the encoded sequence to. If this is `None`, no padding is done.	`None`

Returns:

Type	Description
`LongTensor`	torch.LongTensor: A tensor with the indices of the residues.

Source code in instanovo/utils/residues.py

def encode(self, sequence: list[str], pad_length: int | None = None) -> torch.LongTensor:
    """Map a sequence of residues to their indices and optionally pad them to a fixed length.

    Args:
        sequence (list[str]):
            The sequence of residues.
        pad_length (int | None, optional):
            An optional fixed length to pad the encoded sequence to.
            If this is `None`, no padding is done.

    Returns:
        torch.LongTensor:
            A tensor with the indices of the residues.
    """
    encoded_list = [self.residue_to_index[residue] for residue in sequence]
    if pad_length:
        encoded_list.extend((pad_length - len(encoded_list)) * [self.pad_index])
    return torch.tensor(encoded_list)

`get_mass(residue)`

Get the mass of a residue.

Parameters:

Name	Type	Description	Default
`residue`	`str`	The residue whose mass to fetch. This residue must be in the residue set or this will raise a `KeyError`.	required

Returns:

Name	Type	Description
`float`	`float`	The mass of the residue in Daltons.

Source code in instanovo/utils/residues.py

def get_mass(self, residue: str) -> float:
    """Get the mass of a residue.

    Args:
        residue (str):
            The residue whose mass to fetch. This residue
            must be in the residue set or this will raise
            a `KeyError`.

    Returns:
        float: The mass of the residue in Daltons.
    """
    return self.residue_masses[residue]

`tokenize(sequence)`

Split a peptide represented as a string into a list of residues.

Parameters:

Name	Type	Description	Default
`sequence`	`str`	The peptide to be split.	required

Returns:

Type	Description
`list[str]`	list[str]: The sequence of residues forming the peptide.

Source code in instanovo/utils/residues.py

def tokenize(self, sequence: str) -> list[str]:
    """Split a peptide represented as a string into a list of residues.

    Args:
        sequence (str): The peptide to be split.

    Returns:
        list[str]: The sequence of residues forming the peptide.
    """
    return re.split(self.tokenizer_regex, sequence)