Skip to content

Residues

ResidueSet(residue_masses)

A class for managing sets of residues.

Source code in instanovo/utils/residues.py
def __init__(self, residue_masses: dict[str, float]) -> None:
    self.residue_masses = residue_masses
    self.residue_to_index = {
        residue: index for index, residue in enumerate(self.residue_masses.keys())
    }
    self.index_to_residue = list(self.residue_to_index.keys())
    self.tokenizer_regex = r"(?<=.)(?=[A-Z])"
    self.eos_index = self.residue_to_index["$"]
    self.pad_index = self.eos_index

decode(sequence)

Map a sequence of indices to the corresponding sequence of residues.

Parameters:

Name Type Description Default
sequence list[int]

The sequence of residue indices.

required

Returns:

Type Description
list[str]

list[str]: The corresponding sequence of residue strings.

Source code in instanovo/utils/residues.py
def decode(self, sequence: list[int]) -> list[str]:
    """Map a sequence of indices to the corresponding sequence of residues.

    Args:
        sequence (list[int]): The sequence of residue indices.

    Returns:
        list[str]: The corresponding sequence of residue strings.
    """
    return [self.index_to_residue[index] for index in sequence]

detokenize(sequence)

Joining a list of residues into a string representing the peptide.

Parameters:

Name Type Description Default
sequence list[str]

The sequence of residues.

required

Returns:

Name Type Description
str str

The string representing the peptide.

Source code in instanovo/utils/residues.py
def detokenize(self, sequence: list[str]) -> str:
    """Joining a list of residues into a string representing the peptide.

    Args:
        sequence (list[str]):
            The sequence of residues.

    Returns:
        str:
            The string representing the peptide.
    """
    return "".join(sequence)

encode(sequence, pad_length=None)

Map a sequence of residues to their indices and optionally pad them to a fixed length.

Parameters:

Name Type Description Default
sequence list[str]

The sequence of residues.

required
pad_length int | None

An optional fixed length to pad the encoded sequence to. If this is None, no padding is done.

None

Returns:

Type Description
LongTensor

torch.LongTensor: A tensor with the indices of the residues.

Source code in instanovo/utils/residues.py
def encode(self, sequence: list[str], pad_length: int | None = None) -> torch.LongTensor:
    """Map a sequence of residues to their indices and optionally pad them to a fixed length.

    Args:
        sequence (list[str]):
            The sequence of residues.
        pad_length (int | None, optional):
            An optional fixed length to pad the encoded sequence to.
            If this is `None`, no padding is done.

    Returns:
        torch.LongTensor:
            A tensor with the indices of the residues.
    """
    encoded_list = [self.residue_to_index[residue] for residue in sequence]
    if pad_length:
        encoded_list.extend((pad_length - len(encoded_list)) * [self.pad_index])
    return torch.tensor(encoded_list)

get_mass(residue)

Get the mass of a residue.

Parameters:

Name Type Description Default
residue str

The residue whose mass to fetch. This residue must be in the residue set or this will raise a KeyError.

required

Returns:

Name Type Description
float float

The mass of the residue in Daltons.

Source code in instanovo/utils/residues.py
def get_mass(self, residue: str) -> float:
    """Get the mass of a residue.

    Args:
        residue (str):
            The residue whose mass to fetch. This residue
            must be in the residue set or this will raise
            a `KeyError`.

    Returns:
        float: The mass of the residue in Daltons.
    """
    return self.residue_masses[residue]

tokenize(sequence)

Split a peptide represented as a string into a list of residues.

Parameters:

Name Type Description Default
sequence str

The peptide to be split.

required

Returns:

Type Description
list[str]

list[str]: The sequence of residues forming the peptide.

Source code in instanovo/utils/residues.py
def tokenize(self, sequence: str) -> list[str]:
    """Split a peptide represented as a string into a list of residues.

    Args:
        sequence (str): The peptide to be split.

    Returns:
        list[str]: The sequence of residues forming the peptide.
    """
    return re.split(self.tokenizer_regex, sequence)