Source code for geometricus.geometricus

from __future__ import annotations
from pathlib import Path
from tqdm import tqdm
from typing import List, Tuple, Dict, Set, Union, Generator, Optional
from collections import defaultdict
from dataclasses import dataclass

import numpy as np
import numba as nb
from geometricus.model_utility import ShapemerLearn
from geometricus.moment_invariants import MultipleMomentInvariants, SplitInfo, get_invariants_for_structures
from geometricus.protein_utility import ProteinKey

Shapemer = Union[bytes, tuple]
"""
An integer (in the case of model) or a list of integers for each moment (the old way)
"""
Shapemers = List[Shapemer]
"""
A list of Shapemer types
"""


[docs]@dataclass
class Geometricus:
    """
    Class for storing embedding information
    """
    protein_keys: List[ProteinKey]
    """
    List of protein names = rows of the output embedding
    """
    shapemer_to_protein_indices: Dict[Shapemer, List[Tuple[ProteinKey, int]]]
    """
    Maps each shapemer to the proteins which have it and to the corresponding residue indices within these proteins
    """
    proteins_to_shapemers: Dict[ProteinKey, Shapemers]
    """
    Maps each protein to a list of shapemers in order of its residues\n\n
    """
    shapemer_keys: Shapemers
    """
    List of shapemers found
    """
    proteins_to_shapemer_residue_indices: Dict[ProteinKey, Shapemers]
    """
    Maps each protein to a set of residue indices covered by the current residue's shapemer in order of its residues\n\n
    """
    resolution: Union[float, np.ndarray] = None
    """
    Multiplier that determines how coarse/fine-grained each shape is. 
    This can be a single number, multiplied to all four moment invariants 
    or a numpy array of four numbers, one for each invariant
    (This is for the old way of binning shapemers)
    """

[docs]    @classmethod
    def from_protein_files(cls,
                           input_files: Union[Path, str, List[str]],
                           model: ShapemerLearn = None,
                           split_infos: List[SplitInfo] = None,
                           moment_types: List[str] = None,
                           resolution: Union[float, np.ndarray] = None,
                           n_threads: int = 1,
                           verbose: bool = True):
        """
        Creates a Geometricus object from protein structure files

        Parameters
        ----------
        input_files
            Can be \n
            A list of structure files (.pdb, .pdb.gz, .cif, .cif.gz),
            A list of (structure_file, chain)
            A list of PDBIDs or PDBID_chain or (PDB ID, chain)
            A folder with input structure files,
            A file which lists structure filenames or "structure_filename, chain" on each line,
            A file which lists PDBIDs or PDBID_chain or PDBID, chain on each line
        model
            trained ShapemerLearn model
            if this is not None, shapemers are generated using the trained model
            and split_infos, moment_types, and resolution is ignored
        split_infos
            List of SplitInfo objects
        moment_types
            List of moment types to use
        resolution
            Multiplier that determines how coarse/fine-grained each shape is.
            This can be a single number, multiplied to all four moment invariants
            or a numpy array of four numbers, one for each invariant
            (This is for the old way of binning shapemers)
        n_threads
            Number of threads to use
        verbose
            Whether to print progress

        Returns
        -------
        Geometricus object
        """
        invariants, errors = get_invariants_for_structures(input_files,
                                                           split_infos=split_infos,
                                                           moment_types=moment_types,
                                                           n_threads=n_threads,
                                                           verbose=verbose)
        return cls.from_invariants(
            invariants,
            model=model, resolution=resolution)

[docs]    @classmethod
    def from_invariants(
            cls,
            invariants: Union[Generator[MultipleMomentInvariants], List[MultipleMomentInvariants]],
            protein_keys: Optional[List[ProteinKey]] = None,
            model: Optional[ShapemerLearn] = None,
            resolution: Optional[Union[float, np.ndarray]] = None,
    ):
        """
        Make a GeometricusEmbedding object from a list of MultipleMomentInvariant objects

        Parameters
        ----------
        invariants
            List of MultipleMomentInvariant objects
        protein_keys
            list of protein names = rows of the output embedding.
            if None, takes all keys in `invariants`
        model
            if given, uses this model to make the shapemers
        resolution
            multiplier that determines how coarse/fine-grained each shape is
            this can be a single number, multiplied to all four moment invariants
            or a numpy array of four numbers, one for each invariant
            (This is for the old way of binning shapemers)
        """
        assert model is not None or resolution is not None, "Must provide either a model or resolution"
        if isinstance(resolution, np.ndarray):
            assert resolution.shape[0] == invariants[0].invariants[0].moments.shape[1]
        invariants: Dict[ProteinKey, MultipleMomentInvariants] = {
            x.name: x for x in invariants
        }
        if protein_keys is None:
            protein_keys: List[ProteinKey] = list(invariants.keys())
        assert all(k in invariants for k in protein_keys)
        if model is None:
            proteins_to_shapemers = {k: invariants[k].get_shapemers_binned(resolution) for k in
                                     tqdm(protein_keys, total=len(protein_keys))}
        else:
            proteins_to_shapemers = {k: invariants[k].get_shapemers_model(model) for k in
                                     tqdm(protein_keys, total=len(protein_keys))}

        proteins_to_shapemer_residue_indices = {k: invariants[k].get_neighbors() for k in protein_keys}
        geometricus_class = cls(
            proteins_to_shapemers=proteins_to_shapemers,
            protein_keys=protein_keys,
            resolution=resolution,
            proteins_to_shapemer_residue_indices=proteins_to_shapemer_residue_indices,
            shapemer_keys=[],
            shapemer_to_protein_indices={},
        )
        geometricus_class.shapemer_to_protein_indices = geometricus_class.map_shapemers_to_indices()
        geometricus_class.shapemer_keys = sorted(list(geometricus_class.shapemer_to_protein_indices.keys()))
        return geometricus_class

[docs]    def map_shapemers_to_indices(self, protein_keys=None):
        """
        Maps each shapemer to the proteins which have it and to the corresponding residue indices within these proteins
        Maps shapemer to (protein_key, residue_index)
        """
        if protein_keys is None:
            protein_keys = self.protein_keys
        shapemer_to_protein_indices: Dict[
            Shapemer, List[Tuple[ProteinKey, int]]
        ] = defaultdict(list)
        for key in protein_keys:
            for j, shapemer in enumerate(self.proteins_to_shapemers[key]):
                shapemer_to_protein_indices[shapemer].append((key, j))
        return shapemer_to_protein_indices

[docs]    def map_protein_to_shapemer_indices(self, protein_keys=None, shapemer_keys=None):
        """
        Maps each protein to a list of shapemer indices where the index corresponds to the shapemer in shapemer_keys
        in order of its residues\n\n
        """
        if protein_keys is not None and shapemer_keys is None:
            shapemer_keys = sorted(list(self.map_shapemers_to_indices(protein_keys).keys()))
        elif protein_keys is None:
            protein_keys = self.protein_keys
            if shapemer_keys is None:
                shapemer_keys = self.shapemer_keys
        shapemer_index = {k: i for i, k in enumerate(shapemer_keys)}
        return {
                   k: np.array([shapemer_index[x] for x in self.proteins_to_shapemers[k] if x in shapemer_index],
                               dtype=int)
                   for
                   k in
                   protein_keys}, shapemer_keys

[docs]    def map_shapemer_to_residues(
            self, shapemer: Shapemer
    ) -> Dict[ProteinKey, Set[int]]:
        """
        Gets residue indices within a particular shapemer across all proteins.
        """
        protein_to_shapemer_residues: Dict[ProteinKey, Set[int]] = defaultdict(set)
        for protein_key, residue_index in self.shapemer_to_protein_indices[shapemer]:
            shapemer_residues = self.proteins_to_shapemer_residue_indices[protein_key][residue_index]
            for residue in shapemer_residues:
                protein_to_shapemer_residues[protein_key].add(residue)

        return protein_to_shapemer_residues

[docs]    def get_count_matrix(self, protein_keys=None, shapemer_keys=None):
        if protein_keys is None:
            protein_keys = self.protein_keys
        proteins_to_shapemer_indices, shapemer_keys = self.map_protein_to_shapemer_indices(protein_keys, shapemer_keys)
        return make_count_matrix([proteins_to_shapemer_indices[k] for k in protein_keys],
                                 len(shapemer_keys))


[docs]@nb.njit(parallel=True)
def make_count_matrix(residues_list, alphabet_size: int):
    out = np.zeros((len(residues_list), alphabet_size))
    for i in nb.prange(len(residues_list)):
        for j in range(len(residues_list[i])):
            out[i, residues_list[i][j]] += 1
    return out
Navigation

Related Topics

Source code for geometricus.geometricus