from __future__ import annotations
from pathlib import Path
from tqdm import tqdm
from typing import List, Tuple, Dict, Set, Union, Generator, Optional
from collections import defaultdict
from dataclasses import dataclass
import numpy as np
import numba as nb
from geometricus.model_utility import ShapemerLearn
from geometricus.moment_invariants import MultipleMomentInvariants, SplitInfo, get_invariants_for_structures
from geometricus.protein_utility import ProteinKey
Shapemer = Union[bytes, tuple]
"""
An integer (in the case of model) or a list of integers for each moment (the old way)
"""
Shapemers = List[Shapemer]
"""
A list of Shapemer types
"""
[docs]@dataclass
class Geometricus:
"""
Class for storing embedding information
"""
protein_keys: List[ProteinKey]
"""
List of protein names = rows of the output embedding
"""
shapemer_to_protein_indices: Dict[Shapemer, List[Tuple[ProteinKey, int]]]
"""
Maps each shapemer to the proteins which have it and to the corresponding residue indices within these proteins
"""
proteins_to_shapemers: Dict[ProteinKey, Shapemers]
"""
Maps each protein to a list of shapemers in order of its residues\n\n
"""
shapemer_keys: Shapemers
"""
List of shapemers found
"""
proteins_to_shapemer_residue_indices: Dict[ProteinKey, Shapemers]
"""
Maps each protein to a set of residue indices covered by the current residue's shapemer in order of its residues\n\n
"""
resolution: Union[float, np.ndarray] = None
"""
Multiplier that determines how coarse/fine-grained each shape is.
This can be a single number, multiplied to all four moment invariants
or a numpy array of four numbers, one for each invariant
(This is for the old way of binning shapemers)
"""
[docs] @classmethod
def from_protein_files(cls,
input_files: Union[Path, str, List[str]],
model: ShapemerLearn = None,
split_infos: List[SplitInfo] = None,
moment_types: List[str] = None,
resolution: Union[float, np.ndarray] = None,
n_threads: int = 1,
verbose: bool = True):
"""
Creates a Geometricus object from protein structure files
Parameters
----------
input_files
Can be \n
A list of structure files (.pdb, .pdb.gz, .cif, .cif.gz),
A list of (structure_file, chain)
A list of PDBIDs or PDBID_chain or (PDB ID, chain)
A folder with input structure files,
A file which lists structure filenames or "structure_filename, chain" on each line,
A file which lists PDBIDs or PDBID_chain or PDBID, chain on each line
model
trained ShapemerLearn model
if this is not None, shapemers are generated using the trained model
and split_infos, moment_types, and resolution is ignored
split_infos
List of SplitInfo objects
moment_types
List of moment types to use
resolution
Multiplier that determines how coarse/fine-grained each shape is.
This can be a single number, multiplied to all four moment invariants
or a numpy array of four numbers, one for each invariant
(This is for the old way of binning shapemers)
n_threads
Number of threads to use
verbose
Whether to print progress
Returns
-------
Geometricus object
"""
invariants, errors = get_invariants_for_structures(input_files,
split_infos=split_infos,
moment_types=moment_types,
n_threads=n_threads,
verbose=verbose)
return cls.from_invariants(
invariants,
model=model, resolution=resolution)
[docs] @classmethod
def from_invariants(
cls,
invariants: Union[Generator[MultipleMomentInvariants], List[MultipleMomentInvariants]],
protein_keys: Optional[List[ProteinKey]] = None,
model: Optional[ShapemerLearn] = None,
resolution: Optional[Union[float, np.ndarray]] = None,
):
"""
Make a GeometricusEmbedding object from a list of MultipleMomentInvariant objects
Parameters
----------
invariants
List of MultipleMomentInvariant objects
protein_keys
list of protein names = rows of the output embedding.
if None, takes all keys in `invariants`
model
if given, uses this model to make the shapemers
resolution
multiplier that determines how coarse/fine-grained each shape is
this can be a single number, multiplied to all four moment invariants
or a numpy array of four numbers, one for each invariant
(This is for the old way of binning shapemers)
"""
assert model is not None or resolution is not None, "Must provide either a model or resolution"
if isinstance(resolution, np.ndarray):
assert resolution.shape[0] == invariants[0].invariants[0].moments.shape[1]
invariants: Dict[ProteinKey, MultipleMomentInvariants] = {
x.name: x for x in invariants
}
if protein_keys is None:
protein_keys: List[ProteinKey] = list(invariants.keys())
assert all(k in invariants for k in protein_keys)
if model is None:
proteins_to_shapemers = {k: invariants[k].get_shapemers_binned(resolution) for k in
tqdm(protein_keys, total=len(protein_keys))}
else:
proteins_to_shapemers = {k: invariants[k].get_shapemers_model(model) for k in
tqdm(protein_keys, total=len(protein_keys))}
proteins_to_shapemer_residue_indices = {k: invariants[k].get_neighbors() for k in protein_keys}
geometricus_class = cls(
proteins_to_shapemers=proteins_to_shapemers,
protein_keys=protein_keys,
resolution=resolution,
proteins_to_shapemer_residue_indices=proteins_to_shapemer_residue_indices,
shapemer_keys=[],
shapemer_to_protein_indices={},
)
geometricus_class.shapemer_to_protein_indices = geometricus_class.map_shapemers_to_indices()
geometricus_class.shapemer_keys = sorted(list(geometricus_class.shapemer_to_protein_indices.keys()))
return geometricus_class
[docs] def map_shapemers_to_indices(self, protein_keys=None):
"""
Maps each shapemer to the proteins which have it and to the corresponding residue indices within these proteins
Maps shapemer to (protein_key, residue_index)
"""
if protein_keys is None:
protein_keys = self.protein_keys
shapemer_to_protein_indices: Dict[
Shapemer, List[Tuple[ProteinKey, int]]
] = defaultdict(list)
for key in protein_keys:
for j, shapemer in enumerate(self.proteins_to_shapemers[key]):
shapemer_to_protein_indices[shapemer].append((key, j))
return shapemer_to_protein_indices
[docs] def map_protein_to_shapemer_indices(self, protein_keys=None, shapemer_keys=None):
"""
Maps each protein to a list of shapemer indices where the index corresponds to the shapemer in shapemer_keys
in order of its residues\n\n
"""
if protein_keys is not None and shapemer_keys is None:
shapemer_keys = sorted(list(self.map_shapemers_to_indices(protein_keys).keys()))
elif protein_keys is None:
protein_keys = self.protein_keys
if shapemer_keys is None:
shapemer_keys = self.shapemer_keys
shapemer_index = {k: i for i, k in enumerate(shapemer_keys)}
return {
k: np.array([shapemer_index[x] for x in self.proteins_to_shapemers[k] if x in shapemer_index],
dtype=int)
for
k in
protein_keys}, shapemer_keys
[docs] def map_shapemer_to_residues(
self, shapemer: Shapemer
) -> Dict[ProteinKey, Set[int]]:
"""
Gets residue indices within a particular shapemer across all proteins.
"""
protein_to_shapemer_residues: Dict[ProteinKey, Set[int]] = defaultdict(set)
for protein_key, residue_index in self.shapemer_to_protein_indices[shapemer]:
shapemer_residues = self.proteins_to_shapemer_residue_indices[protein_key][residue_index]
for residue in shapemer_residues:
protein_to_shapemer_residues[protein_key].add(residue)
return protein_to_shapemer_residues
[docs] def get_count_matrix(self, protein_keys=None, shapemer_keys=None):
if protein_keys is None:
protein_keys = self.protein_keys
proteins_to_shapemer_indices, shapemer_keys = self.map_protein_to_shapemer_indices(protein_keys, shapemer_keys)
return make_count_matrix([proteins_to_shapemer_indices[k] for k in protein_keys],
len(shapemer_keys))
[docs]@nb.njit(parallel=True)
def make_count_matrix(residues_list, alphabet_size: int):
out = np.zeros((len(residues_list), alphabet_size))
for i in nb.prange(len(residues_list)):
for j in range(len(residues_list[i])):
out[i, residues_list[i][j]] += 1
return out