Source code for geometricus.protein_utility

from __future__ import annotations

from pathlib import PosixPath, Path

import gzip
import io
from dataclasses import dataclass, field
from typing import Union, Tuple, List

import numpy as np
import numba as nb
import warnings
from Bio import BiopythonDeprecationWarning

warnings.filterwarnings("ignore", category=BiopythonDeprecationWarning)
import prody as pd

ProteinKey = Union[str, Tuple[str, str]]
"""
A protein key is either its PDB ID (str) or a tuple of (PDB ID, chain)
"""


[docs]@dataclass(eq=False) class Structure: """ Class to store basic protein structure information """ name: ProteinKey """PDB ID or (PDB ID, chain)""" length: int """Number of residues""" coordinates: np.ndarray = field(repr=False) """Coordinates"""
[docs]def parse_structure_file(input_value: Union[Path, (Path, str), str]): """ Parse a protein structure file (.pdb, .pdb.gz, .cif, .cif.gz) or PDBID or PDBID_Chain and returns a prody AtomGroup object Parameters ---------- input_value: filename or (filename, chain) or PDBID or PDBID_Chain or (PDBID, chain) Returns ------- prody AtomGroup object """ chain = None if type(input_value) == tuple: input_value, chain = input_value if not Path(input_value).is_file(): if "_" in input_value: pdb_id, chain = input_value.split("_") else: pdb_id = input_value protein = pd.parsePDB(pdb_id, compressed=False, chain=chain) if chain is not None: protein.setTitle(f"{pdb_id}_{chain}") else: protein.setTitle(pdb_id) else: filename = str(input_value) if filename.endswith('.pdb') or filename.endswith('.pdb.gz'): protein = pd.parsePDB(filename) if protein is None: with open(filename) as f: protein = pd.parsePDBStream(f) elif filename.endswith('.cif'): protein = pd.parseMMCIF(filename) if protein is None: with open(filename) as f: protein = pd.parseMMCIFStream(f) elif filename.endswith(".cif.gz"): with gzip.open(filename, 'r') as mmcif: with io.TextIOWrapper(mmcif, encoding='utf-8') as decoder: protein = pd.parseMMCIFStream(decoder) else: with open(filename) as f: protein = pd.parsePDBStream(f) input_value = Path(input_value).name if protein is None: raise ValueError(f"Could not parse {input_value}") if chain is not None: protein = protein[chain].toAtomGroup() if protein is None: raise ValueError(f"Could not parse {input_value} chain {chain}") protein.setTitle(f"{input_value}_{chain}") else: protein.setTitle(input_value) return protein
[docs]def get_structure_files(input_value: Union[Path, str, List[str]]) -> List[Union[str, (str, str)]]: """ Get a list of structure files or PDB IDs from a string representing: A list of structure files (.pdb, .pdb.gz, .cif, .cif.gz), A list of (structure_file, chain) A list of PDBIDs or PDBID_chain or (PDB ID, chain) A folder with input structure files, A file which lists structure filenames or "structure_filename, chain" on each line, A file which lists PDBIDs or PDBID_chain or PDBID, chain on each line Parameters ---------- input_value Returns ------- List of structure files or (structure_file, chain) or PDBIDs or (PDB ID, chain) """ if type(input_value) == str or type(input_value) == PosixPath: input_value = Path(input_value) if input_value.is_dir(): protein_files = list(input_value.glob("*")) elif input_value.is_file(): with open(input_value) as f: protein_files = f.read().strip().split("\n") else: raise ValueError(f"Could not parse {input_value}") else: assert type(input_value) == list or type(input_value) == tuple, "Input must be a path or a list" protein_files = input_value final_protein_files = [] for protein_file in protein_files: if (type(protein_file) == str or type(protein_file) == PosixPath) and Path(protein_file).is_file(): final_protein_files.append(protein_file) elif type(protein_file) == tuple: protein_file, chain = protein_file final_protein_files.append((protein_file, chain)) else: assert type(protein_file) == str, f"Could not understand input {protein_file}" if ", " in protein_file: protein_file, chain = protein_file.split(", ") final_protein_files.append((protein_file, chain)) elif "_" in protein_file: pdb_id, chain = protein_file.split("_") final_protein_files.append((pdb_id, chain)) else: final_protein_files.append(protein_file) return list(set(final_protein_files))
[docs]def group_indices(input_list: List[int]) -> List[List[int]]: """ e.g [1, 1, 1, 2, 2, 3, 3, 3, 4] -> [[0, 1, 2], [3, 4], [5, 6, 7], [8]] """ output_list = [] current_list = [] current_index = None for i in range(len(input_list)): if current_index is None: current_index = input_list[i] if input_list[i] == current_index: current_list.append(i) else: output_list.append(current_list) current_list = [i] current_index = input_list[i] output_list.append(current_list) return output_list
[docs]def get_alpha_indices(protein: pd.AtomGroup) -> List[int]: """ Get indices of alpha carbons of pd AtomGroup object """ return [i for i, a in enumerate(protein.iterAtoms()) if a.getName() == "CA"]
[docs]def get_beta_indices(protein: pd.AtomGroup) -> List[int]: """ Get indices of beta carbons of pd AtomGroup object (If beta carbon doesn't exist, alpha carbon index is returned) """ residue_splits = group_indices(protein.getResindices()) i = 0 indices = [] for split in residue_splits: ca = None cb = None for _ in split: if protein[i].getName() == "CB": cb = protein[i].getIndex() if protein[i].getName() == "CA": ca = protein[i].getIndex() i += 1 if cb is not None: indices.append(cb) else: assert ca is not None indices.append(ca) return indices
[docs]def get_sequences_from_fasta_yield(fasta_file: Union[str, Path], comments=("#")) -> tuple: """ Returns (accession, sequence) iterator Parameters ---------- fasta_file comments ignore lines containing any of these strings Returns ------- (accession, sequence) """ with open(fasta_file) as f: current_sequence = "" current_key = None for line in f: if not len(line.strip()) or any(comment in line for comment in comments): continue if ">" in line: if current_key is None: current_key = line.split(">")[1].strip() else: if current_sequence[-1] == "*": current_sequence = current_sequence[:-1] yield current_key, current_sequence current_sequence = "" current_key = line.split(">")[1].strip() else: current_sequence += line.strip() if current_sequence[-1] == "*": current_sequence = current_sequence[:-1] yield current_key, current_sequence
[docs]def get_sequences_from_fasta(fasta_file: Union[str, Path], comments=("#")) -> dict: """ Returns dict of accession to sequence from fasta file Parameters ---------- fasta_file comments ignore lines containing any of these strings Returns ------- {accession:sequence} """ return { key: sequence for (key, sequence) in get_sequences_from_fasta_yield(fasta_file, comments=comments) }
[docs]@nb.njit def get_rmsd(coords_1: np.ndarray, coords_2: np.ndarray) -> float: """ RMSD of paired coordinates = normalized square-root of sum of squares of euclidean distances """ return np.sqrt(np.sum((coords_1 - coords_2) ** 2) / coords_1.shape[0])
[docs]@nb.njit def get_rotation_matrix(coords_1: np.ndarray, coords_2: np.ndarray): """ Superpose paired coordinates on each other using Kabsch superposition (SVD) Assumes centered coordinates Parameters ---------- coords_1 numpy array of coordinate data for the first protein; shape = (n, 3) coords_2 numpy array of corresponding coordinate data for the second protein; shape = (n, 3) Returns ------- rotation matrix for optimal superposition """ correlation_matrix = np.dot(coords_2.T, coords_1) u, s, v = np.linalg.svd(correlation_matrix) reflect = np.linalg.det(u) * np.linalg.det(v) < 0 if reflect: s[-1] = -s[-1] u[:, -1] = -u[:, -1] rotation_matrix = np.dot(u, v) return rotation_matrix.astype(np.float64)
[docs]def alignment_to_numpy(alignment): aln_np = {} for n in alignment: aln_seq = [] index = 0 for a in alignment[n]: if a == "-": aln_seq.append(-1) else: aln_seq.append(index) index += 1 aln_np[n] = np.array(aln_seq) return aln_np