Module `deeporigin.src.structures.protein`

Classes

class Protein (pdb_id: str = '', file_path: str = '', struct_ind: int = 0, block_type: str = '', block_content: str = '')

Expand source code

class Protein:
    def __init__(
        self, pdb_id: str = "", file_path: str = "", struct_ind: int = 0, block_type: str = "", block_content: str = ""
    ):
        """
        Initialize a Protein object from various input sources.

        This constructor can create a Protein object from a PDB ID, file path, or direct content block.
        It handles structure loading, file management, and basic protein information setup.

        Args:
            pdb_id (str, optional): PDB identifier to download and load protein structure. Defaults to "".
            file_path (str, optional): Path to a local protein structure file. Defaults to "".
            struct_ind (int, optional): Index of the structure to select if multiple structures exist. Defaults to 0.
            block_type (str, optional): File format type (e.g., "pdb", "pdbqt"). Required if using block_content. Defaults to "".
            block_content (str, optional): Direct string content of a protein structure file. Defaults to "".

        Raises:
            ValueError: If not exactly one source (pdb_id, file_path, or block_content) is provided.
            ValueError: If block_type is not provided when using block_content.
            ValueError: If file format is not supported (only pdb/pdbqt are supported).
            ValueError: If structure cannot be loaded.
            FileNotFoundError: If the specified file_path does not exist.

        Attributes:
            pdb_id (str): PDB identifier of the protein.
            file_path (Path): Absolute path to the protein structure file.
            struct_ind (int): Index of the selected structure.
            name (str): Name of the protein structure.
            structure: Loaded protein structure object.
            atom_types: Types of atoms in the structure.
            info (dict): Additional protein information.
            block_type (str): Type of structure file format.
            block_content (str): Content of the structure file.
        """
        self.pdb_id = None
        self.file_path = None
        self.struct_ind = struct_ind
        self.name = None
        self.structure = None
        self.atom_types = None
        self.info = None

        file_path_obj = Path(file_path) if file_path else None
        extension = file_path_obj.suffix.lower() if file_path_obj else ""
        if not block_type and extension:
            block_type = extension.lstrip(".")  # Remove the leading dot
        self.block_type = block_type.lower()
        self.block_content = block_content

        sources_provided = sum(bool(x) for x in [pdb_id, file_path, block_content])
        if sources_provided != 1:
            raise ValueError("Please provide exactly one of pdb_id, file_path, or block_content.")

        from_block = False
        try:
            if pdb_id:
                self.file_path = Path(self.download_protein_by_pdb_id(pdb_id)).absolute()
                if not self.block_type:
                    self.block_type = self.file_path.suffix.lstrip(".").lower()

                self.info = get_protein_info_dict(pdb_id)
                self.block_content = self.file_path.read_text()

            elif file_path:
                self.file_path = Path(file_path).absolute()
                if not self.file_path.exists():
                    raise FileNotFoundError(f"The file {self.file_path} does not exist.")

                if not self.block_type:
                    self.block_type = self.file_path.suffix.lstrip(".").lower()

                self.block_content = self.file_path.read_text()

                if not pdb_id and not self.info:
                    try:
                        protein_file_dir = self.get_directory()
                        if protein_file_dir != str(self.file_path.parent):
                            destination = Path(protein_file_dir) / self.file_path.name
                            shutil.copy2(self.file_path, destination)

                            self.file_path = destination

                    except Exception as e:
                        DEFAULT_LOGGER.log_error(f"Failed to copy file to destination: {str(e)}")
                        raise
            elif block_content:
                self.block_content = block_content
                if not self.block_type:
                    raise ValueError("block_type must be provided when initializing with block_content.")
                from_block = True

            if self.block_content:
                if self.block_type not in ["pdb", "pdbqt"]:
                    raise ValueError(f"Only pdb/pdbqt file formats are supported (given {self.block_type})")
                self.structure = self.load_structure_from_block(self.block_content, self.block_type)

            if self.structure is None:
                raise ValueError("Structure could not be loaded.")

            DEFAULT_LOGGER.log_info(
                f"Loaded structure from {self.file_path}. Selected structure index: {self.struct_ind}"
            )
            self.structure = self.select_structure(self.struct_ind)

            if self.name is None:
                if self.file_path:
                    self.name = self.file_path.stem
                else:
                    self.name = "Unknown_Structure"
                    protein_file_dir = self.get_directory()
                    directory = Path(protein_file_dir)
                    num = len(list(directory.glob(f"{self.name}*")))
                    self.name = f"{self.name}_{num + 1}"

            self.atom_types = self.structure.atom_name

            if from_block:
                protein_file_dir = self.get_directory()
                directory = Path(protein_file_dir)

                self.file_path = directory / f"{self.name}.{self.block_type}"
                self.write_to_file(self.file_path)

        except Exception as e:
            DEFAULT_LOGGER.log_error(f"Failed to initialize Protein: {str(e)}")
            raise

    @property
    def coordinates(self):
        """
        Gets the atomic coordinates of the protein structure.

        Returns:
            numpy.ndarray: A numpy array containing the 3D coordinates of all atoms in the structure.
            The array has shape (n_atoms, 3) where each row represents the x, y, z coordinates of an atom.
        """
        return self.structure.coord

    def prepare(self, model_loops: bool = False, pdb_id: str = "") -> "Protein":
        """
        Prepares the protein structure by processing metals, cofactors and optionally modeling missing loops.

        Args:
            model_loops (bool, optional): Whether to model missing loops in the structure.
                Requires a valid PDB ID if True. Defaults to False.
            pdb_id (str, optional): PDB ID of the protein structure. Required if model_loops=True.
                Defaults to empty string.

        Returns:
            Protein: A new Protein instance with the prepared structure.

        Raises:
            ValueError: If model_loops is True but no PDB ID is provided.
            Exception: If protein preparation fails.

        Notes:
            This method:
            - Extracts and filters metal ions and cofactors from the structure
            - Processes the structure through preparation pipeline
            - Writes the prepared structure to a new file with '_prep.pdb' suffix
            - Creates and returns a new Protein instance with the prepared structure
        """
        pdb_id = pdb_id if pdb_id else self.pdb_id
        if model_loops and not pdb_id:
            raise ValueError("PDB ID must be provided to model loops.")

        metal_resnames, cofactor_resnames = self.extract_metals_and_cofactors()
        metals_to_keep = [resname for resname in metal_resnames if resname.upper() in METALS]

        response = prepare(
            protein_path=self.file_path,
            protein_pdb_id=pdb_id,
            protein_extension=self.block_type,
            metal_resnames=metals_to_keep,
            cofactor_resnames=cofactor_resnames,
            model_loops=model_loops,
        )
        if not response["prepared_protein_content"]:
            raise Exception("Failed to prepare protein.")

        protein_dir = Path(self.file_path).parent
        base_name = Path(self.file_path).stem if self.file_path else "modified_structure"
        new_file_name = protein_dir / f"{base_name}_prep.pdb"

        intermediate_protein = Protein(block_content=response["prepared_protein_content"], block_type="pdb")
        intermediate_protein.write_to_file(str(new_file_name))

        protein = Protein(file_path=new_file_name)
        protein.pdb_id = self.pdb_id

        return protein

    def load_structure_from_block(self, block_content: str, block_type: str):
        """
        Load a molecular structure from a text block.

        This method reads a structure from a string content block in either PDB or PDBQT format
        and returns a Structure object.

        Args:
            block_content (str): String containing the structure data in PDB/PDBQT format
            block_type (str): Format type of the block content ('pdb' or 'pdbqt')

        Returns:
            Structure: A Structure object representing the molecular structure

        Raises:
            ValueError: If the block_type is not supported (must be 'pdb' or 'pdbqt')

        Examples:
            >>> protein = Protein()
            >>> pdb_content = "ATOM      1  N   ASN A   1      27.961  28.064  39.573  1.00 23.02           N"
            >>> structure = protein.load_structure_from_block(pdb_content, "pdb")
        """
        if block_type in ["pdb", "pdbqt"]:
            pdb_file = PDBFile.read(io.StringIO(block_content))
            structure = pdb_file.get_structure()
        else:
            raise ValueError(f"Unsupported block type: {block_type}")
        return structure

    @staticmethod
    def download_protein_by_pdb_id(pdb_id: str, save_dir: str = "") -> str:
        """
        Downloads a protein structure file from the PDB database.

        This function retrieves a protein structure file in PDB format from the Protein Data Bank
        using the provided PDB ID. If the file already exists in the specified directory,
        it skips the download.

        Args:
            pdb_id (str): The 4-character PDB ID of the protein structure.
            save_dir (str, optional): Directory path where the PDB file will be saved.
                If not provided, uses the default protein directory. Defaults to "".

        Returns:
            str: The full path to the downloaded PDB file.

        Raises:
            Exception: If the download fails for any reason (e.g., invalid PDB ID,
                network issues, etc.).

        Example:
            >>> file_path = download_protein_by_pdb_id("1abc", "/path/to/save/")
            >>> print(file_path)
            /path/to/save/1abc.pdb
        """
        if save_dir == "":
            save_dir = Protein.get_directory()

        pdb_id = pdb_id.lower()
        save_dir_path = Path(save_dir)
        save_dir_path.mkdir(parents=True, exist_ok=True)

        file_path = save_dir_path / f"{pdb_id}.pdb"
        if not file_path.exists():
            try:
                fetch(pdb_id, "pdb", save_dir_path)
                DEFAULT_LOGGER.log_info(f"Downloaded PDB {pdb_id} to {file_path}.")
            except Exception as e:
                DEFAULT_LOGGER.log_error(f"Failed to download PDB {pdb_id}: {str(e)}")
                raise
        else:
            DEFAULT_LOGGER.log_info(f"PDB file {file_path} already exists. Skipping download.")

        return str(file_path)

    @staticmethod
    def load_structure(structure_file_path: str):
        """
        Load a protein structure from a PDB file.

        Args:
            structure_file_path (str): Path to the PDB structure file.

        Returns:
            Structure: A Structure object representing the protein structure.

        Raises:
            FileNotFoundError: If the specified PDB file does not exist.
            PDBParseError: If the PDB file cannot be properly parsed.
        """
        structure_file = PDBFile.read(structure_file_path)
        structure = structure_file.get_structure()

        return structure

    def select_structure(self, index: int):
        """
        Selects a specific structure from the list of available structures.

        Args:
            index (int): The index of the structure to select.

        Returns:
            The selected structure at the specified index.

        Raises:
            ValueError: If the index is out of bounds (negative or >= length of structures).
        """
        if index < 0 or index >= len(self.structure):
            raise ValueError(f"Invalid structure index {index}. Total structures: {len(self.structure)}")

        return self.structure[index]

    def _filter_hetatm_records(self, exclude_water: bool = True, keep_resnames: Optional[List[str]] = None):
        """
        Filter HETATM records from the structure based on specified criteria.

        This method filters heterogeneous atom (HETATM) records from the structure,
        with options to exclude water molecules and keep only specific residue names.

        Args:
            exclude_water (bool, optional): Whether to exclude water molecules from the results.
                Removes 'HOH' and 'WAT' residues if True. Defaults to True.
            keep_resnames (List[str], optional): List of residue names to keep in the results.
                If provided, only residues with these names will be kept. Case-insensitive.
                Defaults to None.

        Returns:
            AtomArray: Filtered HETATM records as an AtomArray containing only the
                specified residues.
        """
        hetatm_records = self.structure[self.structure.hetero]
        res_names_upper = np.char.upper(hetatm_records.res_name)

        if exclude_water:
            water_residue_names = ["HOH", "WAT"]
            water_residue_names_upper = [name.upper() for name in water_residue_names]
            hetatm_records = hetatm_records[~np.isin(res_names_upper, water_residue_names_upper)]
            res_names_upper = np.char.upper(hetatm_records.res_name)

        if keep_resnames:
            keep_resnames_upper = [name.upper() for name in keep_resnames]
            hetatm_records = hetatm_records[np.isin(res_names_upper, keep_resnames_upper)]

        return hetatm_records

    def _filter_chain_records(self, chain_ids: Optional[List[str]] = None):
        """
        Filter structure records based on specified chain IDs.

        Args:
            chain_ids (Optional[List[str]]): List of chain IDs to filter by. If None or contains "ALL",
            returns all chains.

        Returns:
            Structure: Filtered structure records containing only specified chains.
        """

        if chain_ids is None or "ALL" in chain_ids:
            return self.structure
        else:
            return self.structure[np.isin(self.structure.chain_id, chain_ids)]

    def list_chain_names(self) -> List[str]:
        """
        Retrieves a list of unique chain identifiers from the protein structure.

        Returns:
            List[str]: A list of unique chain IDs present in the protein structure. Each chain ID is
                      typically a single character that identifies a specific polypeptide chain.
        """
        chain_records = self._filter_chain_records()
        chain_ids = np.unique(chain_records.chain_id)
        return list(chain_ids)

    def list_hetero_names(self, exclude_water=True) -> List[str]:
        """
        Returns a list of unique hetero atom residue names from the structure.

        Args:
            exclude_water (bool): If True, excludes water molecules (HOH) from the returned list.
            Defaults to True.

        Returns:
            List[str]: A list of unique hetero residue names found in the structure.
            Common examples include small molecules, ions, and modified amino acids.
        """
        hetatm_records = self._filter_hetatm_records(exclude_water=exclude_water)
        ligand_res_names = np.unique(hetatm_records.res_name)
        return list(ligand_res_names)

    def select_chain(self, chain_id: str) -> Optional["Protein"]:
        """
        Selects a specific chain from the protein structure and returns a new Protein object.

        Args:
            chain_id (str): The identifier of the chain to be selected.

        Returns:
            Optional[Protein]: A new Protein object containing only the selected chain.
                Returns None if the chain is not found.

        Raises:
            ValueError: If the specified chain_id is not found in the protein structure.

        Example:
            >>> protein = Protein("1abc.pdb")
            >>> chain_a = protein.select_chain("A")
        """
        chain_records = self._filter_chain_records(chain_ids=[chain_id])
        if len(chain_records) > 0:
            return self._create_new_protein_with_structure(chain_records, suffix=f"_chain_{chain_id}")
        else:
            raise ValueError(f"Chain {chain_id} not found.")

    def select_chains(self, chain_ids: List[str]) -> "Protein":
        """
        Select specific chains from the protein structure and create a new Protein object.

        Args:
            chain_ids (List[str]): A list of chain identifiers to select from the protein structure.

        Returns:
            Protein: A new Protein object containing only the selected chains.

        Raises:
            ValueError: If no chains are found for the provided chain IDs.

        Example:
            >>> protein.select_chains(['A', 'B'])
            # Returns a new Protein object with only chains A and B
        """
        chain_records = self._filter_chain_records(chain_ids=chain_ids)
        if len(chain_records) == 0:
            raise ValueError(f"No chains found for the provided chain IDs: {chain_ids}")
        return self._create_new_protein_with_structure(chain_records, suffix=f"_chains_{'_'.join(chain_ids)}")

    def select_ligand(self, res_name: str) -> "Ligand":
        """
        Selects and processes ligands from the protein structure based on the residue name.

        This method identifies ligand atoms in the structure, attempts to fetch their SMILES
        representation, and creates Ligand objects for each unique residue occurrence.

        Args:
            res_name (str): The residue name of the ligand to select.

        Returns:
            List[Ligand]: A list of Ligand objects, each representing a unique instance
            of the specified ligand in the structure.

        Raises:
            ValueError: If the specified residue name is not found in the structure's
                       hetero residues, or if no atoms are found for the specified ligand.

        Notes:
            - The method first attempts to fetch SMILES from PDB API
            - If SMILES fetch fails, it attempts to use OpenBabel for SMILES extraction
            - Each Ligand object contains the atomic coordinates and chemical structure information
            - Bond orders are assigned using SMILES when available

        Example:
            >>> protein = Protein("1abc.pdb")
            >>> ligands = protein.select_ligand("ATP")
            >>> print(len(ligands))  # Number of ATP molecules in structure
        """
        hetero_names = self.list_hetero_names()
        if res_name not in hetero_names:
            raise ValueError(f"Residue {res_name} not found. Available ligands are: {hetero_names}")

        ligand_atoms = self.structure[(self.structure.res_name == res_name) & self.structure.hetero]
        if len(ligand_atoms) == 0:
            raise ValueError(f"No atoms found for ligand {res_name}.")

        try:
            smiles = Ligand.fetch_smiles_from_pdb_api(res_name)
            DEFAULT_LOGGER.log_warning(f"SMILES for {res_name}: {smiles}")
        except Exception:
            DEFAULT_LOGGER.log_warning(f"Failed to fetch SMILES for {res_name}.")
            smiles = None

        chain_ids = ligand_atoms.chain_id
        res_ids = ligand_atoms.res_id
        ins_codes = ligand_atoms.ins_code

        residue_tuples = list(zip(chain_ids, res_ids, ins_codes))
        unique_residue_tuples = list(set(residue_tuples))

        ligands = []
        for chain_id, res_id, ins_code in unique_residue_tuples:
            mask = (
                (ligand_atoms.chain_id == chain_id)
                & (ligand_atoms.res_id == res_id)
                & (ligand_atoms.ins_code == ins_code)
            )

            ligand_group = ligand_atoms[mask]

            pdb_file = PDBFile()
            pdb_file.set_structure(ligand_group)
            pdb_block = io.StringIO()
            pdb_file.write(pdb_block)
            block_content, block_type = pdb_block.getvalue(), "pdb"
            if not smiles:
                DEFAULT_LOGGER.log_warning(f"PROCEEDING WITH OPEN BABEL TO EXTRACT SMILES FOR {res_name}")
                try:
                    block_content, block_type = convert_block("pdb", block_content, "sdf"), "sdf"
                except Exception as _:
                    DEFAULT_LOGGER.log_error(f"Failed to convert block to SDF. Please provide smiles manually.")
                    return

            ligand = Ligand(
                block_content=block_content,
                block_type=block_type,
                name=res_name,
                xref_protein=self,
                xref_ins_code=ins_code,
                xref_residue_id=res_id,
                xref_protein_chain_id=chain_id,
            )

            if smiles:
                ligand.mol.assign_bond_order_from_smiles(smiles)

            ligands.append(ligand)

        return ligands

    def select_ligands(self, res_names: List[str]) -> List["Ligand"]:
        """
        Selects and returns a list of ligands based on their residue names.

        Args:
            res_names (List[str]): A list of residue names to select ligands for.
                If None, all heterogeneous residue names will be used.

        Returns:
            List["Ligand"]: A list of Ligand objects matching the specified residue names.
                Returns an empty list if no matching ligands are found.

        Note:
            If a residue name is not found, a warning will be logged and the selection will continue
            with the remaining residue names.
        """
        if res_names is None:
            res_names = self.list_hetero_names()

        ligands = []
        for res_name in res_names:
            try:
                ligand = self.select_ligand(res_name)
                ligands.extend(ligand)
            except ValueError as e:
                DEFAULT_LOGGER.log_warning(str(e))
        return ligands

    def remove_hetatm(self, keep_resnames: Optional[List[str]] = None, remove_metals: Optional[List[str]] = None):
        """
        Remove HETATM records from the protein structure while allowing specific residues and metals to be kept.

        Args:
            keep_resnames (Optional[List[str]]): List of residue names to keep in the structure despite being HETATM records.
            Names are case-insensitive. Defaults to None.
            remove_metals (Optional[List[str]]): List of metal names to remove from the structure.
            By default, all metals are kept. Names are case-insensitive. Defaults to None.

        Returns:
            Protein: A new Protein object containing the filtered structure with the suffix "_no_hetatm"

        Examples:
            >>> protein.remove_hetatm(keep_resnames=['NAG', 'BMA'], remove_metals=['ZN', 'MG'])
            >>> protein.remove_hetatm()  # Removes all HETATM except metals
            >>> protein.remove_hetatm(keep_resnames=['HOH'])  # Keeps water molecules
        """

        metals = METALS
        if remove_metals:
            exclude_metals_upper = [metal.upper() for metal in remove_metals]
            metals = list(set(METALS) - set(exclude_metals_upper))

        if not metals and not keep_resnames:
            filtered_structure = self.structure[~self.structure.hetero]
        else:
            keep_resnames_upper = [res.upper() for res in keep_resnames] if keep_resnames else []
            keep_resnames_upper.extend(metals)
            keep_resnames_set = list(set(keep_resnames_upper))

            hetatm_to_keep = self._filter_hetatm_records(keep_resnames=keep_resnames_set)
            hetatm_indices_to_keep = np.isin(self.structure.res_id, hetatm_to_keep.res_id)
            filtered_structure = self.structure[~self.structure.hetero | hetatm_indices_to_keep]

        return self._create_new_protein_with_structure(filtered_structure, suffix="_no_hetatm")

    def remove_resnames(self, exclude_resnames: Optional[List[str]] = None) -> "Protein":
        """
        Remove residues from the protein structure based on their residue names.

        Args:
            exclude_resnames (List[str], optional): List of residue names to exclude from the structure.
            If None, returns a copy of the original structure.

        Returns:
            Protein: A new Protein instance with specified residues removed.
            The new instance has '_resnames_removed' suffix added to its name.

        Examples:
            >>> protein.remove_resnames(['ALA', 'GLY'])  # removes all alanine and glycine residues
            >>> protein.remove_resnames()  # returns a copy of the protein
        """
        if exclude_resnames is not None:
            b_resn = np.isin(self.structure.res_name, exclude_resnames)
            filtered_structure = self.structure[~b_resn]
        else:
            filtered_structure = self.structure.copy()
        return self._create_new_protein_with_structure(filtered_structure, suffix="_resnames_removed")

    def remove_water(self) -> "Protein":
        """
        Removes water molecules from the protein structure.

        This method filters out solvent molecules (including water) from the protein structure
        using a predefined solvent filter.

        Returns:
            Protein: A new Protein instance with water molecules removed.
                    The new instance will have the suffix '_no_water' appended to its name.
        """
        filtered_structure = self.structure[~filter_solvent(self.structure)]
        return self._create_new_protein_with_structure(filtered_structure, suffix="_no_water")

    def extract_metals_and_cofactors(self) -> Tuple[List[str], List[str]]:
        """
        Extracts metal ions and cofactor molecules from the protein structure by analyzing HETATM records.

        This method processes the structure's heterogeneous atoms (HETATM records), excluding water molecules,
        and categorizes them into metal ions and cofactors based on their elemental composition.

        Returns:
            Tuple[List[str], List[str]]: A tuple containing two lists:
                - First list contains the residue names of identified metal ions
                - Second list contains the residue names of identified cofactors

        Notes:
            - Water molecules (HOH, WAT) are excluded from the analysis
            - Metal ions are identified by checking if all atoms in a residue are metal elements
            - Any non-metal heterogeneous molecule is classified as a cofactor
            - The results are logged using DEFAULT_LOGGER
        """
        hetatm_records = self.structure[self.structure.hetero]
        water_residue_names = ["HOH", "WAT"]
        hetatm_records = hetatm_records[~np.isin(hetatm_records.res_name, water_residue_names)]

        metal_elements = { 
            "AC", "AG", "AL", "AM", "AS", "AU", "B", "BA", "BE", "BH", "BI", "BK", "CA", "CD", "CE", "CF", 
            "CM", "CN", "CS", "CU", "DB", "DS", "DY", "ER", "ES", "EU", "FE", "FM", "FR", "GA", "GD", "GE", 
            "HF", "HG", "HO", "HS", "K", "LA", "LI", "LR", "LU", "MD", "MG", "MN", "MO", "MT", "NA", "NB", 
            "ND", "NI", "NO", "NP", "OS", "PA", "TA", "PM", "PO", "PR", "PT", "PU", "RA", "RB", "RE", "RF", 
            "RG", "RH", "RU", "SB", "SC", "SG", "SI", "SM", "SN", "SR", "TB", "TC", "TE", "TH", "TI", "TL", 
            "TM", "U", "V", "W", "YB", "ZN", "ZR", "CO", "CR", "IN", "IR", "PB", "PD",
        }

        residue_groups = defaultdict(list)
        for atom in hetatm_records:
            key = (atom.chain_id, atom.res_id, atom.ins_code)
            residue_groups[key].append(atom)

        metal_resnames = set()
        cofactor_resnames = set()
        for key, atoms in residue_groups.items():
            res_name = atoms[0].res_name.strip().upper()
            is_metal = all(atom.element.strip().upper() in metal_elements for atom in atoms)
            if is_metal:
                metal_resnames.add(res_name)
            else:
                cofactor_resnames.add(res_name)

        metal_resnames = list(metal_resnames)
        cofactor_resnames = list(cofactor_resnames)

        DEFAULT_LOGGER.log_info(f"Identified metal residues: {metal_resnames}")
        DEFAULT_LOGGER.log_info(f"Identified cofactor residues: {cofactor_resnames}")

        return metal_resnames, cofactor_resnames

    def _create_new_protein_with_structure(self, new_structure, suffix: str = "_modified") -> "Protein":
        """
        Creates a new Protein instance with a modified structure and saves it to a new PDB file.

        Args:
            new_structure: The modified protein structure to be saved.
            suffix (str, optional): Suffix to append to the base filename. Defaults to "_modified".

        Returns:
            Protein: A new Protein instance containing the modified structure.

        Raises:
            Exception: If there is an error creating the new Protein with the modified structure.

        Notes:
            - If the target file already exists, it will be overwritten
            - If no original file path exists, creates file in system temp directory
            - The new file will have the same base name as the original with the specified suffix
        """
        base_name = self.file_path.stem if self.file_path else "modified_structure"
        new_file_name = f"{base_name}{suffix}.pdb"
        parent_dir = self.file_path.parent if self.file_path else Path(tempfile.gettempdir())
        new_file_path = parent_dir / new_file_name

        if new_file_path.exists():
            DEFAULT_LOGGER.log_warning(f"File {new_file_path} already exists. Overwriting.")
            os.remove(new_file_path)

        try:
            pdb_file = PDBFile()
            pdb_file.set_structure(new_structure)
            pdb_file.write(str(new_file_path))

            DEFAULT_LOGGER.log_info(f"Created new file with modified structure at {new_file_path}")

            return Protein(file_path=str(new_file_path))
        except Exception as e:
            DEFAULT_LOGGER.log_error(f"Failed to create new Protein with modified structure: {str(e)}")

    def write_to_file(self, file_path: str):
        """
        Write the protein structure to a PDB file.

        This method writes the current protein structure to a specified file path in PDB format.

        Args:
            file_path (str): The path where the PDB file should be written.

        Raises:
            Exception: If there is an error writing the structure to the file.

        Example:
            >>> protein.write_to_file("/path/to/output.pdb")
        """
        try:
            pdb_file = PDBFile()
            pdb_file.set_structure(self.structure)
            pdb_file.write(file_path)
            DEFAULT_LOGGER.log_info(f"Current structure written to {file_path}.")
        except Exception as e:
            DEFAULT_LOGGER.log_error(f"Failed to write structure to file {file_path}: {str(e)}")

    @jupyter_visualization
    def visualize(self) -> str:
        """
        Visualizes the protein structure using a ProteinViewer and returns HTML representation.

        This method generates a temporary PDB file for the protein structure and uses
        ProteinViewer to create an interactive 3D visualization.

        Returns:
            str: HTML string containing the protein visualization. If visualization fails,
                 returns an error message in HTML format.

        Raises:
            Exception: Any exceptions during visualization are caught and converted to an
                      error message.

        Example:
            >>> protein = Protein("1abc")
            >>> html = protein.visualize()
            >>> # html contains visualization that can be displayed in browser
        """
        try:
            unique_suffix = uuid.uuid4().hex
            current_protein_file = Path(tempfile.gettempdir()) / f"{self.name}_visualize_{unique_suffix}.pdb"
            self.write_to_file(str(current_protein_file))

            viewer = ProteinViewer(str(current_protein_file))
            protein_config = viewer.get_protein_visualization_config()
            html = viewer.render_protein(protein_config=protein_config)

            return html
        except Exception as e:
            DEFAULT_LOGGER.log_error(f"Visualization failed: {str(e)}")
            return f"<p>Visualization failed: {str(e)}</p>"

    def _repr_html_(self) -> str:
        """
        Generate an HTML representation of the protein for display in Jupyter notebooks.

        Returns:
            str: HTML string containing either protein info or a 3D visualization.
                 Falls back to string representation if visualization fails.
        """
        try:
            if self.info:
                return generate_html_output(self.info)
            return self.visualize()
        except Exception as e:
            DEFAULT_LOGGER.log_warning(f"Failed to generate HTML representation: {str(e)}")
            return self.__str__()

    def __str__(self):
        info_str = f"Name: {self.name}\nFile Path: {self.file_path}\n"
        if self.info:
            info_str += f"Info: {self.info}\n"
        return f"Protein:\n  {info_str}"

    @staticmethod
    def get_directory() -> str:
        """
        Returns the path to the base proteins directory.

        Creates the directory if it doesn't exist, using the WORKING_DIR constant as the root.

        Returns:
            str: The absolute path to the proteins directory as a string
        """
        proteins_base_dir = Path(WORKING_DIR) / "proteins"
        proteins_base_dir.mkdir(parents=True, exist_ok=True)

        return str(proteins_base_dir)

    def update_coordinates(self, coords: np.ndarray):
        """
        Updates the coordinates of the protein structure in-place.

        Args:
            coords (np.ndarray): New coordinates to be assigned to the protein structure.
                                Should match the shape of the existing coordinates.

        Returns:
            None

        Notes:
            This method modifies the protein structure coordinates directly and logs the update.
        """
        self.structure.coord = coords
        DEFAULT_LOGGER.log_info("Protein coordinates has been inplaced updated.")

    def get_center_by_residues(self, residues: List[str]) -> np.ndarray:
        """
        Calculate the center of mass for specified residues and visualize them.
        This method computes the centroid of atoms belonging to the specified residues and
        provides a visualization of these residues in the protein structure.

        Args:
            residues (List[str]): A list of 1-3 residue IDs to analyze.

        Returns:
            tuple: A tuple containing:
                - list: The coordinates of the center point [x, y, z]
                - str: Warning message if any residues were not found (empty string if all found)
                - IPython.display.HTML: Interactive 3D visualization of the protein with highlighted residues

        Raises:
            ValueError: If the number of residues is not between 1 and 3
            ValueError: If any residue ID is not an integer
            ValueError: If no atoms are found for the specified residue IDs

        Examples:
            >>> center, warning, viewer = protein.get_center_by_residues([1, 2, 3])
            >>> print(center)  # [x, y, z]
            >>> print(warning)  # Empty string or warning about missing residues
        """
        if not (1 <= len(residues) <= 3):
            print("Please provide 1-3 residue IDs")
            raise ValueError("Invalid number of residue IDs")

        for res_id in residues:
            if not isinstance(res_id, int):
                raise ValueError(f"Residue IDs must be integers. Got: {res_id}")

        mask = np.isin(self.structure.res_id, residues)
        pocket_atoms = self.structure[mask]
        if len(pocket_atoms) == 0:
            raise ValueError(f"No atoms found for the specified residue IDs: {residues}")

        warning = ""
        missing_residue_ids = set(residues) - set(pocket_atoms.res_id)
        if missing_residue_ids:
            warning = f"Residue IDs {missing_residue_ids} not found in the structure"

        res_name_id_mapping = {}
        for atom in pocket_atoms:
            res_name_id_mapping[atom.res_name] = atom.res_id

        center = centroid(pocket_atoms)

        with tempfile.TemporaryDirectory() as temp_dir:
            protein_format = "pdb"
            protein_path = os.path.join(temp_dir, "protein.pdb")
            self.write_to_file(protein_path)

            docking_viewer = DockingViewer()
            html = docking_viewer.render_highligh_residues(
                protein_data=protein_path, protein_format=protein_format, residue_ids=residues
            )

            if "ATOM" not in html:
                html = ""

        return list(center), warning, JupyterViewer.visualize(html)

Initialize a Protein object from various input sources.

This constructor can create a Protein object from a PDB ID, file path, or direct content block. It handles structure loading, file management, and basic protein information setup.

Args

pdb_id : str, optional: PDB identifier to download and load protein structure. Defaults to "".
file_path : str, optional: Path to a local protein structure file. Defaults to "".
struct_ind : int, optional: Index of the structure to select if multiple structures exist. Defaults to 0.
block_type : str, optional: File format type (e.g., "pdb", "pdbqt"). Required if using block_content. Defaults to "".
block_content : str, optional: Direct string content of a protein structure file. Defaults to "".

Raises

ValueError: If not exactly one source (pdb_id, file_path, or block_content) is provided.
ValueError: If block_type is not provided when using block_content.
ValueError: If file format is not supported (only pdb/pdbqt are supported).
ValueError: If structure cannot be loaded.
FileNotFoundError: If the specified file_path does not exist.

Attributes

pdb_id : str: PDB identifier of the protein.
file_path : Path: Absolute path to the protein structure file.
struct_ind : int: Index of the selected structure.
name : str: Name of the protein structure.
structure: Loaded protein structure object.
atom_types: Types of atoms in the structure.
info : dict: Additional protein information.
block_type : str: Type of structure file format.
block_content : str: Content of the structure file.

Static methods

def download_protein_by_pdb_id(pdb_id: str, save_dir: str = '') ‑> str

Expand source code

@staticmethod
def download_protein_by_pdb_id(pdb_id: str, save_dir: str = "") -> str:
    """
    Downloads a protein structure file from the PDB database.

    This function retrieves a protein structure file in PDB format from the Protein Data Bank
    using the provided PDB ID. If the file already exists in the specified directory,
    it skips the download.

    Args:
        pdb_id (str): The 4-character PDB ID of the protein structure.
        save_dir (str, optional): Directory path where the PDB file will be saved.
            If not provided, uses the default protein directory. Defaults to "".

    Returns:
        str: The full path to the downloaded PDB file.

    Raises:
        Exception: If the download fails for any reason (e.g., invalid PDB ID,
            network issues, etc.).

    Example:
        >>> file_path = download_protein_by_pdb_id("1abc", "/path/to/save/")
        >>> print(file_path)
        /path/to/save/1abc.pdb
    """
    if save_dir == "":
        save_dir = Protein.get_directory()

    pdb_id = pdb_id.lower()
    save_dir_path = Path(save_dir)
    save_dir_path.mkdir(parents=True, exist_ok=True)

    file_path = save_dir_path / f"{pdb_id}.pdb"
    if not file_path.exists():
        try:
            fetch(pdb_id, "pdb", save_dir_path)
            DEFAULT_LOGGER.log_info(f"Downloaded PDB {pdb_id} to {file_path}.")
        except Exception as e:
            DEFAULT_LOGGER.log_error(f"Failed to download PDB {pdb_id}: {str(e)}")
            raise
    else:
        DEFAULT_LOGGER.log_info(f"PDB file {file_path} already exists. Skipping download.")

    return str(file_path)

Downloads a protein structure file from the PDB database.

This function retrieves a protein structure file in PDB format from the Protein Data Bank using the provided PDB ID. If the file already exists in the specified directory, it skips the download.

Args

pdb_id : str: The 4-character PDB ID of the protein structure.
save_dir : str, optional: Directory path where the PDB file will be saved. If not provided, uses the default protein directory. Defaults to "".

Returns

str: The full path to the downloaded PDB file.

Raises

Exception: If the download fails for any reason (e.g., invalid PDB ID, network issues, etc.).

Example

>>> file_path = download_protein_by_pdb_id("1abc", "/path/to/save/")
>>> print(file_path)
/path/to/save/1abc.pdb

def get_directory() ‑> str

Expand source code

@staticmethod
def get_directory() -> str:
    """
    Returns the path to the base proteins directory.

    Creates the directory if it doesn't exist, using the WORKING_DIR constant as the root.

    Returns:
        str: The absolute path to the proteins directory as a string
    """
    proteins_base_dir = Path(WORKING_DIR) / "proteins"
    proteins_base_dir.mkdir(parents=True, exist_ok=True)

    return str(proteins_base_dir)

Returns the path to the base proteins directory.

Creates the directory if it doesn't exist, using the WORKING_DIR constant as the root.

Returns

str: The absolute path to the proteins directory as a string

def load_structure(structure_file_path: str)

Expand source code

@staticmethod
def load_structure(structure_file_path: str):
    """
    Load a protein structure from a PDB file.

    Args:
        structure_file_path (str): Path to the PDB structure file.

    Returns:
        Structure: A Structure object representing the protein structure.

    Raises:
        FileNotFoundError: If the specified PDB file does not exist.
        PDBParseError: If the PDB file cannot be properly parsed.
    """
    structure_file = PDBFile.read(structure_file_path)
    structure = structure_file.get_structure()

    return structure

Load a protein structure from a PDB file.

Args

structure_file_path : str: Path to the PDB structure file.

Returns

Structure: A Structure object representing the protein structure.

Raises

FileNotFoundError: If the specified PDB file does not exist.
PDBParseError: If the PDB file cannot be properly parsed.

Instance variables

prop coordinates

Expand source code

@property
def coordinates(self):
    """
    Gets the atomic coordinates of the protein structure.

    Returns:
        numpy.ndarray: A numpy array containing the 3D coordinates of all atoms in the structure.
        The array has shape (n_atoms, 3) where each row represents the x, y, z coordinates of an atom.
    """
    return self.structure.coord

Gets the atomic coordinates of the protein structure.

Returns

numpy.ndarray: A numpy array containing the 3D coordinates of all atoms in the structure.

The array has shape (n_atoms, 3) where each row represents the x, y, z coordinates of an atom.

Methods

def extract_metals_and_cofactors(self) ‑> Tuple[List[str], List[str]]

Expand source code

def extract_metals_and_cofactors(self) -> Tuple[List[str], List[str]]:
    """
    Extracts metal ions and cofactor molecules from the protein structure by analyzing HETATM records.

    This method processes the structure's heterogeneous atoms (HETATM records), excluding water molecules,
    and categorizes them into metal ions and cofactors based on their elemental composition.

    Returns:
        Tuple[List[str], List[str]]: A tuple containing two lists:
            - First list contains the residue names of identified metal ions
            - Second list contains the residue names of identified cofactors

    Notes:
        - Water molecules (HOH, WAT) are excluded from the analysis
        - Metal ions are identified by checking if all atoms in a residue are metal elements
        - Any non-metal heterogeneous molecule is classified as a cofactor
        - The results are logged using DEFAULT_LOGGER
    """
    hetatm_records = self.structure[self.structure.hetero]
    water_residue_names = ["HOH", "WAT"]
    hetatm_records = hetatm_records[~np.isin(hetatm_records.res_name, water_residue_names)]

    metal_elements = { 
        "AC", "AG", "AL", "AM", "AS", "AU", "B", "BA", "BE", "BH", "BI", "BK", "CA", "CD", "CE", "CF", 
        "CM", "CN", "CS", "CU", "DB", "DS", "DY", "ER", "ES", "EU", "FE", "FM", "FR", "GA", "GD", "GE", 
        "HF", "HG", "HO", "HS", "K", "LA", "LI", "LR", "LU", "MD", "MG", "MN", "MO", "MT", "NA", "NB", 
        "ND", "NI", "NO", "NP", "OS", "PA", "TA", "PM", "PO", "PR", "PT", "PU", "RA", "RB", "RE", "RF", 
        "RG", "RH", "RU", "SB", "SC", "SG", "SI", "SM", "SN", "SR", "TB", "TC", "TE", "TH", "TI", "TL", 
        "TM", "U", "V", "W", "YB", "ZN", "ZR", "CO", "CR", "IN", "IR", "PB", "PD",
    }

    residue_groups = defaultdict(list)
    for atom in hetatm_records:
        key = (atom.chain_id, atom.res_id, atom.ins_code)
        residue_groups[key].append(atom)

    metal_resnames = set()
    cofactor_resnames = set()
    for key, atoms in residue_groups.items():
        res_name = atoms[0].res_name.strip().upper()
        is_metal = all(atom.element.strip().upper() in metal_elements for atom in atoms)
        if is_metal:
            metal_resnames.add(res_name)
        else:
            cofactor_resnames.add(res_name)

    metal_resnames = list(metal_resnames)
    cofactor_resnames = list(cofactor_resnames)

    DEFAULT_LOGGER.log_info(f"Identified metal residues: {metal_resnames}")
    DEFAULT_LOGGER.log_info(f"Identified cofactor residues: {cofactor_resnames}")

    return metal_resnames, cofactor_resnames

Extracts metal ions and cofactor molecules from the protein structure by analyzing HETATM records.

This method processes the structure's heterogeneous atoms (HETATM records), excluding water molecules, and categorizes them into metal ions and cofactors based on their elemental composition.

Returns

Tuple[List[str], List[str]]: A tuple containing two lists: - First list contains the residue names of identified metal ions - Second list contains the residue names of identified cofactors

Notes

Water molecules (HOH, WAT) are excluded from the analysis
Metal ions are identified by checking if all atoms in a residue are metal elements
Any non-metal heterogeneous molecule is classified as a cofactor
The results are logged using DEFAULT_LOGGER

def get_center_by_residues(self, residues: List[str]) ‑> numpy.ndarray

Expand source code

def get_center_by_residues(self, residues: List[str]) -> np.ndarray:
    """
    Calculate the center of mass for specified residues and visualize them.
    This method computes the centroid of atoms belonging to the specified residues and
    provides a visualization of these residues in the protein structure.

    Args:
        residues (List[str]): A list of 1-3 residue IDs to analyze.

    Returns:
        tuple: A tuple containing:
            - list: The coordinates of the center point [x, y, z]
            - str: Warning message if any residues were not found (empty string if all found)
            - IPython.display.HTML: Interactive 3D visualization of the protein with highlighted residues

    Raises:
        ValueError: If the number of residues is not between 1 and 3
        ValueError: If any residue ID is not an integer
        ValueError: If no atoms are found for the specified residue IDs

    Examples:
        >>> center, warning, viewer = protein.get_center_by_residues([1, 2, 3])
        >>> print(center)  # [x, y, z]
        >>> print(warning)  # Empty string or warning about missing residues
    """
    if not (1 <= len(residues) <= 3):
        print("Please provide 1-3 residue IDs")
        raise ValueError("Invalid number of residue IDs")

    for res_id in residues:
        if not isinstance(res_id, int):
            raise ValueError(f"Residue IDs must be integers. Got: {res_id}")

    mask = np.isin(self.structure.res_id, residues)
    pocket_atoms = self.structure[mask]
    if len(pocket_atoms) == 0:
        raise ValueError(f"No atoms found for the specified residue IDs: {residues}")

    warning = ""
    missing_residue_ids = set(residues) - set(pocket_atoms.res_id)
    if missing_residue_ids:
        warning = f"Residue IDs {missing_residue_ids} not found in the structure"

    res_name_id_mapping = {}
    for atom in pocket_atoms:
        res_name_id_mapping[atom.res_name] = atom.res_id

    center = centroid(pocket_atoms)

    with tempfile.TemporaryDirectory() as temp_dir:
        protein_format = "pdb"
        protein_path = os.path.join(temp_dir, "protein.pdb")
        self.write_to_file(protein_path)

        docking_viewer = DockingViewer()
        html = docking_viewer.render_highligh_residues(
            protein_data=protein_path, protein_format=protein_format, residue_ids=residues
        )

        if "ATOM" not in html:
            html = ""

    return list(center), warning, JupyterViewer.visualize(html)

Calculate the center of mass for specified residues and visualize them. This method computes the centroid of atoms belonging to the specified residues and provides a visualization of these residues in the protein structure.

Args

residues : List[str]: A list of 1-3 residue IDs to analyze.

Returns

tuple: A tuple containing: - list: The coordinates of the center point [x, y, z] - str: Warning message if any residues were not found (empty string if all found) - IPython.display.HTML: Interactive 3D visualization of the protein with highlighted residues

Raises

ValueError: If the number of residues is not between 1 and 3
ValueError: If any residue ID is not an integer
ValueError: If no atoms are found for the specified residue IDs

Examples

>>> center, warning, viewer = protein.get_center_by_residues([1, 2, 3])
>>> print(center)  # [x, y, z]
>>> print(warning)  # Empty string or warning about missing residues

def list_chain_names(self) ‑> List[str]

Expand source code

def list_chain_names(self) -> List[str]:
    """
    Retrieves a list of unique chain identifiers from the protein structure.

    Returns:
        List[str]: A list of unique chain IDs present in the protein structure. Each chain ID is
                  typically a single character that identifies a specific polypeptide chain.
    """
    chain_records = self._filter_chain_records()
    chain_ids = np.unique(chain_records.chain_id)
    return list(chain_ids)

Retrieves a list of unique chain identifiers from the protein structure.

Returns

List[str]: A list of unique chain IDs present in the protein structure. Each chain ID is typically a single character that identifies a specific polypeptide chain.

def list_hetero_names(self, exclude_water=True) ‑> List[str]

Expand source code

def list_hetero_names(self, exclude_water=True) -> List[str]:
    """
    Returns a list of unique hetero atom residue names from the structure.

    Args:
        exclude_water (bool): If True, excludes water molecules (HOH) from the returned list.
        Defaults to True.

    Returns:
        List[str]: A list of unique hetero residue names found in the structure.
        Common examples include small molecules, ions, and modified amino acids.
    """
    hetatm_records = self._filter_hetatm_records(exclude_water=exclude_water)
    ligand_res_names = np.unique(hetatm_records.res_name)
    return list(ligand_res_names)

Returns a list of unique hetero atom residue names from the structure.

Args

exclude_water : bool: If True, excludes water molecules (HOH) from the returned list.

Defaults to True.

Returns

List[str]: A list of unique hetero residue names found in the structure.

Common examples include small molecules, ions, and modified amino acids.

def load_structure_from_block(self, block_content: str, block_type: str)

Expand source code

def load_structure_from_block(self, block_content: str, block_type: str):
    """
    Load a molecular structure from a text block.

    This method reads a structure from a string content block in either PDB or PDBQT format
    and returns a Structure object.

    Args:
        block_content (str): String containing the structure data in PDB/PDBQT format
        block_type (str): Format type of the block content ('pdb' or 'pdbqt')

    Returns:
        Structure: A Structure object representing the molecular structure

    Raises:
        ValueError: If the block_type is not supported (must be 'pdb' or 'pdbqt')

    Examples:
        >>> protein = Protein()
        >>> pdb_content = "ATOM      1  N   ASN A   1      27.961  28.064  39.573  1.00 23.02           N"
        >>> structure = protein.load_structure_from_block(pdb_content, "pdb")
    """
    if block_type in ["pdb", "pdbqt"]:
        pdb_file = PDBFile.read(io.StringIO(block_content))
        structure = pdb_file.get_structure()
    else:
        raise ValueError(f"Unsupported block type: {block_type}")
    return structure

Load a molecular structure from a text block.

This method reads a structure from a string content block in either PDB or PDBQT format and returns a Structure object.

Args

block_content : str: String containing the structure data in PDB/PDBQT format
block_type : str: Format type of the block content ('pdb' or 'pdbqt')

Returns

Structure: A Structure object representing the molecular structure

Raises

ValueError: If the block_type is not supported (must be 'pdb' or 'pdbqt')

Examples

>>> protein = Protein()
>>> pdb_content = "ATOM      1  N   ASN A   1      27.961  28.064  39.573  1.00 23.02           N"
>>> structure = protein.load_structure_from_block(pdb_content, "pdb")

def prepare(self, model_loops: bool = False, pdb_id: str = '') ‑> Protein

Expand source code

def prepare(self, model_loops: bool = False, pdb_id: str = "") -> "Protein":
    """
    Prepares the protein structure by processing metals, cofactors and optionally modeling missing loops.

    Args:
        model_loops (bool, optional): Whether to model missing loops in the structure.
            Requires a valid PDB ID if True. Defaults to False.
        pdb_id (str, optional): PDB ID of the protein structure. Required if model_loops=True.
            Defaults to empty string.

    Returns:
        Protein: A new Protein instance with the prepared structure.

    Raises:
        ValueError: If model_loops is True but no PDB ID is provided.
        Exception: If protein preparation fails.

    Notes:
        This method:
        - Extracts and filters metal ions and cofactors from the structure
        - Processes the structure through preparation pipeline
        - Writes the prepared structure to a new file with '_prep.pdb' suffix
        - Creates and returns a new Protein instance with the prepared structure
    """
    pdb_id = pdb_id if pdb_id else self.pdb_id
    if model_loops and not pdb_id:
        raise ValueError("PDB ID must be provided to model loops.")

    metal_resnames, cofactor_resnames = self.extract_metals_and_cofactors()
    metals_to_keep = [resname for resname in metal_resnames if resname.upper() in METALS]

    response = prepare(
        protein_path=self.file_path,
        protein_pdb_id=pdb_id,
        protein_extension=self.block_type,
        metal_resnames=metals_to_keep,
        cofactor_resnames=cofactor_resnames,
        model_loops=model_loops,
    )
    if not response["prepared_protein_content"]:
        raise Exception("Failed to prepare protein.")

    protein_dir = Path(self.file_path).parent
    base_name = Path(self.file_path).stem if self.file_path else "modified_structure"
    new_file_name = protein_dir / f"{base_name}_prep.pdb"

    intermediate_protein = Protein(block_content=response["prepared_protein_content"], block_type="pdb")
    intermediate_protein.write_to_file(str(new_file_name))

    protein = Protein(file_path=new_file_name)
    protein.pdb_id = self.pdb_id

    return protein

Prepares the protein structure by processing metals, cofactors and optionally modeling missing loops.

Args

model_loops : bool, optional: Whether to model missing loops in the structure. Requires a valid PDB ID if True. Defaults to False.
pdb_id : str, optional: PDB ID of the protein structure. Required if model_loops=True. Defaults to empty string.

Returns

Protein: A new Protein instance with the prepared structure.

Raises

ValueError: If model_loops is True but no PDB ID is provided.
Exception: If protein preparation fails.

Notes

This method: - Extracts and filters metal ions and cofactors from the structure - Processes the structure through preparation pipeline - Writes the prepared structure to a new file with '_prep.pdb' suffix - Creates and returns a new Protein instance with the prepared structure

def remove_hetatm(self, keep_resnames: List[str] | None = None, remove_metals: List[str] | None = None)

Expand source code

def remove_hetatm(self, keep_resnames: Optional[List[str]] = None, remove_metals: Optional[List[str]] = None):
    """
    Remove HETATM records from the protein structure while allowing specific residues and metals to be kept.

    Args:
        keep_resnames (Optional[List[str]]): List of residue names to keep in the structure despite being HETATM records.
        Names are case-insensitive. Defaults to None.
        remove_metals (Optional[List[str]]): List of metal names to remove from the structure.
        By default, all metals are kept. Names are case-insensitive. Defaults to None.

    Returns:
        Protein: A new Protein object containing the filtered structure with the suffix "_no_hetatm"

    Examples:
        >>> protein.remove_hetatm(keep_resnames=['NAG', 'BMA'], remove_metals=['ZN', 'MG'])
        >>> protein.remove_hetatm()  # Removes all HETATM except metals
        >>> protein.remove_hetatm(keep_resnames=['HOH'])  # Keeps water molecules
    """

    metals = METALS
    if remove_metals:
        exclude_metals_upper = [metal.upper() for metal in remove_metals]
        metals = list(set(METALS) - set(exclude_metals_upper))

    if not metals and not keep_resnames:
        filtered_structure = self.structure[~self.structure.hetero]
    else:
        keep_resnames_upper = [res.upper() for res in keep_resnames] if keep_resnames else []
        keep_resnames_upper.extend(metals)
        keep_resnames_set = list(set(keep_resnames_upper))

        hetatm_to_keep = self._filter_hetatm_records(keep_resnames=keep_resnames_set)
        hetatm_indices_to_keep = np.isin(self.structure.res_id, hetatm_to_keep.res_id)
        filtered_structure = self.structure[~self.structure.hetero | hetatm_indices_to_keep]

    return self._create_new_protein_with_structure(filtered_structure, suffix="_no_hetatm")

Remove HETATM records from the protein structure while allowing specific residues and metals to be kept.

Args

keep_resnames : Optional[List[str]]: List of residue names to keep in the structure despite being HETATM records.
Names are case-insensitive. Defaults to None.
remove_metals : Optional[List[str]]: List of metal names to remove from the structure.

By default, all metals are kept. Names are case-insensitive. Defaults to None.

Returns

Protein: A new Protein object containing the filtered structure with the suffix "_no_hetatm"

Examples

>>> protein.remove_hetatm(keep_resnames=['NAG', 'BMA'], remove_metals=['ZN', 'MG'])
>>> protein.remove_hetatm()  # Removes all HETATM except metals
>>> protein.remove_hetatm(keep_resnames=['HOH'])  # Keeps water molecules

def remove_resnames(self, exclude_resnames: List[str] | None = None) ‑> Protein

Expand source code

def remove_resnames(self, exclude_resnames: Optional[List[str]] = None) -> "Protein":
    """
    Remove residues from the protein structure based on their residue names.

    Args:
        exclude_resnames (List[str], optional): List of residue names to exclude from the structure.
        If None, returns a copy of the original structure.

    Returns:
        Protein: A new Protein instance with specified residues removed.
        The new instance has '_resnames_removed' suffix added to its name.

    Examples:
        >>> protein.remove_resnames(['ALA', 'GLY'])  # removes all alanine and glycine residues
        >>> protein.remove_resnames()  # returns a copy of the protein
    """
    if exclude_resnames is not None:
        b_resn = np.isin(self.structure.res_name, exclude_resnames)
        filtered_structure = self.structure[~b_resn]
    else:
        filtered_structure = self.structure.copy()
    return self._create_new_protein_with_structure(filtered_structure, suffix="_resnames_removed")

Remove residues from the protein structure based on their residue names.

Args

exclude_resnames : List[str], optional: List of residue names to exclude from the structure.

If None, returns a copy of the original structure.

Returns

Protein: A new Protein instance with specified residues removed.

The new instance has '_resnames_removed' suffix added to its name.

Examples

>>> protein.remove_resnames(['ALA', 'GLY'])  # removes all alanine and glycine residues
>>> protein.remove_resnames()  # returns a copy of the protein

def remove_water(self) ‑> Protein

Expand source code

def remove_water(self) -> "Protein":
    """
    Removes water molecules from the protein structure.

    This method filters out solvent molecules (including water) from the protein structure
    using a predefined solvent filter.

    Returns:
        Protein: A new Protein instance with water molecules removed.
                The new instance will have the suffix '_no_water' appended to its name.
    """
    filtered_structure = self.structure[~filter_solvent(self.structure)]
    return self._create_new_protein_with_structure(filtered_structure, suffix="_no_water")

Removes water molecules from the protein structure.

This method filters out solvent molecules (including water) from the protein structure using a predefined solvent filter.

Returns

Protein: A new Protein instance with water molecules removed. The new instance will have the suffix '_no_water' appended to its name.

def select_chain(self, chain_id: str) ‑> Protein | None

Expand source code

def select_chain(self, chain_id: str) -> Optional["Protein"]:
    """
    Selects a specific chain from the protein structure and returns a new Protein object.

    Args:
        chain_id (str): The identifier of the chain to be selected.

    Returns:
        Optional[Protein]: A new Protein object containing only the selected chain.
            Returns None if the chain is not found.

    Raises:
        ValueError: If the specified chain_id is not found in the protein structure.

    Example:
        >>> protein = Protein("1abc.pdb")
        >>> chain_a = protein.select_chain("A")
    """
    chain_records = self._filter_chain_records(chain_ids=[chain_id])
    if len(chain_records) > 0:
        return self._create_new_protein_with_structure(chain_records, suffix=f"_chain_{chain_id}")
    else:
        raise ValueError(f"Chain {chain_id} not found.")

Selects a specific chain from the protein structure and returns a new Protein object.

Args

chain_id : str: The identifier of the chain to be selected.

Returns

Optional[Protein]: A new Protein object containing only the selected chain. Returns None if the chain is not found.

Raises

ValueError: If the specified chain_id is not found in the protein structure.

Example

>>> protein = Protein("1abc.pdb")
>>> chain_a = protein.select_chain("A")

def select_chains(self, chain_ids: List[str]) ‑> Protein

Expand source code

def select_chains(self, chain_ids: List[str]) -> "Protein":
    """
    Select specific chains from the protein structure and create a new Protein object.

    Args:
        chain_ids (List[str]): A list of chain identifiers to select from the protein structure.

    Returns:
        Protein: A new Protein object containing only the selected chains.

    Raises:
        ValueError: If no chains are found for the provided chain IDs.

    Example:
        >>> protein.select_chains(['A', 'B'])
        # Returns a new Protein object with only chains A and B
    """
    chain_records = self._filter_chain_records(chain_ids=chain_ids)
    if len(chain_records) == 0:
        raise ValueError(f"No chains found for the provided chain IDs: {chain_ids}")
    return self._create_new_protein_with_structure(chain_records, suffix=f"_chains_{'_'.join(chain_ids)}")

Select specific chains from the protein structure and create a new Protein object.

Args

chain_ids : List[str]: A list of chain identifiers to select from the protein structure.

Returns

Protein: A new Protein object containing only the selected chains.

Raises

ValueError: If no chains are found for the provided chain IDs.

Example

>>> protein.select_chains(['A', 'B'])
# Returns a new Protein object with only chains A and B

def select_ligand(self, res_name: str) ‑> Ligand

Expand source code

def select_ligand(self, res_name: str) -> "Ligand":
    """
    Selects and processes ligands from the protein structure based on the residue name.

    This method identifies ligand atoms in the structure, attempts to fetch their SMILES
    representation, and creates Ligand objects for each unique residue occurrence.

    Args:
        res_name (str): The residue name of the ligand to select.

    Returns:
        List[Ligand]: A list of Ligand objects, each representing a unique instance
        of the specified ligand in the structure.

    Raises:
        ValueError: If the specified residue name is not found in the structure's
                   hetero residues, or if no atoms are found for the specified ligand.

    Notes:
        - The method first attempts to fetch SMILES from PDB API
        - If SMILES fetch fails, it attempts to use OpenBabel for SMILES extraction
        - Each Ligand object contains the atomic coordinates and chemical structure information
        - Bond orders are assigned using SMILES when available

    Example:
        >>> protein = Protein("1abc.pdb")
        >>> ligands = protein.select_ligand("ATP")
        >>> print(len(ligands))  # Number of ATP molecules in structure
    """
    hetero_names = self.list_hetero_names()
    if res_name not in hetero_names:
        raise ValueError(f"Residue {res_name} not found. Available ligands are: {hetero_names}")

    ligand_atoms = self.structure[(self.structure.res_name == res_name) & self.structure.hetero]
    if len(ligand_atoms) == 0:
        raise ValueError(f"No atoms found for ligand {res_name}.")

    try:
        smiles = Ligand.fetch_smiles_from_pdb_api(res_name)
        DEFAULT_LOGGER.log_warning(f"SMILES for {res_name}: {smiles}")
    except Exception:
        DEFAULT_LOGGER.log_warning(f"Failed to fetch SMILES for {res_name}.")
        smiles = None

    chain_ids = ligand_atoms.chain_id
    res_ids = ligand_atoms.res_id
    ins_codes = ligand_atoms.ins_code

    residue_tuples = list(zip(chain_ids, res_ids, ins_codes))
    unique_residue_tuples = list(set(residue_tuples))

    ligands = []
    for chain_id, res_id, ins_code in unique_residue_tuples:
        mask = (
            (ligand_atoms.chain_id == chain_id)
            & (ligand_atoms.res_id == res_id)
            & (ligand_atoms.ins_code == ins_code)
        )

        ligand_group = ligand_atoms[mask]

        pdb_file = PDBFile()
        pdb_file.set_structure(ligand_group)
        pdb_block = io.StringIO()
        pdb_file.write(pdb_block)
        block_content, block_type = pdb_block.getvalue(), "pdb"
        if not smiles:
            DEFAULT_LOGGER.log_warning(f"PROCEEDING WITH OPEN BABEL TO EXTRACT SMILES FOR {res_name}")
            try:
                block_content, block_type = convert_block("pdb", block_content, "sdf"), "sdf"
            except Exception as _:
                DEFAULT_LOGGER.log_error(f"Failed to convert block to SDF. Please provide smiles manually.")
                return

        ligand = Ligand(
            block_content=block_content,
            block_type=block_type,
            name=res_name,
            xref_protein=self,
            xref_ins_code=ins_code,
            xref_residue_id=res_id,
            xref_protein_chain_id=chain_id,
        )

        if smiles:
            ligand.mol.assign_bond_order_from_smiles(smiles)

        ligands.append(ligand)

    return ligands

Selects and processes ligands from the protein structure based on the residue name.

This method identifies ligand atoms in the structure, attempts to fetch their SMILES representation, and creates Ligand objects for each unique residue occurrence.

Args

res_name : str: The residue name of the ligand to select.

Returns

List[Ligand]: A list of Ligand objects, each representing a unique instance

of the specified ligand in the structure.

Raises

ValueError: If the specified residue name is not found in the structure's hetero residues, or if no atoms are found for the specified ligand.

Notes

The method first attempts to fetch SMILES from PDB API
If SMILES fetch fails, it attempts to use OpenBabel for SMILES extraction
Each Ligand object contains the atomic coordinates and chemical structure information
Bond orders are assigned using SMILES when available

Example

>>> protein = Protein("1abc.pdb")
>>> ligands = protein.select_ligand("ATP")
>>> print(len(ligands))  # Number of ATP molecules in structure

def select_ligands(self, res_names: List[str]) ‑> List[Ligand]

Expand source code

def select_ligands(self, res_names: List[str]) -> List["Ligand"]:
    """
    Selects and returns a list of ligands based on their residue names.

    Args:
        res_names (List[str]): A list of residue names to select ligands for.
            If None, all heterogeneous residue names will be used.

    Returns:
        List["Ligand"]: A list of Ligand objects matching the specified residue names.
            Returns an empty list if no matching ligands are found.

    Note:
        If a residue name is not found, a warning will be logged and the selection will continue
        with the remaining residue names.
    """
    if res_names is None:
        res_names = self.list_hetero_names()

    ligands = []
    for res_name in res_names:
        try:
            ligand = self.select_ligand(res_name)
            ligands.extend(ligand)
        except ValueError as e:
            DEFAULT_LOGGER.log_warning(str(e))
    return ligands

Selects and returns a list of ligands based on their residue names.

Args

res_names : List[str]: A list of residue names to select ligands for. If None, all heterogeneous residue names will be used.

Returns

List["Ligand"]: A list of Ligand objects matching the specified residue names. Returns an empty list if no matching ligands are found.

Note

If a residue name is not found, a warning will be logged and the selection will continue with the remaining residue names.

def select_structure(self, index: int)

Expand source code

def select_structure(self, index: int):
    """
    Selects a specific structure from the list of available structures.

    Args:
        index (int): The index of the structure to select.

    Returns:
        The selected structure at the specified index.

    Raises:
        ValueError: If the index is out of bounds (negative or >= length of structures).
    """
    if index < 0 or index >= len(self.structure):
        raise ValueError(f"Invalid structure index {index}. Total structures: {len(self.structure)}")

    return self.structure[index]

Selects a specific structure from the list of available structures.

Args

index : int: The index of the structure to select.

Returns

The selected structure at the specified index.

Raises

ValueError: If the index is out of bounds (negative or >= length of structures).

def update_coordinates(self, coords: numpy.ndarray)

Expand source code

def update_coordinates(self, coords: np.ndarray):
    """
    Updates the coordinates of the protein structure in-place.

    Args:
        coords (np.ndarray): New coordinates to be assigned to the protein structure.
                            Should match the shape of the existing coordinates.

    Returns:
        None

    Notes:
        This method modifies the protein structure coordinates directly and logs the update.
    """
    self.structure.coord = coords
    DEFAULT_LOGGER.log_info("Protein coordinates has been inplaced updated.")

Updates the coordinates of the protein structure in-place.

Args

coords : np.ndarray: New coordinates to be assigned to the protein structure. Should match the shape of the existing coordinates.

Returns

None

Notes

This method modifies the protein structure coordinates directly and logs the update.

def visualize(*args, **kwargs)

Expand source code

def wrapper(*args, **kwargs):
    html_visualization = func(*args, **kwargs)
    return JupyterViewer.visualize(html_visualization)

def write_to_file(self, file_path: str)

Expand source code

def write_to_file(self, file_path: str):
    """
    Write the protein structure to a PDB file.

    This method writes the current protein structure to a specified file path in PDB format.

    Args:
        file_path (str): The path where the PDB file should be written.

    Raises:
        Exception: If there is an error writing the structure to the file.

    Example:
        >>> protein.write_to_file("/path/to/output.pdb")
    """
    try:
        pdb_file = PDBFile()
        pdb_file.set_structure(self.structure)
        pdb_file.write(file_path)
        DEFAULT_LOGGER.log_info(f"Current structure written to {file_path}.")
    except Exception as e:
        DEFAULT_LOGGER.log_error(f"Failed to write structure to file {file_path}: {str(e)}")

Write the protein structure to a PDB file.

This method writes the current protein structure to a specified file path in PDB format.

Args

file_path : str: The path where the PDB file should be written.

Raises

Exception: If there is an error writing the structure to the file.

Example

>>> protein.write_to_file("/path/to/output.pdb")