Source code for htpolynet.io.pdb

"""PDB file reader for ATOM/HETATM coordinates and CONECT-record bond orders.

Interprets repeated CONECT entries for a given atom pair as higher-order bonds:
  a partner listed twice  → double bond (order '2')
  a partner listed three times → triple bond (order '3')

Author: Cameron F. Abrams <cfa22@drexel.edu>
"""
import logging
from collections import defaultdict

import numpy as np
import pandas as pd

from ..geometry.bondlist import Bondlist
from ..io.mol2 import MOL2_ATOM_ATTRIBUTES, MOL2_BOND_ATTRIBUTES, MOL2_BOND_TYPES

logger = logging.getLogger(__name__)

# Fallback SYBYL types by element; antechamber will reassign GAFF types.
_ELEMENT_TO_SYBYL = {
    'C':  'C.3',
    'N':  'N.3',
    'O':  'O.3',
    'S':  'S.3',
    'P':  'P.3',
    'H':  'H',
    'F':  'F',
    'CL': 'Cl',
    'BR': 'Br',
    'I':  'I',
}


def _element_from_pdb_line(line, atom_name):
    """Return the element symbol from columns 77-78 or by stripping digits from atom_name."""
    if len(line) >= 78:
        elem = line[76:78].strip()
        if elem:
            return elem.capitalize()
    # fallback: first alphabetic character(s) in atom_name
    stripped = ''.join(c for c in atom_name if c.isalpha())
    return stripped[:2].capitalize() if len(stripped) >= 2 else stripped[:1].upper()


[docs] def read(filename): """Parse a PDB file into coordinate and bond data compatible with mol2 conventions. Bond orders are inferred from CONECT records: if the same bonded-atom serial appears N times in one CONECT record, that bond has order N. Positions are converted from Ångström (PDB) to nm (htpolynet internal). Args: filename (str): path to the PDB file Returns: dict with keys: 'name' (str) 'N' (int): atom count 'metadat' (dict): mol2-compatible metadata 'A' (pd.DataFrame): atom data with MOL2_ATOM_ATTRIBUTES columns 'mol2_bonds' (pd.DataFrame): bond table with MOL2_BOND_ATTRIBUTES columns 'mol2_bondlist' (Bondlist) """ atoms = [] # bond_order[(ai, aj)] = max repetitions seen from either atom's CONECT record, ai < aj bond_order = defaultdict(int) with open(filename) as f: for line in f: record = line[:6].strip() if record in ('ATOM', 'HETATM'): serial = int(line[6:11]) name = line[12:16].strip() resname = line[17:20].strip() resseq = int(line[22:26]) x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) elem = _element_from_pdb_line(line, name) sybyl = _ELEMENT_TO_SYBYL.get(elem.upper(), elem) atoms.append({ 'globalIdx': serial, 'atomName': name, 'posX': x * 0.1, 'posY': y * 0.1, 'posZ': z * 0.1, 'type': sybyl, 'resNum': resseq, 'resName': resname, 'charge': 0.0, }) elif record == 'CONECT': src = int(line[6:11]) partners = [] for col in [11, 16, 21, 26]: if len(line) > col + 4: fld = line[col:col + 5].strip() if fld: try: partners.append(int(fld)) except ValueError: pass # Count how many times each partner appears in this line per_partner = defaultdict(int) for p in partners: per_partner[p] += 1 for p, cnt in per_partner.items(): key = (min(src, p), max(src, p)) bond_order[key] = max(bond_order[key], cnt) if not atoms: raise ValueError(f'No ATOM/HETATM records found in {filename}') A = pd.DataFrame(atoms) bonds = [] for idx, ((ai, aj), order) in enumerate(sorted(bond_order.items()), 1): bonds.append({ 'bondIdx': idx, 'ai': ai, 'aj': aj, 'order': str(order), }) mol2_bonds = pd.DataFrame(bonds).astype(MOL2_BOND_TYPES) if bonds else pd.DataFrame( columns=MOL2_BOND_ATTRIBUTES ) if bonds: logger.debug(f'pdb.read {filename}: {len(atoms)} atoms, {len(bonds)} bonds ' f'({sum(1 for b in bonds if b["order"] != "1")} non-single)') mol2_bondlist = Bondlist.fromDataFrame(mol2_bonds) name = atoms[0]['resName'] if atoms else '' metadat = { 'N': len(atoms), 'nBonds': len(bonds), 'nSubs': 1, 'nFeatures': 0, 'nSets': 0, 'mol2type': 'SMALL', 'mol2chargetype': 'GASTEIGER', } return { 'name': name, 'N': len(atoms), 'metadat': metadat, 'A': A, 'mol2_bonds': mol2_bonds, 'mol2_bondlist': mol2_bondlist, }