Source code for htpolynet.io.mol2

"""Sybyl MOL2 file reader and writer.

Author: Cameron F. Abrams <cfa22@drexel.edu>
"""
import logging

from io import StringIO

import numpy as np
import pandas as pd

from ..geometry.bondlist import Bondlist

logger = logging.getLogger(__name__)

MOL2_ATOM_ATTRIBUTES = [
    'globalIdx', 'atomName', 'posX', 'posY', 'posZ',
    'type', 'resNum', 'resName', 'charge',
]
MOL2_BOND_ATTRIBUTES = ['bondIdx', 'ai', 'aj', 'order']
MOL2_BOND_TYPES = {k: v for k, v in zip(MOL2_BOND_ATTRIBUTES, [int, int, int, str])}



[docs]
def read(filename):
    """Parse a Sybyl MOL2 file.

    Only MOLECULE, ATOM, and BOND sections are read.
    All lengths are converted from Ångström to nm.

    Args:
        filename (str): path to the .mol2 file

    Returns:
        dict: parsed data with keys:
            - 'name'         (str)
            - 'N'            (int): atom count
            - 'metadat'      (dict): nBonds, nSubs, nFeatures, nSets, mol2type, mol2chargetype
            - 'A'            (pd.DataFrame): atom data frame (positions in nm)
            - 'mol2_bonds'   (pd.DataFrame)
            - 'mol2_bondlist' (Bondlist)
    """
    with open(filename, 'r') as f:
        rawsections = f.read().split('@<TRIPOS>')[1:]

    sections = {}
    for rs in rawsections:
        s = rs.split('\n')
        key = s[0].strip().lower()
        val = [a.strip() for a in s[1:] if len(a) > 0]
        if key == 'atom' or key == 'bond':
            val = StringIO('\n'.join(val))
        sections[key] = val

    name = sections['molecule'][0]
    imetadat = list(map(int, sections['molecule'][1].strip().split()))
    metadat = {
        'N':            imetadat[0],
        'nBonds':       imetadat[1],
        'nSubs':        imetadat[2],
        'nFeatures':    imetadat[3],
        'nSets':        imetadat[4],
        'mol2type':     sections['molecule'][2],
        'mol2chargetype': sections['molecule'][3],
    }
    N = metadat['N']

    A = pd.read_csv(sections['atom'], sep=r'\s+', names=MOL2_ATOM_ATTRIBUTES)
    A[['posX', 'posY', 'posZ']] *= [0.1, 0.1, 0.1]   # Å → nm
    N = A.shape[0]

    mol2_bonds = pd.read_csv(
        sections['bond'], sep=r'\s+',
        names=MOL2_BOND_ATTRIBUTES, dtype=MOL2_BOND_TYPES,
    )
    # Ensure ai < aj in every bond record
    for i, r in mol2_bonds.iterrows():
        ai, aj = r['ai'], r['aj']
        if aj < ai:
            logger.debug(f'mol2 bonds swapping {ai} and {aj}')
            mol2_bonds.iloc[i, mol2_bonds.columns == 'ai'] = aj
            mol2_bonds.iloc[i, mol2_bonds.columns == 'aj'] = ai

    mol2_bondlist = Bondlist.fromDataFrame(mol2_bonds)

    return {
        'name': name,
        'N': N,
        'metadat': metadat,
        'A': A,
        'mol2_bonds': mol2_bonds,
        'mol2_bondlist': mol2_bondlist,
    }




[docs]
def write(filename, A, mol2_bonds, metadat, name,
          bondsDF=None, molname='', other_attributes=None):
    """Write a Sybyl MOL2 file.

    All coordinates are written in Ångström (positions in A are assumed to be in nm
    and are multiplied by 10 after centering).

    Args:
        filename (str): path to write
        A (pd.DataFrame): atom data frame
        mol2_bonds (pd.DataFrame): internal mol2 bonds (may be empty)
        metadat (dict): molecule metadata (nFeatures, nSets, mol2type, mol2chargetype …)
        name (str): molecule name used when molname is empty
        bondsDF (pd.DataFrame or None): caller-supplied bonds, overrides mol2_bonds when present
        molname (str): explicit molecule name for the MOLECULE section, defaults to ''
        other_attributes (pd.DataFrame or None): additional per-atom columns to merge into A
    """
    if bondsDF is None:
        bondsDF = pd.DataFrame()
    if other_attributes is None:
        other_attributes = pd.DataFrame()

    acopy = A.copy()

    # Determine which bond table to use
    if bondsDF.empty and (mol2_bonds is None or mol2_bonds.empty):
        logger.warning(f'Cannot write any bonds to MOL2 file {filename}')
        bdf = pd.DataFrame()
    elif (mol2_bonds is not None and not mol2_bonds.empty) and bondsDF.empty:
        bdf = mol2_bonds
    elif (not bondsDF.empty) and (mol2_bonds is None or mol2_bonds.empty):
        bdf = bondsDF
    else:
        logger.info('write_mol2 provided with both a bondsDF parameter and a mol2_bonds attribute')
        logger.info('Using the parameter')
        bdf = bondsDF

    for col in other_attributes.columns:
        acopy[col] = other_attributes[col]

    # Geometric centre for centering before writing
    com = np.array([acopy.posX.mean(), acopy.posY.mean(), acopy.posZ.mean()])

    atomformatters = [
        lambda x: f'{x:>7d}',
        lambda x: f'{x:<8s}',
        lambda x: f'{x:>9.4f}',
        lambda x: f'{x:>9.4f}',
        lambda x: f'{x:>9.4f}',
        lambda x: f'{x:<5s}',
        lambda x: f'{x:>3d}',
        lambda x: f' {x:<7s}',
        lambda x: f'{x:>9.4f}',
    ]
    bondformatters = [
        lambda x: f'{x:>6d}',
        lambda x: f'{x:>5d}',
        lambda x: f'{x:>5d}',
        lambda x: f'{str(x):>4s}',
    ]
    substructureformatters = [
        lambda x: f'{x:>6d}',
        lambda x: f'{x:<7s}',
        lambda x: f'{x:>6d}',
        lambda x: f'{x:<7s}',
    ]

    rdf = acopy[['resNum', 'resName']].copy().drop_duplicates()
    rdf['rootatom'] = [1] * len(rdf)
    rdf['residue'] = ['RESIDUE'] * len(rdf)

    N = acopy.shape[0]
    nBonds = bdf.shape[0]
    nSubs = len(rdf)

    with open(filename, 'w') as f:
        f.write('@<TRIPOS>MOLECULE\n')
        f.write(f'{molname if molname else name}\n')
        f.write('{:>6d}{:>6d}{:>3d}{:>3d}{:>3d}\n'.format(
            N, nBonds, nSubs,
            metadat.get('nFeatures', 0),
            metadat.get('nSets', 0),
        ))
        f.write(f"{metadat.get('mol2type', 'SMALL')}\n")
        f.write(f"{metadat.get('mol2chargetype', 'GASTEIGER')}\n")
        f.write('\n')
        f.write('@<TRIPOS>ATOM\n')
        # Convert nm → Å and centre
        pos = (acopy.loc[:, ['posX', 'posY', 'posZ']] - com) * 10.0
        acopy.loc[:, ['posX', 'posY', 'posZ']] = pos
        f.write(acopy.to_string(
            columns=MOL2_ATOM_ATTRIBUTES, header=False, index=False,
            formatters=atomformatters,
        ))
        f.write('\n')
        f.write('@<TRIPOS>BOND\n')
        if not bondsDF.empty:
            logger.debug('Mol2 bonds from outside')
            out_bdf = bondsDF[['bondIdx', 'ai', 'aj', 'order']].copy()
            out_bdf['bondIdx'] = out_bdf['bondIdx'].astype(int)
            out_bdf['ai'] = out_bdf['ai'].astype(int)
            out_bdf['aj'] = out_bdf['aj'].astype(int)
            f.write(out_bdf.to_string(
                columns=MOL2_BOND_ATTRIBUTES, header=False, index=False,
                formatters=bondformatters,
            ))
        elif mol2_bonds is not None and not mol2_bonds.empty:
            logger.debug(f'write_mol2 ({filename}): Mol2 bonds from mol2_bonds attribute')
            f.write(mol2_bonds.to_string(
                columns=MOL2_BOND_ATTRIBUTES, header=False, index=False,
                formatters=bondformatters,
            ))
        f.write('\n')
        f.write('@<TRIPOS>SUBSTRUCTURE\n')
        f.write(rdf.to_string(header=False, index=False, formatters=substructureformatters))
        f.write('\n')