"""Sybyl MOL2 file reader and writer.
Author: Cameron F. Abrams <cfa22@drexel.edu>
"""
import logging
from io import StringIO
import numpy as np
import pandas as pd
from ..geometry.bondlist import Bondlist
logger = logging.getLogger(__name__)
MOL2_ATOM_ATTRIBUTES = [
'globalIdx', 'atomName', 'posX', 'posY', 'posZ',
'type', 'resNum', 'resName', 'charge',
]
MOL2_BOND_ATTRIBUTES = ['bondIdx', 'ai', 'aj', 'order']
MOL2_BOND_TYPES = {k: v for k, v in zip(MOL2_BOND_ATTRIBUTES, [int, int, int, str])}
[docs]
def read(filename):
"""Parse a Sybyl MOL2 file.
Only MOLECULE, ATOM, and BOND sections are read.
All lengths are converted from Ångström to nm.
Args:
filename (str): path to the .mol2 file
Returns:
dict: parsed data with keys:
- 'name' (str)
- 'N' (int): atom count
- 'metadat' (dict): nBonds, nSubs, nFeatures, nSets, mol2type, mol2chargetype
- 'A' (pd.DataFrame): atom data frame (positions in nm)
- 'mol2_bonds' (pd.DataFrame)
- 'mol2_bondlist' (Bondlist)
"""
with open(filename, 'r') as f:
rawsections = f.read().split('@<TRIPOS>')[1:]
sections = {}
for rs in rawsections:
s = rs.split('\n')
key = s[0].strip().lower()
val = [a.strip() for a in s[1:] if len(a) > 0]
if key == 'atom' or key == 'bond':
val = StringIO('\n'.join(val))
sections[key] = val
name = sections['molecule'][0]
imetadat = list(map(int, sections['molecule'][1].strip().split()))
metadat = {
'N': imetadat[0],
'nBonds': imetadat[1],
'nSubs': imetadat[2],
'nFeatures': imetadat[3],
'nSets': imetadat[4],
'mol2type': sections['molecule'][2],
'mol2chargetype': sections['molecule'][3],
}
N = metadat['N']
A = pd.read_csv(sections['atom'], sep=r'\s+', names=MOL2_ATOM_ATTRIBUTES)
A[['posX', 'posY', 'posZ']] *= [0.1, 0.1, 0.1] # Å → nm
N = A.shape[0]
mol2_bonds = pd.read_csv(
sections['bond'], sep=r'\s+',
names=MOL2_BOND_ATTRIBUTES, dtype=MOL2_BOND_TYPES,
)
# Ensure ai < aj in every bond record
for i, r in mol2_bonds.iterrows():
ai, aj = r['ai'], r['aj']
if aj < ai:
logger.debug(f'mol2 bonds swapping {ai} and {aj}')
mol2_bonds.iloc[i, mol2_bonds.columns == 'ai'] = aj
mol2_bonds.iloc[i, mol2_bonds.columns == 'aj'] = ai
mol2_bondlist = Bondlist.fromDataFrame(mol2_bonds)
return {
'name': name,
'N': N,
'metadat': metadat,
'A': A,
'mol2_bonds': mol2_bonds,
'mol2_bondlist': mol2_bondlist,
}
[docs]
def write(filename, A, mol2_bonds, metadat, name,
bondsDF=None, molname='', other_attributes=None):
"""Write a Sybyl MOL2 file.
All coordinates are written in Ångström (positions in A are assumed to be in nm
and are multiplied by 10 after centering).
Args:
filename (str): path to write
A (pd.DataFrame): atom data frame
mol2_bonds (pd.DataFrame): internal mol2 bonds (may be empty)
metadat (dict): molecule metadata (nFeatures, nSets, mol2type, mol2chargetype …)
name (str): molecule name used when molname is empty
bondsDF (pd.DataFrame or None): caller-supplied bonds, overrides mol2_bonds when present
molname (str): explicit molecule name for the MOLECULE section, defaults to ''
other_attributes (pd.DataFrame or None): additional per-atom columns to merge into A
"""
if bondsDF is None:
bondsDF = pd.DataFrame()
if other_attributes is None:
other_attributes = pd.DataFrame()
acopy = A.copy()
# Determine which bond table to use
if bondsDF.empty and (mol2_bonds is None or mol2_bonds.empty):
logger.warning(f'Cannot write any bonds to MOL2 file {filename}')
bdf = pd.DataFrame()
elif (mol2_bonds is not None and not mol2_bonds.empty) and bondsDF.empty:
bdf = mol2_bonds
elif (not bondsDF.empty) and (mol2_bonds is None or mol2_bonds.empty):
bdf = bondsDF
else:
logger.info('write_mol2 provided with both a bondsDF parameter and a mol2_bonds attribute')
logger.info('Using the parameter')
bdf = bondsDF
for col in other_attributes.columns:
acopy[col] = other_attributes[col]
# Geometric centre for centering before writing
com = np.array([acopy.posX.mean(), acopy.posY.mean(), acopy.posZ.mean()])
atomformatters = [
lambda x: f'{x:>7d}',
lambda x: f'{x:<8s}',
lambda x: f'{x:>9.4f}',
lambda x: f'{x:>9.4f}',
lambda x: f'{x:>9.4f}',
lambda x: f'{x:<5s}',
lambda x: f'{x:>3d}',
lambda x: f' {x:<7s}',
lambda x: f'{x:>9.4f}',
]
bondformatters = [
lambda x: f'{x:>6d}',
lambda x: f'{x:>5d}',
lambda x: f'{x:>5d}',
lambda x: f'{str(x):>4s}',
]
substructureformatters = [
lambda x: f'{x:>6d}',
lambda x: f'{x:<7s}',
lambda x: f'{x:>6d}',
lambda x: f'{x:<7s}',
]
rdf = acopy[['resNum', 'resName']].copy().drop_duplicates()
rdf['rootatom'] = [1] * len(rdf)
rdf['residue'] = ['RESIDUE'] * len(rdf)
N = acopy.shape[0]
nBonds = bdf.shape[0]
nSubs = len(rdf)
with open(filename, 'w') as f:
f.write('@<TRIPOS>MOLECULE\n')
f.write(f'{molname if molname else name}\n')
f.write('{:>6d}{:>6d}{:>3d}{:>3d}{:>3d}\n'.format(
N, nBonds, nSubs,
metadat.get('nFeatures', 0),
metadat.get('nSets', 0),
))
f.write(f"{metadat.get('mol2type', 'SMALL')}\n")
f.write(f"{metadat.get('mol2chargetype', 'GASTEIGER')}\n")
f.write('\n')
f.write('@<TRIPOS>ATOM\n')
# Convert nm → Å and centre
pos = (acopy.loc[:, ['posX', 'posY', 'posZ']] - com) * 10.0
acopy.loc[:, ['posX', 'posY', 'posZ']] = pos
f.write(acopy.to_string(
columns=MOL2_ATOM_ATTRIBUTES, header=False, index=False,
formatters=atomformatters,
))
f.write('\n')
f.write('@<TRIPOS>BOND\n')
if not bondsDF.empty:
logger.debug('Mol2 bonds from outside')
out_bdf = bondsDF[['bondIdx', 'ai', 'aj', 'order']].copy()
out_bdf['bondIdx'] = out_bdf['bondIdx'].astype(int)
out_bdf['ai'] = out_bdf['ai'].astype(int)
out_bdf['aj'] = out_bdf['aj'].astype(int)
f.write(out_bdf.to_string(
columns=MOL2_BOND_ATTRIBUTES, header=False, index=False,
formatters=bondformatters,
))
elif mol2_bonds is not None and not mol2_bonds.empty:
logger.debug(f'write_mol2 ({filename}): Mol2 bonds from mol2_bonds attribute')
f.write(mol2_bonds.to_string(
columns=MOL2_BOND_ATTRIBUTES, header=False, index=False,
formatters=bondformatters,
))
f.write('\n')
f.write('@<TRIPOS>SUBSTRUCTURE\n')
f.write(rdf.to_string(header=False, index=False, formatters=substructureformatters))
f.write('\n')