Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Implementing gemmi-based mmcif reader (with easy extension to PDB/PDBx and mmJSON) #4712

Draft
wants to merge 14 commits into
base: develop
Choose a base branch
from
Draft
38 changes: 38 additions & 0 deletions package/MDAnalysis/coordinates/MMCIF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import gemmi
import logging
from . import base

logger = logging.getLogger("MDAnalysis.coordinates.MMCIF")


class MMCIFReader(base.SingleFrameReaderBase):
"""Reads from an MMCIF file"""

format = "MMCIF"
units = {"time": None, "length": "Angstrom"}

def _read_first_frame(self):
structure = gemmi.read_structure(self.filename)
coords = np.array(
[
[*at.pos.tolist()]
for model in structure
for chain in model
for res in chain
for at in res
]
)
self.n_atoms = len(coords)
self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs)
self.ts.frame = 0

def Writer(self, filename, n_atoms=None, **kwargs):
raise NotImplementedError

def close(self):
pass


class MMCIFWriter(base.WriterBase):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't include this at this stage, Writer is optional

pass
1 change: 1 addition & 0 deletions package/MDAnalysis/coordinates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,3 +791,4 @@ class can choose an appropriate reader automatically.
from . import NAMDBIN
from . import FHIAIMS
from . import TNG
from . import MMCIF
178 changes: 178 additions & 0 deletions package/MDAnalysis/topology/MMCIFParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
"""
MMCIF Topology Parser #
===================
"""

import gemmi
import numpy as np
import warnings
import itertools

from ..core.topology import Topology
from ..core.topologyattrs import (
AltLocs,
Atomids,
Atomnames,
Atomtypes,
ChainIDs,
Elements,
FormalCharges,
ICodes,
Masses,
Occupancies,
RecordTypes,
Resids,
Resnames,
Resnums,
Segids,
Tempfactors,
)
from .base import TopologyReaderBase


def _into_idx(arr: list[int]) -> list[int]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document what this does, ideally with an example

return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group]


class MMCIFParser(TopologyReaderBase):
format = "MMCIF"

def parse(self, **kwargs):
"""Read the file and return the structure.

Returns
-------
MDAnalysis Topology object
"""
structure = gemmi.read_structure(self.filename)

if len(structure) > 1:
warnings.warn(
"MMCIF model {self.filename} contains {len(model)=} different models, "
"but only the first one will be used to assign the topology"
)
model = structure[0]

# atom properties
(
altlocs, # at.altloc
serials, # at.serial
names, # at.name
atomtypes, # at.name
# ------------------
chainids, # chain.name
elements, # at.element.name
formalcharges, # at.charge
weights, # at.element.weight
# ------------------
occupancies, # at.occ
record_types, # res.het_flag
tempfactors, # at.b_iso
residx, # _into_idx(res.seqid.num)
) = map( # this is probably not pretty, but it's efficient -- one loop over the mmcif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all the fields here guaranteed in a valid pdbx? One benefit to working column by column is that you can do optional columns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example of a PDBx in mind, or like a test set for them? I've never actually worked with the format, since in RCSB afaik we have only pdb or mmcif

np.array,
list(
zip(
*[
(
at.altloc, # altlocs
at.serial, # serials
at.name, # names
at.name, # atomtypes
# ------------------
chain.name, # chainids
at.element.name, # elements
at.charge, # formalcharges
at.element.weight, # weights
# ------------------
at.occ, # occupancies
res.het_flag, # record_types
at.b_iso, # tempfactors
res.seqid.num, # residx, later translated into continious repr
)
for chain in model
for res in chain
for at in res
]
)
),
)

(
icodes, # res.seqid.icode
resids, # res.seqid.num
resnames, # res.name
segidx, # chain.name
resnums,
) = map(
np.array,
list(
zip(
*[
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm struggling to follow the logic here, a comment breaking down what this double nested loop iteration into a zip is doing would be nice

(
res.seqid.icode,
res.seqid.num,
res.name,
chain.name,
res.seqid.num,
)
for chain in model
for res in chain
]
)
),
)

segids = [chain.name for chain in model]

# transform *idx into continious numpy arrays
residx = np.array(_into_idx(residx))
segidx = np.array(_into_idx(segidx))

# fill in altlocs
altlocs = ["A" if not elem else elem for elem in altlocs]
record_types = [
"ATOM" if record == "A" else "HETATM" if record == "H" else None
for record in record_types
]
if any((elem is None for elem in record_types)):
raise ValueError("Found an atom that is neither ATOM or HETATM")

attrs = [
# AtomAttr subclasses
AltLocs(altlocs), # at.altloc
Atomids(serials), # at.serial
Atomnames(names), # at.name
Atomtypes(atomtypes), # at.name
# ---------------------------------------
ChainIDs(chainids), # chain.name
Elements(elements), # at.element.name
FormalCharges(formalcharges), # at.charge
Masses(weights), # at.element.weight
# ---------------------------------------
Occupancies(occupancies), # at.occ
RecordTypes(record_types), # res.het_flat
Resnums(resnums), # res.seqid.num
Tempfactors(tempfactors), # at.b_iso
#
# ResidueAttr subclasses
ICodes(icodes), # res.seqid.icode
Resids(resids), # res.seqid.num
Resnames(resnames), # res.name
#
# SegmentAttr subclasses
Segids(segids), # chain.name
]

n_atoms = len(names)
n_residues = len(resids)
n_segments = len(segids)

return Topology(
n_atoms,
n_residues,
n_segments,
attrs=attrs,
atom_resindex=residx,
residue_segindex=segidx,
)
1 change: 1 addition & 0 deletions package/MDAnalysis/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,4 @@
from . import MinimalParser
from . import ITPParser
from . import FHIAIMSParser
from . import MMCIFParser
1 change: 1 addition & 0 deletions package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ extra_formats = [
"pytng>=0.2.3",
"gsd>3.0.0",
"rdkit>=2020.03.1",
"gemmi", # for mmcif format
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will probably be optional, so other imports will have to respect that too

]
analysis = [
"biopython>=1.80",
Expand Down
Loading