MDAnalysis · marinegor · May 22, 2024 · May 22, 2024 · May 22, 2024 · May 22, 2024
diff --git a/package/MDAnalysis/coordinates/MMCIF.py b/package/MDAnalysis/coordinates/MMCIF.py
@@ -0,0 +1,38 @@
+import numpy as np
+import gemmi
+import logging
+from . import base
+
+logger = logging.getLogger("MDAnalysis.coordinates.MMCIF")
+
+
+class MMCIFReader(base.SingleFrameReaderBase):
+    """Reads from an MMCIF file"""
+
+    format = "MMCIF"
+    units = {"time": None, "length": "Angstrom"}
+
+    def _read_first_frame(self):
+        structure = gemmi.read_structure(self.filename)
+        coords = np.array(
+            [
+                [*at.pos.tolist()]
+                for model in structure
+                for chain in model
+                for res in chain
+                for at in res
+            ]
+        )
+        self.n_atoms = len(coords)
+        self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs)
+        self.ts.frame = 0
+
+    def Writer(self, filename, n_atoms=None, **kwargs):
+        raise NotImplementedError
+
+    def close(self):
+        pass
+
+
+class MMCIFWriter(base.WriterBase):
+    pass
diff --git a/package/MDAnalysis/coordinates/__init__.py b/package/MDAnalysis/coordinates/__init__.py
@@ -791,3 +791,4 @@ class can choose an appropriate reader automatically.
 from . import NAMDBIN
 from . import FHIAIMS
 from . import TNG
+from . import MMCIF
diff --git a/package/MDAnalysis/topology/MMCIFParser.py b/package/MDAnalysis/topology/MMCIFParser.py
@@ -0,0 +1,178 @@
+"""
+MMCIF Topology Parser # 
+===================
+"""
+
+import gemmi
+import numpy as np
+import warnings
+import itertools
+
+from ..core.topology import Topology
+from ..core.topologyattrs import (
+    AltLocs,
+    Atomids,
+    Atomnames,
+    Atomtypes,
+    ChainIDs,
+    Elements,
+    FormalCharges,
+    ICodes,
+    Masses,
+    Occupancies,
+    RecordTypes,
+    Resids,
+    Resnames,
+    Resnums,
+    Segids,
+    Tempfactors,
+)
+from .base import TopologyReaderBase
+
+
+def _into_idx(arr: list[int]) -> list[int]:
+    return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group]
+
+
+class MMCIFParser(TopologyReaderBase):
+    format = "MMCIF"
+
+    def parse(self, **kwargs):
+        """Read the file and return the structure.
+
+        Returns
+        -------
+        MDAnalysis Topology object
+        """
+        structure = gemmi.read_structure(self.filename)
+
+        if len(structure) > 1:
+            warnings.warn(
+                "MMCIF model {self.filename} contains {len(model)=} different models, "
+                "but only the first one will be used to assign the topology"
+            )
+        model = structure[0]
+
+        # atom properties
+        (
+            altlocs,  # at.altloc
+            serials,  # at.serial
+            names,  # at.name
+            atomtypes,  # at.name
+            # ------------------
+            chainids,  # chain.name
+            elements,  # at.element.name
+            formalcharges,  # at.charge
+            weights,  # at.element.weight
+            # ------------------
+            occupancies,  # at.occ
+            record_types,  # res.het_flag
+            tempfactors,  # at.b_iso
+            residx,  # _into_idx(res.seqid.num)
+        ) = map(  # this is probably not pretty, but it's efficient -- one loop over the mmcif
+            np.array,
+            list(
+                zip(
+                    *[
+                        (
+                            at.altloc,  # altlocs
+                            at.serial,  # serials
+                            at.name,  # names
+                            at.name,  # atomtypes
+                            # ------------------
+                            chain.name,  # chainids
+                            at.element.name,  # elements
+                            at.charge,  # formalcharges
+                            at.element.weight,  # weights
+                            # ------------------
+                            at.occ,  # occupancies
+                            res.het_flag,  # record_types
+                            at.b_iso,  # tempfactors
+                            res.seqid.num,  # residx, later translated into continious repr
+                        )
+                        for chain in model
+                        for res in chain
+                        for at in res
+                    ]
+                )
+            ),
+        )
+
+        (
+            icodes,  # res.seqid.icode
+            resids,  # res.seqid.num
+            resnames,  # res.name
+            segidx,  # chain.name
+            resnums,
+        ) = map(
+            np.array,
+            list(
+                zip(
+                    *[
+                        (
+                            res.seqid.icode,
+                            res.seqid.num,
+                            res.name,
+                            chain.name,
+                            res.seqid.num,
+                        )
+                        for chain in model
+                        for res in chain
+                    ]
+                )
+            ),
+        )
+
+        segids = [chain.name for chain in model]
+
+        # transform *idx into continious numpy arrays
+        residx = np.array(_into_idx(residx))
+        segidx = np.array(_into_idx(segidx))
+
+        # fill in altlocs
+        altlocs = ["A" if not elem else elem for elem in altlocs]
+        record_types = [
+            "ATOM" if record == "A" else "HETATM" if record == "H" else None
+            for record in record_types
+        ]
+        if any((elem is None for elem in record_types)):
+            raise ValueError("Found an atom that is neither ATOM or HETATM")
+
+        attrs = [
+            # AtomAttr subclasses
+            AltLocs(altlocs),  # at.altloc
+            Atomids(serials),  # at.serial
+            Atomnames(names),  # at.name
+            Atomtypes(atomtypes),  # at.name
+            # ---------------------------------------
+            ChainIDs(chainids),  # chain.name
+            Elements(elements),  # at.element.name
+            FormalCharges(formalcharges),  # at.charge
+            Masses(weights),  # at.element.weight
+            # ---------------------------------------
+            Occupancies(occupancies),  # at.occ
+            RecordTypes(record_types),  # res.het_flat
+            Resnums(resnums),  # res.seqid.num
+            Tempfactors(tempfactors),  # at.b_iso
+            #
+            # ResidueAttr subclasses
+            ICodes(icodes),  # res.seqid.icode
+            Resids(resids),  # res.seqid.num
+            Resnames(resnames),  # res.name
+            #
+            # SegmentAttr subclasses
+            Segids(segids),  # chain.name
+        ]
+
+        n_atoms = len(names)
+        n_residues = len(resids)
+        n_segments = len(segids)
+
+        return Topology(
+            n_atoms,
+            n_residues,
+            n_segments,
+            attrs=attrs,
+            atom_resindex=residx,
+            residue_segindex=segidx,
+        )
diff --git a/package/MDAnalysis/topology/__init__.py b/package/MDAnalysis/topology/__init__.py
@@ -333,3 +333,4 @@
 from . import MinimalParser
 from . import ITPParser
 from . import FHIAIMSParser
+from . import MMCIFParser
diff --git a/package/pyproject.toml b/package/pyproject.toml
@@ -78,6 +78,7 @@ extra_formats = [
     "pytng>=0.2.3",
     "gsd>3.0.0",
     "rdkit>=2020.03.1",
+    "gemmi", # for mmcif format
 ]
 analysis = [
     "biopython>=1.80",