diff --git a/package/MDAnalysis/coordinates/MMCIF.py b/package/MDAnalysis/coordinates/MMCIF.py new file mode 100644 index 0000000000..b6b633f951 --- /dev/null +++ b/package/MDAnalysis/coordinates/MMCIF.py @@ -0,0 +1,38 @@ +import numpy as np +import gemmi +import logging +from . import base + +logger = logging.getLogger("MDAnalysis.coordinates.MMCIF") + + +class MMCIFReader(base.SingleFrameReaderBase): + """Reads from an MMCIF file""" + + format = "MMCIF" + units = {"time": None, "length": "Angstrom"} + + def _read_first_frame(self): + structure = gemmi.read_structure(self.filename) + coords = np.array( + [ + [*at.pos.tolist()] + for model in structure + for chain in model + for res in chain + for at in res + ] + ) + self.n_atoms = len(coords) + self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs) + self.ts.frame = 0 + + def Writer(self, filename, n_atoms=None, **kwargs): + raise NotImplementedError + + def close(self): + pass + + +class MMCIFWriter(base.WriterBase): + pass diff --git a/package/MDAnalysis/coordinates/__init__.py b/package/MDAnalysis/coordinates/__init__.py index 9b6a7121bc..1d0488d38f 100644 --- a/package/MDAnalysis/coordinates/__init__.py +++ b/package/MDAnalysis/coordinates/__init__.py @@ -791,3 +791,4 @@ class can choose an appropriate reader automatically. from . import NAMDBIN from . import FHIAIMS from . import TNG +from . import MMCIF diff --git a/package/MDAnalysis/topology/MMCIFParser.py b/package/MDAnalysis/topology/MMCIFParser.py new file mode 100644 index 0000000000..b9d5fcf694 --- /dev/null +++ b/package/MDAnalysis/topology/MMCIFParser.py @@ -0,0 +1,178 @@ +""" +MMCIF Topology Parser # +=================== +""" + +import gemmi +import numpy as np +import warnings +import itertools + +from ..core.topology import Topology +from ..core.topologyattrs import ( + AltLocs, + Atomids, + Atomnames, + Atomtypes, + ChainIDs, + Elements, + FormalCharges, + ICodes, + Masses, + Occupancies, + RecordTypes, + Resids, + Resnames, + Resnums, + Segids, + Tempfactors, +) +from .base import TopologyReaderBase + + +def _into_idx(arr: list[int]) -> list[int]: + return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group] + + +class MMCIFParser(TopologyReaderBase): + format = "MMCIF" + + def parse(self, **kwargs): + """Read the file and return the structure. + + Returns + ------- + MDAnalysis Topology object + """ + structure = gemmi.read_structure(self.filename) + + if len(structure) > 1: + warnings.warn( + "MMCIF model {self.filename} contains {len(model)=} different models, " + "but only the first one will be used to assign the topology" + ) + model = structure[0] + + # atom properties + ( + altlocs, # at.altloc + serials, # at.serial + names, # at.name + atomtypes, # at.name + # ------------------ + chainids, # chain.name + elements, # at.element.name + formalcharges, # at.charge + weights, # at.element.weight + # ------------------ + occupancies, # at.occ + record_types, # res.het_flag + tempfactors, # at.b_iso + residx, # _into_idx(res.seqid.num) + ) = map( # this is probably not pretty, but it's efficient -- one loop over the mmcif + np.array, + list( + zip( + *[ + ( + at.altloc, # altlocs + at.serial, # serials + at.name, # names + at.name, # atomtypes + # ------------------ + chain.name, # chainids + at.element.name, # elements + at.charge, # formalcharges + at.element.weight, # weights + # ------------------ + at.occ, # occupancies + res.het_flag, # record_types + at.b_iso, # tempfactors + res.seqid.num, # residx, later translated into continious repr + ) + for chain in model + for res in chain + for at in res + ] + ) + ), + ) + + ( + icodes, # res.seqid.icode + resids, # res.seqid.num + resnames, # res.name + segidx, # chain.name + resnums, + ) = map( + np.array, + list( + zip( + *[ + ( + res.seqid.icode, + res.seqid.num, + res.name, + chain.name, + res.seqid.num, + ) + for chain in model + for res in chain + ] + ) + ), + ) + + segids = [chain.name for chain in model] + + # transform *idx into continious numpy arrays + residx = np.array(_into_idx(residx)) + segidx = np.array(_into_idx(segidx)) + + # fill in altlocs + altlocs = ["A" if not elem else elem for elem in altlocs] + record_types = [ + "ATOM" if record == "A" else "HETATM" if record == "H" else None + for record in record_types + ] + if any((elem is None for elem in record_types)): + raise ValueError("Found an atom that is neither ATOM or HETATM") + + attrs = [ + # AtomAttr subclasses + AltLocs(altlocs), # at.altloc + Atomids(serials), # at.serial + Atomnames(names), # at.name + Atomtypes(atomtypes), # at.name + # --------------------------------------- + ChainIDs(chainids), # chain.name + Elements(elements), # at.element.name + FormalCharges(formalcharges), # at.charge + Masses(weights), # at.element.weight + # --------------------------------------- + Occupancies(occupancies), # at.occ + RecordTypes(record_types), # res.het_flat + Resnums(resnums), # res.seqid.num + Tempfactors(tempfactors), # at.b_iso + # + # ResidueAttr subclasses + ICodes(icodes), # res.seqid.icode + Resids(resids), # res.seqid.num + Resnames(resnames), # res.name + # + # SegmentAttr subclasses + Segids(segids), # chain.name + ] + + n_atoms = len(names) + n_residues = len(resids) + n_segments = len(segids) + + return Topology( + n_atoms, + n_residues, + n_segments, + attrs=attrs, + atom_resindex=residx, + residue_segindex=segidx, + ) diff --git a/package/MDAnalysis/topology/__init__.py b/package/MDAnalysis/topology/__init__.py index 32df510f47..058d142737 100644 --- a/package/MDAnalysis/topology/__init__.py +++ b/package/MDAnalysis/topology/__init__.py @@ -333,3 +333,4 @@ from . import MinimalParser from . import ITPParser from . import FHIAIMSParser +from . import MMCIFParser \ No newline at end of file diff --git a/package/pyproject.toml b/package/pyproject.toml index 1880d88fd0..2f7ba8b90d 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -78,6 +78,7 @@ extra_formats = [ "pytng>=0.2.3", "gsd>3.0.0", "rdkit>=2020.03.1", + "gemmi", # for mmcif format ] analysis = [ "biopython>=1.80",