-
Notifications
You must be signed in to change notification settings - Fork 647
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Implementing gemmi
-based mmcif reader (with easy extension to PDB/PDBx and mmJSON)
#4712
base: develop
Are you sure you want to change the base?
Changes from all commits
aa2a88f
218cf43
7f78e02
6682d6e
817f3a0
3cc8c80
f1bf325
d21c220
2a1be15
77645e6
91e6942
9a0c086
9c731df
8b40ec7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import numpy as np | ||
import gemmi | ||
import logging | ||
from . import base | ||
|
||
logger = logging.getLogger("MDAnalysis.coordinates.MMCIF") | ||
|
||
|
||
class MMCIFReader(base.SingleFrameReaderBase): | ||
"""Reads from an MMCIF file""" | ||
|
||
format = "MMCIF" | ||
units = {"time": None, "length": "Angstrom"} | ||
|
||
def _read_first_frame(self): | ||
structure = gemmi.read_structure(self.filename) | ||
coords = np.array( | ||
[ | ||
[*at.pos.tolist()] | ||
for model in structure | ||
for chain in model | ||
for res in chain | ||
for at in res | ||
] | ||
) | ||
self.n_atoms = len(coords) | ||
self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs) | ||
self.ts.frame = 0 | ||
|
||
def Writer(self, filename, n_atoms=None, **kwargs): | ||
raise NotImplementedError | ||
|
||
def close(self): | ||
pass | ||
|
||
|
||
class MMCIFWriter(base.WriterBase): | ||
pass |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
""" | ||
MMCIF Topology Parser # | ||
=================== | ||
""" | ||
|
||
import gemmi | ||
import numpy as np | ||
import warnings | ||
import itertools | ||
|
||
from ..core.topology import Topology | ||
from ..core.topologyattrs import ( | ||
AltLocs, | ||
Atomids, | ||
Atomnames, | ||
Atomtypes, | ||
ChainIDs, | ||
Elements, | ||
FormalCharges, | ||
ICodes, | ||
Masses, | ||
Occupancies, | ||
RecordTypes, | ||
Resids, | ||
Resnames, | ||
Resnums, | ||
Segids, | ||
Tempfactors, | ||
) | ||
from .base import TopologyReaderBase | ||
|
||
|
||
def _into_idx(arr: list[int]) -> list[int]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Document what this does, ideally with an example |
||
return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group] | ||
|
||
|
||
class MMCIFParser(TopologyReaderBase): | ||
format = "MMCIF" | ||
|
||
def parse(self, **kwargs): | ||
"""Read the file and return the structure. | ||
|
||
Returns | ||
------- | ||
MDAnalysis Topology object | ||
""" | ||
structure = gemmi.read_structure(self.filename) | ||
|
||
if len(structure) > 1: | ||
warnings.warn( | ||
"MMCIF model {self.filename} contains {len(model)=} different models, " | ||
"but only the first one will be used to assign the topology" | ||
) | ||
model = structure[0] | ||
|
||
# atom properties | ||
( | ||
altlocs, # at.altloc | ||
serials, # at.serial | ||
names, # at.name | ||
atomtypes, # at.name | ||
# ------------------ | ||
chainids, # chain.name | ||
elements, # at.element.name | ||
formalcharges, # at.charge | ||
weights, # at.element.weight | ||
# ------------------ | ||
occupancies, # at.occ | ||
record_types, # res.het_flag | ||
tempfactors, # at.b_iso | ||
residx, # _into_idx(res.seqid.num) | ||
) = map( # this is probably not pretty, but it's efficient -- one loop over the mmcif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are all the fields here guaranteed in a valid pdbx? One benefit to working column by column is that you can do optional columns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have an example of a PDBx in mind, or like a test set for them? I've never actually worked with the format, since in RCSB afaik we have only pdb or mmcif |
||
np.array, | ||
list( | ||
zip( | ||
*[ | ||
( | ||
at.altloc, # altlocs | ||
at.serial, # serials | ||
at.name, # names | ||
at.name, # atomtypes | ||
# ------------------ | ||
chain.name, # chainids | ||
at.element.name, # elements | ||
at.charge, # formalcharges | ||
at.element.weight, # weights | ||
# ------------------ | ||
at.occ, # occupancies | ||
res.het_flag, # record_types | ||
at.b_iso, # tempfactors | ||
res.seqid.num, # residx, later translated into continious repr | ||
) | ||
for chain in model | ||
for res in chain | ||
for at in res | ||
] | ||
) | ||
), | ||
) | ||
|
||
( | ||
icodes, # res.seqid.icode | ||
resids, # res.seqid.num | ||
resnames, # res.name | ||
segidx, # chain.name | ||
resnums, | ||
) = map( | ||
np.array, | ||
list( | ||
zip( | ||
*[ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm struggling to follow the logic here, a comment breaking down what this double nested loop iteration into a zip is doing would be nice |
||
( | ||
res.seqid.icode, | ||
res.seqid.num, | ||
res.name, | ||
chain.name, | ||
res.seqid.num, | ||
) | ||
for chain in model | ||
for res in chain | ||
] | ||
) | ||
), | ||
) | ||
|
||
segids = [chain.name for chain in model] | ||
|
||
# transform *idx into continious numpy arrays | ||
residx = np.array(_into_idx(residx)) | ||
segidx = np.array(_into_idx(segidx)) | ||
|
||
# fill in altlocs | ||
altlocs = ["A" if not elem else elem for elem in altlocs] | ||
record_types = [ | ||
"ATOM" if record == "A" else "HETATM" if record == "H" else None | ||
for record in record_types | ||
] | ||
if any((elem is None for elem in record_types)): | ||
raise ValueError("Found an atom that is neither ATOM or HETATM") | ||
|
||
attrs = [ | ||
# AtomAttr subclasses | ||
AltLocs(altlocs), # at.altloc | ||
Atomids(serials), # at.serial | ||
Atomnames(names), # at.name | ||
Atomtypes(atomtypes), # at.name | ||
# --------------------------------------- | ||
ChainIDs(chainids), # chain.name | ||
Elements(elements), # at.element.name | ||
FormalCharges(formalcharges), # at.charge | ||
Masses(weights), # at.element.weight | ||
# --------------------------------------- | ||
Occupancies(occupancies), # at.occ | ||
RecordTypes(record_types), # res.het_flat | ||
Resnums(resnums), # res.seqid.num | ||
Tempfactors(tempfactors), # at.b_iso | ||
# | ||
# ResidueAttr subclasses | ||
ICodes(icodes), # res.seqid.icode | ||
Resids(resids), # res.seqid.num | ||
Resnames(resnames), # res.name | ||
# | ||
# SegmentAttr subclasses | ||
Segids(segids), # chain.name | ||
] | ||
|
||
n_atoms = len(names) | ||
n_residues = len(resids) | ||
n_segments = len(segids) | ||
|
||
return Topology( | ||
n_atoms, | ||
n_residues, | ||
n_segments, | ||
attrs=attrs, | ||
atom_resindex=residx, | ||
residue_segindex=segidx, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -333,3 +333,4 @@ | |
from . import MinimalParser | ||
from . import ITPParser | ||
from . import FHIAIMSParser | ||
from . import MMCIFParser |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,6 +78,7 @@ extra_formats = [ | |
"pytng>=0.2.3", | ||
"gsd>3.0.0", | ||
"rdkit>=2020.03.1", | ||
"gemmi", # for mmcif format | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will probably be optional, so other imports will have to respect that too |
||
] | ||
analysis = [ | ||
"biopython>=1.80", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wouldn't include this at this stage, Writer is optional