Merge pull request #659 from padix-key/cif

Fix invalid CIF/BCIF files created when file is edited
biotite-dev · Sep 9, 2024 · d0107fa · d0107fa
2 parents dc691a5 + 2dddcba
commit d0107fa
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 17 deletions.
diff --git a/src/biotite/structure/io/pdbx/bcif.py b/src/biotite/structure/io/pdbx/bcif.py
@@ -457,7 +457,12 @@ class BinaryCIFBlock(_HierarchicalContainer):
     """
 
     def __init__(self, categories=None):
-        super().__init__(categories)
+        if categories is None:
+            categories = {}
+        super().__init__(
+            # Actual bcif files use leading '_' as category names
+            {"_" + name: category for name, category in categories.items()}
+        )
 
     @staticmethod
     def subcomponent_class():
@@ -470,21 +475,36 @@ def supercomponent_class():
     @staticmethod
     def deserialize(content):
         return BinaryCIFBlock(
-            BinaryCIFBlock._deserialize_elements(content["categories"], "name")
+            {
+                # The superclass uses leading '_' in category names,
+                # but on the level of this class, the leading '_' is omitted
+                name.lstrip("_"): category
+                for name, category in BinaryCIFBlock._deserialize_elements(
+                    content["categories"], "name"
+                ).items()
+            }
         )
 
     def serialize(self):
         return {"categories": self._serialize_elements("name")}
 
     def __getitem__(self, key):
-        # Actual bcif files use leading '_' as categories
-        return super().__getitem__("_" + key)
+        try:
+            return super().__getitem__("_" + key)
+        except KeyError:
+            raise KeyError(key)
 
     def __setitem__(self, key, element):
-        return super().__setitem__("_" + key, element)
+        try:
+            return super().__setitem__("_" + key, element)
+        except KeyError:
+            raise KeyError(key)
 
     def __delitem__(self, key):
-        return super().__setitem__("_" + key)
+        try:
+            return super().__setitem__("_" + key)
+        except KeyError:
+            raise KeyError(key)
 
     def __iter__(self):
         return (key.lstrip("_") for key in super().__iter__())

diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
@@ -569,6 +569,17 @@ class CIFBlock(_Component, MutableMapping):
         The keys are the category names and the values are the
         :class:`CIFCategory` objects.
         By default, an empty block is created.
+    name : str, optional
+        The name of the block.
+        This is only used for serialization and is automatically set,
+        when the :class:`CIFBlock` is added to a :class:`CIFFile`.
+        It only needs to be set manually, when the block is directly
+        serialized.
+
+    Attributes
+    ----------
+    name : str
+        The name of the block.
 
     Notes
     -----
@@ -580,13 +591,15 @@ class CIFBlock(_Component, MutableMapping):
     --------
 
     >>> # Add category on creation
-    >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})})
+    >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})}, name="baz")
     >>> # Add category later on
     >>> block["bar"] = CIFCategory({"another_column": [2, 3]})
     >>> # Access a column
     >>> print(block["bar"]["another_column"].as_array())
     ['2' '3']
     >>> print(block.serialize())
+    data_baz
+    #
     _foo.some_column   1
     #
     loop_
@@ -596,11 +609,20 @@ class CIFBlock(_Component, MutableMapping):
     #
     """
 
-    def __init__(self, categories=None):
+    def __init__(self, categories=None, name=None):
+        self._name = name
         if categories is None:
             categories = {}
         self._categories = categories
 
+    @property
+    def name(self):
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        self._name = name
+
     @staticmethod
     def subcomponent_class():
         return CIFCategory
@@ -634,7 +656,10 @@ def deserialize(text):
         return CIFBlock(_create_element_dict(lines, category_names, category_starts))
 
     def serialize(self):
-        text_blocks = []
+        if self._name is None:
+            raise SerializationError("Block name is required")
+        # The block starts with the black name line followed by a comment line
+        text_blocks = ["data_" + self._name + "\n#\n"]
         for category_name, category in self._categories.items():
             if isinstance(category, str):
                 # Category is already stored as lines
@@ -806,14 +831,12 @@ def deserialize(text):
     def serialize(self):
         text_blocks = []
         for block_name, block in self._blocks.items():
-            text_blocks.append("data_" + block_name + "\n")
-            # A comment line is set after the block indicator
-            text_blocks.append("#\n")
             if isinstance(block, str):
                 # Block is already stored as text
                 text_blocks.append(block)
             else:
                 try:
+                    block.name = block_name
                     text_blocks.append(block.serialize())
                 except Exception:
                     raise SerializationError(
@@ -884,6 +907,7 @@ def __getitem__(self, key):
     def __setitem__(self, key, block):
         if not isinstance(block, CIFBlock):
             raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'")
+        block.name = key
         self._blocks[key] = block
 
     def __delitem__(self, key):
@@ -921,7 +945,7 @@ def _create_element_dict(lines, element_names, element_starts):
     # Lazy deserialization
     # -> keep as text for now and deserialize later if needed
     return {
-        element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]])
+        element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) + "\n"
         for i, element_name in enumerate(element_names)
     }
 

diff --git a/src/biotite/structure/io/pdbx/component.py b/src/biotite/structure/io/pdbx/component.py
@@ -171,10 +171,10 @@ def _serialize_elements(self, store_key_in=None):
         Parameters
         ----------
         store_key_in: str, optional
-        If given, the key of each element is stored as value in the
-        serialized element.
-        This is basically the reverse operation of `take_key_from` in
-        :meth:`_deserialize_elements()`.
+            If given, the key of each element is stored as value in the
+            serialized element.
+            This is basically the reverse operation of `take_key_from` in
+            :meth:`_deserialize_elements()`.
         """
         serialized_elements = []
         for key, element in self._elements.items():

diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
@@ -704,6 +704,43 @@ def test_serialization_consistency(format, create_new_encoding):
             raise Exception(f"Comparison failed for '{category_name}.{key}'")
 
 
+@pytest.mark.parametrize(
+    "format, level", itertools.product(["cif", "bcif"], ["block", "category", "column"])
+)
+def test_editing(tmpdir, format, level):
+    """
+    Check if editing an existing PDBx file works, by checking if replacing some
+    category/block/column with a copy of itself does not affect the content.
+    """
+    File = pdbx.CIFFile if format == "cif" else pdbx.BinaryCIFFile
+    Block = File.subcomponent_class()
+    Category = Block.subcomponent_class()
+    Column = Category.subcomponent_class()
+
+    column = Column(["a", "b", "c"])
+    category = Category({"foo_col": column, "bar_col": column, "baz_col": column})
+    block = Block({"foo_cat": category, "bar_cat": category, "baz_cat": category})
+    ref_pdbx_file = File({"foo_block": block, "bar_block": block, "baz_block": block})
+    ref_pdbx_file.write(join(tmpdir, f"original.{format}"))
+
+    pdbx_file = File.read(join(tmpdir, f"original.{format}"))
+    if level == "block":
+        # Replace block in the mid,
+        # to check if the block before and after remain the same
+        pdbx_file["bar_block"] = pdbx_file["bar_block"]
+    elif level == "category":
+        pdbx_file["bar_block"]["bar_cat"] = pdbx_file["bar_block"]["bar_cat"]
+    elif level == "column":
+        pdbx_file["bar_block"]["bar_cat"]["bar_col"] = pdbx_file["bar_block"][
+            "bar_cat"
+        ]["bar_col"]
+    pdbx_file.write(join(tmpdir, f"edited.{format}"))
+
+    test_pdbx_file = File.read(join(tmpdir, f"edited.{format}"))
+    # As the content should not have changed, the serialized files should be identical
+    assert test_pdbx_file.serialize() == ref_pdbx_file.serialize()
+
+
 def _clear_encoding(category):
     columns = {}
     for key, col in category.items():