kbaseattic · jayrbolton · Oct 3, 2019 · Oct 3, 2019 · Oct 3, 2019 · Oct 3, 2019
diff --git a/README.md b/README.md
@@ -1,14 +1,16 @@
 # Relation Engine Spec
 
-This repo holds the [stored queries](stored_queries), [schemas](schemas), and [migrations](migrations) for the relation engine graph database service.
+This repo holds the [stored queries](stored_queries), collection [schemas](schemas), and [subgraphs](subgraphs) for the relation engine graph database service.
 
-These specifications are used by the [Relation Engine API]()
+These specifications are used by the [Relation Engine API](https://github.com/kbase/relation_engine_api)
 
 * **Stored queries** are stored [AQL queries](https://docs.arangodb.com/3.3/AQL/index.html) that can be used
 by KBase apps to fetch data from the database.
 * **Schemas** are [JSON schemas](https://json-schema.org/) that define what form of data can be stored in
 the database's collections.
-* **Migrations** are python modules that connect to the database and are responsible for transitioning the data in a collection from an old schema to a newer one.
+* **Subgraphs** are groupings of vertex and edge collections into a semantically related sub-graph of data.
+
+The additional configuration found in **/aliases.yaml** defines a set of custom names assigned to one or more collection names. This lets us abstract the database collection names, allowing us to rename underlying collections without breaking APIs.
 
 ## Development
 

diff --git a/aliases.yaml b/aliases.yaml
@@ -0,0 +1,62 @@
+# Aliases from custom name to an array of collection names
+ws_prov_descendant_of: [ws_prov_descendant_of]
+ws_obj_instance_of_type: [ws_obj_instance_of_type]
+ws_workspace: [ws_workspace]
+ws_user: [ws_user]
+ws_method_version: [ws_method_version]
+ws_genome_has_feature: [ws_genome_has_feature]
+ws_module_contains_method: [ws_module_contains_method]
+ws_genome_features: [ws_genome_features]
+ws_object_hash: [ws_object_hash]
+ws_owner_of: [ws_owner_of]
+ws_workspace_contains_obj: [ws_workspace_contains_obj]
+ws_type_module: [ws_type_module]
+ws_type_consumed_by_method: [ws_type_consumed_by_method]
+ws_feature_has_GO_annotation: [ws_feature_has_GO_annotation]
+ws_obj_created_with_method: [ws_obj_created_with_method]
+ws_refers_to: [ws_refers_to]
+ws_type: [ws_type]
+ws_latest_version_of: [ws_latest_version_of]
+ws_object: [ws_object]
+ws_method: [ws_method]
+ws_obj_version_has_taxon: [ws_obj_version_has_taxon]
+ws_object_version: [ws_object_version]
+ws_copied_from: [ws_copied_from]
+ws_module: [ws_module]
+ws_version_of: [ws_version_of]
+ws_has_perm: [ws_has_perm]
+ws_obj_created_with_module: [ws_obj_created_with_module]
+ws_type_version: [ws_type_version]
+ws_module_version: [ws_module_version]
+wsprov_links: [wsprov_links]
+wsprov_action: [wsprov_action]
+wsprov_produced: [wsprov_produced]
+wsprov_object: [wsprov_object]
+wsprov_input_in: [wsprov_input_in]
+wsprov_copied_into: [wsprov_copied_into]
+gtdb_child_of_taxon: [gtdb_child_of_taxon]
+gtdb_taxon: [gtdb_taxon]
+gtdb_organism: [gtdb_organism]
+rxn_similar_to_reaction: [rxn_similar_to_reaction]
+rxn_gene_complex: [rxn_gene_complex]
+rxn_compound: [rxn_compound]
+rxn_reaction_linked_to_reaction: [rxn_reaction_linked_to_reaction]
+rxn_compound_linked_to_compound: [rxn_compound_linked_to_compound]
+rxn_reaction: [rxn_reaction]
+rxn_compound_within_reaction: [rxn_compound_within_reaction]
+rxn_gene_within_complex: [rxn_gene_within_complex]
+rxn_reaction_within_complex: [rxn_reaction_within_complex]
+test_edge: [test_edge]
+test_vertex: [test_vertex]
+mash_genome_similar_to: [mash_genome_similar_to]
+GO_merges: [GO_merges]
+GO_edges: [GO_edges]
+GO_terms: [GO_terms]
+ncbi_taxon: [ncbi_taxon]
+ncbi_genome: [ncbi_genome]
+ncbi_child_of_taxon: [ncbi_child_of_taxon]
+ncbi_gene_within_genome: [ncbi_gene_within_genome]
+ncbi_gene: [ncbi_gene]
+ENVO_merges: [ENVO_merges]
+ENVO_edges: [ENVO_edges]
+ENVO_terms: [ENVO_terms]
diff --git a/migrations/README.md b/migrations/README.md
diff --git a/migrations/__init__.py b/migrations/__init__.py
diff --git a/migrations/example.py b/migrations/example.py
diff --git a/subgraphs/_schema.yaml b/subgraphs/_schema.yaml
@@ -0,0 +1,29 @@
+# JSON schema describing the namespace configuration files found in this directory.
+type: object
+additionalProperties: false
+required: [id, name, vertices, edges]
+properties:
+  id:
+    type: string
+  name:
+    type: string
+  url:
+    type: string
+  category:
+    type: string
+    enum: [taxonomy, ontology]
+  vertices:
+    type: array
+    minLength: 1
+    items: {type: string}
+  edges:
+    type: object
+    patternProperties:
+      # Any key
+      ".":
+        type: array
+        description: "array of pairs of vertex names"
+        items:
+          type: array
+          description: "pair of [vertex1, vertex2] denoting two vertices that this edge connects"
+          items: [{type: string}, {type: string}]
diff --git a/subgraphs/envo.yaml b/subgraphs/envo.yaml
@@ -0,0 +1,8 @@
+id: ENVO
+name: Environmental Ontology
+category: ontology
+url: http://www.obofoundry.org/ontology/envo.html
+vertices: [ENVO_terms]
+edges:
+  ENVO_edges: [[ENVO_terms, ENVO_terms]]
+  ENVO_merges: [[ENVO_terms, ENVO_terms]]
diff --git a/subgraphs/go.yaml b/subgraphs/go.yaml
@@ -0,0 +1,8 @@
+id: GO
+name: Gene Ontology
+category: ontology
+url: http://geneontology.org
+vertices: [GO_terms]
+edges:
+  GO_edges: [[GO_terms, GO_terms]]
+  GO_merges: [[GO_terms, GO_terms]]
diff --git a/subgraphs/gtdb.yaml b/subgraphs/gtdb.yaml
@@ -0,0 +1,7 @@
+id: GTDB
+name: Genome Taxonomy Database
+category: taxonomy
+url: https://gtdb.ecogenomic.org/
+vertices: [gtdb_taxon]
+edges:
+  gtdb_child_of_taxon: [[gtdb_taxon, gtdb_taxon]]
diff --git a/subgraphs/ncbi_taxonomy.yaml b/subgraphs/ncbi_taxonomy.yaml
@@ -0,0 +1,8 @@
+id: ncbi_taxonomy
+name: NCBI Taxonomy
+category: taxonomy
+url: https://www.ncbi.nlm.nih.gov/taxonomy
+vertices: [ncbi_taxon]
+edges:
+  ncbi_child_of_taxon: [[ncbi_taxon, ncbi_taxon]]
+  ncbi_taxon_merges: [[ncbi_taxon, ncbi_taxon]]
diff --git a/test/validate.py b/test/validate.py
@@ -35,11 +35,12 @@
 
 def validate_json_schemas():
     """Validate the syntax of all the JSON schemas."""
-    print('Validating JSON schemas..')
+    print('Validating JSON schemas')
     names = set()  # type: set
+    schemas = []
     for path in glob.iglob('schemas/**/*.yaml', recursive=True):
         name = os.path.basename(path)
-        print(f'  validating {path}..')
+        print(f'  validating {path}')
         with open(path) as fd:
             data = yaml.safe_load(fd)
         jsonschema.validate(data, schema_schema)
@@ -75,8 +76,8 @@ def validate_json_schemas():
             _fatal('Time-travel vertex schemas must require the "id" attribute in ' + path)
         elif data['type'] == 'vertex' and not data.get('delta') and '_key' not in required:
             _fatal('Vertex schemas must require the "_key" attribute in ' + path)
-        print(f'✓ {name} is valid.')
-    print('..all valid.')
+        schemas.append(data)
+    return schemas
 
 
 stored_query_schema = {
@@ -93,10 +94,10 @@ def validate_json_schemas():
 
 def validate_stored_queries():
     """Validate the structure and syntax of all the queries."""
-    print('Validating AQL queries..')
+    print('Validating AQL queries')
     names = set()  # type: set
     for path in glob.iglob('stored_queries/**/*.yaml', recursive=True):
-        print(f'  validating {path}..')
+        print(f'  validating {path}')
         with open(path) as fd:
             data = yaml.safe_load(fd)
         jsonschema.validate(data, stored_query_schema)
@@ -133,8 +134,45 @@ def validate_stored_queries():
             _fatal((f"Bind vars are invalid.\n"
                     f"  Extra vars in query: {query_bind_vars - params}.\n"
                     f"  Extra params in schema: {params - query_bind_vars}"))
-        print(f'✓ {path} is valid.')
-    print('..all valid.')
+
+
+def validate_subgraphs(schemas):
+    """
+    Validate the namespace configuration files found in /subgraphs.
+    Args:
+        schemas - a list of dicts loaded directly from schema yaml files.
+    """
+    print('Validating subgraphs')
+    ids = set()  # type: set
+    with open('subgraphs/_schema.yaml') as fd:
+        schema = yaml.safe_load(fd)
+    for path in glob.iglob('subgraphs/*.yaml', recursive=True):
+        if os.path.basename(path) == '_schema.yaml':
+            continue
+        print(f'  validating {path}')
+        with open(path) as fd:
+            data = yaml.safe_load(fd)
+        jsonschema.validate(data, schema)
+        _id = data['id']
+        if _id in ids:
+            _fatal(f'Duplicate namespace id {_id}')
+        ids.add(_id)
+        schema_names = set(c['name'] for c in schemas)
+        verts = set(data['vertices'])
+        # Check that every listed vert has a schema file
+        for v in verts:
+            if v not in schema_names:
+                _fatal(f'Missing schema file for vertex "{v}"')
+        # Validate that each source and dest vertex in edges exists
+        for (name, edge_verts) in data['edges'].items():
+            # Check that the edge name has a schema file
+            if name not in schema_names:
+                _fatal(f'Missing schema file for edge "{name}"')
+            for (src_vert, dest_vert) in edge_verts:
+                if src_vert not in verts:
+                    _fatal('Source vertex {src_vert} for edge {name} does not exist.')
+                if dest_vert not in verts:
+                    _fatal('Destination vertex {dest_vert} for edge {name} does not exist.')
 
 
 def _fatal(msg):
@@ -145,5 +183,6 @@ def _fatal(msg):
 
 if __name__ == '__main__':
     wait_for_arangodb()
-    validate_json_schemas()
+    schemas = validate_json_schemas()
+    validate_subgraphs(schemas)
     validate_stored_queries()