From 41fa18d6f5a5b5b7493250b479bc3564f7c762ad Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 3 Oct 2019 16:02:46 -0700 Subject: [PATCH 1/7] Add an aliases file that can be used by the RE API to abstract the collection names --- aliases.yaml | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 aliases.yaml diff --git a/aliases.yaml b/aliases.yaml new file mode 100644 index 0000000..1748f11 --- /dev/null +++ b/aliases.yaml @@ -0,0 +1,62 @@ +# Aliases from custom name to an array of collection names +ws_prov_descendant_of: [ws_prov_descendant_of] +ws_obj_instance_of_type: [ws_obj_instance_of_type] +ws_workspace: [ws_workspace] +ws_user: [ws_user] +ws_method_version: [ws_method_version] +ws_genome_has_feature: [ws_genome_has_feature] +ws_module_contains_method: [ws_module_contains_method] +ws_genome_features: [ws_genome_features] +ws_object_hash: [ws_object_hash] +ws_owner_of: [ws_owner_of] +ws_workspace_contains_obj: [ws_workspace_contains_obj] +ws_type_module: [ws_type_module] +ws_type_consumed_by_method: [ws_type_consumed_by_method] +ws_feature_has_GO_annotation: [ws_feature_has_GO_annotation] +ws_obj_created_with_method: [ws_obj_created_with_method] +ws_refers_to: [ws_refers_to] +ws_type: [ws_type] +ws_latest_version_of: [ws_latest_version_of] +ws_object: [ws_object] +ws_method: [ws_method] +ws_obj_version_has_taxon: [ws_obj_version_has_taxon] +ws_object_version: [ws_object_version] +ws_copied_from: [ws_copied_from] +ws_module: [ws_module] +ws_version_of: [ws_version_of] +ws_has_perm: [ws_has_perm] +ws_obj_created_with_module: [ws_obj_created_with_module] +ws_type_version: [ws_type_version] +ws_module_version: [ws_module_version] +wsprov_links: [wsprov_links] +wsprov_action: [wsprov_action] +wsprov_produced: [wsprov_produced] +wsprov_object: [wsprov_object] +wsprov_input_in: [wsprov_input_in] +wsprov_copied_into: [wsprov_copied_into] +gtdb_child_of_taxon: [gtdb_child_of_taxon] +gtdb_taxon: [gtdb_taxon] +gtdb_organism: [gtdb_organism] +rxn_similar_to_reaction: [rxn_similar_to_reaction] +rxn_gene_complex: [rxn_gene_complex] +rxn_compound: [rxn_compound] +rxn_reaction_linked_to_reaction: [rxn_reaction_linked_to_reaction] +rxn_compound_linked_to_compound: [rxn_compound_linked_to_compound] +rxn_reaction: [rxn_reaction] +rxn_compound_within_reaction: [rxn_compound_within_reaction] +rxn_gene_within_complex: [rxn_gene_within_complex] +rxn_reaction_within_complex: [rxn_reaction_within_complex] +test_edge: [test_edge] +test_vertex: [test_vertex] +mash_genome_similar_to: [mash_genome_similar_to] +GO_merges: [GO_merges] +GO_edges: [GO_edges] +GO_terms: [GO_terms] +ncbi_taxon: [ncbi_taxon] +ncbi_genome: [ncbi_genome] +ncbi_child_of_taxon: [ncbi_child_of_taxon] +ncbi_gene_within_genome: [ncbi_gene_within_genome] +ncbi_gene: [ncbi_gene] +ENVO_merges: [ENVO_merges] +ENVO_edges: [ENVO_edges] +ENVO_terms: [ENVO_terms] From 465c047e204663b85f2ddd3692bdf6af333eb543 Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 3 Oct 2019 16:04:04 -0700 Subject: [PATCH 2/7] Remove the migrations directory --- migrations/README.md | 9 --------- migrations/__init__.py | 1 - migrations/example.py | 4 ---- 3 files changed, 14 deletions(-) delete mode 100644 migrations/README.md delete mode 100644 migrations/__init__.py delete mode 100644 migrations/example.py diff --git a/migrations/README.md b/migrations/README.md deleted file mode 100644 index 994aaca..0000000 --- a/migrations/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# Relation Engine Migrations - -Migrations are python scripts (using pyArango) that migrate (or roll back) the database to a new -schema version. - -## Guidelines - -- Every migration script has two functions -- `forward` and `backward -- for migrating the database forwards or backwards. -- Every migration should specify a collection name, the version we're migrating *from*, and version we're migrating *to* diff --git a/migrations/__init__.py b/migrations/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/migrations/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/migrations/example.py b/migrations/example.py deleted file mode 100644 index 214701e..0000000 --- a/migrations/example.py +++ /dev/null @@ -1,4 +0,0 @@ -# TODO - -x = 1 - From 6ec5cb8430fcd9d7a7b9a161d9dd13554d5827e7 Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 3 Oct 2019 16:04:19 -0700 Subject: [PATCH 3/7] Add MrCreosote to codeowners --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 788e5f5..b4799ee 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @jayrbolton +* @jayrbolton @MrCreosote From c7e45175f1d28d3216c6e269d8eb76a449984621 Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 3 Oct 2019 16:27:27 -0700 Subject: [PATCH 4/7] Add the subgraphs config files, validation code, and docs --- README.md | 8 ++++-- subgraphs/_schema.yaml | 27 ++++++++++++++++++ subgraphs/envo.yaml | 8 ++++++ subgraphs/go.yaml | 8 ++++++ subgraphs/gtdb.yaml | 7 +++++ subgraphs/ncbi_taxonomy.yaml | 7 +++++ test/validate.py | 53 ++++++++++++++++++++++++++++++------ 7 files changed, 106 insertions(+), 12 deletions(-) create mode 100644 subgraphs/_schema.yaml create mode 100644 subgraphs/envo.yaml create mode 100644 subgraphs/go.yaml create mode 100644 subgraphs/gtdb.yaml create mode 100644 subgraphs/ncbi_taxonomy.yaml diff --git a/README.md b/README.md index 4f000cf..32ca514 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,16 @@ # Relation Engine Spec -This repo holds the [stored queries](stored_queries), [schemas](schemas), and [migrations](migrations) for the relation engine graph database service. +This repo holds the [stored queries](stored_queries), [schemas](schemas), and [namespaces](namespaces) for the relation engine graph database service. -These specifications are used by the [Relation Engine API]() +These specifications are used by the [Relation Engine API](https://github.com/kbase/relation_engine_api) * **Stored queries** are stored [AQL queries](https://docs.arangodb.com/3.3/AQL/index.html) that can be used by KBase apps to fetch data from the database. * **Schemas** are [JSON schemas](https://json-schema.org/) that define what form of data can be stored in the database's collections. -* **Migrations** are python modules that connect to the database and are responsible for transitioning the data in a collection from an old schema to a newer one. +* **Subgraphs** are groupings of vertex and edge collections into a semantically related sub-graph of data. + +The additional configuration found in **/aliases.yaml** defines a set of custom names assigned to one or more collection names. This lets us abstract the database collection names, allowing us to rename underlying collections without breaking APIs. ## Development diff --git a/subgraphs/_schema.yaml b/subgraphs/_schema.yaml new file mode 100644 index 0000000..3cb6973 --- /dev/null +++ b/subgraphs/_schema.yaml @@ -0,0 +1,27 @@ +# JSON schema describing the namespace configuration files found in this directory. +type: object +additionalProperties: false +required: [id, name, vertices, edges] +properties: + id: + type: string + name: + type: string + url: + type: string + category: + type: string + enum: [taxonomy, ontology] + vertices: + type: array + minLength: 1 + items: {type: string} + edges: + type: object + patternProperties: + # Any key + ".": + type: array + items: + type: array + items: [{type: string}, {type: string}] diff --git a/subgraphs/envo.yaml b/subgraphs/envo.yaml new file mode 100644 index 0000000..aa4e3e1 --- /dev/null +++ b/subgraphs/envo.yaml @@ -0,0 +1,8 @@ +id: ENVO +name: Environmental Ontology +category: ontology +url: http://www.obofoundry.org/ontology/envo.html +vertices: [ENVO_terms] +edges: + ENVO_edges: [[ENVO_terms, ENVO_terms]] + ENVO_merges: [[ENVO_terms, ENVO_terms]] diff --git a/subgraphs/go.yaml b/subgraphs/go.yaml new file mode 100644 index 0000000..5ddbdb8 --- /dev/null +++ b/subgraphs/go.yaml @@ -0,0 +1,8 @@ +id: GO +name: Gene Ontology +category: ontology +url: http://geneontology.org +vertices: [GO_terms] +edges: + GO_edges: [[GO_terms, GO_terms]] + GO_merges: [[GO_terms, GO_terms]] diff --git a/subgraphs/gtdb.yaml b/subgraphs/gtdb.yaml new file mode 100644 index 0000000..7e197c7 --- /dev/null +++ b/subgraphs/gtdb.yaml @@ -0,0 +1,7 @@ +id: GTDB +name: Genome Taxonomy Database +category: taxonomy +url: https://gtdb.ecogenomic.org/ +vertices: [gtdb_organism, gtdb_taxon] +edges: + gtdb_child_of_taxon: [[gtdb_taxon, gtdb_taxon], [gtdb_organism, gtdb_taxon]] diff --git a/subgraphs/ncbi_taxonomy.yaml b/subgraphs/ncbi_taxonomy.yaml new file mode 100644 index 0000000..c66b352 --- /dev/null +++ b/subgraphs/ncbi_taxonomy.yaml @@ -0,0 +1,7 @@ +id: ncbi_taxonomy +name: NCBI Taxonomy +category: taxonomy +url: https://www.ncbi.nlm.nih.gov/taxonomy +vertices: [ncbi_taxon] +edges: + ncbi_child_of_taxon: [[ncbi_taxon, ncbi_taxon]] diff --git a/test/validate.py b/test/validate.py index 1fb9de4..3fe2ed9 100644 --- a/test/validate.py +++ b/test/validate.py @@ -35,11 +35,12 @@ def validate_json_schemas(): """Validate the syntax of all the JSON schemas.""" - print('Validating JSON schemas..') + print('Validating JSON schemas') names = set() # type: set + schemas = [] for path in glob.iglob('schemas/**/*.yaml', recursive=True): name = os.path.basename(path) - print(f' validating {path}..') + print(f' validating {path}') with open(path) as fd: data = yaml.safe_load(fd) jsonschema.validate(data, schema_schema) @@ -75,8 +76,8 @@ def validate_json_schemas(): _fatal('Time-travel vertex schemas must require the "id" attribute in ' + path) elif data['type'] == 'vertex' and not data.get('delta') and '_key' not in required: _fatal('Vertex schemas must require the "_key" attribute in ' + path) - print(f'✓ {name} is valid.') - print('..all valid.') + schemas.append(data) + return schemas stored_query_schema = { @@ -93,10 +94,10 @@ def validate_json_schemas(): def validate_stored_queries(): """Validate the structure and syntax of all the queries.""" - print('Validating AQL queries..') + print('Validating AQL queries') names = set() # type: set for path in glob.iglob('stored_queries/**/*.yaml', recursive=True): - print(f' validating {path}..') + print(f' validating {path}') with open(path) as fd: data = yaml.safe_load(fd) jsonschema.validate(data, stored_query_schema) @@ -133,8 +134,41 @@ def validate_stored_queries(): _fatal((f"Bind vars are invalid.\n" f" Extra vars in query: {query_bind_vars - params}.\n" f" Extra params in schema: {params - query_bind_vars}")) - print(f'✓ {path} is valid.') - print('..all valid.') + + +def validate_subgraphs(schemas): + """Validate the namespace configuration files found in /subgraphs.""" + print('Validating subgraphs') + ids = set() # type: set + with open('subgraphs/_schema.yaml') as fd: + schema = yaml.safe_load(fd) + for path in glob.iglob('subgraphs/**/*.yaml', recursive=True): + if path.endswith('_schema.yaml'): + continue + print(f' validating {path}') + with open(path) as fd: + data = yaml.safe_load(fd) + jsonschema.validate(data, schema) + _id = data['id'] + if _id in ids: + _fatal(f'Duplicate namespace id {_id}') + ids.add(_id) + schema_names = set(c['name'] for c in schemas) + verts = set(data['vertices']) + # Check that every listed vert has a schema file + for v in verts: + if v not in schema_names: + _fatal(f'Missing schema file for vertex "{v}"') + # Validate that each source and dest vertex in edges exists + for (name, edge_verts) in data['edges'].items(): + # Check that the edge name has a schema file + if name not in schema_names: + _fatal(f'Missing schema file for edge "{name}"') + for (src_vert, dest_vert) in edge_verts: + if src_vert not in verts: + _fatal('Source vertex {src_vert} for edge {name} does not exist.') + if dest_vert not in verts: + _fatal('Destination vertex {dest_vert} for edge {name} does not exist.') def _fatal(msg): @@ -145,5 +179,6 @@ def _fatal(msg): if __name__ == '__main__': wait_for_arangodb() - validate_json_schemas() + schemas = validate_json_schemas() + validate_subgraphs(schemas) validate_stored_queries() From 15861a9704f76b524947657a17c36eb5500735ca Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 31 Oct 2019 15:53:15 -0700 Subject: [PATCH 5/7] Address gavin's PR comments --- CODEOWNERS | 2 +- subgraphs/_schema.yaml | 2 ++ subgraphs/gtdb.yaml | 4 ++-- test/validate.py | 8 ++++++-- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index b4799ee..788e5f5 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @jayrbolton @MrCreosote +* @jayrbolton diff --git a/subgraphs/_schema.yaml b/subgraphs/_schema.yaml index 3cb6973..bc9200e 100644 --- a/subgraphs/_schema.yaml +++ b/subgraphs/_schema.yaml @@ -22,6 +22,8 @@ properties: # Any key ".": type: array + description: "array of pairs of vertex names" items: type: array + description: "pair of [vertex1, vertex2] denoting two vertices that this edge connects" items: [{type: string}, {type: string}] diff --git a/subgraphs/gtdb.yaml b/subgraphs/gtdb.yaml index 7e197c7..87096c0 100644 --- a/subgraphs/gtdb.yaml +++ b/subgraphs/gtdb.yaml @@ -2,6 +2,6 @@ id: GTDB name: Genome Taxonomy Database category: taxonomy url: https://gtdb.ecogenomic.org/ -vertices: [gtdb_organism, gtdb_taxon] +vertices: [gtdb_taxon] edges: - gtdb_child_of_taxon: [[gtdb_taxon, gtdb_taxon], [gtdb_organism, gtdb_taxon]] + gtdb_child_of_taxon: [[gtdb_taxon, gtdb_taxon]] diff --git a/test/validate.py b/test/validate.py index 3fe2ed9..4d2d894 100644 --- a/test/validate.py +++ b/test/validate.py @@ -137,13 +137,17 @@ def validate_stored_queries(): def validate_subgraphs(schemas): - """Validate the namespace configuration files found in /subgraphs.""" + """ + Validate the namespace configuration files found in /subgraphs. + Args: + schemas - a list of dicts loaded directly from schema yaml files. + """ print('Validating subgraphs') ids = set() # type: set with open('subgraphs/_schema.yaml') as fd: schema = yaml.safe_load(fd) for path in glob.iglob('subgraphs/**/*.yaml', recursive=True): - if path.endswith('_schema.yaml'): + if os.path.basename(path) == '_schema.yaml': continue print(f' validating {path}') with open(path) as fd: From 940462d30178d5dbc2dd5cb5a1b4b31b1a53df95 Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 31 Oct 2019 16:04:18 -0700 Subject: [PATCH 6/7] Add ncbi taxon merges edge to ncbi subgraph; fix readme --- README.md | 2 +- subgraphs/ncbi_taxonomy.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 32ca514..d03fa3b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Relation Engine Spec -This repo holds the [stored queries](stored_queries), [schemas](schemas), and [namespaces](namespaces) for the relation engine graph database service. +This repo holds the [stored queries](stored_queries), collection [schemas](schemas), and [subgraphs](subgraphs) for the relation engine graph database service. These specifications are used by the [Relation Engine API](https://github.com/kbase/relation_engine_api) diff --git a/subgraphs/ncbi_taxonomy.yaml b/subgraphs/ncbi_taxonomy.yaml index c66b352..d63a905 100644 --- a/subgraphs/ncbi_taxonomy.yaml +++ b/subgraphs/ncbi_taxonomy.yaml @@ -5,3 +5,4 @@ url: https://www.ncbi.nlm.nih.gov/taxonomy vertices: [ncbi_taxon] edges: ncbi_child_of_taxon: [[ncbi_taxon, ncbi_taxon]] + ncbi_taxon_merges: [[ncbi_taxon, ncbi_taxon]] From 8d890fd5e765d676612e7f33f9e055853870851f Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Thu, 31 Oct 2019 16:19:09 -0700 Subject: [PATCH 7/7] Simplify glob --- test/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/validate.py b/test/validate.py index 4d2d894..2753614 100644 --- a/test/validate.py +++ b/test/validate.py @@ -146,7 +146,7 @@ def validate_subgraphs(schemas): ids = set() # type: set with open('subgraphs/_schema.yaml') as fd: schema = yaml.safe_load(fd) - for path in glob.iglob('subgraphs/**/*.yaml', recursive=True): + for path in glob.iglob('subgraphs/*.yaml', recursive=True): if os.path.basename(path) == '_schema.yaml': continue print(f' validating {path}')