Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor converter generation code #974

Merged
merged 5 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions exports/contexts/bioregistry.epm.json
Original file line number Diff line number Diff line change
Expand Up @@ -26168,14 +26168,12 @@
"uri_prefix": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"uri_prefix_synonyms": [
"TAIR.PROTEIN:",
"http://arabidopsis.org/servlets/TairObject?accession=",
"http://bio2rdf.org/tair.protein:",
"http://bioregistry.io/tair.protein:",
"http://identifiers.org/tair.protein/",
"http://identifiers.org/tair.protein/AASequence:",
"http://identifiers.org/tair.protein:",
"http://n2t.net/tair.protein:",
"https://arabidopsis.org/servlets/TairObject?accession=",
"https://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"https://bio2rdf.org/tair.protein:",
"https://bioregistry.io/tair.protein:",
Expand Down
2 changes: 0 additions & 2 deletions exports/contexts/bioregistry.rpm.json
Original file line number Diff line number Diff line change
Expand Up @@ -3167,7 +3167,6 @@
"http://aps.unmc.edu/AP/database/query_output.php?ID=": "apd",
"http://ar5iv.org/abs/": "arxiv",
"http://arabidopsis.info/StockInfo?NASC_id=": "nasc",
"http://arabidopsis.org/servlets/TairObject?accession=": "tair.protein",
"http://arabidopsis.org/servlets/TairObject?accession=AASequence:": "tair.protein",
"http://arabidopsis.org/servlets/TairObject?accession=Gene:": "tair.gene",
"http://arabidopsis.org/servlets/TairObject?type=locus&name=": "tair.locus",
Expand Down Expand Up @@ -10733,7 +10732,6 @@
"https://aps.unmc.edu/AP/database/query_output.php?ID=": "apd",
"https://ar5iv.org/abs/": "arxiv",
"https://arabidopsis.info/StockInfo?NASC_id=": "nasc",
"https://arabidopsis.org/servlets/TairObject?accession=": "tair.protein",
"https://arabidopsis.org/servlets/TairObject?accession=AASequence:": "tair.protein",
"https://arabidopsis.org/servlets/TairObject?accession=Gene:": "tair.gene",
"https://arabidopsis.org/servlets/TairObject?type=locus&name=": "tair.locus",
Expand Down
2 changes: 0 additions & 2 deletions exports/contexts/obo.epm.json
Original file line number Diff line number Diff line change
Expand Up @@ -22771,14 +22771,12 @@
"prefix": "tair.protein",
"uri_prefix": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"uri_prefix_synonyms": [
"http://arabidopsis.org/servlets/TairObject?accession=",
"http://bio2rdf.org/tair.protein:",
"http://bioregistry.io/tair.protein:",
"http://identifiers.org/tair.protein/",
"http://identifiers.org/tair.protein/AASequence:",
"http://identifiers.org/tair.protein:",
"http://n2t.net/tair.protein:",
"https://arabidopsis.org/servlets/TairObject?accession=",
"https://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"https://bio2rdf.org/tair.protein:",
"https://bioregistry.io/tair.protein:",
Expand Down
1 change: 0 additions & 1 deletion src/bioregistry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@
write_registry,
)
from .uri_format import ( # noqa:F401
get_extended_prefix_map,
get_pattern_map,
get_prefix_map,
get_uri_format,
Expand Down
18 changes: 6 additions & 12 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -95986,8 +95986,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"prefix": "tair.gene",
"sampleId": "Gene:2200934",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "Gene:2200934"
},
"n2t": {
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This is the reference gene model for a given locus.",
Expand All @@ -95996,8 +95995,7 @@
"name": "The Arabidopsis Information Resource (TAIR) Gene",
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"prefix": "tair.gene",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"prefix": "tair.gene"
},
"part_of": "tair",
"pattern": "^\\d{7}$",
Expand All @@ -96012,8 +96010,7 @@
"prefix": "tair.gene",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"publications": [
{
Expand Down Expand Up @@ -96133,8 +96130,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"prefix": "tair.protein",
"sampleId": "AASequence:1009107926",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "AASequence:1009107926"
},
"n2t": {
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This provides protein information for a given gene model and provides links to other sources such as UniProtKB and GenPept",
Expand All @@ -96143,8 +96139,7 @@
"name": "The Arabidopsis Information Resource (TAIR) Protein",
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"prefix": "tair.protein",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"prefix": "tair.protein"
},
"pattern": "^\\d{10}$",
"prefixcommons": {
Expand All @@ -96158,8 +96153,7 @@
"prefix": "tair.protein",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"publications": [
{
Expand Down
6 changes: 2 additions & 4 deletions src/bioregistry/data/external/miriam/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -9440,8 +9440,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"prefix": "tair.gene",
"sampleId": "Gene:2200934",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "Gene:2200934"
},
"tair.locus": {
"deprecated": false,
Expand Down Expand Up @@ -9476,8 +9475,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"prefix": "tair.protein",
"sampleId": "AASequence:1009107926",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "AASequence:1009107926"
},
"tarbase": {
"deprecated": false,
Expand Down
6 changes: 2 additions & 4 deletions src/bioregistry/data/external/n2t/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -5460,8 +5460,7 @@
"homepage": "http://arabidopsis.org/index.jsp",
"name": "The Arabidopsis Information Resource (TAIR) Gene",
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"pattern": "^Gene:\\d{7}$"
},
"tair.locus": {
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. The name of a Locus is unique and used by TAIR, TIGR, and MIPS.",
Expand All @@ -5478,8 +5477,7 @@
"homepage": "http://arabidopsis.org/index.jsp",
"name": "The Arabidopsis Information Resource (TAIR) Protein",
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"pattern": "^AASequence:\\d{10}$"
},
"tarbase": {
"description": "TarBase stores microRNA (miRNA) information for miRNA–gene interactions, as well as miRNA- and gene-related facts to information specific to the interaction and the experimental validation methodologies used.",
Expand Down
6 changes: 2 additions & 4 deletions src/bioregistry/data/external/prefixcommons/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -10795,8 +10795,7 @@
"prefix": "tair.gene",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"tair.locus": {
"description": "The locus name",
Expand All @@ -10823,8 +10822,7 @@
"prefix": "tair.protein",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"tao": {
"bioportal": "1110",
Expand Down
13 changes: 10 additions & 3 deletions src/bioregistry/external/miriam.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
"4503",
"6vts",
}
SKIP_URI_FORMATS = {
"http://arabidopsis.org/servlets/TairObject?accession=$1",
}


def get_miriam(force_download: bool = False, force_process: bool = False):
Expand Down Expand Up @@ -91,7 +94,8 @@
else:
primary, *rest = resources
rv["homepage"] = primary["homepage"]
rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY]
if URI_FORMAT_KEY in primary:
rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY]

Check warning on line 98 in src/bioregistry/external/miriam.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/miriam.py#L98

Added line #L98 was not covered by tests

extras = []
for provider in rest:
Expand All @@ -113,14 +117,17 @@


def _preprocess_resource(resource):
return {
rv = {

Check warning on line 120 in src/bioregistry/external/miriam.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/miriam.py#L120

Added line #L120 was not covered by tests
"official": resource["official"],
"homepage": resource["resourceHomeUrl"],
"code": resource["providerCode"],
URI_FORMAT_KEY: resource["urlPattern"].replace("{$id}", "$1"),
"name": resource["name"],
"description": resource["description"],
}
uri_format = resource["urlPattern"].replace("{$id}", "$1")

Check warning on line 127 in src/bioregistry/external/miriam.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/miriam.py#L127

Added line #L127 was not covered by tests
if uri_format not in SKIP_URI_FORMATS:
rv[URI_FORMAT_KEY] = uri_format
return rv

Check warning on line 130 in src/bioregistry/external/miriam.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/miriam.py#L129-L130

Added lines #L129 - L130 were not covered by tests


@click.command()
Expand Down
15 changes: 14 additions & 1 deletion src/bioregistry/external/n2t.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
"merops": "issue with miriam having duplicate prefixes for this resource", # FIXME
"hgnc.family": "issue with miriam having duplicate prefixes for this resource", # FIXME
}
SKIP_URI_FORMATS = {
"http://arabidopsis.org/servlets/TairObject?accession=$1",
}


def get_n2t(force_download: bool = False):
Expand Down Expand Up @@ -53,7 +56,7 @@
def _process(record):
rv = {
"name": record.get("name"),
URI_FORMAT_KEY: record["redirect"].replace("$id", "$1") if "redirect" in record else None,
URI_FORMAT_KEY: _get_uri_format(record),
"description": record.get("description"),
"homepage": record.get("more"),
"pattern": record.get("pattern"),
Expand All @@ -63,6 +66,16 @@
return {k: v for k, v in rv.items() if v is not None}


def _get_uri_format(record):
raw_redirect = record.get("redirect")

Check warning on line 70 in src/bioregistry/external/n2t.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/n2t.py#L70

Added line #L70 was not covered by tests
if raw_redirect is None:
return None
uri_format = raw_redirect.replace("$id", "$1")

Check warning on line 73 in src/bioregistry/external/n2t.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/n2t.py#L72-L73

Added lines #L72 - L73 were not covered by tests
if uri_format in SKIP_URI_FORMATS:
return None
return uri_format

Check warning on line 76 in src/bioregistry/external/n2t.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/n2t.py#L75-L76

Added lines #L75 - L76 were not covered by tests


@click.command()
def main():
"""Reload the N2T data."""
Expand Down
17 changes: 11 additions & 6 deletions src/bioregistry/external/prefixcommons.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@
}
#: These contain synonyms with mismatches
DISCARD_SYNONYMS = {"biogrid", "cath", "zfa"}
SKIP_URI_FORMATS = {
"http://purl.obolibrary.org/obo/$1",
"http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
"http://arabidopsis.org/servlets/TairObject?accession=$1",
}


def get_prefixcommons(force_download: bool = False, force_process: bool = False):
Expand Down Expand Up @@ -147,10 +152,7 @@
uri_format = rv.pop("uri_format", None)
if uri_format:
uri_format = uri_format.replace("$id", "$1").replace("[?id]", "$1").replace("$d", "$1")
if uri_format not in {
"http://purl.obolibrary.org/obo/$1",
"http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
}:
if uri_format not in SKIP_URI_FORMATS:
rv["uri_format"] = uri_format

uri_rdf_formats = _get_uri_formats(rv, "rdf_uri_prefix")
Expand Down Expand Up @@ -191,9 +193,12 @@
continue
if "$1" in uri_format or "[?id]" in uri_format: # FIXME check if these come at the end
continue
rv.append(f"{uri_format}$1")
uri_format = f"{uri_format}$1"

Check warning on line 196 in src/bioregistry/external/prefixcommons.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/prefixcommons.py#L196

Added line #L196 was not covered by tests
if uri_format in SKIP_URI_FORMATS:
continue
rv.append(uri_format)

Check warning on line 199 in src/bioregistry/external/prefixcommons.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/prefixcommons.py#L198-L199

Added lines #L198 - L199 were not covered by tests
return rv


if __name__ == "__main__":
print(len(get_prefixcommons(force_process=True))) # noqa:T201
print(len(get_prefixcommons(force_process=True, force_download=True))) # noqa:T201

Check warning on line 204 in src/bioregistry/external/prefixcommons.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/external/prefixcommons.py#L204

Added line #L204 was not covered by tests
28 changes: 28 additions & 0 deletions src/bioregistry/record_accumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@
)

import curies
from curies import Converter

from bioregistry import Resource

__all__ = [
"get_converter",
]

logger = logging.getLogger(__name__)
prefix_blacklist = {"bgee.gene"}
uri_prefix_blacklist = {
Expand All @@ -33,6 +38,7 @@
"http://www.ncbi.nlm.nih.gov/nuccore/",
"https://www.ebi.ac.uk/ena/data/view/",
"http://www.ebi.ac.uk/ena/data/view/",
"http://arabidopsis.org/servlets/TairObject?accession=",
}
prefix_resource_blacklist = {
("orphanet", "http://www.orpha.net/ORDO/Orphanet_"), # biocontext is wrong
Expand Down Expand Up @@ -103,6 +109,28 @@ def _iterate_prefix_prefix(resource: Resource, *extras: str):
# (e.g., uniprot.isoform and uniprot)


def get_converter(
resources: List[Resource],
prefix_priority: Optional[Sequence[str]] = None,
uri_prefix_priority: Optional[Sequence[str]] = None,
include_prefixes: bool = False,
strict: bool = False,
blacklist: Optional[Collection[str]] = None,
remapping: Optional[Mapping[str, str]] = None,
) -> Converter:
"""Generate a converter from resources."""
records = get_records(
resources,
prefix_priority=prefix_priority,
uri_prefix_priority=uri_prefix_priority,
include_prefixes=include_prefixes,
strict=strict,
blacklist=blacklist,
remapping=remapping,
)
return curies.Converter(records)


def get_records( # noqa: C901
resources: List[Resource],
prefix_priority: Optional[Sequence[str]] = None,
Expand Down
Loading
Loading