From 087de8dedf5dc0143ecb92ddb774f1ecf6097c28 Mon Sep 17 00:00:00 2001 From: Andy Jackson Date: Fri, 6 Nov 2020 19:42:22 +0000 Subject: [PATCH 1/5] Adding metrics for CDX indexing These two metrics should help us check that the CDX indexing is happening. --- stat-pusher/prod.stats | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/stat-pusher/prod.stats b/stat-pusher/prod.stats index 485642a..f8594a2 100644 --- a/stat-pusher/prod.stats +++ b/stat-pusher/prod.stats @@ -8,6 +8,14 @@ "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=*:*&wt=json", "match": "['response','numFound']" }, + "numFound": { + "host": "solr8", + "label": "cdx", + "desc": "Number of records in trackdb collection that are marked as cdx-indexed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, "refresh_timestamp": { "host": "solr8", "label": "", @@ -15,6 +23,14 @@ "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=*:*&sort=refresh_date_dt%20desc&wt=json", "kind": "json", "match": "['response','docs','refresh_date_dt']" + }, + "last_timestamp": { + "host": "solr8", + "label": "cdx", + "desc": "Most recent trackdb timestamp that is marked as having been cdx-indexed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','docs','timestamp_dt']" } } } From be7c2a6a8ab9a14c1bf48023890b4160366994a7 Mon Sep 17 00:00:00 2001 From: Andy Jackson Date: Fri, 6 Nov 2020 19:47:07 +0000 Subject: [PATCH 2/5] Add counter of missing files If records are not longer being updated, the files may have gone missing. --- stat-pusher/prod.stats | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/stat-pusher/prod.stats b/stat-pusher/prod.stats index f8594a2..d427841 100644 --- a/stat-pusher/prod.stats +++ b/stat-pusher/prod.stats @@ -8,14 +8,6 @@ "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=*:*&wt=json", "match": "['response','numFound']" }, - "numFound": { - "host": "solr8", - "label": "cdx", - "desc": "Number of records in trackdb collection that are marked as cdx-indexed", - "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", - "kind": "json", - "match": "['response','numFound']" - }, "refresh_timestamp": { "host": "solr8", "label": "", @@ -24,6 +16,22 @@ "kind": "json", "match": "['response','docs','refresh_date_dt']" }, + "numFound": { + "host": "solr8", + "label": "missing", + "desc": "Number of records in trackdb collection that appear to be missing as their records no longer refreshed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=refresh_date_dt%3A%5B*%20TO%20NOW-1DAY%5D&sort=refresh_date_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, + "numFound": { + "host": "solr8", + "label": "cdx", + "desc": "Number of records in trackdb collection that are marked as cdx-indexed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, "last_timestamp": { "host": "solr8", "label": "cdx", From a8aeaf90a9f8bcae4974f90410b9424e7d6e0cf0 Mon Sep 17 00:00:00 2001 From: Andy Jackson Date: Fri, 6 Nov 2020 21:33:05 +0000 Subject: [PATCH 3/5] Add CDX up-to-date check Add a check that queries the public CDX for the most recent timestamp of a page that should be updated every day. --- stat-pusher/prod.stats | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/stat-pusher/prod.stats b/stat-pusher/prod.stats index d427841..b358c22 100644 --- a/stat-pusher/prod.stats +++ b/stat-pusher/prod.stats @@ -40,5 +40,15 @@ "kind": "json", "match": "['response','docs','timestamp_dt']" } + }, + "cdx_oa_wayback" { + "last_timestamp": { + "host": "www.webarchive.org.uk", + "label": "cdx", + "desc": "Most recent CDX timestamp of a page that should be crawled every day (bl.uk/robots.txt)", + "uri": "https://www.webarchive.org.uk/wayback/archive/cdx?url=https%3A%2F%2Fwww.bl.uk%2Frobots.txt&output=json&allowFuzzy=false&sort=reverse&limit=1", + "kind": "json", + "match": "['timestamp']" + } } } From 2886d186ac5fb6b4f2c964f4803253376d400a63 Mon Sep 17 00:00:00 2001 From: Andy Jackson Date: Fri, 6 Nov 2020 21:57:49 +0000 Subject: [PATCH 4/5] Add WARC stats Records WARCs count and most recent WARC timestamp. --- stat-pusher/prod.stats | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/stat-pusher/prod.stats b/stat-pusher/prod.stats index b358c22..fcbbca2 100644 --- a/stat-pusher/prod.stats +++ b/stat-pusher/prod.stats @@ -24,6 +24,22 @@ "kind": "json", "match": "['response','numFound']" }, + "numFound": { + "host": "solr8", + "label": "warcs", + "desc": "Number of records in trackdb collection that are marked as WARCs", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=kind_s%3Awarcs&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, + "last_timestamp": { + "host": "solr8", + "label": "warcs", + "desc": "Most recent trackdb timestamp of the WARCs", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=kind_s%3Awarcs&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','docs','timestamp_dt']" + } "numFound": { "host": "solr8", "label": "cdx", From 4781b3c402d558ec5d84d35c1b9728521e6a381c Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 11 Nov 2020 13:56:40 +0000 Subject: [PATCH 5/5] Testing and working on dev now. --- stat-pusher/README.md | 18 +++++++++++++ stat-pusher/dev.stats | 50 ++++++++++++++++++++++++++++++++++++ stat-pusher/prod.stats | 50 ------------------------------------ stat-pusher/requirements.txt | 1 - 4 files changed, 68 insertions(+), 51 deletions(-) create mode 100755 stat-pusher/README.md diff --git a/stat-pusher/README.md b/stat-pusher/README.md new file mode 100755 index 0000000..ddd2655 --- /dev/null +++ b/stat-pusher/README.md @@ -0,0 +1,18 @@ +Stats Pusher +============ + +Add new stats metrics to `dev.stats`, then use a pull-request to add them in. They should then be deployed across `beta` and then `prod`. + +To test them on the `dev` server, we need a `virtualenv`: + +``` +$ virtualenv -p python3 venv +``` + +Then run: + +``` +$ run_stat_pusher.sh dev +``` + +Which should grab the stats and push them to the DEV monitor. It relies on `gitlab/ukwa-monitor` to pick up environment variables. \ No newline at end of file diff --git a/stat-pusher/dev.stats b/stat-pusher/dev.stats index 485642a..1620430 100644 --- a/stat-pusher/dev.stats +++ b/stat-pusher/dev.stats @@ -15,6 +15,56 @@ "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=*:*&sort=refresh_date_dt%20desc&wt=json", "kind": "json", "match": "['response','docs','refresh_date_dt']" + }, + "numFound_missing": { + "host": "solr8", + "label": "missing", + "desc": "Number of records in trackdb collection that appear to be missing as their records no longer refreshed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=refresh_date_dt%3A%5B*%20TO%20NOW-1DAY%5D&sort=refresh_date_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, + "numFound_warcs": { + "host": "solr8", + "label": "warcs", + "desc": "Number of records in trackdb collection that are marked as WARCs", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=kind_s%3Awarcs&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, + "last_timestamp_warcs": { + "host": "solr8", + "label": "warcs", + "desc": "Most recent trackdb timestamp of the WARCs", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=kind_s%3Awarcs&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','docs','timestamp_dt']" + }, + "numFound_cdx": { + "host": "solr8", + "label": "cdx", + "desc": "Number of records in trackdb collection that are marked as cdx-indexed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','numFound']" + }, + "last_timestamp_cdx": { + "host": "solr8", + "label": "cdx", + "desc": "Most recent trackdb timestamp that is marked as having been cdx-indexed", + "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", + "kind": "json", + "match": "['response','docs','timestamp_dt']" + } + }, + "cdx_oa_wayback": { + "last_timestamp": { + "host": "www.webarchive.org.uk", + "label": "cdx", + "desc": "Most recent CDX timestamp of a page that should be crawled every day (bl.uk/robots.txt)", + "uri": "https://www.webarchive.org.uk/wayback/archive/cdx?url=https%3A%2F%2Fwww.bl.uk%2Frobots.txt&output=json&allowFuzzy=false&sort=reverse&limit=1", + "kind": "json", + "match": "['timestamp']" } } } diff --git a/stat-pusher/prod.stats b/stat-pusher/prod.stats index fcbbca2..485642a 100644 --- a/stat-pusher/prod.stats +++ b/stat-pusher/prod.stats @@ -15,56 +15,6 @@ "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=*:*&sort=refresh_date_dt%20desc&wt=json", "kind": "json", "match": "['response','docs','refresh_date_dt']" - }, - "numFound": { - "host": "solr8", - "label": "missing", - "desc": "Number of records in trackdb collection that appear to be missing as their records no longer refreshed", - "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=refresh_date_dt%3A%5B*%20TO%20NOW-1DAY%5D&sort=refresh_date_dt%20desc&wt=json", - "kind": "json", - "match": "['response','numFound']" - }, - "numFound": { - "host": "solr8", - "label": "warcs", - "desc": "Number of records in trackdb collection that are marked as WARCs", - "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=kind_s%3Awarcs&sort=timestamp_dt%20desc&wt=json", - "kind": "json", - "match": "['response','numFound']" - }, - "last_timestamp": { - "host": "solr8", - "label": "warcs", - "desc": "Most recent trackdb timestamp of the WARCs", - "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=kind_s%3Awarcs&sort=timestamp_dt%20desc&wt=json", - "kind": "json", - "match": "['response','docs','timestamp_dt']" - } - "numFound": { - "host": "solr8", - "label": "cdx", - "desc": "Number of records in trackdb collection that are marked as cdx-indexed", - "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", - "kind": "json", - "match": "['response','numFound']" - }, - "last_timestamp": { - "host": "solr8", - "label": "cdx", - "desc": "Most recent trackdb timestamp that is marked as having been cdx-indexed", - "uri": "http://solr8.api.wa.bl.uk/solr/tracking/select?q=cdx_index_ss%3A%5B*%20TO%20*%5D&sort=timestamp_dt%20desc&wt=json", - "kind": "json", - "match": "['response','docs','timestamp_dt']" - } - }, - "cdx_oa_wayback" { - "last_timestamp": { - "host": "www.webarchive.org.uk", - "label": "cdx", - "desc": "Most recent CDX timestamp of a page that should be crawled every day (bl.uk/robots.txt)", - "uri": "https://www.webarchive.org.uk/wayback/archive/cdx?url=https%3A%2F%2Fwww.bl.uk%2Frobots.txt&output=json&allowFuzzy=false&sort=reverse&limit=1", - "kind": "json", - "match": "['timestamp']" } } } diff --git a/stat-pusher/requirements.txt b/stat-pusher/requirements.txt index 9885d89..21b1c2c 100644 --- a/stat-pusher/requirements.txt +++ b/stat-pusher/requirements.txt @@ -1,5 +1,4 @@ prometheus_client requests -urllib python_dateutil datetime