From 39029ed2e2515ffdec066a529c4d325ec0333e37 Mon Sep 17 00:00:00 2001 From: Rebecca Pearce <17481621+beccapearce@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:28:39 +0100 Subject: [PATCH 1/6] Add local_env to the gitignore Local env is used only for locally testing, this is to ensure secrets are not accidentally pushed. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 2a2300ea4..ed95b434b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ # vim swap files and tags *.sw[a-z] /tags + +# Ignore local config +config/local_env.yml From 4cc49010a43b43529f2f52d1e72d4eb9727c9a77 Mon Sep 17 00:00:00 2001 From: Rebecca Pearce <17481621+beccapearce@users.noreply.github.com> Date: Fri, 6 Sep 2024 08:51:05 +0100 Subject: [PATCH 2/6] Initial BigQuery setup - Add the BigQuery gem - Create a BigQuery service so the app can talk to BigQuery NB in order to use this you must add the credentials to config/local_env.yml. There are instructions on how to do this in the [dev docs](https://docs.publishing.service.gov.uk/repos/content-data-api/google_analytics_setup.html) --- Gemfile | 1 + Gemfile.lock | 49 ++++++++++++++++++++++++++++++ app/services/bigquery.rb | 25 +++++++++++++++ config/environments/development.rb | 9 ++++++ 4 files changed, 84 insertions(+) create mode 100644 app/services/bigquery.rb diff --git a/Gemfile b/Gemfile index 5eb74850e..84f64036f 100644 --- a/Gemfile +++ b/Gemfile @@ -9,6 +9,7 @@ gem "dalli" gem "dartsass-rails" gem "faraday" gem "gds-api-adapters" +gem "google-cloud-bigquery" gem "govspeak" gem "govuk_ab_testing" gem "govuk_app_config" diff --git a/Gemfile.lock b/Gemfile.lock index 7ee0ab038..32da42a15 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -152,6 +152,7 @@ GEM sass-embedded (~> 1.63) date (3.3.4) debug_inspector (1.2.0) + declarative (0.0.20) diff-lcs (1.5.1) dig_rb (1.0.1) docile (1.4.0) @@ -186,11 +187,41 @@ GEM rest-client (~> 2.0) globalid (1.2.1) activesupport (>= 6.1) + google-apis-bigquery_v2 (0.70.0) + google-apis-core (>= 0.15.0, < 2.a) + google-apis-core (0.15.0) + addressable (~> 2.5, >= 2.5.1) + googleauth (~> 1.9) + httpclient (>= 2.8.1, < 3.a) + mini_mime (~> 1.0) + representable (~> 3.0) + retriable (>= 2.0, < 4.a) + rexml + google-cloud-bigquery (1.49.0) + concurrent-ruby (~> 1.0) + google-apis-bigquery_v2 (~> 0.62) + google-apis-core (~> 0.13) + google-cloud-core (~> 1.6) + googleauth (~> 1.9) + mini_mime (~> 1.0) + google-cloud-core (1.7.0) + google-cloud-env (>= 1.0, < 3.a) + google-cloud-errors (~> 1.0) + google-cloud-env (2.1.1) + faraday (>= 1.0, < 3.a) + google-cloud-errors (1.4.0) google-protobuf (4.27.3) bigdecimal rake (>= 13) googleapis-common-protos-types (1.15.0) google-protobuf (>= 3.18, < 5.a) + googleauth (1.11.0) + faraday (>= 1.0, < 3.a) + google-cloud-env (~> 2.1) + jwt (>= 1.4, < 3.0) + multi_json (~> 1.11) + os (>= 0.9, < 2.0) + signet (>= 0.16, < 2.a) govspeak (8.3.4) actionview (>= 6) addressable (>= 2.3.8, < 3) @@ -245,6 +276,7 @@ GEM csv mini_mime (>= 1.0.0) multi_xml (>= 0.5.2) + httpclient (2.8.3) i18n (1.14.5) concurrent-ruby (~> 1.0) i18n-coverage (0.2.0) @@ -265,6 +297,8 @@ GEM json (2.7.2) json-schema (4.3.0) addressable (>= 2.8) + jwt (2.8.1) + base64 kramdown (2.4.0) rexml language_server-protocol (3.17.0.3) @@ -293,6 +327,7 @@ GEM mocha (2.4.5) ruby2_keywords (>= 0.0.5) msgpack (1.7.2) + multi_json (1.15.0) multi_test (1.1.0) multi_xml (0.7.1) bigdecimal (~> 3.1) @@ -514,6 +549,7 @@ GEM opentelemetry-semantic_conventions opentelemetry-semantic_conventions (1.10.1) opentelemetry-api (~> 1.0) + os (1.1.4) pact (1.65.1) pact-mock_service (~> 3.0, >= 3.3.1) pact-support (~> 1.16, >= 1.16.9) @@ -622,6 +658,10 @@ GEM regexp_parser (2.9.2) reline (0.5.9) io-console (~> 0.5) + representable (3.2.0) + declarative (< 0.1.0) + trailblazer-option (>= 0.1.1, < 0.2.0) + uber (< 0.2.0) request_store (1.7.0) rack (>= 1.4) rest-client (2.1.0) @@ -629,6 +669,7 @@ GEM http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) + retriable (3.1.2) rexml (3.3.5) strscan rinku (2.0.6) @@ -705,6 +746,11 @@ GEM sentry-ruby (5.18.2) bigdecimal concurrent-ruby (~> 1.0, >= 1.0.2) + signet (0.19.0) + addressable (~> 2.8) + faraday (>= 0.17.5, < 3.a) + jwt (>= 1.5, < 3.0) + multi_json (~> 1.10) simplecov (0.22.0) docile (~> 1.1) simplecov-html (~> 0.11) @@ -747,8 +793,10 @@ GEM tins (1.33.0) bigdecimal sync + trailblazer-option (0.1.2) tzinfo (2.0.6) concurrent-ruby (~> 1.0) + uber (0.1.0) unicode-display_width (2.5.0) unparser (0.6.15) diff-lcs (~> 1.3) @@ -781,6 +829,7 @@ DEPENDENCIES erb_lint faraday gds-api-adapters + google-cloud-bigquery govspeak govuk_ab_testing govuk_app_config diff --git a/app/services/bigquery.rb b/app/services/bigquery.rb new file mode 100644 index 000000000..f07b16591 --- /dev/null +++ b/app/services/bigquery.rb @@ -0,0 +1,25 @@ +require "google/cloud/bigquery" +require "googleauth" + +class Bigquery + include Google::Auth + + def self.build + new.build + end + + def build + credentials = { + "client_email" => ENV["BIGQUERY_CLIENT_EMAIL"], + "private_key" => ENV["BIGQUERY_PRIVATE_KEY"], + } + + Google::Cloud::Bigquery.new( + project_id: ENV["BIGQUERY_PROJECT"], + credentials: Google::Auth::ServiceAccountCredentials.make_creds( + json_key_io: StringIO.new(credentials.to_json), + scope: ["https://www.googleapis.com/auth/bigquery"], + ), + ) + end +end diff --git a/config/environments/development.rb b/config/environments/development.rb index 17eab5130..0e2497bab 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -70,4 +70,13 @@ # Uncomment if you wish to allow Action Cable access from any origin. # config.action_cable.disable_request_forgery_protection = true + + config.before_configuration do + env_file = Rails.root.join("config/local_env.yml") + if File.exist?(env_file) + YAML.safe_load(File.open(env_file)).each do |key, value| + ENV[key.to_s] = value + end + end + end end From 046d6186287bd24b5366b515fedb0c0f131aa54e Mon Sep 17 00:00:00 2001 From: Rebecca Pearce <17481621+beccapearce@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:07:14 +0100 Subject: [PATCH 3/6] Add a BigQuery query to get some initial data - Added a basic SQL query to retrieve some initial test data NB this query only currently works for a select time period and only on the benefits and business pages (shouldn't be too hard to change) --- app/services/popular_tasks.rb | 57 +++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 app/services/popular_tasks.rb diff --git a/app/services/popular_tasks.rb b/app/services/popular_tasks.rb new file mode 100644 index 000000000..e71f945f6 --- /dev/null +++ b/app/services/popular_tasks.rb @@ -0,0 +1,57 @@ +class PopularTasks + def initialize; end + + def client + @client ||= Bigquery.build + end + + def fetch_data(date: Date.yesterday) + @fetch_data = client + @date = date.strftime("%Y-%m-%d") + + query = <<~SQL + WITH cte1 as (SELECT + event_date, + event_name, + search_term, + cleaned_page_location, + cleaned_page_referrer, + link_url, + count(event_name) as click, + + FROM `ga4-analytics-352613.flattened_dataset.flattened_daily_ga_data_*` + WHERE _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)) + -- WHERE _table_suffix IN ('20240708', '20240709','20240710','20240711','20240712','20240713','20240714') + group by 1,2,3,4,5,6), + + CTE2 as (SELECT + event_date, + sum(click) as clicks, + cleaned_page_referrer as BrowsePage, + search_term, + ROW_NUMBER() OVER(PARTITION BY cleaned_page_referrer ORDER BY click DESC) Rank, + link_url as SearchDestPage + FROM cte1 + WHERE event_name = 'select_item' + AND cleaned_page_referrer IN ('/browse/benefits','/browse/business') + AND cleaned_page_location = '/search/all' + group by click,event_date,cleaned_page_referrer,search_term,link_url + order by cleaned_page_referrer,Rank asc) + + SELECT + * + FROM CTE2 + WHERE Rank <6 + SQL + + data = @fetch_data.query(query).all + @results = data.map do |row| + { + url: row[:SearchDestPage], # Using SearchDestPage as the link URL + browse_page: row[:BrowsePage], # Using BrowsePage as the L1 browse + rank: row[:Rank], # Rank to order the links + } + end + @results.sort_by { |link| link[:rank] } # Order the links by their rank + end +end From 794a1c2124ab20d7d64764eff68cfedc3931d5b7 Mon Sep 17 00:00:00 2001 From: Rebecca Pearce <17481621+beccapearce@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:42:56 +0100 Subject: [PATCH 4/6] Render popular tasks in view - Updated the view to render the results fetched from BigQuery - Simple unordered list displaying the search term as a link - Just making sure that the data can be passed from BigQuery to the view NB Still need to add the title into the link rendering --- app/helpers/browse_helper.rb | 37 +++++++++++++++------------------- app/views/browse/show.html.erb | 26 +++++++++++++++++++----- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/app/helpers/browse_helper.rb b/app/helpers/browse_helper.rb index 02328a263..fc7edaaa3 100644 --- a/app/helpers/browse_helper.rb +++ b/app/helpers/browse_helper.rb @@ -1,27 +1,22 @@ module BrowseHelper - def display_popular_links_for_slug?(slug) - I18n.exists?(slug.to_s, scope: "browse.popular_links") + def slug(path = base_path) + path.sub(%r{.*(?=/browse/)}, "") + end + + def display_popular_tasks_for_slug?(slug) + %w[benefits business].include?(slug) + end + + def select_browse_page(browse_page = "/browse/benefits") + browse_page = slug(browse_page) + popular_task_data = PopularTasks.new.fetch_data + popular_task_data.select { |link| link[:browse_page] == "/browse/#{browse_page}" } end def popular_links_for_slug(slug) - links = I18n.t(slug.to_s, scope: "browse.popular_links") - count = links.length - links.map.with_index(1) do |link, index| - { - text: link[:title], - href: link[:url], - data_attributes: { - module: "ga4-link-tracker", - ga4_track_links_only: "", - ga4_link: { - event_name: "navigation", - type: "action", - index_link: index, - index_total: count, - text: link[:title], - }, - }, - } - end + links = select_browse_page(slug) + return [] unless links + + links end end diff --git a/app/views/browse/show.html.erb b/app/views/browse/show.html.erb index 787441e74..dc6c6f3e2 100644 --- a/app/views/browse/show.html.erb +++ b/app/views/browse/show.html.erb @@ -34,7 +34,7 @@ } %> <% end %> -<% if display_popular_links_for_slug?(page.slug) %> +<% if display_popular_tasks_for_slug?(page.slug) %>
@@ -45,9 +45,25 @@ font_size: "m" } %>
    - <% popular_links_for_slug(page.slug).each do |link| %> + <% popular_links_for_slug(page.slug).each_with_index do |task, index| %>
  • - <%= render partial: "shared/browse_action_link", locals: {link:} %> + <%= render "govuk_publishing_components/components/action_link", { + text: task[:url], + href: task[:url], + dark_large_icon: true, + margin_bottom: 3, + data_attributes: { + module: "ga4-link-tracker", + ga4_track_links_only: "", + ga4_link: { + event_name: "navigation", + type: "action", + index_link: index + 1 , + index_total: popular_links_for_slug(page.slug).length, + text: task[:url] + } + } + } %>
  • <% end %>
@@ -60,7 +76,7 @@ <% total_links = page.second_level_browse_pages.count.to_s %> <%= render "shared/browse_cards_container" do %> <%= render "govuk_publishing_components/components/cards", { - heading: display_popular_links_for_slug?(page.slug) ? t("browse.topics") : nil, + heading: display_popular_tasks_for_slug?(page.slug) ? t("browse.topics") : nil, items: page.second_level_browse_pages.map.with_index do |second_level_browse_page, index| { link: { @@ -78,6 +94,6 @@ description: second_level_browse_page.description, } end, - sub_heading_level: display_popular_links_for_slug?(page.slug) ? 3 : 2, + sub_heading_level: display_popular_tasks_for_slug?(page.slug) ? 3 : 2, } %> <% end %> From bcebb6430cd13dda25fe584d2628c6a3bedef4dc Mon Sep 17 00:00:00 2001 From: Rebecca Pearce <17481621+beccapearce@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:14:42 +0100 Subject: [PATCH 5/6] Implement basic caching for the popular tasks - Cache the expensive process of retrieving popular tasks from BigQuery - Save this cache with a cache key that has the date and the browse page name. Sneaky change added in here that I'll move to a different commit: - Change the BigQuery data retrieval to only collect data for one browse at a time. --- app/helpers/browse_helper.rb | 21 ++++---- app/services/popular_tasks.rb | 96 +++++++++++++++++++---------------- 2 files changed, 64 insertions(+), 53 deletions(-) diff --git a/app/helpers/browse_helper.rb b/app/helpers/browse_helper.rb index fc7edaaa3..cf395e4eb 100644 --- a/app/helpers/browse_helper.rb +++ b/app/helpers/browse_helper.rb @@ -7,16 +7,19 @@ def display_popular_tasks_for_slug?(slug) %w[benefits business].include?(slug) end - def select_browse_page(browse_page = "/browse/benefits") - browse_page = slug(browse_page) - popular_task_data = PopularTasks.new.fetch_data - popular_task_data.select { |link| link[:browse_page] == "/browse/#{browse_page}" } - end - def popular_links_for_slug(slug) - links = select_browse_page(slug) - return [] unless links + browse_page = slug(slug) + + # Try to fetch the cache first + popular_task_data = Rails.cache.read("popular_tasks_#{browse_page}_#{Date.yesterday.strftime("%Y-%m-%d")}") + + # If cache is empty fetch fresh data and cache it + if popular_task_data.nil? + popular_task_data = PopularTasks.new.fetch_data("/browse/#{browse_page}") + end + + return [] unless popular_task_data - links + popular_task_data end end diff --git a/app/services/popular_tasks.rb b/app/services/popular_tasks.rb index e71f945f6..e96fa82d2 100644 --- a/app/services/popular_tasks.rb +++ b/app/services/popular_tasks.rb @@ -1,57 +1,65 @@ class PopularTasks + CACHE_EXPIRATION = 24.hours # Set the cache expiration time + def initialize; end def client @client ||= Bigquery.build end - def fetch_data(date: Date.yesterday) + def fetch_data(browse_page, date: Date.yesterday) @fetch_data = client @date = date.strftime("%Y-%m-%d") - query = <<~SQL - WITH cte1 as (SELECT - event_date, - event_name, - search_term, - cleaned_page_location, - cleaned_page_referrer, - link_url, - count(event_name) as click, - - FROM `ga4-analytics-352613.flattened_dataset.flattened_daily_ga_data_*` - WHERE _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)) - -- WHERE _table_suffix IN ('20240708', '20240709','20240710','20240711','20240712','20240713','20240714') - group by 1,2,3,4,5,6), - - CTE2 as (SELECT - event_date, - sum(click) as clicks, - cleaned_page_referrer as BrowsePage, - search_term, - ROW_NUMBER() OVER(PARTITION BY cleaned_page_referrer ORDER BY click DESC) Rank, - link_url as SearchDestPage - FROM cte1 - WHERE event_name = 'select_item' - AND cleaned_page_referrer IN ('/browse/benefits','/browse/business') - AND cleaned_page_location = '/search/all' - group by click,event_date,cleaned_page_referrer,search_term,link_url - order by cleaned_page_referrer,Rank asc) - - SELECT - * - FROM CTE2 - WHERE Rank <6 - SQL - - data = @fetch_data.query(query).all - @results = data.map do |row| - { - url: row[:SearchDestPage], # Using SearchDestPage as the link URL - browse_page: row[:BrowsePage], # Using BrowsePage as the L1 browse - rank: row[:Rank], # Rank to order the links - } + # Define cache keys for the specific browse page + cache_key = "popular_tasks_#{browse_page}_#{@date}" + + Rails.cache.fetch(cache_key, expires_in: CACHE_EXPIRATION) do + # If cache is empty, this block is executed + query = <<~SQL + WITH cte1 as (SELECT + event_date, + event_name, + search_term, + cleaned_page_location, + cleaned_page_referrer, + link_url, + count(event_name) as click, + + FROM `ga4-analytics-352613.flattened_dataset.flattened_daily_ga_data_*` + WHERE _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)) + -- WHERE _table_suffix IN ('20240708', '20240709','20240710','20240711','20240712','20240713','20240714') + group by 1,2,3,4,5,6), + + CTE2 as (SELECT + event_date, + sum(click) as clicks, + cleaned_page_referrer as BrowsePage, + search_term, + ROW_NUMBER() OVER(PARTITION BY cleaned_page_referrer ORDER BY click DESC) Rank, + link_url as SearchDestPage + FROM cte1 + WHERE event_name = 'select_item' + AND cleaned_page_referrer = '#{browse_page}' + AND cleaned_page_location = '/search/all' + group by click,event_date,cleaned_page_referrer,search_term,link_url + order by cleaned_page_referrer,Rank asc) + + SELECT + * + FROM CTE2 + WHERE Rank <6 + SQL + + data = @fetch_data.query(query).all + @results = data.map do |row| + { + url: row[:SearchDestPage], # Using SearchDestPage as the link URL + browse_page: row[:BrowsePage], # Using BrowsePage as the L1 browse + rank: row[:Rank], # Rank to order the links + } + end + @results.sort_by { |link| link[:rank] } # Order the links by their rank end - @results.sort_by { |link| link[:rank] } # Order the links by their rank end end From b6d0eff30cf2aabb90c0d8adbeca29121eca8788 Mon Sep 17 00:00:00 2001 From: Rebecca Pearce <17481621+beccapearce@users.noreply.github.com> Date: Fri, 6 Sep 2024 17:09:46 +0100 Subject: [PATCH 6/6] Implement backup cache for popular tasks Improve data availability bu having a backup cache if the latest is not availabile - Added a backup cache mechanism to ensure data is available even if the latest cache is expired or unavailable. - Popular tasks data is now stored in both a latest cache (24 hours expiration) and a backup cache (7 days expiration). - Fallback to backup cache when the latest cache is missing, ensuring users always see data even if fresh data retrieval fails. - Updated methods to handle cache fallback logic, improving robustness and reducing the likelihood of empty data responses. --- app/helpers/browse_helper.rb | 18 +++++++++++++----- app/services/popular_tasks.rb | 12 +++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/app/helpers/browse_helper.rb b/app/helpers/browse_helper.rb index cf395e4eb..5f4d06bb5 100644 --- a/app/helpers/browse_helper.rb +++ b/app/helpers/browse_helper.rb @@ -10,15 +10,23 @@ def display_popular_tasks_for_slug?(slug) def popular_links_for_slug(slug) browse_page = slug(slug) - # Try to fetch the cache first - popular_task_data = Rails.cache.read("popular_tasks_#{browse_page}_#{Date.yesterday.strftime("%Y-%m-%d")}") + # Cache keys for the specific browse page + cache_key_latest = "popular_tasks_#{browse_page}_#{Date.yesterday.strftime("%Y-%m-%d")}" + cache_key_backup = "popular_tasks_backup_#{browse_page}" - # If cache is empty fetch fresh data and cache it + # Try to fetch the latest cache first + popular_task_data = Rails.cache.read(cache_key_latest) + + # If the latest cache doesn't exist, fall back to the backup cache if popular_task_data.nil? - popular_task_data = PopularTasks.new.fetch_data("/browse/#{browse_page}") + # Falling back to backup cache + popular_task_data = Rails.cache.read(cache_key_backup) end - return [] unless popular_task_data + # If both caches are empty, fetch fresh data and cache it + if popular_task_data.nil? + popular_task_data = PopularTasks.new.fetch_data("/browse/#{browse_page}") + end popular_task_data end diff --git a/app/services/popular_tasks.rb b/app/services/popular_tasks.rb index e96fa82d2..5cfee4d0f 100644 --- a/app/services/popular_tasks.rb +++ b/app/services/popular_tasks.rb @@ -1,5 +1,6 @@ class PopularTasks CACHE_EXPIRATION = 24.hours # Set the cache expiration time + BACKUP_CACHE_EXPIRATION = 7.days # Backup cache can have a longer expiration def initialize; end @@ -11,10 +12,10 @@ def fetch_data(browse_page, date: Date.yesterday) @fetch_data = client @date = date.strftime("%Y-%m-%d") - # Define cache keys for the specific browse page - cache_key = "popular_tasks_#{browse_page}_#{@date}" + cache_key_latest = "popular_tasks_#{browse_page}_#{@date}" + cache_key_backup = "popular_tasks_backup_#{browse_page}" - Rails.cache.fetch(cache_key, expires_in: CACHE_EXPIRATION) do + Rails.cache.fetch(cache_key_latest, expires_in: CACHE_EXPIRATION) do # If cache is empty, this block is executed query = <<~SQL WITH cte1 as (SELECT @@ -60,6 +61,11 @@ def fetch_data(browse_page, date: Date.yesterday) } end @results.sort_by { |link| link[:rank] } # Order the links by their rank + + # Cache the results in the backup cache as well + Rails.cache.write(cache_key_backup, @results, expires_in: BACKUP_CACHE_EXPIRATION) + + @results end end end