From 786ad40687b584ce2cc7f4edc53f67a5e1bd0f19 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 10 Jul 2025 12:29:09 -0400 Subject: [PATCH 1/9] adding course folder into key for canvas course learning resources --- learning_resources/etl/canvas.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 911ba93efa..a99e76650e 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -5,6 +5,7 @@ from tempfile import TemporaryDirectory from defusedxml import ElementTree +from django.conf import settings from learning_resources.constants import LearningResourceType from learning_resources.etl.constants import ETLSource @@ -20,10 +21,14 @@ def sync_canvas_archive(bucket, key: str, overwrite): """ from learning_resources.etl.loaders import load_content_files + course_folder = key.lstrip(settings.CANVAS_COURSE_BUCKET_PREFIX).split("/")[0] + with TemporaryDirectory() as export_tempdir: course_archive_path = Path(export_tempdir, key.split("/")[-1]) bucket.download_file(key, course_archive_path) - run = run_for_canvas_archive(course_archive_path, overwrite=overwrite) + run = run_for_canvas_archive( + course_archive_path, course_folder=course_folder, overwrite=overwrite + ) checksum = calc_checksum(course_archive_path) if run: load_content_files( @@ -36,7 +41,7 @@ def sync_canvas_archive(bucket, key: str, overwrite): run.save() -def run_for_canvas_archive(course_archive_path, overwrite): +def run_for_canvas_archive(course_archive_path, course_folder, overwrite): """ Generate and return a LearningResourceRun for a Canvas course """ @@ -46,7 +51,7 @@ def run_for_canvas_archive(course_archive_path, overwrite): readable_id = course_info.get("course_code") # create placeholder learning resource resource, _ = LearningResource.objects.get_or_create( - readable_id=readable_id, + readable_id=f"{course_folder}+{readable_id}", defaults={ "title": course_title, "published": False, @@ -57,7 +62,9 @@ def run_for_canvas_archive(course_archive_path, overwrite): ) if resource.runs.count() == 0: LearningResourceRun.objects.create( - run_id=f"{readable_id}+canvas", learning_resource=resource, published=True + run_id=f"{course_folder}+{readable_id}+canvas", + learning_resource=resource, + published=True, ) run = resource.runs.first() if run.checksum == checksum and not overwrite: From 97d3634815d8d1954013c7b19d10bdf2043def10 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 10 Jul 2025 12:48:41 -0400 Subject: [PATCH 2/9] fixing test --- learning_resources/etl/utils_test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 31c38d4ad9..19932058e0 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -632,7 +632,9 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): # No resource exists yet course_archive_path = tmp_path / "archive.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive(course_archive_path, overwrite=True) + run = run_for_canvas_archive( + course_archive_path, course_folder="test", overwrite=True + ) resource = LearningResource.objects.get(readable_id="TEST101") assert resource.title == "Test Course" assert resource.etl_source == ETLSource.canvas.name @@ -666,7 +668,9 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): assert resource.runs.count() == 0 course_archive_path = tmp_path / "archive4.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive(course_archive_path, overwrite=True) + run = run_for_canvas_archive( + course_archive_path, course_folder="test", overwrite=True + ) assert run is not None assert run.learning_resource == resource assert run.checksum == "checksum104" From 5b197568381d28ed871cecde2f7bad3ed426b1d7 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 10 Jul 2025 12:51:33 -0400 Subject: [PATCH 3/9] minor refactor --- learning_resources/etl/canvas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index a99e76650e..447c0e81fe 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -48,10 +48,10 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite): checksum = calc_checksum(course_archive_path) course_info = parse_canvas_settings(course_archive_path) course_title = course_info.get("title") - readable_id = course_info.get("course_code") + readable_id = f"{course_folder}-{course_info.get('course_code')}" # create placeholder learning resource resource, _ = LearningResource.objects.get_or_create( - readable_id=f"{course_folder}+{readable_id}", + readable_id=readable_id, defaults={ "title": course_title, "published": False, @@ -62,7 +62,7 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite): ) if resource.runs.count() == 0: LearningResourceRun.objects.create( - run_id=f"{course_folder}+{readable_id}+canvas", + run_id=f"{readable_id}+canvas", learning_resource=resource, published=True, ) From 6a99a43b99f5a8577bea45a958e7998e3a33794c Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 10 Jul 2025 13:14:25 -0400 Subject: [PATCH 4/9] fixing test --- learning_resources/etl/utils_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 19932058e0..679553c79e 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -624,6 +624,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): Test that run_for_canvas_archive creates a LearningResource and Run when given a valid canvas archive. """ + course_folder = "test" mocker.patch( "learning_resources.etl.canvas.parse_canvas_settings", return_value={"title": "Test Course", "course_code": "TEST101"}, @@ -635,7 +636,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): run = run_for_canvas_archive( course_archive_path, course_folder="test", overwrite=True ) - resource = LearningResource.objects.get(readable_id="TEST101") + resource = LearningResource.objects.get(readable_id=f"{course_folder}-TEST101") assert resource.title == "Test Course" assert resource.etl_source == ETLSource.canvas.name assert resource.resource_type == LearningResourceType.course.name @@ -649,6 +650,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): """ Test that run_for_canvas_archive creates a Run if no runs exist for the resource. """ + course_folder = "test" mocker.patch( "learning_resources.etl.canvas.parse_canvas_settings", return_value={"title": "Test Course", "course_code": "TEST104"}, @@ -658,7 +660,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): ) # Create resource with no runs resource = LearningResourceFactory.create( - readable_id="TEST104", + readable_id=f"{course_folder}-TEST104", title="Test Course", etl_source=ETLSource.canvas.name, resource_type=LearningResourceType.course.name, @@ -669,7 +671,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): course_archive_path = tmp_path / "archive4.zip" course_archive_path.write_text("dummy") run = run_for_canvas_archive( - course_archive_path, course_folder="test", overwrite=True + course_archive_path, course_folder=course_folder, overwrite=True ) assert run is not None assert run.learning_resource == resource From 038aaecd15ba9cf73e2cf05830dad5f0f1758b36 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 11 Jul 2025 11:59:00 -0400 Subject: [PATCH 5/9] removing stale canvas courses --- learning_resources/etl/canvas.py | 8 +++++--- learning_resources/tasks.py | 21 ++++++++++++++++++--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 447c0e81fe..9c651314b9 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -26,7 +26,7 @@ def sync_canvas_archive(bucket, key: str, overwrite): with TemporaryDirectory() as export_tempdir: course_archive_path = Path(export_tempdir, key.split("/")[-1]) bucket.download_file(key, course_archive_path) - run = run_for_canvas_archive( + resource_readable_id, run = run_for_canvas_archive( course_archive_path, course_folder=course_folder, overwrite=overwrite ) checksum = calc_checksum(course_archive_path) @@ -39,6 +39,7 @@ def sync_canvas_archive(bucket, key: str, overwrite): ) run.checksum = checksum run.save() + return resource_readable_id, run def run_for_canvas_archive(course_archive_path, course_folder, overwrite): @@ -67,12 +68,13 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite): published=True, ) run = resource.runs.first() + resource_readable_id = run.learning_resource.readable_id if run.checksum == checksum and not overwrite: log.info("Checksums match for %s, skipping load", readable_id) - return None + return resource_readable_id, None run.checksum = checksum run.save() - return run + return resource_readable_id, run def parse_canvas_settings(course_archive_path): diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index b8b854e5c2..78bd1a008e 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -12,6 +12,7 @@ from django.db.models import Q from django.utils import timezone +from learning_resources.constants import LearningResourceType from learning_resources.content_summarizer import ContentSummarizer from learning_resources.etl import pipelines, youtube from learning_resources.etl.canvas import ( @@ -30,7 +31,11 @@ ) from learning_resources.models import ContentFile, LearningResource from learning_resources.site_scrapers.utils import scraper_for_site -from learning_resources.utils import html_to_markdown, load_course_blocklist +from learning_resources.utils import ( + bulk_resources_unpublished_actions, + html_to_markdown, + load_course_blocklist, +) from learning_resources_search.exceptions import RetryError from main.celery import app from main.constants import ISOFORMAT @@ -473,7 +478,7 @@ def summarize_unprocessed_content( @app.task(acks_late=True) def ingest_canvas_course(archive_path, overwrite): bucket = get_learning_course_bucket(ETLSource.canvas.name) - sync_canvas_archive(bucket, archive_path, overwrite=overwrite) + return sync_canvas_archive(bucket, archive_path, overwrite=overwrite) @app.task(acks_late=True) @@ -499,14 +504,24 @@ def sync_canvas_courses(overwrite): == archive.last_modified ): latest_archives[course_folder] = archive + canvas_readable_ids = [] for archive in latest_archives.values(): key = archive.key log.info("Ingesting canvas course %s", key) - ingest_canvas_course( + resource_readable_id, canvas_run = ingest_canvas_course( key, overwrite=overwrite, ) + canvas_readable_ids.append(resource_readable_id) + stale_courses = LearningResource.objects.filter( + etl_source=ETLSource.canvas.name + ).exclude(readable_id__in=canvas_readable_ids) + stale_courses.update(test_mode=False, published=False) + bulk_resources_unpublished_actions( + stale_courses.values_list("id", flat=True), + LearningResourceType.course.name, + ) @app.task(bind=True) From 7dc1c8d7cfc0ba389ea627b67da79380f6906823 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 11 Jul 2025 12:52:40 -0400 Subject: [PATCH 6/9] adding test --- learning_resources/tasks.py | 1 + learning_resources/tasks_test.py | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index 78bd1a008e..353fb50355 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -522,6 +522,7 @@ def sync_canvas_courses(overwrite): stale_courses.values_list("id", flat=True), LearningResourceType.course.name, ) + stale_courses.delete() @app.task(bind=True) diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py index c5aaa651cf..06c4354447 100644 --- a/learning_resources/tasks_test.py +++ b/learning_resources/tasks_test.py @@ -17,14 +17,17 @@ from learning_resources.factories import ( LearningResourceFactory, ) +from learning_resources.models import LearningResource from learning_resources.tasks import ( get_ocw_data, get_youtube_data, get_youtube_transcripts, marketing_page_for_resources, scrape_marketing_pages, + sync_canvas_courses, update_next_start_date_and_prices, ) +from main.utils import now_in_utc pytestmark = pytest.mark.django_db # pylint:disable=redefined-outer-name,unused-argument,too-many-arguments @@ -606,3 +609,62 @@ def test_scrape_marketing_pages(mocker, settings, mocked_celery): eid in mock_marketing_page_task.mock_calls[0].args[0] for eid in expected_ids ) mock_group.assert_called_once() + + +def test_sync_canvas_courses_removes_stale_resources( + settings, mocker, django_assert_num_queries +): + """ + sync_canvas_courses should unpublish and delete stale canvas LearningResources + """ + settings.CANVAS_COURSE_BUCKET_PREFIX = "canvas/" + mocker.patch("learning_resources.tasks.bulk_resources_unpublished_actions") + mocker.patch("learning_resources.tasks.get_learning_course_bucket") + mock_bucket = mocker.Mock() + mock_archive1 = mocker.Mock() + mock_archive1.key = "canvas/course1/archive1.zip" + mock_archive1.last_modified = now_in_utc() + mock_archive2 = mocker.Mock() + mock_archive2.key = "canvas/course2/archive2.zip" + mock_archive2.last_modified = now_in_utc() - timedelta(days=1) + mock_bucket.objects.filter.return_value = [mock_archive1, mock_archive2] + mocker.patch( + "learning_resources.tasks.get_learning_course_bucket", return_value=mock_bucket + ) + + # Create two canvas LearningResources - one stale + + lr1 = LearningResourceFactory.create( + readable_id="course1", + etl_source=ETLSource.canvas.name, + published=True, + test_mode=True, + resource_type="course", + ) + lr2 = LearningResourceFactory.create( + readable_id="course2", + etl_source=ETLSource.canvas.name, + published=True, + test_mode=True, + resource_type="course", + ) + lr_stale = LearningResourceFactory.create( + readable_id="course3", + etl_source=ETLSource.canvas.name, + published=True, + test_mode=True, + resource_type="course", + ) + + # Patch ingest_canvas_course to return the readable_ids for the two non-stale courses + mocker.patch( + "learning_resources.tasks.ingest_canvas_course", + side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())], + ) + sync_canvas_courses(overwrite=False) + + # The stale course should be unpublished and deleted + assert not LearningResource.objects.filter(id=lr_stale.id).exists() + # The non-stale courses should still exist + assert LearningResource.objects.filter(id=lr1.id).exists() + assert LearningResource.objects.filter(id=lr2.id).exists() From b11a7902500a0d9d6fa421f829ce701918b37c74 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 11 Jul 2025 13:16:30 -0400 Subject: [PATCH 7/9] fixing test --- learning_resources/etl/utils_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 679553c79e..f03ec80bd6 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -633,7 +633,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): # No resource exists yet course_archive_path = tmp_path / "archive.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive( + _, run = run_for_canvas_archive( course_archive_path, course_folder="test", overwrite=True ) resource = LearningResource.objects.get(readable_id=f"{course_folder}-TEST101") @@ -670,7 +670,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): assert resource.runs.count() == 0 course_archive_path = tmp_path / "archive4.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive( + _, run = run_for_canvas_archive( course_archive_path, course_folder=course_folder, overwrite=True ) assert run is not None From c51927999cbbd6744af2272be886d9b6dae6ed82 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 11 Jul 2025 14:05:15 -0400 Subject: [PATCH 8/9] switching to resource unpublished actions --- learning_resources/tasks.py | 8 ++------ learning_resources/tasks_test.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index 353fb50355..51f944f014 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -12,7 +12,6 @@ from django.db.models import Q from django.utils import timezone -from learning_resources.constants import LearningResourceType from learning_resources.content_summarizer import ContentSummarizer from learning_resources.etl import pipelines, youtube from learning_resources.etl.canvas import ( @@ -32,9 +31,9 @@ from learning_resources.models import ContentFile, LearningResource from learning_resources.site_scrapers.utils import scraper_for_site from learning_resources.utils import ( - bulk_resources_unpublished_actions, html_to_markdown, load_course_blocklist, + resource_unpublished_actions, ) from learning_resources_search.exceptions import RetryError from main.celery import app @@ -518,10 +517,7 @@ def sync_canvas_courses(overwrite): etl_source=ETLSource.canvas.name ).exclude(readable_id__in=canvas_readable_ids) stale_courses.update(test_mode=False, published=False) - bulk_resources_unpublished_actions( - stale_courses.values_list("id", flat=True), - LearningResourceType.course.name, - ) + [resource_unpublished_actions(resource) for resource in stale_courses] stale_courses.delete() diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py index 06c4354447..5d5c374345 100644 --- a/learning_resources/tasks_test.py +++ b/learning_resources/tasks_test.py @@ -618,7 +618,7 @@ def test_sync_canvas_courses_removes_stale_resources( sync_canvas_courses should unpublish and delete stale canvas LearningResources """ settings.CANVAS_COURSE_BUCKET_PREFIX = "canvas/" - mocker.patch("learning_resources.tasks.bulk_resources_unpublished_actions") + mocker.patch("learning_resources.tasks.resource_unpublished_actions") mocker.patch("learning_resources.tasks.get_learning_course_bucket") mock_bucket = mocker.Mock() mock_archive1 = mocker.Mock() From be61261a86ac7c317fd9b56d96cc1577a1605734 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 11 Jul 2025 14:22:26 -0400 Subject: [PATCH 9/9] fixing tests --- learning_resources/etl/canvas_test.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 8664c4c4bf..521d9bc7a0 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -83,6 +83,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): Test that run_for_canvas_archive creates a LearningResource and Run when given a valid canvas archive. """ + course_folder = "test" mocker.patch( "learning_resources.etl.canvas.parse_canvas_settings", return_value={"title": "Test Course", "course_code": "TEST101"}, @@ -91,8 +92,10 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): # No resource exists yet course_archive_path = tmp_path / "archive.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive(course_archive_path, overwrite=True) - resource = LearningResource.objects.get(readable_id="TEST101") + _, run = run_for_canvas_archive( + course_archive_path, course_folder=course_folder, overwrite=True + ) + resource = LearningResource.objects.get(readable_id=f"{course_folder}-TEST101") assert resource.title == "Test Course" assert resource.etl_source == ETLSource.canvas.name assert resource.resource_type == LearningResourceType.course.name @@ -107,6 +110,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): """ Test that run_for_canvas_archive creates a Run if no runs exist for the resource. """ + course_folder = "test" mocker.patch( "learning_resources.etl.canvas.parse_canvas_settings", return_value={"title": "Test Course", "course_code": "TEST104"}, @@ -116,7 +120,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): ) # Create resource with no runs resource = LearningResourceFactory.create( - readable_id="TEST104", + readable_id=f"{course_folder}-TEST104", title="Test Course", etl_source=ETLSource.canvas.name, resource_type=LearningResourceType.course.name, @@ -126,7 +130,9 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): assert resource.runs.count() == 0 course_archive_path = tmp_path / "archive4.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive(course_archive_path, overwrite=True) + _, run = run_for_canvas_archive( + course_archive_path, course_folder=course_folder, overwrite=True + ) assert run is not None assert run.learning_resource == resource assert run.checksum == "checksum104"