diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index b731889320..dbc340f8be 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -5,6 +5,7 @@ from tempfile import TemporaryDirectory from defusedxml import ElementTree +from django.conf import settings from learning_resources.constants import LearningResourceType, PlatformType from learning_resources.etl.constants import ETLSource @@ -24,10 +25,14 @@ def sync_canvas_archive(bucket, key: str, overwrite): """ from learning_resources.etl.loaders import load_content_files + course_folder = key.lstrip(settings.CANVAS_COURSE_BUCKET_PREFIX).split("/")[0] + with TemporaryDirectory() as export_tempdir: course_archive_path = Path(export_tempdir, key.split("/")[-1]) bucket.download_file(key, course_archive_path) - run = run_for_canvas_archive(course_archive_path, overwrite=overwrite) + resource_readable_id, run = run_for_canvas_archive( + course_archive_path, course_folder=course_folder, overwrite=overwrite + ) checksum = calc_checksum(course_archive_path) if run: load_content_files( @@ -38,16 +43,17 @@ def sync_canvas_archive(bucket, key: str, overwrite): ) run.checksum = checksum run.save() + return resource_readable_id, run -def run_for_canvas_archive(course_archive_path, overwrite): +def run_for_canvas_archive(course_archive_path, course_folder, overwrite): """ Generate and return a LearningResourceRun for a Canvas course """ checksum = calc_checksum(course_archive_path) course_info = parse_canvas_settings(course_archive_path) course_title = course_info.get("title") - readable_id = course_info.get("course_code") + readable_id = f"{course_folder}-{course_info.get('course_code')}" # create placeholder learning resource resource, _ = LearningResource.objects.update_or_create( readable_id=readable_id, @@ -64,15 +70,18 @@ def run_for_canvas_archive(course_archive_path, overwrite): ) if resource.runs.count() == 0: LearningResourceRun.objects.create( - run_id=f"{readable_id}+canvas", learning_resource=resource, published=True + run_id=f"{readable_id}+canvas", + learning_resource=resource, + published=True, ) run = resource.runs.first() + resource_readable_id = run.learning_resource.readable_id if run.checksum == checksum and not overwrite: log.info("Checksums match for %s, skipping load", readable_id) - return None + return resource_readable_id, None run.checksum = checksum run.save() - return run + return resource_readable_id, run def parse_canvas_settings(course_archive_path): diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 8664c4c4bf..521d9bc7a0 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -83,6 +83,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): Test that run_for_canvas_archive creates a LearningResource and Run when given a valid canvas archive. """ + course_folder = "test" mocker.patch( "learning_resources.etl.canvas.parse_canvas_settings", return_value={"title": "Test Course", "course_code": "TEST101"}, @@ -91,8 +92,10 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): # No resource exists yet course_archive_path = tmp_path / "archive.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive(course_archive_path, overwrite=True) - resource = LearningResource.objects.get(readable_id="TEST101") + _, run = run_for_canvas_archive( + course_archive_path, course_folder=course_folder, overwrite=True + ) + resource = LearningResource.objects.get(readable_id=f"{course_folder}-TEST101") assert resource.title == "Test Course" assert resource.etl_source == ETLSource.canvas.name assert resource.resource_type == LearningResourceType.course.name @@ -107,6 +110,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): """ Test that run_for_canvas_archive creates a Run if no runs exist for the resource. """ + course_folder = "test" mocker.patch( "learning_resources.etl.canvas.parse_canvas_settings", return_value={"title": "Test Course", "course_code": "TEST104"}, @@ -116,7 +120,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): ) # Create resource with no runs resource = LearningResourceFactory.create( - readable_id="TEST104", + readable_id=f"{course_folder}-TEST104", title="Test Course", etl_source=ETLSource.canvas.name, resource_type=LearningResourceType.course.name, @@ -126,7 +130,9 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): assert resource.runs.count() == 0 course_archive_path = tmp_path / "archive4.zip" course_archive_path.write_text("dummy") - run = run_for_canvas_archive(course_archive_path, overwrite=True) + _, run = run_for_canvas_archive( + course_archive_path, course_folder=course_folder, overwrite=True + ) assert run is not None assert run.learning_resource == resource assert run.checksum == "checksum104" diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index b8b854e5c2..51f944f014 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -30,7 +30,11 @@ ) from learning_resources.models import ContentFile, LearningResource from learning_resources.site_scrapers.utils import scraper_for_site -from learning_resources.utils import html_to_markdown, load_course_blocklist +from learning_resources.utils import ( + html_to_markdown, + load_course_blocklist, + resource_unpublished_actions, +) from learning_resources_search.exceptions import RetryError from main.celery import app from main.constants import ISOFORMAT @@ -473,7 +477,7 @@ def summarize_unprocessed_content( @app.task(acks_late=True) def ingest_canvas_course(archive_path, overwrite): bucket = get_learning_course_bucket(ETLSource.canvas.name) - sync_canvas_archive(bucket, archive_path, overwrite=overwrite) + return sync_canvas_archive(bucket, archive_path, overwrite=overwrite) @app.task(acks_late=True) @@ -499,14 +503,22 @@ def sync_canvas_courses(overwrite): == archive.last_modified ): latest_archives[course_folder] = archive + canvas_readable_ids = [] for archive in latest_archives.values(): key = archive.key log.info("Ingesting canvas course %s", key) - ingest_canvas_course( + resource_readable_id, canvas_run = ingest_canvas_course( key, overwrite=overwrite, ) + canvas_readable_ids.append(resource_readable_id) + stale_courses = LearningResource.objects.filter( + etl_source=ETLSource.canvas.name + ).exclude(readable_id__in=canvas_readable_ids) + stale_courses.update(test_mode=False, published=False) + [resource_unpublished_actions(resource) for resource in stale_courses] + stale_courses.delete() @app.task(bind=True) diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py index c5aaa651cf..5d5c374345 100644 --- a/learning_resources/tasks_test.py +++ b/learning_resources/tasks_test.py @@ -17,14 +17,17 @@ from learning_resources.factories import ( LearningResourceFactory, ) +from learning_resources.models import LearningResource from learning_resources.tasks import ( get_ocw_data, get_youtube_data, get_youtube_transcripts, marketing_page_for_resources, scrape_marketing_pages, + sync_canvas_courses, update_next_start_date_and_prices, ) +from main.utils import now_in_utc pytestmark = pytest.mark.django_db # pylint:disable=redefined-outer-name,unused-argument,too-many-arguments @@ -606,3 +609,62 @@ def test_scrape_marketing_pages(mocker, settings, mocked_celery): eid in mock_marketing_page_task.mock_calls[0].args[0] for eid in expected_ids ) mock_group.assert_called_once() + + +def test_sync_canvas_courses_removes_stale_resources( + settings, mocker, django_assert_num_queries +): + """ + sync_canvas_courses should unpublish and delete stale canvas LearningResources + """ + settings.CANVAS_COURSE_BUCKET_PREFIX = "canvas/" + mocker.patch("learning_resources.tasks.resource_unpublished_actions") + mocker.patch("learning_resources.tasks.get_learning_course_bucket") + mock_bucket = mocker.Mock() + mock_archive1 = mocker.Mock() + mock_archive1.key = "canvas/course1/archive1.zip" + mock_archive1.last_modified = now_in_utc() + mock_archive2 = mocker.Mock() + mock_archive2.key = "canvas/course2/archive2.zip" + mock_archive2.last_modified = now_in_utc() - timedelta(days=1) + mock_bucket.objects.filter.return_value = [mock_archive1, mock_archive2] + mocker.patch( + "learning_resources.tasks.get_learning_course_bucket", return_value=mock_bucket + ) + + # Create two canvas LearningResources - one stale + + lr1 = LearningResourceFactory.create( + readable_id="course1", + etl_source=ETLSource.canvas.name, + published=True, + test_mode=True, + resource_type="course", + ) + lr2 = LearningResourceFactory.create( + readable_id="course2", + etl_source=ETLSource.canvas.name, + published=True, + test_mode=True, + resource_type="course", + ) + lr_stale = LearningResourceFactory.create( + readable_id="course3", + etl_source=ETLSource.canvas.name, + published=True, + test_mode=True, + resource_type="course", + ) + + # Patch ingest_canvas_course to return the readable_ids for the two non-stale courses + mocker.patch( + "learning_resources.tasks.ingest_canvas_course", + side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())], + ) + sync_canvas_courses(overwrite=False) + + # The stale course should be unpublished and deleted + assert not LearningResource.objects.filter(id=lr_stale.id).exists() + # The non-stale courses should still exist + assert LearningResource.objects.filter(id=lr1.id).exists() + assert LearningResource.objects.filter(id=lr2.id).exists()