Skip to content

canvas - separate courses per session #2347

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions learning_resources/etl/canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tempfile import TemporaryDirectory

from defusedxml import ElementTree
from django.conf import settings

from learning_resources.constants import LearningResourceType, PlatformType
from learning_resources.etl.constants import ETLSource
Expand All @@ -24,10 +25,14 @@ def sync_canvas_archive(bucket, key: str, overwrite):
"""
from learning_resources.etl.loaders import load_content_files

course_folder = key.lstrip(settings.CANVAS_COURSE_BUCKET_PREFIX).split("/")[0]

with TemporaryDirectory() as export_tempdir:
course_archive_path = Path(export_tempdir, key.split("/")[-1])
bucket.download_file(key, course_archive_path)
run = run_for_canvas_archive(course_archive_path, overwrite=overwrite)
resource_readable_id, run = run_for_canvas_archive(
course_archive_path, course_folder=course_folder, overwrite=overwrite
)
checksum = calc_checksum(course_archive_path)
if run:
load_content_files(
Expand All @@ -38,16 +43,17 @@ def sync_canvas_archive(bucket, key: str, overwrite):
)
run.checksum = checksum
run.save()
return resource_readable_id, run


def run_for_canvas_archive(course_archive_path, overwrite):
def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
"""
Generate and return a LearningResourceRun for a Canvas course
"""
checksum = calc_checksum(course_archive_path)
course_info = parse_canvas_settings(course_archive_path)
course_title = course_info.get("title")
readable_id = course_info.get("course_code")
readable_id = f"{course_folder}-{course_info.get('course_code')}"
# create placeholder learning resource
resource, _ = LearningResource.objects.update_or_create(
readable_id=readable_id,
Expand All @@ -64,15 +70,18 @@ def run_for_canvas_archive(course_archive_path, overwrite):
)
if resource.runs.count() == 0:
LearningResourceRun.objects.create(
run_id=f"{readable_id}+canvas", learning_resource=resource, published=True
run_id=f"{readable_id}+canvas",
learning_resource=resource,
published=True,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New session-specific courses were created, but the 2 older courses from an earlier command run are still present with test_mode=True. I think any courses not found in the archives during a command run should probably be deleted and/or have test_mode set to False (in addition to published=False), with all their contentfiles deleted or unpublished from regular and vector search.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the deletion of stale canvas courses. Retrieving the current course ids required going into the archive so i funnel up the relevant readable ids and at the end of the command do a deletion+deindex. This also only applies when running the management command since the webhook just tells us to add a new entry.

)
run = resource.runs.first()
resource_readable_id = run.learning_resource.readable_id
if run.checksum == checksum and not overwrite:
log.info("Checksums match for %s, skipping load", readable_id)
return None
return resource_readable_id, None
run.checksum = checksum
run.save()
return run
return resource_readable_id, run


def parse_canvas_settings(course_archive_path):
Expand Down
14 changes: 10 additions & 4 deletions learning_resources/etl/canvas_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker):
Test that run_for_canvas_archive creates a LearningResource and Run
when given a valid canvas archive.
"""
course_folder = "test"
mocker.patch(
"learning_resources.etl.canvas.parse_canvas_settings",
return_value={"title": "Test Course", "course_code": "TEST101"},
Expand All @@ -91,8 +92,10 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker):
# No resource exists yet
course_archive_path = tmp_path / "archive.zip"
course_archive_path.write_text("dummy")
run = run_for_canvas_archive(course_archive_path, overwrite=True)
resource = LearningResource.objects.get(readable_id="TEST101")
_, run = run_for_canvas_archive(
course_archive_path, course_folder=course_folder, overwrite=True
)
resource = LearningResource.objects.get(readable_id=f"{course_folder}-TEST101")
assert resource.title == "Test Course"
assert resource.etl_source == ETLSource.canvas.name
assert resource.resource_type == LearningResourceType.course.name
Expand All @@ -107,6 +110,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker):
"""
Test that run_for_canvas_archive creates a Run if no runs exist for the resource.
"""
course_folder = "test"
mocker.patch(
"learning_resources.etl.canvas.parse_canvas_settings",
return_value={"title": "Test Course", "course_code": "TEST104"},
Expand All @@ -116,7 +120,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker):
)
# Create resource with no runs
resource = LearningResourceFactory.create(
readable_id="TEST104",
readable_id=f"{course_folder}-TEST104",
title="Test Course",
etl_source=ETLSource.canvas.name,
resource_type=LearningResourceType.course.name,
Expand All @@ -126,7 +130,9 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker):
assert resource.runs.count() == 0
course_archive_path = tmp_path / "archive4.zip"
course_archive_path.write_text("dummy")
run = run_for_canvas_archive(course_archive_path, overwrite=True)
_, run = run_for_canvas_archive(
course_archive_path, course_folder=course_folder, overwrite=True
)
assert run is not None
assert run.learning_resource == resource
assert run.checksum == "checksum104"
18 changes: 15 additions & 3 deletions learning_resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@
)
from learning_resources.models import ContentFile, LearningResource
from learning_resources.site_scrapers.utils import scraper_for_site
from learning_resources.utils import html_to_markdown, load_course_blocklist
from learning_resources.utils import (
html_to_markdown,
load_course_blocklist,
resource_unpublished_actions,
)
from learning_resources_search.exceptions import RetryError
from main.celery import app
from main.constants import ISOFORMAT
Expand Down Expand Up @@ -473,7 +477,7 @@ def summarize_unprocessed_content(
@app.task(acks_late=True)
def ingest_canvas_course(archive_path, overwrite):
bucket = get_learning_course_bucket(ETLSource.canvas.name)
sync_canvas_archive(bucket, archive_path, overwrite=overwrite)
return sync_canvas_archive(bucket, archive_path, overwrite=overwrite)


@app.task(acks_late=True)
Expand All @@ -499,14 +503,22 @@ def sync_canvas_courses(overwrite):
== archive.last_modified
):
latest_archives[course_folder] = archive
canvas_readable_ids = []

for archive in latest_archives.values():
key = archive.key
log.info("Ingesting canvas course %s", key)
ingest_canvas_course(
resource_readable_id, canvas_run = ingest_canvas_course(
key,
overwrite=overwrite,
)
canvas_readable_ids.append(resource_readable_id)
stale_courses = LearningResource.objects.filter(
etl_source=ETLSource.canvas.name
).exclude(readable_id__in=canvas_readable_ids)
stale_courses.update(test_mode=False, published=False)
[resource_unpublished_actions(resource) for resource in stale_courses]
stale_courses.delete()


@app.task(bind=True)
Expand Down
62 changes: 62 additions & 0 deletions learning_resources/tasks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@
from learning_resources.factories import (
LearningResourceFactory,
)
from learning_resources.models import LearningResource
from learning_resources.tasks import (
get_ocw_data,
get_youtube_data,
get_youtube_transcripts,
marketing_page_for_resources,
scrape_marketing_pages,
sync_canvas_courses,
update_next_start_date_and_prices,
)
from main.utils import now_in_utc

pytestmark = pytest.mark.django_db
# pylint:disable=redefined-outer-name,unused-argument,too-many-arguments
Expand Down Expand Up @@ -606,3 +609,62 @@ def test_scrape_marketing_pages(mocker, settings, mocked_celery):
eid in mock_marketing_page_task.mock_calls[0].args[0] for eid in expected_ids
)
mock_group.assert_called_once()


def test_sync_canvas_courses_removes_stale_resources(
settings, mocker, django_assert_num_queries
):
"""
sync_canvas_courses should unpublish and delete stale canvas LearningResources
"""
settings.CANVAS_COURSE_BUCKET_PREFIX = "canvas/"
mocker.patch("learning_resources.tasks.resource_unpublished_actions")
mocker.patch("learning_resources.tasks.get_learning_course_bucket")
mock_bucket = mocker.Mock()
mock_archive1 = mocker.Mock()
mock_archive1.key = "canvas/course1/archive1.zip"
mock_archive1.last_modified = now_in_utc()
mock_archive2 = mocker.Mock()
mock_archive2.key = "canvas/course2/archive2.zip"
mock_archive2.last_modified = now_in_utc() - timedelta(days=1)
mock_bucket.objects.filter.return_value = [mock_archive1, mock_archive2]
mocker.patch(
"learning_resources.tasks.get_learning_course_bucket", return_value=mock_bucket
)

# Create two canvas LearningResources - one stale

lr1 = LearningResourceFactory.create(
readable_id="course1",
etl_source=ETLSource.canvas.name,
published=True,
test_mode=True,
resource_type="course",
)
lr2 = LearningResourceFactory.create(
readable_id="course2",
etl_source=ETLSource.canvas.name,
published=True,
test_mode=True,
resource_type="course",
)
lr_stale = LearningResourceFactory.create(
readable_id="course3",
etl_source=ETLSource.canvas.name,
published=True,
test_mode=True,
resource_type="course",
)

# Patch ingest_canvas_course to return the readable_ids for the two non-stale courses
mocker.patch(
"learning_resources.tasks.ingest_canvas_course",
side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],
)
sync_canvas_courses(overwrite=False)

# The stale course should be unpublished and deleted
assert not LearningResource.objects.filter(id=lr_stale.id).exists()
# The non-stale courses should still exist
assert LearningResource.objects.filter(id=lr1.id).exists()
assert LearningResource.objects.filter(id=lr2.id).exists()