Skip to content

Shanbady/summary flashcard sync #2339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions vector_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,12 +498,18 @@ def embed_learning_resources(ids, resource_type, overwrite):
points = _process_resource_embeddings(serialized_resources)
_embed_course_metadata_as_contentfile(serialized_resources)
else:
# Process content files summaries/flashcards if applicable before serialization
serialized_resources = list(serialize_bulk_content_files(ids))
# TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501
# Currently we only want to summarize content that already has a summary
existing_summary_content_ids = [
resource["id"]
for resource in serialized_resources
if resource.get("summary")
]
ContentSummarizer().summarize_content_files_by_ids(
existing_summary_content_ids, overwrite
)

ContentSummarizer().summarize_content_files_by_ids([], overwrite)

serialized_resources = list(serialize_bulk_content_files(ids))
collection_name = CONTENT_FILES_COLLECTION_NAME
points = [
(
Expand Down
52 changes: 52 additions & 0 deletions vector_search/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,3 +583,55 @@ def test_update_payload_no_points(mocker):
update_content_file_payload(serialized_files[0])
# Verify set_payload not called
mock_qdrant.set_payload.assert_not_called()


@pytest.mark.django_db
def test_embed_learning_resources_summarizes_only_contentfiles_with_summary(mocker):
"""
Test that summarize_content_files_by_ids is only called with contentfiles that have an existing summary
"""
mock_qdrant = mocker.patch("qdrant_client.QdrantClient")
mocker.patch("vector_search.utils.qdrant_client", return_value=mock_qdrant)
mocker.patch("vector_search.utils.create_qdrant_collections")
mocker.patch("vector_search.utils._process_content_embeddings", return_value=None)
mocker.patch(
"vector_search.utils.filter_existing_qdrant_points_by_ids", return_value=[]
)
mocker.patch("vector_search.utils.remove_qdrant_records")

# Create ContentFiles, some with summary, some without
contentfiles_with_summary = ContentFileFactory.create_batch(
2, content="abc", summary="summary text"
)
contentfiles_without_summary = ContentFileFactory.create_batch(
3, content="def", summary=""
)
all_contentfiles = contentfiles_with_summary + contentfiles_without_summary

# Patch serialize_bulk_content_files to return dicts with/without summary
serialized = []
for cf in all_contentfiles:
d = {
"id": cf.id,
"resource_readable_id": getattr(cf, "resource_readable_id", "resid"),
"run_readable_id": getattr(cf, "run_readable_id", "runid"),
"key": getattr(cf, "key", "key"),
"summary": cf.summary,
"content": cf.content,
"checksum": "checksum",
}
serialized.append(d)
mocker.patch(
"vector_search.utils.serialize_bulk_content_files", return_value=serialized
)

summarize_mock = mocker.patch(
"vector_search.utils.ContentSummarizer.summarize_content_files_by_ids"
)
embed_learning_resources(
[cf.id for cf in all_contentfiles], "content_file", overwrite=True
)

# Only contentfiles with summary should be passed
expected_ids = [cf.id for cf in contentfiles_with_summary]
summarize_mock.assert_called_once_with(expected_ids, True) # noqa: FBT003
Loading