diff --git a/vector_search/utils.py b/vector_search/utils.py index 92a2243657..c100a95bb5 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -498,12 +498,18 @@ def embed_learning_resources(ids, resource_type, overwrite): points = _process_resource_embeddings(serialized_resources) _embed_course_metadata_as_contentfile(serialized_resources) else: - # Process content files summaries/flashcards if applicable before serialization + serialized_resources = list(serialize_bulk_content_files(ids)) # TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501 + # Currently we only want to summarize content that already has a summary + existing_summary_content_ids = [ + resource["id"] + for resource in serialized_resources + if resource.get("summary") + ] + ContentSummarizer().summarize_content_files_by_ids( + existing_summary_content_ids, overwrite + ) - ContentSummarizer().summarize_content_files_by_ids([], overwrite) - - serialized_resources = list(serialize_bulk_content_files(ids)) collection_name = CONTENT_FILES_COLLECTION_NAME points = [ ( diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 1ee053b9da..10a9955819 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -583,3 +583,55 @@ def test_update_payload_no_points(mocker): update_content_file_payload(serialized_files[0]) # Verify set_payload not called mock_qdrant.set_payload.assert_not_called() + + +@pytest.mark.django_db +def test_embed_learning_resources_summarizes_only_contentfiles_with_summary(mocker): + """ + Test that summarize_content_files_by_ids is only called with contentfiles that have an existing summary + """ + mock_qdrant = mocker.patch("qdrant_client.QdrantClient") + mocker.patch("vector_search.utils.qdrant_client", return_value=mock_qdrant) + mocker.patch("vector_search.utils.create_qdrant_collections") + mocker.patch("vector_search.utils._process_content_embeddings", return_value=None) + mocker.patch( + "vector_search.utils.filter_existing_qdrant_points_by_ids", return_value=[] + ) + mocker.patch("vector_search.utils.remove_qdrant_records") + + # Create ContentFiles, some with summary, some without + contentfiles_with_summary = ContentFileFactory.create_batch( + 2, content="abc", summary="summary text" + ) + contentfiles_without_summary = ContentFileFactory.create_batch( + 3, content="def", summary="" + ) + all_contentfiles = contentfiles_with_summary + contentfiles_without_summary + + # Patch serialize_bulk_content_files to return dicts with/without summary + serialized = [] + for cf in all_contentfiles: + d = { + "id": cf.id, + "resource_readable_id": getattr(cf, "resource_readable_id", "resid"), + "run_readable_id": getattr(cf, "run_readable_id", "runid"), + "key": getattr(cf, "key", "key"), + "summary": cf.summary, + "content": cf.content, + "checksum": "checksum", + } + serialized.append(d) + mocker.patch( + "vector_search.utils.serialize_bulk_content_files", return_value=serialized + ) + + summarize_mock = mocker.patch( + "vector_search.utils.ContentSummarizer.summarize_content_files_by_ids" + ) + embed_learning_resources( + [cf.id for cf in all_contentfiles], "content_file", overwrite=True + ) + + # Only contentfiles with summary should be passed + expected_ids = [cf.id for cf in contentfiles_with_summary] + summarize_mock.assert_called_once_with(expected_ids, True) # noqa: FBT003