From a47b43beb0b302830a8f388e58d4a7f05b36cd82 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 2 Jul 2025 20:11:48 -0400 Subject: [PATCH 1/5] run summarizer for items with existing summaries only --- vector_search/utils.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/vector_search/utils.py b/vector_search/utils.py index 92a2243657..d52fa58852 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -106,7 +106,9 @@ def create_qdrant_collections(force_recreate): ), ), }, - sparse_vectors_config=client.get_fastembed_sparse_vector_params(), + sparse_vectors_config={ + "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF) + }, optimizers_config=models.OptimizersConfigDiff(default_segment_number=2), quantization_config=models.ScalarQuantization( scalar=models.ScalarQuantizationConfig( @@ -129,7 +131,9 @@ def create_qdrant_collections(force_recreate): size=encoder.dim(), distance=models.Distance.COSINE ), }, - sparse_vectors_config=client.get_fastembed_sparse_vector_params(), + sparse_vectors_config={ + "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF) + }, optimizers_config=models.OptimizersConfigDiff(default_segment_number=2), quantization_config=models.ScalarQuantization( scalar=models.ScalarQuantizationConfig( @@ -498,12 +502,17 @@ def embed_learning_resources(ids, resource_type, overwrite): points = _process_resource_embeddings(serialized_resources) _embed_course_metadata_as_contentfile(serialized_resources) else: - # Process content files summaries/flashcards if applicable before serialization + serialized_resources = list(serialize_bulk_content_files(ids)) # TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501 + existing_summary_content_ids = [ + resource["id"] + for resource in serialized_resources + if resource.get("summary") + ] + ContentSummarizer().summarize_content_files_by_ids( + existing_summary_content_ids, overwrite + ) - ContentSummarizer().summarize_content_files_by_ids([], overwrite) - - serialized_resources = list(serialize_bulk_content_files(ids)) collection_name = CONTENT_FILES_COLLECTION_NAME points = [ ( From d5984ee196931c508249e1762ca526b3564147ae Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 2 Jul 2025 20:37:28 -0400 Subject: [PATCH 2/5] adding test --- vector_search/utils.py | 1 + vector_search/utils_test.py | 52 +++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/vector_search/utils.py b/vector_search/utils.py index d52fa58852..2e5c4216c3 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -504,6 +504,7 @@ def embed_learning_resources(ids, resource_type, overwrite): else: serialized_resources = list(serialize_bulk_content_files(ids)) # TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501 + # Currently we only want to summarize content that already has a summary existing_summary_content_ids = [ resource["id"] for resource in serialized_resources diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 1ee053b9da..369a63194d 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -583,3 +583,55 @@ def test_update_payload_no_points(mocker): update_content_file_payload(serialized_files[0]) # Verify set_payload not called mock_qdrant.set_payload.assert_not_called() + + +@pytest.mark.django_db +def test_embed_learning_resources_summarizes_only_contentfiles_with_summary(mocker): + """ + Test that summarize_content_files_by_ids is only called with contentfiles that have an existing summary + """ + mock_qdrant = mocker.patch("qdrant_client.QdrantClient") + mocker.patch("vector_search.utils.qdrant_client", return_value=mock_qdrant) + mocker.patch("vector_search.utils.create_qdrant_collections") + mocker.patch("vector_search.utils._process_content_embeddings", return_value=None) + mocker.patch( + "vector_search.utils.filter_existing_qdrant_points_by_ids", return_value=[] + ) + mocker.patch("vector_search.utils.remove_qdrant_records") + + # Create ContentFiles, some with summary, some without + contentfiles_with_summary = ContentFileFactory.create_batch( + 2, content="abc", summary="summary text" + ) + contentfiles_without_summary = ContentFileFactory.create_batch( + 3, content="def", summary=None + ) + all_contentfiles = contentfiles_with_summary + contentfiles_without_summary + + # Patch serialize_bulk_content_files to return dicts with/without summary + serialized = [] + for cf in all_contentfiles: + d = { + "id": cf.id, + "resource_readable_id": getattr(cf, "resource_readable_id", "resid"), + "run_readable_id": getattr(cf, "run_readable_id", "runid"), + "key": getattr(cf, "key", "key"), + "summary": cf.summary, + "content": cf.content, + "checksum": "checksum", + } + serialized.append(d) + mocker.patch( + "vector_search.utils.serialize_bulk_content_files", return_value=serialized + ) + + summarize_mock = mocker.patch( + "vector_search.utils.ContentSummarizer.summarize_content_files_by_ids" + ) + embed_learning_resources( + [cf.id for cf in all_contentfiles], "content_file", overwrite=True + ) + + # Only contentfiles with summary should be passed + expected_ids = [cf.id for cf in contentfiles_with_summary] + summarize_mock.assert_called_once_with(expected_ids, overwrite=True) From cd071841119be17e1934b6e5152da50bd4d89fbc Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 3 Jul 2025 10:15:28 -0400 Subject: [PATCH 3/5] setting summary to blank --- vector_search/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 369a63194d..34f3eb8be3 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -604,7 +604,7 @@ def test_embed_learning_resources_summarizes_only_contentfiles_with_summary(mock 2, content="abc", summary="summary text" ) contentfiles_without_summary = ContentFileFactory.create_batch( - 3, content="def", summary=None + 3, content="def", summary="" ) all_contentfiles = contentfiles_with_summary + contentfiles_without_summary From 4def30ec87a342266d8e1ce89a56e935b95aef33 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 3 Jul 2025 10:17:22 -0400 Subject: [PATCH 4/5] remove unrelated change --- vector_search/utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vector_search/utils.py b/vector_search/utils.py index 2e5c4216c3..c100a95bb5 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -106,9 +106,7 @@ def create_qdrant_collections(force_recreate): ), ), }, - sparse_vectors_config={ - "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF) - }, + sparse_vectors_config=client.get_fastembed_sparse_vector_params(), optimizers_config=models.OptimizersConfigDiff(default_segment_number=2), quantization_config=models.ScalarQuantization( scalar=models.ScalarQuantizationConfig( @@ -131,9 +129,7 @@ def create_qdrant_collections(force_recreate): size=encoder.dim(), distance=models.Distance.COSINE ), }, - sparse_vectors_config={ - "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF) - }, + sparse_vectors_config=client.get_fastembed_sparse_vector_params(), optimizers_config=models.OptimizersConfigDiff(default_segment_number=2), quantization_config=models.ScalarQuantization( scalar=models.ScalarQuantizationConfig( From df3fc65d02c5a775cbe616ce2f569706566eb17b Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 3 Jul 2025 10:32:34 -0400 Subject: [PATCH 5/5] fixing test --- vector_search/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 34f3eb8be3..10a9955819 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -634,4 +634,4 @@ def test_embed_learning_resources_summarizes_only_contentfiles_with_summary(mock # Only contentfiles with summary should be passed expected_ids = [cf.id for cf in contentfiles_with_summary] - summarize_mock.assert_called_once_with(expected_ids, overwrite=True) + summarize_mock.assert_called_once_with(expected_ids, True) # noqa: FBT003