From cf4852b04fced70df3c922d7ce9740bf5ae75034 Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Fri, 10 May 2024 14:16:56 +0000 Subject: [PATCH 1/9] Import logger --- bertopic/representation/_openai.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index e84a75f1..0d026784 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -5,8 +5,9 @@ from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation -from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document +from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document, MyLogger +logger = MyLogger("WARNING") DEFAULT_PROMPT = """ This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title @@ -37,7 +38,7 @@ Topic name:""" DEFAULT_CHAT_PROMPT = """ -I have a topic that contains the following documents: +I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: [KEYWORDS] From 69179b3002f6e5dd1510e4639447680c9753760a Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Fri, 10 May 2024 14:24:30 +0000 Subject: [PATCH 2/9] Handle finish reasons --- bertopic/representation/_openai.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 0d026784..055f5d92 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -218,10 +218,13 @@ def extract_topics(self, else: response = self.client.chat.completions.create(**kwargs) - # Check whether content was actually generated - # Adresses #1570 for potential issues with OpenAI's content filter - if hasattr(response.choices[0].message, "content"): - label = response.choices[0].message.content.strip().replace("topic: ", "") + choice = response.choices[0] + has_content = hasattr(response.choices[0].message, "content") + + if rchoice.finish_reason == "stop" and has_content: + label = rchoice.message.content.strip().replace("topic: ", "") + elif rchoice.finish_reason == "length" and has_content: + label = rchoice.message.content.strip().replace("topic: ", "") else: label = "No label returned" else: From acba986bbc9d5bc714e0fcb438b8c7e186f2621a Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Fri, 10 May 2024 14:47:52 +0000 Subject: [PATCH 3/9] Add logging for each finish_reason --- bertopic/representation/_openai.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 055f5d92..f4eba317 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -194,7 +194,7 @@ def extract_topics(self, updated_topics: Updated topic representations """ # Extract the top n representative documents per topic - repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity) + repr_docs_mappings, _, _, repr_doc_ids = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity) # Generate using OpenAI's Language Model updated_topics = {} @@ -219,13 +219,20 @@ def extract_topics(self, response = self.client.chat.completions.create(**kwargs) choice = response.choices[0] - has_content = hasattr(response.choices[0].message, "content") - if rchoice.finish_reason == "stop" and has_content: - label = rchoice.message.content.strip().replace("topic: ", "") - elif rchoice.finish_reason == "length" and has_content: - label = rchoice.message.content.strip().replace("topic: ", "") + if choice.finish_reason == "stop": + label = choice.message.content.strip().replace("topic: ", "") + elif choice.finish_reason == "length": + logger.warn(f"Extracing Topics - Length limit reached for doc_ids ({repr_doc_ids})") + if hasattr(response.choices[0].message, "content"): + label = choice.message.content.strip().replace("topic: ", "") + else: + label = "Incomple output due to token limit being reached" + elif choice.finish_reason == "content_filter": + logger.warn(f"Extracing Topics - Content filtered for doc_ids ({repr_doc_ids})") + label = "Output content filtered by OpenAI" else: + logger.warn(f"Extracing Topics - No label due to finish_reason {choice.finish_reason} for doc_ids ({repr_doc_ids})") label = "No label returned" else: if self.exponential_backoff: From 335338ec9c36e812371d4e6a8ad3a56317246221 Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Wed, 22 May 2024 15:21:30 +0000 Subject: [PATCH 4/9] Rename choice to output - Replace all uses of choices[0] with output --- bertopic/representation/_openai.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index f4eba317..1f35e2c6 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -218,28 +218,28 @@ def extract_topics(self, else: response = self.client.chat.completions.create(**kwargs) - choice = response.choices[0] + output = response.choices[0] - if choice.finish_reason == "stop": - label = choice.message.content.strip().replace("topic: ", "") - elif choice.finish_reason == "length": + if output.finish_reason == "stop": + label = output.message.content.strip().replace("topic: ", "") + elif output.finish_reason == "length": logger.warn(f"Extracing Topics - Length limit reached for doc_ids ({repr_doc_ids})") - if hasattr(response.choices[0].message, "content"): - label = choice.message.content.strip().replace("topic: ", "") + if hasattr(output.message, "content"): + label = output.message.content.strip().replace("topic: ", "") else: label = "Incomple output due to token limit being reached" - elif choice.finish_reason == "content_filter": + elif output.finish_reason == "content_filter": logger.warn(f"Extracing Topics - Content filtered for doc_ids ({repr_doc_ids})") label = "Output content filtered by OpenAI" else: - logger.warn(f"Extracing Topics - No label due to finish_reason {choice.finish_reason} for doc_ids ({repr_doc_ids})") + logger.warn(f"Extracing Topics - No label due to finish_reason {output.finish_reason} for doc_ids ({repr_doc_ids})") label = "No label returned" else: if self.exponential_backoff: response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs) else: response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs) - label = response.choices[0].text.strip() + label = output.text.strip() updated_topics[topic] = [(label, 1)] From c064bda894838d1bc99305ab5977fce995b9de38 Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Wed, 22 May 2024 15:49:10 +0000 Subject: [PATCH 5/9] Fix typo --- bertopic/representation/_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 1f35e2c6..f70acb58 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -227,7 +227,7 @@ def extract_topics(self, if hasattr(output.message, "content"): label = output.message.content.strip().replace("topic: ", "") else: - label = "Incomple output due to token limit being reached" + label = "Incomplete output due to token limit being reached" elif output.finish_reason == "content_filter": logger.warn(f"Extracing Topics - Content filtered for doc_ids ({repr_doc_ids})") label = "Output content filtered by OpenAI" From 8a97198cf4db443bfa110e2c9f6192e272f1707e Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Wed, 22 May 2024 15:49:45 +0000 Subject: [PATCH 6/9] Use requested logging prefix --- bertopic/representation/_openai.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index f70acb58..e49a4d09 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -223,16 +223,16 @@ def extract_topics(self, if output.finish_reason == "stop": label = output.message.content.strip().replace("topic: ", "") elif output.finish_reason == "length": - logger.warn(f"Extracing Topics - Length limit reached for doc_ids ({repr_doc_ids})") + logger.warn(f"OpenAI Topic Representation - Length limit reached for doc_ids ({repr_doc_ids})") if hasattr(output.message, "content"): label = output.message.content.strip().replace("topic: ", "") else: - label = "Incomplete output due to token limit being reached" + label = "OpenAI Topic Representation - Incomplete output due to token limit being reached" elif output.finish_reason == "content_filter": - logger.warn(f"Extracing Topics - Content filtered for doc_ids ({repr_doc_ids})") + logger.warn(f"OpenAI Topic Representation - Content filtered for doc_ids ({repr_doc_ids})") label = "Output content filtered by OpenAI" else: - logger.warn(f"Extracing Topics - No label due to finish_reason {output.finish_reason} for doc_ids ({repr_doc_ids})") + logger.warn(f"OpenAI Topic Representation - No label due to finish_reason {output.finish_reason} for doc_ids ({repr_doc_ids})") label = "No label returned" else: if self.exponential_backoff: From 7e04b78caa65e110379616402a099f554fbee9c6 Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Wed, 22 May 2024 16:04:33 +0000 Subject: [PATCH 7/9] Update messages --- bertopic/representation/_openai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index e49a4d09..0131b318 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -223,16 +223,16 @@ def extract_topics(self, if output.finish_reason == "stop": label = output.message.content.strip().replace("topic: ", "") elif output.finish_reason == "length": - logger.warn(f"OpenAI Topic Representation - Length limit reached for doc_ids ({repr_doc_ids})") + logger.warn(f"OpenAI Topic Representation - Length limit reached for documents IDs: ({repr_doc_ids})") if hasattr(output.message, "content"): label = output.message.content.strip().replace("topic: ", "") else: label = "OpenAI Topic Representation - Incomplete output due to token limit being reached" elif output.finish_reason == "content_filter": - logger.warn(f"OpenAI Topic Representation - Content filtered for doc_ids ({repr_doc_ids})") + logger.warn(f"OpenAI Topic Representation - The content filter of OpenAI was trigger for the following documents IDs: ({repr_doc_ids})") label = "Output content filtered by OpenAI" else: - logger.warn(f"OpenAI Topic Representation - No label due to finish_reason {output.finish_reason} for doc_ids ({repr_doc_ids})") + logger.warn(f"OpenAI Topic Representation - Couldn't create a label due to {output.finish_reason} for the following document IDs: ({repr_doc_ids})") label = "No label returned" else: if self.exponential_backoff: From cf020ec6250f0476111daec5d931b56c7d9f768f Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Fri, 14 Jun 2024 14:31:08 +0000 Subject: [PATCH 8/9] Use logger from topic_model --- bertopic/representation/_openai.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 6dc4da61..c04b2684 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -5,9 +5,7 @@ from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation -from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document, MyLogger - -logger = MyLogger("WARNING") +from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document DEFAULT_PROMPT = """ This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title @@ -223,17 +221,17 @@ def extract_topics(self, if output.finish_reason == "stop": label = output.message.content.strip().replace("topic: ", "") elif output.finish_reason == "length": - logger.warn(f"OpenAI Topic Representation - Length limit reached for documents IDs: ({repr_doc_ids})") + topic_model.logger.warn(f"OpenAI Topic Representation - Length limit reached for documents IDs: ({repr_doc_ids})") if hasattr(output.message, "content"): label = output.message.content.strip().replace("topic: ", "") else: label = "OpenAI Topic Representation - Incomplete output due to token limit being reached" # Addresses #1570 for potential issues with OpenAI's content filter elif output.finish_reason == "content_filter": - logger.warn(f"OpenAI Topic Representation - The content filter of OpenAI was trigger for the following documents IDs: ({repr_doc_ids})") + topic_model.logger.warn(f"OpenAI Topic Representation - The content filter of OpenAI was trigger for the following documents IDs: ({repr_doc_ids})") label = "Output content filtered by OpenAI" else: - logger.warn(f"OpenAI Topic Representation - Couldn't create a label due to {output.finish_reason} for the following document IDs: ({repr_doc_ids})") + topic_model.logger.warn(f"OpenAI Topic Representation - Couldn't create a label due to {output.finish_reason} for the following document IDs: ({repr_doc_ids})") label = "No label returned" else: if self.exponential_backoff: From d47a928b8590128f6eca928d1ce4c3f7cf151bf9 Mon Sep 17 00:00:00 2001 From: Steven Solomon Date: Fri, 14 Jun 2024 14:36:19 +0000 Subject: [PATCH 9/9] Fix unused response for non-chat flow --- bertopic/representation/_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index c04b2684..7619e834 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -238,7 +238,7 @@ def extract_topics(self, response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs) else: response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs) - label = output.text.strip() + label = response.choices[0].text.strip() updated_topics[topic] = [(label, 1)]