Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend OpenAI finish_reason handling #1985

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
26 changes: 18 additions & 8 deletions bertopic/representation/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from bertopic.representation._base import BaseRepresentation
from bertopic.representation._utils import retry_with_exponential_backoff, truncate_document


DEFAULT_PROMPT = """
This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
---
Expand Down Expand Up @@ -37,7 +36,7 @@
Topic name:"""

DEFAULT_CHAT_PROMPT = """
I have a topic that contains the following documents:
I have a topic that contains the following documents:
steven-solomon marked this conversation as resolved.
Show resolved Hide resolved
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Expand Down Expand Up @@ -193,7 +192,7 @@ def extract_topics(self,
updated_topics: Updated topic representations
"""
# Extract the top n representative documents per topic
repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)
repr_docs_mappings, _, _, repr_doc_ids = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)

# Generate using OpenAI's Language Model
updated_topics = {}
Expand All @@ -216,12 +215,23 @@ def extract_topics(self,
response = chat_completions_with_backoff(self.client, **kwargs)
else:
response = self.client.chat.completions.create(**kwargs)

# Check whether content was actually generated
# Addresses #1570 for potential issues with OpenAI's content filter
if hasattr(response.choices[0].message, "content"):
label = response.choices[0].message.content.strip().replace("topic: ", "")

output = response.choices[0]

if output.finish_reason == "stop":
label = output.message.content.strip().replace("topic: ", "")
elif output.finish_reason == "length":
topic_model.logger.warn(f"OpenAI Topic Representation - Length limit reached for documents IDs: ({repr_doc_ids})")
if hasattr(output.message, "content"):
label = output.message.content.strip().replace("topic: ", "")
else:
label = "OpenAI Topic Representation - Incomplete output due to token limit being reached"
# Addresses #1570 for potential issues with OpenAI's content filter
elif output.finish_reason == "content_filter":
topic_model.logger.warn(f"OpenAI Topic Representation - The content filter of OpenAI was trigger for the following documents IDs: ({repr_doc_ids})")
label = "Output content filtered by OpenAI"
else:
topic_model.logger.warn(f"OpenAI Topic Representation - Couldn't create a label due to {output.finish_reason} for the following document IDs: ({repr_doc_ids})")
label = "No label returned"
else:
if self.exponential_backoff:
Expand Down