From a350cb0b46c71b6bddf145a8a3759cf4918d64d8 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:20:12 +0530
Subject: [PATCH 01/18] detect script as well
---
core_backend/app/llm_call/llm_prompts.py | 230 ++++++++++++++------
core_backend/app/llm_call/llm_rag.py | 19 +-
core_backend/app/llm_call/process_input.py | 24 +-
core_backend/app/llm_call/process_output.py | 5 +
core_backend/app/question_answer/schemas.py | 3 +-
5 files changed, 200 insertions(+), 81 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 2ede20f4c..0d1076834 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -103,7 +103,7 @@
{context}
IMPORTANT NOTES ON THE "answer" FIELD:
-- Answer in the language of the question ({original_language}).
+- Answer in the language {original_language} in the script {original_script}.
- Answer should be concise, to the point, and no longer than 80 words.
- Do not include any information that is not present in the REFERENCE TEXT.
"""
@@ -182,6 +182,61 @@ class AlignmentScore(BaseModel):
model_config = ConfigDict(strict=True)
+CHAT_RESPONSE_PROMPT = """\
+You are an AI assistant designed to help users with their
+questions/concerns. You interact with users via a chat interface. You will
+be provided with ADDITIONAL RELEVANT INFORMATION that can address the
+user's questions/concerns.
+
+BEFORE answering the user's LATEST MESSAGE, follow these steps:
+
+1. Review the conversation history to ensure that you understand the
+context in which the user's LATEST MESSAGE is being asked.
+2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
+understand the most useful information related to the user's LATEST
+MESSAGE.
+
+When you have completed the above steps, you will then write a JSON, whose
+TypeScript Interface is given below:
+
+interface Response {{
+ extracted_info: string[];
+ answer: string;
+}}
+
+For "extracted_info", extract from the provided ADDITIONAL RELEVANT
+INFORMATION the most useful information related to the LATEST MESSAGE asked
+by the user, and list them one by one. If no useful information is found,
+return an empty list.
+
+For "answer", understand the conversation history, ADDITIONAL RELEVANT
+INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
+the user's LATEST MESSAGE. If no useful information was found in the
+either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
+respond with {failure_message}.
+
+EXAMPLE RESPONSES:
+{{"extracted_info": [
+ "Pineapples are a blend of pinecones and apples.",
+ "Pineapples have the shape of a pinecone."
+ ],
+ "answer": "The 'pine-' from pineapples likely come from the fact that
+ pineapples are a hybrid of pinecones and apples and its pinecone-like
+ shape."
+}}
+{{"extracted_info": [], "answer": "{failure_message}"}}
+
+IMPORTANT NOTES ON THE "answer" FIELD:
+- Keep in mind that the user is asking a {message_type} question.
+- Answer in the language {original_language} in the script {original_script}.
+- Answer should be concise and to the point.
+- Do not include any information that is not present in the ADDITIONAL
+RELEVANT INFORMATION.
+
+Only output the JSON response, without any additional text.
+"""
+
+
class ChatHistory:
"""Contains the prompts and models for the chat history task."""
@@ -227,62 +282,7 @@ class ChatHistory:
),
prompt_kws={"valid_message_types": _valid_message_types},
)
- system_message_generate_response = format_prompt(
- prompt=textwrap.dedent(
- """You are an AI assistant designed to help users with their
- questions/concerns. You interact with users via a chat interface. You will
- be provided with ADDITIONAL RELEVANT INFORMATION that can address the
- user's questions/concerns.
-
- BEFORE answering the user's LATEST MESSAGE, follow these steps:
-
- 1. Review the conversation history to ensure that you understand the
- context in which the user's LATEST MESSAGE is being asked.
- 2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
- understand the most useful information related to the user's LATEST
- MESSAGE.
-
- When you have completed the above steps, you will then write a JSON, whose
- TypeScript Interface is given below:
-
- interface Response {{
- extracted_info: string[];
- answer: string;
- }}
-
- For "extracted_info", extract from the provided ADDITIONAL RELEVANT
- INFORMATION the most useful information related to the LATEST MESSAGE asked
- by the user, and list them one by one. If no useful information is found,
- return an empty list.
-
- For "answer", understand the conversation history, ADDITIONAL RELEVANT
- INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
- the user's LATEST MESSAGE. If no useful information was found in the
- either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
- respond with {failure_message}.
-
- EXAMPLE RESPONSES:
- {{"extracted_info": [
- "Pineapples are a blend of pinecones and apples.",
- "Pineapples have the shape of a pinecone."
- ],
- "answer": "The 'pine-' from pineapples likely come from the fact that
- pineapples are a hybrid of pinecones and apples and its pinecone-like
- shape."
- }}
- {{"extracted_info": [], "answer": "{failure_message}"}}
-
- IMPORTANT NOTES ON THE "answer" FIELD:
- - Keep in mind that the user is asking a {message_type} question.
- - Answer in the language of the question ({original_language}).
- - Answer should be concise and to the point.
- - Do not include any information that is not present in the ADDITIONAL
- RELEVANT INFORMATION.
-
- Only output the JSON response, without any additional text.
- """
- )
- )
+ system_message_generate_response = CHAT_RESPONSE_PROMPT
class ChatHistoryConstructSearchQuery(BaseModel):
"""Pydantic model for the output of the construct search query chat history."""
@@ -330,6 +330,106 @@ def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]
raise ValueError(f"Error validating the output: {e}") from e
+class IdentifiedScript(str, Enum):
+ """Script used in the user's input."""
+
+ LATIN = "Latin"
+ DEVANAGARI = "Devanagari"
+ ARABIC = "Arabic"
+ CYRILLIC = "Cyrillic"
+ CHINESE = "Chinese"
+ JAPANESE = "Japanese"
+ KOREAN = "Korean"
+ THAI = "Thai"
+ BENGALI = "Bengali"
+ TAMIL = "Tamil"
+ TELUGU = "Telugu"
+ KANNADA = "Kannada"
+ MALAYALAM = "Malayalam"
+ GUJARATI = "Gujarati"
+ GURMUKHI = "Gurmukhi"
+ ORIYA = "Oriya"
+ SINHALA = "Sinhala"
+ MYANMAR = "Myanmar"
+ ETHIOPIC = "Ethiopic"
+ GEORGIAN = "Georgian"
+ ARMENIAN = "Armenian"
+ HEBREW = "Hebrew"
+ GREEK = "Greek"
+ TIBETAN = "Tibetan"
+ MONGOLIAN = "Mongolian"
+ KHMER = "Khmer"
+ LAO = "Lao"
+ VIETNAMESE = "Vietnamese"
+ THAI_LAO = "Thai-Lao"
+ UNKNOWN = "Unknown"
+
+ @classmethod
+ def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override]
+ """If script identified is not one of the supported scripts, it is
+ classified as UNKNOWN.
+
+ Parameters
+ ----------
+ value
+ The script identified.
+
+ Returns
+ -------
+ Script
+ The identified script (i.e., UNKNOWN).
+ """
+ return cls.UNKNOWN
+
+ @classmethod
+ def get_supported_scripts(cls) -> list[str]:
+ """Return a list of supported scripts.
+
+ Returns
+ -------
+ list[str]
+ A list of supported scripts.
+ """
+ return [script.value for script in cls if script != cls.UNKNOWN]
+
+
+class LanguageIdentificationResponse(BaseModel):
+ """Pydantic model for the language identification response."""
+
+ language: IdentifiedLanguage
+ script: IdentifiedScript
+
+ model_config = ConfigDict(strict=True)
+
+
+LANGUAGE_ID_PROMPT = f"""\
+You are a high-performing language identification bot that classifies the \
+language and script of the user input.
+
+For each input, identify:
+1. The language (must be one of {{member_names}})
+2. The script (must be one of {", ".join(IdentifiedScript.get_supported_scripts())})
+
+If the user input is:
+1. in one of the supported languages, respond with that language and its script
+2. written in a mix of languages, respond with the dominant language and its script
+3. in a real language but not a supported language, respond with UNSUPPORTED and \
+its script
+4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin
+
+Examples:
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
+
+Respond with a JSON object containing "language" and "script" keys.
+"""
+
+
class IdentifiedLanguage(str, Enum):
"""Identified language of the user's input."""
@@ -387,21 +487,7 @@ def get_prompt(cls) -> str:
The prompt for the language identification bot.
"""
- return textwrap.dedent(
- f"""
- You are a high-performing language identification bot that classifies the
- language of the user input into one of {", ".join(cls._member_names_)}.
-
- If the user input is
- 1. in one of the supported languages, then respond with that language.
- 2. written in a mix of languages, then respond with the dominant language.
- 3. in a real language but not a supported language, then respond with
- UNSUPPORTED.
- 4. unintelligible or gibberish, then respond with UNINTELLIGIBLE.
-
- Answer should be a single word and strictly one of
- [{", ".join(cls._member_names_)}]"""
- ).strip()
+ return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
class RAG(BaseModel):
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index ab4431ade..49a229364 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -8,7 +8,13 @@
from ..config import LITELLM_MODEL_GENERATION
from ..utils import setup_logger
-from .llm_prompts import RAG, RAG_FAILURE_MESSAGE, ChatHistory, IdentifiedLanguage
+from .llm_prompts import (
+ RAG,
+ RAG_FAILURE_MESSAGE,
+ ChatHistory,
+ IdentifiedLanguage,
+ IdentifiedScript,
+)
from .utils import (
_ask_llm_async,
append_messages_to_chat_history,
@@ -24,6 +30,7 @@ async def get_llm_rag_answer(
context: str,
metadata: dict | None = None,
original_language: IdentifiedLanguage,
+ original_script: IdentifiedScript,
question: str,
) -> RAG:
"""Get an answer from the LLM model using RAG.
@@ -36,6 +43,8 @@ async def get_llm_rag_answer(
Additional metadata to provide to the LLM model.
original_language
The original language of the question.
+ original_script
+ The scrip in which the original question was written.
question
The question to ask the LLM model.
@@ -46,7 +55,11 @@ async def get_llm_rag_answer(
"""
metadata = metadata or {}
- prompt = RAG.prompt.format(context=context, original_language=original_language)
+ prompt = RAG.prompt.format(
+ context=context,
+ original_language=original_language,
+ original_script=original_script,
+ )
result = await _ask_llm_async(
json_=True,
@@ -75,6 +88,7 @@ async def get_llm_rag_answer_with_chat_history(
message_type: str,
metadata: dict | None = None,
original_language: IdentifiedLanguage,
+ original_script: IdentifiedScript,
question: str,
session_id: str,
) -> tuple[RAG, list[dict[str, str | None]]]:
@@ -112,6 +126,7 @@ async def get_llm_rag_answer_with_chat_history(
failure_message=RAG_FAILURE_MESSAGE,
message_type=message_type,
original_language=original_language,
+ original_script=original_script,
)
)
content = (
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 9a30ffdeb..d714527b9 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -3,6 +3,8 @@
from functools import wraps
from typing import Any, Callable, Optional
+from pydantic import ValidationError
+
from ..config import (
LITELLM_MODEL_LANGUAGE_DETECT,
LITELLM_MODEL_PARAPHRASE,
@@ -22,9 +24,11 @@
TRANSLATE_FAILED_MESSAGE,
TRANSLATE_PROMPT,
IdentifiedLanguage,
+ IdentifiedScript,
+ LanguageIdentificationResponse,
SafetyClassification,
)
-from .utils import _ask_llm_async
+from .utils import _ask_llm_async, remove_json_markdown
logger = setup_logger(name="INPUT RAILS")
@@ -84,7 +88,7 @@ async def _identify_language(
query_refined: QueryRefined,
response: QueryResponse | QueryResponseError,
) -> tuple[QueryRefined, QueryResponse | QueryResponseError]:
- """Identify the language of the question.
+ """Identify the language and script of the question.
Parameters
----------
@@ -104,19 +108,27 @@ async def _identify_language(
if isinstance(response, QueryResponseError):
return query_refined, response
- llm_identified_lang = await _ask_llm_async(
+ json_str = await _ask_llm_async(
+ json_=True,
litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
metadata=metadata,
system_message=IdentifiedLanguage.get_prompt(),
user_message=query_refined.query_text,
)
- identified_lang = getattr(
- IdentifiedLanguage, llm_identified_lang, IdentifiedLanguage.UNSUPPORTED
- )
+ try:
+ cleaned_json_str = remove_json_markdown(text=json_str)
+ lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
+ identified_lang = lang_info["language"]
+ identified_script = lang_info["script"]
+ except ValidationError:
+ identified_lang = IdentifiedLanguage.UNSUPPORTED
+ identified_script = IdentifiedScript.LATIN
+
query_refined.original_language = identified_lang
response.debug_info["original_query"] = query_refined.query_text_original
response.debug_info["original_language"] = identified_lang
+ response.debug_info["original_script"] = identified_script
processed_response = _process_identified_language_response(
identified_language=identified_lang, response=response
diff --git a/core_backend/app/llm_call/process_output.py b/core_backend/app/llm_call/process_output.py
index a4671030b..2a569f0e1 100644
--- a/core_backend/app/llm_call/process_output.py
+++ b/core_backend/app/llm_call/process_output.py
@@ -84,6 +84,9 @@ async def generate_llm_query_response(
if query_refined.original_language is None:
logger.warning("No original_language found in the query.")
return response, chat_history
+ if query_refined.original_script is None:
+ logger.warning("No original_script found in the query.")
+ return response, chat_history
context = get_context_string_from_search_results(
search_results=response.search_results
@@ -98,6 +101,7 @@ async def generate_llm_query_response(
message_type=message_type,
metadata=metadata,
original_language=query_refined.original_language,
+ original_script=query_refined.original_script,
question=query_refined.query_text_original,
session_id=chat_query_params["session_id"],
)
@@ -106,6 +110,7 @@ async def generate_llm_query_response(
context=context,
metadata=metadata,
original_language=query_refined.original_language,
+ original_script=query_refined.original_script,
question=query_refined.query_text_original, # Use the original query text
)
diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py
index 8904e2c36..bda58ce7a 100644
--- a/core_backend/app/question_answer/schemas.py
+++ b/core_backend/app/question_answer/schemas.py
@@ -6,7 +6,7 @@
from pydantic import BaseModel, ConfigDict, Field
from pydantic.json_schema import SkipJsonSchema
-from ..llm_call.llm_prompts import IdentifiedLanguage
+from ..llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript
from ..schemas import FeedbackSentiment, QuerySearchResult
@@ -49,6 +49,7 @@ class QueryRefined(QueryBase):
generate_tts: bool = Field(False)
original_language: IdentifiedLanguage | None = None
+ original_script: IdentifiedScript | None = None
query_text_original: str
workspace_id: int
From 5b1759350554ae2edd2f4a95d5f4fcbc5f99995c Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:32:09 +0530
Subject: [PATCH 02/18] fix prompt
---
core_backend/app/llm_call/llm_prompts.py | 100 +++++++++++----------
core_backend/app/llm_call/process_input.py | 3 +-
2 files changed, 53 insertions(+), 50 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 0d1076834..18c0d5864 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -335,33 +335,33 @@ class IdentifiedScript(str, Enum):
LATIN = "Latin"
DEVANAGARI = "Devanagari"
- ARABIC = "Arabic"
- CYRILLIC = "Cyrillic"
- CHINESE = "Chinese"
- JAPANESE = "Japanese"
- KOREAN = "Korean"
- THAI = "Thai"
+ # ARABIC = "Arabic"
+ # CYRILLIC = "Cyrillic"
+ # CHINESE = "Chinese"
+ # JAPANESE = "Japanese"
+ # KOREAN = "Korean"
+ # THAI = "Thai"
BENGALI = "Bengali"
TAMIL = "Tamil"
TELUGU = "Telugu"
KANNADA = "Kannada"
MALAYALAM = "Malayalam"
GUJARATI = "Gujarati"
- GURMUKHI = "Gurmukhi"
- ORIYA = "Oriya"
- SINHALA = "Sinhala"
- MYANMAR = "Myanmar"
- ETHIOPIC = "Ethiopic"
- GEORGIAN = "Georgian"
- ARMENIAN = "Armenian"
- HEBREW = "Hebrew"
- GREEK = "Greek"
- TIBETAN = "Tibetan"
- MONGOLIAN = "Mongolian"
- KHMER = "Khmer"
- LAO = "Lao"
- VIETNAMESE = "Vietnamese"
- THAI_LAO = "Thai-Lao"
+ # GURMUKHI = "Gurmukhi"
+ # ORIYA = "Oriya"
+ # SINHALA = "Sinhala"
+ # MYANMAR = "Myanmar"
+ # ETHIOPIC = "Ethiopic"
+ # GEORGIAN = "Georgian"
+ # ARMENIAN = "Armenian"
+ # HEBREW = "Hebrew"
+ # GREEK = "Greek"
+ # TIBETAN = "Tibetan"
+ # MONGOLIAN = "Mongolian"
+ # KHMER = "Khmer"
+ # LAO = "Lao"
+ # VIETNAMESE = "Vietnamese"
+ # THAI_LAO = "Thai-Lao"
UNKNOWN = "Unknown"
@classmethod
@@ -402,34 +402,6 @@ class LanguageIdentificationResponse(BaseModel):
model_config = ConfigDict(strict=True)
-LANGUAGE_ID_PROMPT = f"""\
-You are a high-performing language identification bot that classifies the \
-language and script of the user input.
-
-For each input, identify:
-1. The language (must be one of {{member_names}})
-2. The script (must be one of {", ".join(IdentifiedScript.get_supported_scripts())})
-
-If the user input is:
-1. in one of the supported languages, respond with that language and its script
-2. written in a mix of languages, respond with the dominant language and its script
-3. in a real language but not a supported language, respond with UNSUPPORTED and \
-its script
-4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin
-
-Examples:
-"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
-"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
-"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
-"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
-"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
-"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
-"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
-
-Respond with a JSON object containing "language" and "script" keys.
-"""
-
-
class IdentifiedLanguage(str, Enum):
"""Identified language of the user's input."""
@@ -490,6 +462,36 @@ def get_prompt(cls) -> str:
return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
+LANGUAGE_ID_PROMPT = (
+ f"""\
+You are a high-performing language identification bot that classifies the \
+language and script of the user input.
+
+For each input, identify:
+1. The language (must be one of {", ".join(IdentifiedLanguage._member_names_)})
+2. The script (must be one of {", ".join(IdentifiedScript._member_names_)})
+
+If the user input is:
+1. in one of the supported languages, respond with that language and its script
+2. written in a mix of languages, respond with the dominant language and its script
+3. in a real language but not a supported language, respond with UNSUPPORTED and \
+its script
+4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin"""
+ + """
+Examples:
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
+
+Respond with a JSON object containing "language" and "script" keys.
+"""
+)
+
+
class RAG(BaseModel):
"""Generated response based on question and retrieved context."""
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index d714527b9..dfc3341a7 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -19,6 +19,7 @@
)
from ..utils import setup_logger
from .llm_prompts import (
+ LANGUAGE_ID_PROMPT,
PARAPHRASE_FAILED_MESSAGE,
PARAPHRASE_PROMPT,
TRANSLATE_FAILED_MESSAGE,
@@ -112,7 +113,7 @@ async def _identify_language(
json_=True,
litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
metadata=metadata,
- system_message=IdentifiedLanguage.get_prompt(),
+ system_message=LANGUAGE_ID_PROMPT,
user_message=query_refined.query_text,
)
From f7fee04f2f59d37370950c4ce4d87e449c4f2e5b Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:03:06 +0530
Subject: [PATCH 03/18] fix tests
---
.secrets.baseline | 8 +-
core_backend/app/llm_call/llm_prompts.py | 133 +++++++++---------
core_backend/app/llm_call/process_input.py | 4 +-
.../tests/api/test_question_answer.py | 21 ++-
.../rails/data/language_identification.yaml | 108 +++++++-------
.../rails/test_language_identification.py | 46 ++++--
6 files changed, 185 insertions(+), 135 deletions(-)
diff --git a/.secrets.baseline b/.secrets.baseline
index 5cab9e8c1..2ba6baa9b 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -448,14 +448,14 @@
"filename": "core_backend/tests/api/test_question_answer.py",
"hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d",
"is_verified": false,
- "line_number": 294
+ "line_number": 419
},
{
"type": "Secret Keyword",
"filename": "core_backend/tests/api/test_question_answer.py",
"hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee",
"is_verified": false,
- "line_number": 653
+ "line_number": 1019
}
],
"core_backend/tests/api/test_user_tools.py": [
@@ -473,7 +473,7 @@
"filename": "core_backend/tests/rails/test_language_identification.py",
"hashed_secret": "051b2c1d98174fabc4749641c4f4f4660556441e",
"is_verified": false,
- "line_number": 48
+ "line_number": 69
}
],
"core_backend/tests/rails/test_paraphrasing.py": [
@@ -581,5 +581,5 @@
}
]
},
- "generated_at": "2025-01-24T13:35:08Z"
+ "generated_at": "2025-04-09T08:32:56Z"
}
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 18c0d5864..751a1079a 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -330,6 +330,67 @@ def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]
raise ValueError(f"Error validating the output: {e}") from e
+class IdentifiedLanguage(str, Enum):
+ """Identified language of the user's input."""
+
+ # AFRIKAANS = "AFRIKAANS"
+ ENGLISH = "ENGLISH"
+ FRENCH = "FRENCH"
+ HINDI = "HINDI"
+ MARATHI = "MARATHI"
+ SWAHILI = "SWAHILI"
+ UNINTELLIGIBLE = "UNINTELLIGIBLE"
+ UNSUPPORTED = "UNSUPPORTED"
+ # XHOSA = "XHOSA"
+ # ZULU = "ZULU"
+
+ @classmethod
+ def get_supported_languages(cls) -> list[str]:
+ """Return a list of supported languages.
+
+ Returns
+ -------
+ list[str]
+ A list of supported languages.
+ """
+
+ return [
+ lang
+ for lang in cls._member_names_
+ if lang not in ("UNINTELLIGIBLE", "UNSUPPORTED")
+ ]
+
+ @classmethod
+ def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override]
+ """If language identified is not one of the supported language, it is
+ classified as UNSUPPORTED.
+
+ Parameters
+ ----------
+ value
+ The language identified.
+
+ Returns
+ -------
+ IdentifiedLanguage
+ The identified language (i.e., UNSUPPORTED).
+ """
+
+ return cls.UNSUPPORTED
+
+ @classmethod
+ def get_prompt(cls) -> str:
+ """Return the prompt for the language identification bot.
+
+ Returns
+ -------
+ str
+ The prompt for the language identification bot.
+ """
+
+ return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
+
+
class IdentifiedScript(str, Enum):
"""Script used in the user's input."""
@@ -341,12 +402,12 @@ class IdentifiedScript(str, Enum):
# JAPANESE = "Japanese"
# KOREAN = "Korean"
# THAI = "Thai"
- BENGALI = "Bengali"
- TAMIL = "Tamil"
- TELUGU = "Telugu"
- KANNADA = "Kannada"
- MALAYALAM = "Malayalam"
- GUJARATI = "Gujarati"
+ # BENGALI = "Bengali"
+ # TAMIL = "Tamil"
+ # TELUGU = "Telugu"
+ # KANNADA = "Kannada"
+ # MALAYALAM = "Malayalam"
+ # GUJARATI = "Gujarati"
# GURMUKHI = "Gurmukhi"
# ORIYA = "Oriya"
# SINHALA = "Sinhala"
@@ -402,66 +463,6 @@ class LanguageIdentificationResponse(BaseModel):
model_config = ConfigDict(strict=True)
-class IdentifiedLanguage(str, Enum):
- """Identified language of the user's input."""
-
- # AFRIKAANS = "AFRIKAANS"
- ENGLISH = "ENGLISH"
- FRENCH = "FRENCH"
- HINDI = "HINDI"
- SWAHILI = "SWAHILI"
- UNINTELLIGIBLE = "UNINTELLIGIBLE"
- UNSUPPORTED = "UNSUPPORTED"
- # XHOSA = "XHOSA"
- # ZULU = "ZULU"
-
- @classmethod
- def get_supported_languages(cls) -> list[str]:
- """Return a list of supported languages.
-
- Returns
- -------
- list[str]
- A list of supported languages.
- """
-
- return [
- lang
- for lang in cls._member_names_
- if lang not in ("UNINTELLIGIBLE", "UNSUPPORTED")
- ]
-
- @classmethod
- def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override]
- """If language identified is not one of the supported language, it is
- classified as UNSUPPORTED.
-
- Parameters
- ----------
- value
- The language identified.
-
- Returns
- -------
- IdentifiedLanguage
- The identified language (i.e., UNSUPPORTED).
- """
-
- return cls.UNSUPPORTED
-
- @classmethod
- def get_prompt(cls) -> str:
- """Return the prompt for the language identification bot.
-
- Returns
- -------
- str
- The prompt for the language identification bot.
- """
-
- return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
-
-
LANGUAGE_ID_PROMPT = (
f"""\
You are a high-performing language identification bot that classifies the \
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index dfc3341a7..ba8d1025a 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
try:
cleaned_json_str = remove_json_markdown(text=json_str)
lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
- identified_lang = lang_info["language"]
- identified_script = lang_info["script"]
+ identified_lang = lang_info.language
+ identified_script = lang_info.script
except ValidationError:
identified_lang = IdentifiedLanguage.UNSUPPORTED
identified_script = IdentifiedScript.LATIN
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 163e77574..50299a7b4 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -10,7 +10,11 @@
from fastapi import status
from fastapi.testclient import TestClient
-from core_backend.app.llm_call.llm_prompts import AlignmentScore, IdentifiedLanguage
+from core_backend.app.llm_call.llm_prompts import (
+ AlignmentScore,
+ IdentifiedLanguage,
+ LanguageIdentificationResponse,
+)
from core_backend.app.llm_call.process_input import (
_classify_safety,
_identify_language,
@@ -1045,10 +1049,10 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
)
@pytest.mark.parametrize(
- "identified_lang_str,should_error,expected_error_type",
+ "identified_lang_str,identified_script_str,should_error,expected_error_type",
[
- ("ENGLISH", False, None),
- ("HINDI", False, None),
+ ("ENGLISH", "Latin", False, None),
+ ("HINDI", "Devanagari", False, None),
("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT),
("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE),
("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE),
@@ -1059,6 +1063,7 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
async def test_language_identify_error(
self,
identified_lang_str: str,
+ identified_script_str: str,
should_error: bool,
expected_error_type: ErrorType,
monkeypatch: pytest.MonkeyPatch,
@@ -1084,6 +1089,7 @@ async def test_language_identify_error(
generate_llm_response=False,
generate_tts=False,
original_language=None,
+ original_script=None,
query_text="This is a basic query",
query_text_original="This is a query original",
workspace_id=124,
@@ -1104,10 +1110,12 @@ async def mock_ask_llm( # pylint: disable=W0613
Returns
-------
str
- The identified language string.
+ The identified language and script model json string.
"""
- return identified_lang_str
+ return LanguageIdentificationResponse(
+ language=identified_lang_str, script=identified_script_str
+ ).model_dump_json()
monkeypatch.setattr(
"core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm
@@ -1233,6 +1241,7 @@ async def mock_ask_llm( # pylint: disable=W0613
generate_llm_response=False,
generate_tts=False,
original_language=None,
+ original_script=None,
query_text="This is a basic query",
query_text_original="This is a query original",
workspace_id=124,
diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml
index a4d3ddb34..7daa61476 100644
--- a/core_backend/tests/rails/data/language_identification.yaml
+++ b/core_backend/tests/rails/data/language_identification.yaml
@@ -2,59 +2,73 @@
# improve this with a native speaker. These might be too "pure".
HAUSA:
- - Ina da yara biyu masu hanci
- - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
- - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
- - Menene wannan?
- - Sannun ku da zuwa #h/t: Fola from here on
- - Ni yarinya ne
- - Zo ka chi abunchi
- - Ina kwana Maman mu
- - Wannan shago na ne
+ Latin:
+ - Ina da yara biyu masu hanci
+ - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
+ - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
+ - Menene wannan?
+ - Sannun ku da zuwa #h/t: Fola from here on
+ - Ni yarinya ne
+ - Zo ka chi abunchi
+ - Ina kwana Maman mu
+ - Wannan shago na ne
ENGLISH:
- - I have two children. You see I girl, what is the probability the other is also a girl?
- - No idea
- - Why you say that?
+ Latin:
+ - I have two children. You see I girl, what is the probability the other is also a girl?
+ - No idea
+ - Why you say that?
XHOSA:
- - Umama ngugqirha
- - Utata ngumongikazi
- - Ukuba intamo yam yayifuna ukwenza oko?
- - Iintsana zikhala kakhulu, huh?
+ Latin:
+ - Umama ngugqirha
+ - Utata ngumongikazi
+ - Ukuba intamo yam yayifuna ukwenza oko?
+ - Iintsana zikhala kakhulu, huh?
YORUBA: #h/t: Fola
- - Ni bo ló ti ri owo yen?
- - Eyin melo ni e wa ni be?
- - Ki ni itumo oruko ẹ?
- - Ki ni o jẹ lánà?
- - Omo Ibadan ni mi
+ Latin:
+ - Ni bo ló ti ri owo yen?
+ - Eyin melo ni e wa ni be?
+ - Ki ni itumo oruko ẹ?
+ - Ki ni o jẹ lánà?
+ - Omo Ibadan ni mi
IGBO: #h/t: Fola
- - agụụ na-agụ m
- - agam aga ahia echi
- - ị hụla ngozi? ana m achọ ya.
- - m na-aga ọrụ
+ Latin:
+ - agụụ na-agụ m
+ - agam aga ahia echi
+ - ị hụla ngozi? ana m achọ ya.
+ - m na-aga ọrụ
KOREAN:
- - 애가 둘이예요
- - ㅋㅋㅋㅋㅋㅋ
- - 아이들이 많이 울어요ㅠ
- - 이 프로젝트 애칭은 ask-a-question이야.
+ Korean:
+ - 애가 둘이예요
+ - ㅋㅋㅋㅋㅋㅋ
+ - 아이들이 많이 울어요ㅠ
+ - 이 프로젝트 애칭은 ask-a-question이야.
ZULU:
- - Ngingumama
- - Ingabe uyi-bot noma ungumuntu?
- - Ngifuna ukwenza lokhu?
- - Izingane zikhala kakhulu, hhe
+ Latin:
+ - Ngingumama
+ - Ingabe uyi-bot noma ungumuntu?
+ - Ngifuna ukwenza lokhu?
+ - Izingane zikhala kakhulu, hhe
AFRIKAANS:
- - Ek het hierdie goddelose dal gemaak
- - Is covid nog 'n ding?
- - My hond het my huiswerk geëet
- - Het jy al gebraaide roomys probeer?
+ Latin:
+ - Ek het hierdie goddelose dal gemaak
+ - Is covid nog 'n ding?
+ - My hond het my huiswerk geëet
+ - Het jy al gebraaide roomys probeer?
HINDI: #h/t: Sid
- - is ka matlab kya hai?
- - kabhi kabhi mere dil mein
- - अंत में सभी लोग नाश्ता करने जाएं
- - गब्बर सिंह कह के गया जो डर गया वो मर गया
+ Latin:
+ - is ka matlab kya hai?
+ - kabhi kabhi mere dil mein
+ Devanagari:
+ - अंत में सभी लोग नाश्ता करने जाएं
+ - गब्बर सिंह कह के गया जो डर गया वो मर गया
+MARATHI:
+ Latin:
+ - Portal chi link aahe
UNINTELLIGIBLE:
- - sdfsdf sss dyhnel jjj
- - hs dsfsg xd ewwo ddfs
- - Heghlu'meH QaQ jajvam
- - yIHuchQo', 'ej jIHvaD yIqemchu'mo'
- - \%^*# levels; 91011 AQGs!!!
- - 1234 AQI WHO? 5678
+ Unknown:
+ - sdfsdf sss dyhnel jjj
+ - hs dsfsg xd ewwo ddfs
+ - Heghlu'meH QaQ jajvam
+ - yIHuchQo', 'ej jIHvaD yIqemchu'mo'
+ - \%^*# levels; 91011 AQGs!!!
+ - 1234 AQI WHO? 5678
diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py
index 9b30b2e9a..b7c30f46f 100644
--- a/core_backend/tests/rails/test_language_identification.py
+++ b/core_backend/tests/rails/test_language_identification.py
@@ -5,7 +5,7 @@
import pytest
import yaml
-from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage
+from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript
from core_backend.app.llm_call.process_input import _identify_language
from core_backend.app.question_answer.schemas import QueryRefined, QueryResponse
@@ -22,19 +22,38 @@ def available_languages() -> list[str]:
return list(IdentifiedLanguage)
-def read_test_data(file: str) -> list[tuple[str, str]]:
+@pytest.fixture(scope="module")
+def available_scripts() -> list[str]:
+ """Returns a list of available languages."""
+
+ return list(IdentifiedScript)
+
+
+def read_test_data(file: str) -> list[tuple[str, str, str]]:
"""Reads test data from file and returns a list of strings."""
file_path = Path(__file__).parent / file
with open(file_path, "r", encoding="utf-8") as f:
content = yaml.safe_load(f)
- return [(key, value) for key, values in content.items() for value in values]
-
-
-@pytest.mark.parametrize("expected_label, content", read_test_data(LANGUAGE_FILE))
+ data = [
+ (language, script, text)
+ for language, script_dict in content.items()
+ for script, texts in script_dict.items()
+ for text in texts
+ ]
+ return data
+
+
+@pytest.mark.parametrize(
+ "expected_language,expected_script,content", read_test_data(LANGUAGE_FILE)
+)
async def test_language_identification(
- available_languages: list[str], expected_label: str, content: str
+ available_languages: list[str],
+ available_scripts: list[str],
+ expected_language: str,
+ expected_script: str,
+ content: str,
) -> None:
"""Test language identification."""
@@ -53,8 +72,15 @@ async def test_language_identification(
search_results=None,
session_id=None,
)
- if expected_label not in available_languages:
- expected_label = "UNSUPPORTED"
+
+ if expected_language not in available_languages:
+ expected_language = "UNSUPPORTED"
+
+ if expected_script not in available_scripts:
+ expected_script = "Unknown"
+
_, response = await _identify_language(query_refined=question, response=response)
- assert response.debug_info["original_language"] == expected_label
+ assert response.debug_info["original_language"] == expected_language
+ if expected_language not in ("UNINTELLIGIBLE", "UNSUPPORTED"):
+ assert response.debug_info["original_script"] == expected_script
From ab2fd75511f3333e7bccde9ddd5388c1ee22ae8e Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 9 Apr 2025 18:38:28 +0530
Subject: [PATCH 04/18] changes
---
core_backend/app/llm_call/llm_prompts.py | 27 +++++++-------
core_backend/app/llm_call/llm_rag.py | 14 ++------
core_backend/app/llm_call/process_input.py | 40 +++++++++++++++------
core_backend/app/question_answer/routers.py | 4 ---
core_backend/app/question_answer/schemas.py | 1 +
5 files changed, 46 insertions(+), 40 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 751a1079a..d82e3aed7 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -234,6 +234,11 @@ class AlignmentScore(BaseModel):
RELEVANT INFORMATION.
Only output the JSON response, without any additional text.
+
+
+
+{additional_info}
+
"""
@@ -396,20 +401,14 @@ class IdentifiedScript(str, Enum):
LATIN = "Latin"
DEVANAGARI = "Devanagari"
- # ARABIC = "Arabic"
- # CYRILLIC = "Cyrillic"
- # CHINESE = "Chinese"
- # JAPANESE = "Japanese"
- # KOREAN = "Korean"
- # THAI = "Thai"
- # BENGALI = "Bengali"
- # TAMIL = "Tamil"
- # TELUGU = "Telugu"
- # KANNADA = "Kannada"
- # MALAYALAM = "Malayalam"
- # GUJARATI = "Gujarati"
- # GURMUKHI = "Gurmukhi"
- # ORIYA = "Oriya"
+ BENGALI = "Bengali"
+ TAMIL = "Tamil"
+ TELUGU = "Telugu"
+ KANNADA = "Kannada"
+ MALAYALAM = "Malayalam"
+ GUJARATI = "Gujarati"
+ GURMUKHI = "Gurmukhi"
+ ORIYA = "Oriya"
# SINHALA = "Sinhala"
# MYANMAR = "Myanmar"
# ETHIOPIC = "Ethiopic"
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index 49a229364..d2fb045d8 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -127,24 +127,14 @@ async def get_llm_rag_answer_with_chat_history(
message_type=message_type,
original_language=original_language,
original_script=original_script,
+ additional_info=context,
)
)
- content = (
- question
- + f""""\n\n
- ADDITIONAL RELEVANT INFORMATION BELOW
- =====================================
- {context}
-
- ADDITIONAL RELEVANT INFORMATION ABOVE
- =====================================
- """
- )
content = await get_chat_response(
chat_history=chat_history,
chat_params=chat_params,
- message_params=content,
+ message_params=question,
session_id=session_id,
json_=True,
metadata=metadata or {},
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index ba8d1025a..80b462f36 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -127,19 +127,26 @@ async def _identify_language(
identified_script = IdentifiedScript.LATIN
query_refined.original_language = identified_lang
+ query_refined.original_script = identified_script
+
response.debug_info["original_query"] = query_refined.query_text_original
response.debug_info["original_language"] = identified_lang
response.debug_info["original_script"] = identified_script
processed_response = _process_identified_language_response(
- identified_language=identified_lang, response=response
+ identified_language=identified_lang,
+ identified_script=identified_script,
+ response=response,
)
return query_refined, processed_response
def _process_identified_language_response(
- *, identified_language: IdentifiedLanguage, response: QueryResponse
+ *,
+ identified_language: IdentifiedLanguage,
+ identified_script: IdentifiedScript,
+ response: QueryResponse,
) -> QueryResponse | QueryResponseError:
"""Process the identified language and return the response.
@@ -147,6 +154,8 @@ def _process_identified_language_response(
----------
identified_language
The identified language.
+ identified_script
+ The identified script.
response
The response object.
@@ -157,20 +166,31 @@ def _process_identified_language_response(
"""
supported_languages_list = IdentifiedLanguage.get_supported_languages()
+ supported_scripts_list = IdentifiedScript.get_supported_scripts()
- if identified_language in supported_languages_list:
+ if (
+ identified_language in supported_languages_list
+ and identified_script in supported_scripts_list
+ ):
return response
supported_languages = ", ".join(supported_languages_list)
+ supported_scripts = ", ".join(supported_scripts_list)
- match identified_language:
- case IdentifiedLanguage.UNINTELLIGIBLE:
+ if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
+ error_message = (
+ "Unintelligible input. "
+ + f"The following languages are supported: {supported_languages}."
+ )
+ error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
+ else:
+ if identified_script == IdentifiedScript.UNKNOWN:
error_message = (
- "Unintelligible input. "
- + f"The following languages are supported: {supported_languages}."
+ "Unsupported script. "
+ + f"Only the following scripts are supported: {supported_scripts}"
)
- error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
- case _:
+ error_type = ErrorType.UNSUPPORTED_SCRIPT
+ else:
error_message = (
"Unsupported language. Only the following languages "
+ f"are supported: {supported_languages}."
@@ -190,7 +210,7 @@ def _process_identified_language_response(
error_response.debug_info.update(response.debug_info)
logger.info(
- f"LANGUAGE IDENTIFICATION FAILED due to {identified_language.value} "
+ f"LANGUAGE IDENTIFICATION FAILED due to {error_message} "
f"language on query id: {str(response.query_id)}"
)
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 9d301cdc4..28f5e16aa 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -843,10 +843,6 @@ async def get_user_query_and_response(
query_text_original=user_query.query_text,
workspace_id=workspace_id,
)
- if user_query_refined.chat_query_params:
- user_query_refined.query_text = user_query_refined.chat_query_params.pop(
- "search_query"
- )
# Prepare the placeholder response object.
response_template = QueryResponse(
diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py
index bda58ce7a..c434b28ee 100644
--- a/core_backend/app/question_answer/schemas.py
+++ b/core_backend/app/question_answer/schemas.py
@@ -23,6 +23,7 @@ class ErrorType(str, Enum):
UNABLE_TO_TRANSLATE = "unable_to_translate"
UNINTELLIGIBLE_INPUT = "unintelligible_input"
UNSUPPORTED_LANGUAGE = "unsupported_language"
+ UNSUPPORTED_SCRIPT = "unsupported_script"
class QueryBase(BaseModel):
From 2cba258051be3a32f6c55f0e9ebab6336daf3fbd Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 11:01:00 +0530
Subject: [PATCH 05/18] remove search query during init chat history
---
core_backend/app/llm_call/llm_prompts.py | 4 +---
core_backend/app/question_answer/routers.py | 1 -
2 files changed, 1 insertion(+), 4 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index d82e3aed7..7977a923d 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -276,9 +276,7 @@ class ChatHistory:
{{
"message_type": "The type of the user's LATEST MESSAGE. List of valid
- options are: {valid_message_types},
- "query": "The vector database query that you have constructed based on
- the user's LATEST MESSAGE and the conversation history."
+ options are: {valid_message_types}"
}}
Do NOT attempt to answer the user's question/concern. Only output the JSON
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 28f5e16aa..5a4b057b8 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -1073,7 +1073,6 @@ async def init_user_query_and_chat_histories(
"chat_params": chat_params,
"message_type": search_query_json_response["message_type"],
"redis_client": redis_client,
- "search_query": search_query_json_response["query"],
"session_id": session_id,
}
user_query.generate_llm_response = True
From dbdeec459ff5cfc5557c12be9e1de0a684433a91 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 11:36:12 +0530
Subject: [PATCH 06/18] fix tests and type
---
core_backend/app/llm_call/llm_prompts.py | 1 -
core_backend/tests/api/test_question_answer.py | 12 +++++++-----
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 7977a923d..2766fabe6 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -291,7 +291,6 @@ class ChatHistoryConstructSearchQuery(BaseModel):
"""Pydantic model for the output of the construct search query chat history."""
message_type: Literal["FOLLOW-UP", "NEW"]
- query: str
@staticmethod
def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]:
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 50299a7b4..936bf41b5 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1053,11 +1053,13 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
[
("ENGLISH", "Latin", False, None),
("HINDI", "Devanagari", False, None),
- ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT),
- ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("SOME_UNSUPPORTED_LANG", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("don't kow", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("UNINTELLIGIBLE", "Latin", True, ErrorType.UNINTELLIGIBLE_INPUT),
+ ("UNINTELLIGIBLE", "Unknown", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("GIBBERISH", "Unknwon", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("GIBBERISH", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("UNSUPPORTED", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("SOME_UNSUPPORTED_LANG", "Unknown", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("don't kow", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
],
)
async def test_language_identify_error(
From 09fdd6e8b876ad01f637548c64e77210944bb30b Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:08:01 +0530
Subject: [PATCH 07/18] change schema and add validator
---
core_backend/app/llm_call/llm_prompts.py | 20 +++++++++++++++++---
1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 2766fabe6..61cef0490 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -7,7 +7,7 @@
from enum import Enum
from typing import ClassVar, Literal
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator
from .utils import format_prompt, remove_json_markdown
@@ -453,8 +453,22 @@ def get_supported_scripts(cls) -> list[str]:
class LanguageIdentificationResponse(BaseModel):
"""Pydantic model for the language identification response."""
- language: IdentifiedLanguage
- script: IdentifiedScript
+ language: str
+ script: str
+
+ @field_validator("language")
+ def validate_language(cls, value: str) -> str:
+ """Make sure language input is a valid IdentifiedLanguage"""
+ if value not in IdentifiedLanguage._member_names_:
+ raise ValueError(f"Invalid language: {value}")
+ return value
+
+ @field_validator("script")
+ def validate_script(cls, value: str) -> str:
+ """Make sure script input is a valid IdentifiedScript"""
+ if value not in IdentifiedScript._member_names_:
+ raise ValueError(f"Invalid script: {value}")
+ return value
model_config = ConfigDict(strict=True)
From ba013a57aed365f4a2d58e36855c7f0f4c5e02ad Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:35:58 +0530
Subject: [PATCH 08/18] fix test return mock value
---
.secrets.baseline | 6 +++---
core_backend/app/llm_call/process_input.py | 2 +-
core_backend/tests/api/test_chat.py | 1 -
core_backend/tests/api/test_question_answer.py | 7 +++----
4 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/.secrets.baseline b/.secrets.baseline
index 2ba6baa9b..30aef52a0 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -448,14 +448,14 @@
"filename": "core_backend/tests/api/test_question_answer.py",
"hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d",
"is_verified": false,
- "line_number": 419
+ "line_number": 418
},
{
"type": "Secret Keyword",
"filename": "core_backend/tests/api/test_question_answer.py",
"hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee",
"is_verified": false,
- "line_number": 1019
+ "line_number": 1018
}
],
"core_backend/tests/api/test_user_tools.py": [
@@ -581,5 +581,5 @@
}
]
},
- "generated_at": "2025-04-09T08:32:56Z"
+ "generated_at": "2025-04-10T10:05:42Z"
}
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 80b462f36..d603980a4 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -117,8 +117,8 @@ async def _identify_language(
user_message=query_refined.query_text,
)
+ cleaned_json_str = remove_json_markdown(text=json_str)
try:
- cleaned_json_str = remove_json_markdown(text=json_str)
lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
identified_lang = lang_info.language
identified_script = lang_info.script
diff --git a/core_backend/tests/api/test_chat.py b/core_backend/tests/api/test_chat.py
index ed2f35f5e..d32cb3436 100644
--- a/core_backend/tests/api/test_chat.py
+++ b/core_backend/tests/api/test_chat.py
@@ -85,7 +85,6 @@ async def test_init_user_query_and_chat_histories(redis_client: aioredis.Redis)
chat_query_params["chat_cache_key"] == f"chatCache:{user_query.session_id}"
)
assert chat_query_params["message_type"] == "NEW"
- assert chat_query_params["search_query"] == "stomachache and possible remedies"
async def test__ask_llm_async() -> None:
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 936bf41b5..38fa74ddc 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -13,7 +13,6 @@
from core_backend.app.llm_call.llm_prompts import (
AlignmentScore,
IdentifiedLanguage,
- LanguageIdentificationResponse,
)
from core_backend.app.llm_call.process_input import (
_classify_safety,
@@ -1115,9 +1114,9 @@ async def mock_ask_llm( # pylint: disable=W0613
The identified language and script model json string.
"""
- return LanguageIdentificationResponse(
- language=identified_lang_str, script=identified_script_str
- ).model_dump_json()
+ return f"""
+ {{"language": "{identified_lang_str}", "script": "{identified_script_str}"}}
+ """.strip()
monkeypatch.setattr(
"core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm
From 7ed12705b693cb6ae00bf7a7a05e2db346c7cadd Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:43:29 +0530
Subject: [PATCH 09/18] use enum not string
---
core_backend/app/llm_call/process_input.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index d603980a4..06d0847a3 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
cleaned_json_str = remove_json_markdown(text=json_str)
try:
lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
- identified_lang = lang_info.language
- identified_script = lang_info.script
+ identified_lang = getattr(IdentifiedLanguage, lang_info.language)
+ identified_script = getattr(IdentifiedScript, lang_info.script)
except ValidationError:
identified_lang = IdentifiedLanguage.UNSUPPORTED
identified_script = IdentifiedScript.LATIN
From e8cbf8a9f70fe66dfc4d1af85b85aa8efbd5336b Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:07:36 +0530
Subject: [PATCH 10/18] fix type issues
---
core_backend/app/llm_call/llm_prompts.py | 151 +++++++++------------
core_backend/app/llm_call/llm_rag.py | 10 +-
core_backend/app/llm_call/process_input.py | 4 +-
3 files changed, 73 insertions(+), 92 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 61cef0490..92b63a125 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -183,20 +183,20 @@ class AlignmentScore(BaseModel):
CHAT_RESPONSE_PROMPT = """\
-You are an AI assistant designed to help users with their
-questions/concerns. You interact with users via a chat interface. You will
-be provided with ADDITIONAL RELEVANT INFORMATION that can address the
+You are an AI assistant designed to help users with their \
+questions/concerns. You interact with users via a chat interface. You will \
+be provided with ADDITIONAL RELEVANT INFORMATION that can address the \
user's questions/concerns.
BEFORE answering the user's LATEST MESSAGE, follow these steps:
-1. Review the conversation history to ensure that you understand the
+1. Review the conversation history to ensure that you understand the \
context in which the user's LATEST MESSAGE is being asked.
-2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
-understand the most useful information related to the user's LATEST
+2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you \
+understand the most useful information related to the user's LATEST \
MESSAGE.
-When you have completed the above steps, you will then write a JSON, whose
+When you have completed the above steps, you will then write a JSON, whose \
TypeScript Interface is given below:
interface Response {{
@@ -204,41 +204,33 @@ class AlignmentScore(BaseModel):
answer: string;
}}
-For "extracted_info", extract from the provided ADDITIONAL RELEVANT
-INFORMATION the most useful information related to the LATEST MESSAGE asked
-by the user, and list them one by one. If no useful information is found,
+For "extracted_info", extract from the provided ADDITIONAL RELEVANT \
+INFORMATION the most useful information related to the LATEST MESSAGE asked \
+by the user, and list them one by one. If no useful information is found, \
return an empty list.
-For "answer", understand the conversation history, ADDITIONAL RELEVANT
-INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
-the user's LATEST MESSAGE. If no useful information was found in the
-either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
+For "answer", understand the conversation history, ADDITIONAL RELEVANT \
+INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to \
+the user's LATEST MESSAGE. If no useful information was found in the \
+either the conversation history or the ADDITIONAL RELEVANT INFORMATION, \
respond with {failure_message}.
EXAMPLE RESPONSES:
-{{"extracted_info": [
- "Pineapples are a blend of pinecones and apples.",
- "Pineapples have the shape of a pinecone."
- ],
- "answer": "The 'pine-' from pineapples likely come from the fact that
- pineapples are a hybrid of pinecones and apples and its pinecone-like
- shape."
-}}
+{{"extracted_info": ["Pineapples are a blend of pinecones and apples.", \
+"Pineapples have the shape of a pinecone."], \
+"answer": "The 'pine-' from pineapples likely come from the fact that \
+pineapples are a hybrid of pinecones and apples and its pinecone-like \
+shape."}}
{{"extracted_info": [], "answer": "{failure_message}"}}
IMPORTANT NOTES ON THE "answer" FIELD:
- Keep in mind that the user is asking a {message_type} question.
- Answer in the language {original_language} in the script {original_script}.
- Answer should be concise and to the point.
-- Do not include any information that is not present in the ADDITIONAL
+- Do not include any information that is not present in the ADDITIONAL \
RELEVANT INFORMATION.
-Only output the JSON response, without any additional text.
-
-
-
-{additional_info}
-
+Only output the JSON response, without any additional text.\
"""
@@ -343,19 +335,17 @@ class IdentifiedLanguage(str, Enum):
SWAHILI = "SWAHILI"
UNINTELLIGIBLE = "UNINTELLIGIBLE"
UNSUPPORTED = "UNSUPPORTED"
+
# XHOSA = "XHOSA"
# ZULU = "ZULU"
-
@classmethod
def get_supported_languages(cls) -> list[str]:
"""Return a list of supported languages.
-
Returns
-------
list[str]
A list of supported languages.
"""
-
return [
lang
for lang in cls._member_names_
@@ -380,57 +370,53 @@ def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override]
return cls.UNSUPPORTED
- @classmethod
- def get_prompt(cls) -> str:
- """Return the prompt for the language identification bot.
-
- Returns
- -------
- str
- The prompt for the language identification bot.
- """
-
- return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
-
class IdentifiedScript(str, Enum):
"""Script used in the user's input."""
- LATIN = "Latin"
- DEVANAGARI = "Devanagari"
- BENGALI = "Bengali"
- TAMIL = "Tamil"
- TELUGU = "Telugu"
- KANNADA = "Kannada"
- MALAYALAM = "Malayalam"
- GUJARATI = "Gujarati"
- GURMUKHI = "Gurmukhi"
- ORIYA = "Oriya"
- # SINHALA = "Sinhala"
- # MYANMAR = "Myanmar"
- # ETHIOPIC = "Ethiopic"
- # GEORGIAN = "Georgian"
- # ARMENIAN = "Armenian"
- # HEBREW = "Hebrew"
- # GREEK = "Greek"
- # TIBETAN = "Tibetan"
- # MONGOLIAN = "Mongolian"
- # KHMER = "Khmer"
- # LAO = "Lao"
- # VIETNAMESE = "Vietnamese"
- # THAI_LAO = "Thai-Lao"
- UNKNOWN = "Unknown"
+ LATIN = "LATIN"
+ DEVANAGARI = "DEVANAGARI"
+ BENGALI = "BENGALI"
+ TAMIL = "TAMIL"
+ TELUGU = "TELUGU"
+ KANNADA = "KANNADA"
+ MALAYALAM = "MALAYALAM"
+ GUJARATI = "GUJARATI"
+ # GURMUKHI = "GURMUKHI"
+ # ORIYA = "ORIYA"
+ # SINHALA = "SINHALA"
+ # MYANMAR = "MYANMAR"
+ # ETHIOPIC = "ETHIOPIC"
+ # GEORGIAN = "GEORGIAN"
+ # ARMENIAN = "ARMENIAN"
+ # HEBREW = "HEBREW"
+ # GREEK = "GREEK"
+ # TIBETAN = "TIBETAN"
+ # MONGOLIAN = "MONGOLIAN"
+ # KHMER = "KHMER"
+ # LAO = "LAO"
+ # VIETNAMESE = "VIETNAMESE"
+ # THAI_LAO = "THAI_LAO"
+ UNKNOWN = "UNKNOWN"
+
+ @classmethod
+ def get_supported_scripts(cls) -> list[str]:
+ """Return a list of supported scripts.
+ Returns
+ -------
+ list[str]
+ A list of supported scripts.
+ """
+ return [script for script in cls._member_names_ if script != "UNKNOWN"]
@classmethod
def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override]
"""If script identified is not one of the supported scripts, it is
classified as UNKNOWN.
-
Parameters
----------
value
The script identified.
-
Returns
-------
Script
@@ -438,17 +424,6 @@ def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override]
"""
return cls.UNKNOWN
- @classmethod
- def get_supported_scripts(cls) -> list[str]:
- """Return a list of supported scripts.
-
- Returns
- -------
- list[str]
- A list of supported scripts.
- """
- return [script.value for script in cls if script != cls.UNKNOWN]
-
class LanguageIdentificationResponse(BaseModel):
"""Pydantic model for the language identification response."""
@@ -490,13 +465,13 @@ def validate_script(cls, value: str) -> str:
4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin"""
+ """
Examples:
-"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
-"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
-"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
-"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
-"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
-"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
-"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "LATIN"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "LATIN"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "DEVANAGARI"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "LATIN"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "LATIN"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "LATIN"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "ARABIC"}}
Respond with a JSON object containing "language" and "script" keys.
"""
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index d2fb045d8..d7a1dea12 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -18,6 +18,7 @@
from .utils import (
_ask_llm_async,
append_messages_to_chat_history,
+ format_prompt,
get_chat_response,
remove_json_markdown,
)
@@ -127,14 +128,19 @@ async def get_llm_rag_answer_with_chat_history(
message_type=message_type,
original_language=original_language,
original_script=original_script,
- additional_info=context,
)
)
+ user_message_with_context = format_prompt(
+ prompt=f"""{question}\n\n
+
+ {context}
+ """
+ )
content = await get_chat_response(
chat_history=chat_history,
chat_params=chat_params,
- message_params=question,
+ message_params=user_message_with_context,
session_id=session_id,
json_=True,
metadata=metadata or {},
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 06d0847a3..e73cc24d6 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
cleaned_json_str = remove_json_markdown(text=json_str)
try:
lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
- identified_lang = getattr(IdentifiedLanguage, lang_info.language)
- identified_script = getattr(IdentifiedScript, lang_info.script)
+ identified_lang = getattr(IdentifiedLanguage, lang_info.language.upper())
+ identified_script = getattr(IdentifiedScript, lang_info.script.upper())
except ValidationError:
identified_lang = IdentifiedLanguage.UNSUPPORTED
identified_script = IdentifiedScript.LATIN
From 730a8ccbcbae28f11f6b7317247f8c43bb4ef7aa Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:10:15 +0530
Subject: [PATCH 11/18] fix tests and always run paraphrase guardrail
---
core_backend/app/llm_call/process_input.py | 7 +++----
core_backend/tests/api/test_question_answer.py | 18 +++++++++---------
2 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index e73cc24d6..c71371908 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -493,10 +493,9 @@ async def wrapper(
The appropriate response object.
"""
- if not query_refined.chat_query_params:
- query_refined, response = await _paraphrase_question(
- query_refined=query_refined, response=response
- )
+ query_refined, response = await _paraphrase_question(
+ query_refined=query_refined, response=response
+ )
response = await func(query_refined, response, *args, **kwargs)
return response
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 38fa74ddc..aa8648c5e 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1050,15 +1050,15 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
@pytest.mark.parametrize(
"identified_lang_str,identified_script_str,should_error,expected_error_type",
[
- ("ENGLISH", "Latin", False, None),
- ("HINDI", "Devanagari", False, None),
- ("UNINTELLIGIBLE", "Latin", True, ErrorType.UNINTELLIGIBLE_INPUT),
- ("UNINTELLIGIBLE", "Unknown", True, ErrorType.UNSUPPORTED_SCRIPT),
- ("GIBBERISH", "Unknwon", True, ErrorType.UNSUPPORTED_SCRIPT),
- ("GIBBERISH", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("UNSUPPORTED", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("SOME_UNSUPPORTED_LANG", "Unknown", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("don't kow", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("ENGLISH", "LATIN", False, None),
+ ("HINDI", "DEVANAGARI", False, None),
+ ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
+ ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("don't kow", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
],
)
async def test_language_identify_error(
From 8b7e5a4cbfabbd94ddcd991e79f6f455f2d39a7a Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:14:47 +0530
Subject: [PATCH 12/18] use uppercase
---
.../rails/data/language_identification.yaml | 24 +++++++++----------
.../rails/test_language_identification.py | 2 +-
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml
index 7daa61476..4b28c20e2 100644
--- a/core_backend/tests/rails/data/language_identification.yaml
+++ b/core_backend/tests/rails/data/language_identification.yaml
@@ -2,7 +2,7 @@
# improve this with a native speaker. These might be too "pure".
HAUSA:
- Latin:
+ LATIN:
- Ina da yara biyu masu hanci
- Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
- Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
@@ -13,59 +13,59 @@ HAUSA:
- Ina kwana Maman mu
- Wannan shago na ne
ENGLISH:
- Latin:
+ LATIN:
- I have two children. You see I girl, what is the probability the other is also a girl?
- No idea
- Why you say that?
XHOSA:
- Latin:
+ LATIN:
- Umama ngugqirha
- Utata ngumongikazi
- Ukuba intamo yam yayifuna ukwenza oko?
- Iintsana zikhala kakhulu, huh?
YORUBA: #h/t: Fola
- Latin:
+ LATIN:
- Ni bo ló ti ri owo yen?
- Eyin melo ni e wa ni be?
- Ki ni itumo oruko ẹ?
- Ki ni o jẹ lánà?
- Omo Ibadan ni mi
IGBO: #h/t: Fola
- Latin:
+ LATIN:
- agụụ na-agụ m
- agam aga ahia echi
- ị hụla ngozi? ana m achọ ya.
- m na-aga ọrụ
KOREAN:
- Korean:
+ KOREAN:
- 애가 둘이예요
- ㅋㅋㅋㅋㅋㅋ
- 아이들이 많이 울어요ㅠ
- 이 프로젝트 애칭은 ask-a-question이야.
ZULU:
- Latin:
+ LATIN:
- Ngingumama
- Ingabe uyi-bot noma ungumuntu?
- Ngifuna ukwenza lokhu?
- Izingane zikhala kakhulu, hhe
AFRIKAANS:
- Latin:
+ LATIN:
- Ek het hierdie goddelose dal gemaak
- Is covid nog 'n ding?
- My hond het my huiswerk geëet
- Het jy al gebraaide roomys probeer?
HINDI: #h/t: Sid
- Latin:
+ LATIN:
- is ka matlab kya hai?
- kabhi kabhi mere dil mein
- Devanagari:
+ DEVANAGARI:
- अंत में सभी लोग नाश्ता करने जाएं
- गब्बर सिंह कह के गया जो डर गया वो मर गया
MARATHI:
- Latin:
+ LATIN:
- Portal chi link aahe
UNINTELLIGIBLE:
- Unknown:
+ UNKNOWN:
- sdfsdf sss dyhnel jjj
- hs dsfsg xd ewwo ddfs
- Heghlu'meH QaQ jajvam
diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py
index b7c30f46f..6744d8216 100644
--- a/core_backend/tests/rails/test_language_identification.py
+++ b/core_backend/tests/rails/test_language_identification.py
@@ -77,7 +77,7 @@ async def test_language_identification(
expected_language = "UNSUPPORTED"
if expected_script not in available_scripts:
- expected_script = "Unknown"
+ expected_script = "UNKNOWN"
_, response = await _identify_language(query_refined=question, response=response)
From 15e083fafd0a680ce6094d03c03aaedcee1e652f Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:35:30 +0530
Subject: [PATCH 13/18] fix tests and how we get enum
---
core_backend/app/llm_call/process_input.py | 4 ++--
core_backend/tests/api/test_question_answer.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index c71371908..4ae2eb368 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
cleaned_json_str = remove_json_markdown(text=json_str)
try:
lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
- identified_lang = getattr(IdentifiedLanguage, lang_info.language.upper())
- identified_script = getattr(IdentifiedScript, lang_info.script.upper())
+ identified_lang = IdentifiedLanguage(lang_info.language.upper())
+ identified_script = IdentifiedScript(lang_info.script.upper())
except ValidationError:
identified_lang = IdentifiedLanguage.UNSUPPORTED
identified_script = IdentifiedScript.LATIN
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index aa8648c5e..0894ccbde 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1053,7 +1053,7 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
("ENGLISH", "LATIN", False, None),
("HINDI", "DEVANAGARI", False, None),
("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
- ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT),
("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
From a4136be621f058ada3b1e15e71f5a0827dad82d0 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:48:49 +0530
Subject: [PATCH 14/18] add test cases
---
core_backend/app/llm_call/process_input.py | 1 +
core_backend/tests/api/test_question_answer.py | 4 +++-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 4ae2eb368..a7a3d7a21 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -184,6 +184,7 @@ def _process_identified_language_response(
)
error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
else:
+ # TODO: create types for language x script combos
if identified_script == IdentifiedScript.UNKNOWN:
error_message = (
"Unsupported script. "
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 0894ccbde..43e9b7ef5 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1054,7 +1054,9 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
("HINDI", "DEVANAGARI", False, None),
("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT),
- ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("ENGLISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("ENGLISH", "Some unsupported script", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
From e951d0acdc17bb8fb2a356636db956541ef4f0e4 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:06:54 +0530
Subject: [PATCH 15/18] clean up error logic
---
core_backend/app/llm_call/process_input.py | 35 ++++++++++------------
1 file changed, 16 insertions(+), 19 deletions(-)
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index a7a3d7a21..a84d1e798 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -167,34 +167,31 @@ def _process_identified_language_response(
supported_languages_list = IdentifiedLanguage.get_supported_languages()
supported_scripts_list = IdentifiedScript.get_supported_scripts()
+ supported_languages_str = ", ".join(supported_languages_list)
+ suported_scripts_str = ", ".join(supported_scripts_list)
- if (
- identified_language in supported_languages_list
- and identified_script in supported_scripts_list
- ):
- return response
+ language_ok = identified_language in supported_languages_list
+ script_ok = identified_script in supported_scripts_list
- supported_languages = ", ".join(supported_languages_list)
- supported_scripts = ", ".join(supported_scripts_list)
-
- if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
+ if language_ok and script_ok:
+ return response
+ elif language_ok and not script_ok:
error_message = (
- "Unintelligible input. "
- + f"The following languages are supported: {supported_languages}."
+ "Unsupported script. "
+ + f"Only the following scripts are supported: {suported_scripts_str}"
)
- error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
- else:
- # TODO: create types for language x script combos
- if identified_script == IdentifiedScript.UNKNOWN:
+ error_type: ErrorType = ErrorType.UNSUPPORTED_SCRIPT
+ else: # regardless of script, language is not "ok"
+ if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
error_message = (
- "Unsupported script. "
- + f"Only the following scripts are supported: {supported_scripts}"
+ "Unintelligible input. "
+ + f"The following languages are supported: {supported_languages_str}."
)
- error_type = ErrorType.UNSUPPORTED_SCRIPT
+ error_type = ErrorType.UNINTELLIGIBLE_INPUT
else:
error_message = (
"Unsupported language. Only the following languages "
- + f"are supported: {supported_languages}."
+ + f"are supported: {supported_languages_str}."
)
error_type = ErrorType.UNSUPPORTED_LANGUAGE
From c8117de28461fa5c586c9d08328b6cacead2f559 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:14:57 +0530
Subject: [PATCH 16/18] fix conftes
---
.secrets.baseline | 53 +-----------------------------
core_backend/tests/api/conftest.py | 3 ++
2 files changed, 4 insertions(+), 52 deletions(-)
diff --git a/.secrets.baseline b/.secrets.baseline
index 30aef52a0..f961ef821 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -348,57 +348,6 @@
"line_number": 15
}
],
- "core_backend/tests/api/conftest.py": [
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3",
- "is_verified": false,
- "line_number": 46
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7",
- "is_verified": false,
- "line_number": 47
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097",
- "is_verified": false,
- "line_number": 50
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4",
- "is_verified": false,
- "line_number": 51
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a",
- "is_verified": false,
- "line_number": 56
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78",
- "is_verified": false,
- "line_number": 57
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e",
- "is_verified": false,
- "line_number": 317
- }
- ],
"core_backend/tests/api/test.env": [
{
"type": "Secret Keyword",
@@ -581,5 +530,5 @@
}
]
},
- "generated_at": "2025-04-10T10:05:42Z"
+ "generated_at": "2025-04-10T13:44:48Z"
}
diff --git a/core_backend/tests/api/conftest.py b/core_backend/tests/api/conftest.py
index ca0a53a0b..d0e8b15bc 100644
--- a/core_backend/tests/api/conftest.py
+++ b/core_backend/tests/api/conftest.py
@@ -35,6 +35,7 @@
RAG,
AlignmentScore,
IdentifiedLanguage,
+ IdentifiedScript,
)
from core_backend.app.question_answer.models import (
ContentFeedbackDB,
@@ -1703,7 +1704,9 @@ async def mock_identify_language(
"""
query_refined.original_language = IdentifiedLanguage.ENGLISH
+ query_refined.original_script = IdentifiedScript.LATIN
response.debug_info["original_language"] = "ENGLISH"
+ response.debug_info["original_script"] = "LATIN"
return query_refined, response
From a81a3816ed477693fb7da49f05d7f55c4cb3537a Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:37:18 +0530
Subject: [PATCH 17/18] fix logic
---
core_backend/app/llm_call/llm_prompts.py | 20 +++-----------------
core_backend/app/llm_call/process_input.py | 7 ++++---
2 files changed, 7 insertions(+), 20 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 92b63a125..f3603deb2 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -7,7 +7,7 @@
from enum import Enum
from typing import ClassVar, Literal
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field
from .utils import format_prompt, remove_json_markdown
@@ -428,22 +428,8 @@ def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override]
class LanguageIdentificationResponse(BaseModel):
"""Pydantic model for the language identification response."""
- language: str
- script: str
-
- @field_validator("language")
- def validate_language(cls, value: str) -> str:
- """Make sure language input is a valid IdentifiedLanguage"""
- if value not in IdentifiedLanguage._member_names_:
- raise ValueError(f"Invalid language: {value}")
- return value
-
- @field_validator("script")
- def validate_script(cls, value: str) -> str:
- """Make sure script input is a valid IdentifiedScript"""
- if value not in IdentifiedScript._member_names_:
- raise ValueError(f"Invalid script: {value}")
- return value
+ language: IdentifiedLanguage
+ script: IdentifiedScript
model_config = ConfigDict(strict=True)
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index a84d1e798..b854d8ff0 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -167,12 +167,13 @@ def _process_identified_language_response(
supported_languages_list = IdentifiedLanguage.get_supported_languages()
supported_scripts_list = IdentifiedScript.get_supported_scripts()
- supported_languages_str = ", ".join(supported_languages_list)
- suported_scripts_str = ", ".join(supported_scripts_list)
language_ok = identified_language in supported_languages_list
script_ok = identified_script in supported_scripts_list
+ supported_languages_str = ", ".join(supported_languages_list)
+ suported_scripts_str = ", ".join(supported_scripts_list)
+
if language_ok and script_ok:
return response
elif language_ok and not script_ok:
@@ -209,7 +210,7 @@ def _process_identified_language_response(
logger.info(
f"LANGUAGE IDENTIFICATION FAILED due to {error_message} "
- f"language on query id: {str(response.query_id)}"
+ f"on query id: {str(response.query_id)}"
)
return error_response
From 40187b86fdd66d0579bba7b8f63407217cf30885 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Fri, 11 Apr 2025 17:15:42 +0530
Subject: [PATCH 18/18] Add query optimization back in, but don't run
translation for chat queries
---
core_backend/app/llm_call/llm_prompts.py | 5 ++++-
core_backend/app/llm_call/process_input.py | 18 +++++++++++-------
core_backend/app/question_answer/routers.py | 8 ++++++++
core_backend/app/question_answer/utils.py | 4 +++-
core_backend/tests/api/test_chat.py | 1 +
5 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index f3603deb2..db7a154bd 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -268,7 +268,9 @@ class ChatHistory:
{{
"message_type": "The type of the user's LATEST MESSAGE. List of valid
- options are: {valid_message_types}"
+ options are: {valid_message_types}",
+ "query": "The vector database query that you have constructed based on
+ the user's LATEST MESSAGE and the conversation history."
}}
Do NOT attempt to answer the user's question/concern. Only output the JSON
@@ -283,6 +285,7 @@ class ChatHistoryConstructSearchQuery(BaseModel):
"""Pydantic model for the output of the construct search query chat history."""
message_type: Literal["FOLLOW-UP", "NEW"]
+ query: str
@staticmethod
def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]:
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index b854d8ff0..c6da6a5b4 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -114,7 +114,8 @@ async def _identify_language(
litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
metadata=metadata,
system_message=LANGUAGE_ID_PROMPT,
- user_message=query_refined.query_text,
+ # Always use the original query text for language and script detection
+ user_message=query_refined.query_text_original,
)
cleaned_json_str = remove_json_markdown(text=json_str)
@@ -256,9 +257,10 @@ async def wrapper(
The appropriate response object.
"""
- query_refined, response = await _translate_question(
- query_refined=query_refined, response=response
- )
+ if not query_refined.chat_query_params:
+ query_refined, response = await _translate_question(
+ query_refined=query_refined, response=response
+ )
response = await func(query_refined, response, *args, **kwargs)
return response
@@ -492,9 +494,11 @@ async def wrapper(
The appropriate response object.
"""
- query_refined, response = await _paraphrase_question(
- query_refined=query_refined, response=response
- )
+ if not query_refined.chat_query_params:
+ query_refined, response = await _paraphrase_question(
+ query_refined=query_refined, response=response
+ )
+
response = await func(query_refined, response, *args, **kwargs)
return response
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 5a4b057b8..e6091edb1 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -844,6 +844,13 @@ async def get_user_query_and_response(
workspace_id=workspace_id,
)
+ # In case of a chat query, use the optimized query as the base query_text.
+ # Note that for language identification, we use query_text_original.
+ if user_query_refined.chat_query_params:
+ user_query_refined.query_text = user_query_refined.chat_query_params.pop(
+ "search_query"
+ )
+
# Prepare the placeholder response object.
response_template = QueryResponse(
debug_info={},
@@ -1072,6 +1079,7 @@ async def init_user_query_and_chat_histories(
"chat_history": user_assistant_chat_history,
"chat_params": chat_params,
"message_type": search_query_json_response["message_type"],
+ "search_query": search_query_json_response["query"],
"redis_client": redis_client,
"session_id": session_id,
}
diff --git a/core_backend/app/question_answer/utils.py b/core_backend/app/question_answer/utils.py
index 029d7194c..f972e46dc 100644
--- a/core_backend/app/question_answer/utils.py
+++ b/core_backend/app/question_answer/utils.py
@@ -23,6 +23,8 @@ def get_context_string_from_search_results(
for key, result in search_results.items():
if not isinstance(result, QuerySearchResult):
result = QuerySearchResult(**result)
- context_list.append(f"{key}. {result.title}\n{result.text}")
+ context_list.append(
+ f" \n**{result.title}**\n\n{result.text}\n"
+ )
context_string = "\n\n".join(context_list)
return context_string
diff --git a/core_backend/tests/api/test_chat.py b/core_backend/tests/api/test_chat.py
index d32cb3436..ed2f35f5e 100644
--- a/core_backend/tests/api/test_chat.py
+++ b/core_backend/tests/api/test_chat.py
@@ -85,6 +85,7 @@ async def test_init_user_query_and_chat_histories(redis_client: aioredis.Redis)
chat_query_params["chat_cache_key"] == f"chatCache:{user_query.session_id}"
)
assert chat_query_params["message_type"] == "NEW"
+ assert chat_query_params["search_query"] == "stomachache and possible remedies"
async def test__ask_llm_async() -> None: