From a350cb0b46c71b6bddf145a8a3759cf4918d64d8 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:20:12 +0530
Subject: [PATCH 01/18] detect script as well

---
 core_backend/app/llm_call/llm_prompts.py    | 230 ++++++++++++++------
 core_backend/app/llm_call/llm_rag.py        |  19 +-
 core_backend/app/llm_call/process_input.py  |  24 +-
 core_backend/app/llm_call/process_output.py |   5 +
 core_backend/app/question_answer/schemas.py |   3 +-
 5 files changed, 200 insertions(+), 81 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 2ede20f4c..0d1076834 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -103,7 +103,7 @@
 {context}
 
 IMPORTANT NOTES ON THE "answer" FIELD:
-- Answer in the language of the question ({original_language}).
+- Answer in the language {original_language} in the script {original_script}.
 - Answer should be concise, to the point, and no longer than 80 words.
 - Do not include any information that is not present in the REFERENCE TEXT.
 """
@@ -182,6 +182,61 @@ class AlignmentScore(BaseModel):
     model_config = ConfigDict(strict=True)
 
 
+CHAT_RESPONSE_PROMPT = """\
+You are an AI assistant designed to help users with their
+questions/concerns. You interact with users via a chat interface. You will
+be provided with ADDITIONAL RELEVANT INFORMATION that can address the
+user's questions/concerns.
+
+BEFORE answering the user's LATEST MESSAGE, follow these steps:
+
+1. Review the conversation history to ensure that you understand the
+context in which the user's LATEST MESSAGE is being asked.
+2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
+understand the most useful information related to the user's LATEST
+MESSAGE.
+
+When you have completed the above steps, you will then write a JSON, whose
+TypeScript Interface is given below:
+
+interface Response {{
+    extracted_info: string[];
+    answer: string;
+}}
+
+For "extracted_info", extract from the provided ADDITIONAL RELEVANT
+INFORMATION the most useful information related to the LATEST MESSAGE asked
+by the user, and list them one by one. If no useful information is found,
+return an empty list.
+
+For "answer", understand the conversation history, ADDITIONAL RELEVANT
+INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
+the user's LATEST MESSAGE. If no useful information was found in the
+either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
+respond with {failure_message}.
+
+EXAMPLE RESPONSES:
+{{"extracted_info": [
+    "Pineapples are a blend of pinecones and apples.",
+    "Pineapples have the shape of a pinecone."
+    ],
+    "answer": "The 'pine-' from pineapples likely come from the fact that
+    pineapples are a hybrid of pinecones and apples and its pinecone-like
+    shape."
+}}
+{{"extracted_info": [], "answer": "{failure_message}"}}
+
+IMPORTANT NOTES ON THE "answer" FIELD:
+- Keep in mind that the user is asking a {message_type} question.
+- Answer in the language {original_language} in the script {original_script}.
+- Answer should be concise and to the point.
+- Do not include any information that is not present in the ADDITIONAL
+RELEVANT INFORMATION.
+
+Only output the JSON response, without any additional text.
+"""
+
+
 class ChatHistory:
     """Contains the prompts and models for the chat history task."""
 
@@ -227,62 +282,7 @@ class ChatHistory:
         ),
         prompt_kws={"valid_message_types": _valid_message_types},
     )
-    system_message_generate_response = format_prompt(
-        prompt=textwrap.dedent(
-            """You are an AI assistant designed to help users with their
-            questions/concerns. You interact with users via a chat interface. You will
-            be provided with ADDITIONAL RELEVANT INFORMATION that can address the
-            user's questions/concerns.
-
-            BEFORE answering the user's LATEST MESSAGE, follow these steps:
-
-            1. Review the conversation history to ensure that you understand the
-            context in which the user's LATEST MESSAGE is being asked.
-            2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
-            understand the most useful information related to the user's LATEST
-            MESSAGE.
-
-            When you have completed the above steps, you will then write a JSON, whose
-            TypeScript Interface is given below:
-
-            interface Response {{
-                extracted_info: string[];
-                answer: string;
-            }}
-
-            For "extracted_info", extract from the provided ADDITIONAL RELEVANT
-            INFORMATION the most useful information related to the LATEST MESSAGE asked
-            by the user, and list them one by one. If no useful information is found,
-            return an empty list.
-
-            For "answer", understand the conversation history, ADDITIONAL RELEVANT
-            INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
-            the user's LATEST MESSAGE. If no useful information was found in the
-            either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
-            respond with {failure_message}.
-
-            EXAMPLE RESPONSES:
-            {{"extracted_info": [
-                "Pineapples are a blend of pinecones and apples.",
-                "Pineapples have the shape of a pinecone."
-                ],
-              "answer": "The 'pine-' from pineapples likely come from the fact that
-               pineapples are a hybrid of pinecones and apples and its pinecone-like
-               shape."
-            }}
-            {{"extracted_info": [], "answer": "{failure_message}"}}
-
-            IMPORTANT NOTES ON THE "answer" FIELD:
-            - Keep in mind that the user is asking a {message_type} question.
-            - Answer in the language of the question ({original_language}).
-            - Answer should be concise and to the point.
-            - Do not include any information that is not present in the ADDITIONAL
-            RELEVANT INFORMATION.
-
-            Only output the JSON response, without any additional text.
-            """
-        )
-    )
+    system_message_generate_response = CHAT_RESPONSE_PROMPT
 
     class ChatHistoryConstructSearchQuery(BaseModel):
         """Pydantic model for the output of the construct search query chat history."""
@@ -330,6 +330,106 @@ def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]
             raise ValueError(f"Error validating the output: {e}") from e
 
 
+class IdentifiedScript(str, Enum):
+    """Script used in the user's input."""
+
+    LATIN = "Latin"
+    DEVANAGARI = "Devanagari"
+    ARABIC = "Arabic"
+    CYRILLIC = "Cyrillic"
+    CHINESE = "Chinese"
+    JAPANESE = "Japanese"
+    KOREAN = "Korean"
+    THAI = "Thai"
+    BENGALI = "Bengali"
+    TAMIL = "Tamil"
+    TELUGU = "Telugu"
+    KANNADA = "Kannada"
+    MALAYALAM = "Malayalam"
+    GUJARATI = "Gujarati"
+    GURMUKHI = "Gurmukhi"
+    ORIYA = "Oriya"
+    SINHALA = "Sinhala"
+    MYANMAR = "Myanmar"
+    ETHIOPIC = "Ethiopic"
+    GEORGIAN = "Georgian"
+    ARMENIAN = "Armenian"
+    HEBREW = "Hebrew"
+    GREEK = "Greek"
+    TIBETAN = "Tibetan"
+    MONGOLIAN = "Mongolian"
+    KHMER = "Khmer"
+    LAO = "Lao"
+    VIETNAMESE = "Vietnamese"
+    THAI_LAO = "Thai-Lao"
+    UNKNOWN = "Unknown"
+
+    @classmethod
+    def _missing_(cls, value: str) -> IdentifiedScript:  # type: ignore[override]
+        """If script identified is not one of the supported scripts, it is
+        classified as UNKNOWN.
+
+        Parameters
+        ----------
+        value
+            The script identified.
+
+        Returns
+        -------
+        Script
+            The identified script (i.e., UNKNOWN).
+        """
+        return cls.UNKNOWN
+
+    @classmethod
+    def get_supported_scripts(cls) -> list[str]:
+        """Return a list of supported scripts.
+
+        Returns
+        -------
+        list[str]
+            A list of supported scripts.
+        """
+        return [script.value for script in cls if script != cls.UNKNOWN]
+
+
+class LanguageIdentificationResponse(BaseModel):
+    """Pydantic model for the language identification response."""
+
+    language: IdentifiedLanguage
+    script: IdentifiedScript
+
+    model_config = ConfigDict(strict=True)
+
+
+LANGUAGE_ID_PROMPT = f"""\
+You are a high-performing language identification bot that classifies the \
+language and script of the user input.
+
+For each input, identify:
+1. The language (must be one of {{member_names}})
+2. The script (must be one of {", ".join(IdentifiedScript.get_supported_scripts())})
+
+If the user input is:
+1. in one of the supported languages, respond with that language and its script
+2. written in a mix of languages, respond with the dominant language and its script
+3. in a real language but not a supported language, respond with UNSUPPORTED and \
+its script
+4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin
+
+Examples:
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
+
+Respond with a JSON object containing "language" and "script" keys.
+"""
+
+
 class IdentifiedLanguage(str, Enum):
     """Identified language of the user's input."""
 
@@ -387,21 +487,7 @@ def get_prompt(cls) -> str:
             The prompt for the language identification bot.
         """
 
-        return textwrap.dedent(
-            f"""
-            You are a high-performing language identification bot that classifies the
-            language of the user input into one of {", ".join(cls._member_names_)}.
-
-            If the user input is
-            1. in one of the supported languages, then respond with that language.
-            2. written in a mix of languages, then respond with the dominant language.
-            3. in a real language but not a supported language, then respond with
-            UNSUPPORTED.
-            4. unintelligible or gibberish, then respond with UNINTELLIGIBLE.
-
-            Answer should be a single word and strictly one of
-            [{", ".join(cls._member_names_)}]"""
-        ).strip()
+        return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
 
 
 class RAG(BaseModel):
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index ab4431ade..49a229364 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -8,7 +8,13 @@
 
 from ..config import LITELLM_MODEL_GENERATION
 from ..utils import setup_logger
-from .llm_prompts import RAG, RAG_FAILURE_MESSAGE, ChatHistory, IdentifiedLanguage
+from .llm_prompts import (
+    RAG,
+    RAG_FAILURE_MESSAGE,
+    ChatHistory,
+    IdentifiedLanguage,
+    IdentifiedScript,
+)
 from .utils import (
     _ask_llm_async,
     append_messages_to_chat_history,
@@ -24,6 +30,7 @@ async def get_llm_rag_answer(
     context: str,
     metadata: dict | None = None,
     original_language: IdentifiedLanguage,
+    original_script: IdentifiedScript,
     question: str,
 ) -> RAG:
     """Get an answer from the LLM model using RAG.
@@ -36,6 +43,8 @@ async def get_llm_rag_answer(
         Additional metadata to provide to the LLM model.
     original_language
         The original language of the question.
+    original_script
+        The scrip in which the original question was written.
     question
         The question to ask the LLM model.
 
@@ -46,7 +55,11 @@ async def get_llm_rag_answer(
     """
 
     metadata = metadata or {}
-    prompt = RAG.prompt.format(context=context, original_language=original_language)
+    prompt = RAG.prompt.format(
+        context=context,
+        original_language=original_language,
+        original_script=original_script,
+    )
 
     result = await _ask_llm_async(
         json_=True,
@@ -75,6 +88,7 @@ async def get_llm_rag_answer_with_chat_history(
     message_type: str,
     metadata: dict | None = None,
     original_language: IdentifiedLanguage,
+    original_script: IdentifiedScript,
     question: str,
     session_id: str,
 ) -> tuple[RAG, list[dict[str, str | None]]]:
@@ -112,6 +126,7 @@ async def get_llm_rag_answer_with_chat_history(
                 failure_message=RAG_FAILURE_MESSAGE,
                 message_type=message_type,
                 original_language=original_language,
+                original_script=original_script,
             )
         )
     content = (
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 9a30ffdeb..d714527b9 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -3,6 +3,8 @@
 from functools import wraps
 from typing import Any, Callable, Optional
 
+from pydantic import ValidationError
+
 from ..config import (
     LITELLM_MODEL_LANGUAGE_DETECT,
     LITELLM_MODEL_PARAPHRASE,
@@ -22,9 +24,11 @@
     TRANSLATE_FAILED_MESSAGE,
     TRANSLATE_PROMPT,
     IdentifiedLanguage,
+    IdentifiedScript,
+    LanguageIdentificationResponse,
     SafetyClassification,
 )
-from .utils import _ask_llm_async
+from .utils import _ask_llm_async, remove_json_markdown
 
 logger = setup_logger(name="INPUT RAILS")
 
@@ -84,7 +88,7 @@ async def _identify_language(
     query_refined: QueryRefined,
     response: QueryResponse | QueryResponseError,
 ) -> tuple[QueryRefined, QueryResponse | QueryResponseError]:
-    """Identify the language of the question.
+    """Identify the language and script of the question.
 
     Parameters
     ----------
@@ -104,19 +108,27 @@ async def _identify_language(
     if isinstance(response, QueryResponseError):
         return query_refined, response
 
-    llm_identified_lang = await _ask_llm_async(
+    json_str = await _ask_llm_async(
+        json_=True,
         litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
         metadata=metadata,
         system_message=IdentifiedLanguage.get_prompt(),
         user_message=query_refined.query_text,
     )
 
-    identified_lang = getattr(
-        IdentifiedLanguage, llm_identified_lang, IdentifiedLanguage.UNSUPPORTED
-    )
+    try:
+        cleaned_json_str = remove_json_markdown(text=json_str)
+        lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
+        identified_lang = lang_info["language"]
+        identified_script = lang_info["script"]
+    except ValidationError:
+        identified_lang = IdentifiedLanguage.UNSUPPORTED
+        identified_script = IdentifiedScript.LATIN
+
     query_refined.original_language = identified_lang
     response.debug_info["original_query"] = query_refined.query_text_original
     response.debug_info["original_language"] = identified_lang
+    response.debug_info["original_script"] = identified_script
 
     processed_response = _process_identified_language_response(
         identified_language=identified_lang, response=response
diff --git a/core_backend/app/llm_call/process_output.py b/core_backend/app/llm_call/process_output.py
index a4671030b..2a569f0e1 100644
--- a/core_backend/app/llm_call/process_output.py
+++ b/core_backend/app/llm_call/process_output.py
@@ -84,6 +84,9 @@ async def generate_llm_query_response(
     if query_refined.original_language is None:
         logger.warning("No original_language found in the query.")
         return response, chat_history
+    if query_refined.original_script is None:
+        logger.warning("No original_script found in the query.")
+        return response, chat_history
 
     context = get_context_string_from_search_results(
         search_results=response.search_results
@@ -98,6 +101,7 @@ async def generate_llm_query_response(
             message_type=message_type,
             metadata=metadata,
             original_language=query_refined.original_language,
+            original_script=query_refined.original_script,
             question=query_refined.query_text_original,
             session_id=chat_query_params["session_id"],
         )
@@ -106,6 +110,7 @@ async def generate_llm_query_response(
             context=context,
             metadata=metadata,
             original_language=query_refined.original_language,
+            original_script=query_refined.original_script,
             question=query_refined.query_text_original,  # Use the original query text
         )
 
diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py
index 8904e2c36..bda58ce7a 100644
--- a/core_backend/app/question_answer/schemas.py
+++ b/core_backend/app/question_answer/schemas.py
@@ -6,7 +6,7 @@
 from pydantic import BaseModel, ConfigDict, Field
 from pydantic.json_schema import SkipJsonSchema
 
-from ..llm_call.llm_prompts import IdentifiedLanguage
+from ..llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript
 from ..schemas import FeedbackSentiment, QuerySearchResult
 
 
@@ -49,6 +49,7 @@ class QueryRefined(QueryBase):
 
     generate_tts: bool = Field(False)
     original_language: IdentifiedLanguage | None = None
+    original_script: IdentifiedScript | None = None
     query_text_original: str
     workspace_id: int
 

From 5b1759350554ae2edd2f4a95d5f4fcbc5f99995c Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:32:09 +0530
Subject: [PATCH 02/18] fix prompt

---
 core_backend/app/llm_call/llm_prompts.py   | 100 +++++++++++----------
 core_backend/app/llm_call/process_input.py |   3 +-
 2 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 0d1076834..18c0d5864 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -335,33 +335,33 @@ class IdentifiedScript(str, Enum):
 
     LATIN = "Latin"
     DEVANAGARI = "Devanagari"
-    ARABIC = "Arabic"
-    CYRILLIC = "Cyrillic"
-    CHINESE = "Chinese"
-    JAPANESE = "Japanese"
-    KOREAN = "Korean"
-    THAI = "Thai"
+    # ARABIC = "Arabic"
+    # CYRILLIC = "Cyrillic"
+    # CHINESE = "Chinese"
+    # JAPANESE = "Japanese"
+    # KOREAN = "Korean"
+    # THAI = "Thai"
     BENGALI = "Bengali"
     TAMIL = "Tamil"
     TELUGU = "Telugu"
     KANNADA = "Kannada"
     MALAYALAM = "Malayalam"
     GUJARATI = "Gujarati"
-    GURMUKHI = "Gurmukhi"
-    ORIYA = "Oriya"
-    SINHALA = "Sinhala"
-    MYANMAR = "Myanmar"
-    ETHIOPIC = "Ethiopic"
-    GEORGIAN = "Georgian"
-    ARMENIAN = "Armenian"
-    HEBREW = "Hebrew"
-    GREEK = "Greek"
-    TIBETAN = "Tibetan"
-    MONGOLIAN = "Mongolian"
-    KHMER = "Khmer"
-    LAO = "Lao"
-    VIETNAMESE = "Vietnamese"
-    THAI_LAO = "Thai-Lao"
+    # GURMUKHI = "Gurmukhi"
+    # ORIYA = "Oriya"
+    # SINHALA = "Sinhala"
+    # MYANMAR = "Myanmar"
+    # ETHIOPIC = "Ethiopic"
+    # GEORGIAN = "Georgian"
+    # ARMENIAN = "Armenian"
+    # HEBREW = "Hebrew"
+    # GREEK = "Greek"
+    # TIBETAN = "Tibetan"
+    # MONGOLIAN = "Mongolian"
+    # KHMER = "Khmer"
+    # LAO = "Lao"
+    # VIETNAMESE = "Vietnamese"
+    # THAI_LAO = "Thai-Lao"
     UNKNOWN = "Unknown"
 
     @classmethod
@@ -402,34 +402,6 @@ class LanguageIdentificationResponse(BaseModel):
     model_config = ConfigDict(strict=True)
 
 
-LANGUAGE_ID_PROMPT = f"""\
-You are a high-performing language identification bot that classifies the \
-language and script of the user input.
-
-For each input, identify:
-1. The language (must be one of {{member_names}})
-2. The script (must be one of {", ".join(IdentifiedScript.get_supported_scripts())})
-
-If the user input is:
-1. in one of the supported languages, respond with that language and its script
-2. written in a mix of languages, respond with the dominant language and its script
-3. in a real language but not a supported language, respond with UNSUPPORTED and \
-its script
-4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin
-
-Examples:
-"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
-"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
-"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
-"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
-"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
-"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
-"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
-
-Respond with a JSON object containing "language" and "script" keys.
-"""
-
-
 class IdentifiedLanguage(str, Enum):
     """Identified language of the user's input."""
 
@@ -490,6 +462,36 @@ def get_prompt(cls) -> str:
         return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
 
 
+LANGUAGE_ID_PROMPT = (
+    f"""\
+You are a high-performing language identification bot that classifies the \
+language and script of the user input.
+
+For each input, identify:
+1. The language (must be one of {", ".join(IdentifiedLanguage._member_names_)})
+2. The script (must be one of {", ".join(IdentifiedScript._member_names_)})
+
+If the user input is:
+1. in one of the supported languages, respond with that language and its script
+2. written in a mix of languages, respond with the dominant language and its script
+3. in a real language but not a supported language, respond with UNSUPPORTED and \
+its script
+4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin"""
+    + """
+Examples:
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
+
+Respond with a JSON object containing "language" and "script" keys.
+"""
+)
+
+
 class RAG(BaseModel):
     """Generated response based on question and retrieved context."""
 
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index d714527b9..dfc3341a7 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -19,6 +19,7 @@
 )
 from ..utils import setup_logger
 from .llm_prompts import (
+    LANGUAGE_ID_PROMPT,
     PARAPHRASE_FAILED_MESSAGE,
     PARAPHRASE_PROMPT,
     TRANSLATE_FAILED_MESSAGE,
@@ -112,7 +113,7 @@ async def _identify_language(
         json_=True,
         litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
         metadata=metadata,
-        system_message=IdentifiedLanguage.get_prompt(),
+        system_message=LANGUAGE_ID_PROMPT,
         user_message=query_refined.query_text,
     )
 

From f7fee04f2f59d37370950c4ce4d87e449c4f2e5b Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:03:06 +0530
Subject: [PATCH 03/18] fix tests

---
 .secrets.baseline                             |   8 +-
 core_backend/app/llm_call/llm_prompts.py      | 133 +++++++++---------
 core_backend/app/llm_call/process_input.py    |   4 +-
 .../tests/api/test_question_answer.py         |  21 ++-
 .../rails/data/language_identification.yaml   | 108 +++++++-------
 .../rails/test_language_identification.py     |  46 ++++--
 6 files changed, 185 insertions(+), 135 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index 5cab9e8c1..2ba6baa9b 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -448,14 +448,14 @@
         "filename": "core_backend/tests/api/test_question_answer.py",
         "hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d",
         "is_verified": false,
-        "line_number": 294
+        "line_number": 419
       },
       {
         "type": "Secret Keyword",
         "filename": "core_backend/tests/api/test_question_answer.py",
         "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee",
         "is_verified": false,
-        "line_number": 653
+        "line_number": 1019
       }
     ],
     "core_backend/tests/api/test_user_tools.py": [
@@ -473,7 +473,7 @@
         "filename": "core_backend/tests/rails/test_language_identification.py",
         "hashed_secret": "051b2c1d98174fabc4749641c4f4f4660556441e",
         "is_verified": false,
-        "line_number": 48
+        "line_number": 69
       }
     ],
     "core_backend/tests/rails/test_paraphrasing.py": [
@@ -581,5 +581,5 @@
       }
     ]
   },
-  "generated_at": "2025-01-24T13:35:08Z"
+  "generated_at": "2025-04-09T08:32:56Z"
 }
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 18c0d5864..751a1079a 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -330,6 +330,67 @@ def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]
             raise ValueError(f"Error validating the output: {e}") from e
 
 
+class IdentifiedLanguage(str, Enum):
+    """Identified language of the user's input."""
+
+    # AFRIKAANS = "AFRIKAANS"
+    ENGLISH = "ENGLISH"
+    FRENCH = "FRENCH"
+    HINDI = "HINDI"
+    MARATHI = "MARATHI"
+    SWAHILI = "SWAHILI"
+    UNINTELLIGIBLE = "UNINTELLIGIBLE"
+    UNSUPPORTED = "UNSUPPORTED"
+    # XHOSA = "XHOSA"
+    # ZULU = "ZULU"
+
+    @classmethod
+    def get_supported_languages(cls) -> list[str]:
+        """Return a list of supported languages.
+
+        Returns
+        -------
+        list[str]
+            A list of supported languages.
+        """
+
+        return [
+            lang
+            for lang in cls._member_names_
+            if lang not in ("UNINTELLIGIBLE", "UNSUPPORTED")
+        ]
+
+    @classmethod
+    def _missing_(cls, value: str) -> IdentifiedLanguage:  # type: ignore[override]
+        """If language identified is not one of the supported language, it is
+        classified as UNSUPPORTED.
+
+        Parameters
+        ----------
+        value
+            The language identified.
+
+        Returns
+        -------
+        IdentifiedLanguage
+            The identified language (i.e., UNSUPPORTED).
+        """
+
+        return cls.UNSUPPORTED
+
+    @classmethod
+    def get_prompt(cls) -> str:
+        """Return the prompt for the language identification bot.
+
+        Returns
+        -------
+        str
+            The prompt for the language identification bot.
+        """
+
+        return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
+
+
 class IdentifiedScript(str, Enum):
     """Script used in the user's input."""
 
@@ -341,12 +402,12 @@ class IdentifiedScript(str, Enum):
     # JAPANESE = "Japanese"
     # KOREAN = "Korean"
     # THAI = "Thai"
-    BENGALI = "Bengali"
-    TAMIL = "Tamil"
-    TELUGU = "Telugu"
-    KANNADA = "Kannada"
-    MALAYALAM = "Malayalam"
-    GUJARATI = "Gujarati"
+    # BENGALI = "Bengali"
+    # TAMIL = "Tamil"
+    # TELUGU = "Telugu"
+    # KANNADA = "Kannada"
+    # MALAYALAM = "Malayalam"
+    # GUJARATI = "Gujarati"
     # GURMUKHI = "Gurmukhi"
     # ORIYA = "Oriya"
     # SINHALA = "Sinhala"
@@ -402,66 +463,6 @@ class LanguageIdentificationResponse(BaseModel):
     model_config = ConfigDict(strict=True)
 
 
-class IdentifiedLanguage(str, Enum):
-    """Identified language of the user's input."""
-
-    # AFRIKAANS = "AFRIKAANS"
-    ENGLISH = "ENGLISH"
-    FRENCH = "FRENCH"
-    HINDI = "HINDI"
-    SWAHILI = "SWAHILI"
-    UNINTELLIGIBLE = "UNINTELLIGIBLE"
-    UNSUPPORTED = "UNSUPPORTED"
-    # XHOSA = "XHOSA"
-    # ZULU = "ZULU"
-
-    @classmethod
-    def get_supported_languages(cls) -> list[str]:
-        """Return a list of supported languages.
-
-        Returns
-        -------
-        list[str]
-            A list of supported languages.
-        """
-
-        return [
-            lang
-            for lang in cls._member_names_
-            if lang not in ("UNINTELLIGIBLE", "UNSUPPORTED")
-        ]
-
-    @classmethod
-    def _missing_(cls, value: str) -> IdentifiedLanguage:  # type: ignore[override]
-        """If language identified is not one of the supported language, it is
-        classified as UNSUPPORTED.
-
-        Parameters
-        ----------
-        value
-            The language identified.
-
-        Returns
-        -------
-        IdentifiedLanguage
-            The identified language (i.e., UNSUPPORTED).
-        """
-
-        return cls.UNSUPPORTED
-
-    @classmethod
-    def get_prompt(cls) -> str:
-        """Return the prompt for the language identification bot.
-
-        Returns
-        -------
-        str
-            The prompt for the language identification bot.
-        """
-
-        return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
-
-
 LANGUAGE_ID_PROMPT = (
     f"""\
 You are a high-performing language identification bot that classifies the \
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index dfc3341a7..ba8d1025a 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
     try:
         cleaned_json_str = remove_json_markdown(text=json_str)
         lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
-        identified_lang = lang_info["language"]
-        identified_script = lang_info["script"]
+        identified_lang = lang_info.language
+        identified_script = lang_info.script
     except ValidationError:
         identified_lang = IdentifiedLanguage.UNSUPPORTED
         identified_script = IdentifiedScript.LATIN
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 163e77574..50299a7b4 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -10,7 +10,11 @@
 from fastapi import status
 from fastapi.testclient import TestClient
 
-from core_backend.app.llm_call.llm_prompts import AlignmentScore, IdentifiedLanguage
+from core_backend.app.llm_call.llm_prompts import (
+    AlignmentScore,
+    IdentifiedLanguage,
+    LanguageIdentificationResponse,
+)
 from core_backend.app.llm_call.process_input import (
     _classify_safety,
     _identify_language,
@@ -1045,10 +1049,10 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
         )
 
     @pytest.mark.parametrize(
-        "identified_lang_str,should_error,expected_error_type",
+        "identified_lang_str,identified_script_str,should_error,expected_error_type",
         [
-            ("ENGLISH", False, None),
-            ("HINDI", False, None),
+            ("ENGLISH", "Latin", False, None),
+            ("HINDI", "Devanagari", False, None),
             ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT),
             ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE),
             ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE),
@@ -1059,6 +1063,7 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
     async def test_language_identify_error(
         self,
         identified_lang_str: str,
+        identified_script_str: str,
         should_error: bool,
         expected_error_type: ErrorType,
         monkeypatch: pytest.MonkeyPatch,
@@ -1084,6 +1089,7 @@ async def test_language_identify_error(
             generate_llm_response=False,
             generate_tts=False,
             original_language=None,
+            original_script=None,
             query_text="This is a basic query",
             query_text_original="This is a query original",
             workspace_id=124,
@@ -1104,10 +1110,12 @@ async def mock_ask_llm(  # pylint: disable=W0613
             Returns
             -------
             str
-                The identified language string.
+                The identified language and script model json string.
             """
 
-            return identified_lang_str
+            return LanguageIdentificationResponse(
+                language=identified_lang_str, script=identified_script_str
+            ).model_dump_json()
 
         monkeypatch.setattr(
             "core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm
@@ -1233,6 +1241,7 @@ async def mock_ask_llm(  # pylint: disable=W0613
             generate_llm_response=False,
             generate_tts=False,
             original_language=None,
+            original_script=None,
             query_text="This is a basic query",
             query_text_original="This is a query original",
             workspace_id=124,
diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml
index a4d3ddb34..7daa61476 100644
--- a/core_backend/tests/rails/data/language_identification.yaml
+++ b/core_backend/tests/rails/data/language_identification.yaml
@@ -2,59 +2,73 @@
 # improve this with a native speaker. These might be too "pure".
 
 HAUSA:
-  - Ina da yara biyu masu hanci
-  - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
-  - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
-  - Menene wannan?
-  - Sannun ku da zuwa #h/t: Fola from here on
-  - Ni yarinya ne
-  - Zo ka chi abunchi
-  - Ina kwana Maman mu
-  - Wannan shago na ne
+  Latin:
+    - Ina da yara biyu masu hanci
+    - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
+    - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
+    - Menene wannan?
+    - Sannun ku da zuwa #h/t: Fola from here on
+    - Ni yarinya ne
+    - Zo ka chi abunchi
+    - Ina kwana Maman mu
+    - Wannan shago na ne
 ENGLISH:
-  - I have two children. You see I girl, what is the probability the other is also a girl?
-  - No idea
-  - Why you say that?
+  Latin:
+    - I have two children. You see I girl, what is the probability the other is also a girl?
+    - No idea
+    - Why you say that?
 XHOSA:
-  - Umama ngugqirha
-  - Utata ngumongikazi
-  - Ukuba intamo yam yayifuna ukwenza oko?
-  - Iintsana zikhala kakhulu, huh?
+  Latin:
+    - Umama ngugqirha
+    - Utata ngumongikazi
+    - Ukuba intamo yam yayifuna ukwenza oko?
+    - Iintsana zikhala kakhulu, huh?
 YORUBA: #h/t: Fola
-  - Ni bo ló ti ri owo yen?
-  - Eyin melo ni e wa ni be?
-  - Ki ni itumo oruko ẹ?
-  - Ki ni o jẹ lánà?
-  - Omo Ibadan ni mi
+  Latin:
+    - Ni bo ló ti ri owo yen?
+    - Eyin melo ni e wa ni be?
+    - Ki ni itumo oruko ẹ?
+    - Ki ni o jẹ lánà?
+    - Omo Ibadan ni mi
 IGBO: #h/t: Fola
-  - agụụ na-agụ m
-  - agam aga ahia echi
-  - ị hụla ngozi? ana m achọ ya.
-  - m na-aga ọrụ
+  Latin:
+    - agụụ na-agụ m
+    - agam aga ahia echi
+    - ị hụla ngozi? ana m achọ ya.
+    - m na-aga ọrụ
 KOREAN:
-  - 애가 둘이예요
-  - ㅋㅋㅋㅋㅋㅋ
-  - 아이들이 많이 울어요ㅠ
-  - 이 프로젝트 애칭은 ask-a-question이야.
+  Korean:
+    - 애가 둘이예요
+    - ㅋㅋㅋㅋㅋㅋ
+    - 아이들이 많이 울어요ㅠ
+    - 이 프로젝트 애칭은 ask-a-question이야.
 ZULU:
-  - Ngingumama
-  - Ingabe uyi-bot noma ungumuntu?
-  - Ngifuna ukwenza lokhu?
-  - Izingane zikhala kakhulu, hhe
+  Latin:
+    - Ngingumama
+    - Ingabe uyi-bot noma ungumuntu?
+    - Ngifuna ukwenza lokhu?
+    - Izingane zikhala kakhulu, hhe
 AFRIKAANS:
-  - Ek het hierdie goddelose dal gemaak
-  - Is covid nog 'n ding?
-  - My hond het my huiswerk geëet
-  - Het jy al gebraaide roomys probeer?
+  Latin:
+    - Ek het hierdie goddelose dal gemaak
+    - Is covid nog 'n ding?
+    - My hond het my huiswerk geëet
+    - Het jy al gebraaide roomys probeer?
 HINDI: #h/t: Sid
-  - is ka matlab kya hai?
-  - kabhi kabhi mere dil mein
-  - अंत में सभी लोग नाश्ता करने जाएं
-  - गब्बर सिंह कह के गया जो डर गया वो मर गया
+  Latin:
+    - is ka matlab kya hai?
+    - kabhi kabhi mere dil mein
+  Devanagari:
+    - अंत में सभी लोग नाश्ता करने जाएं
+    - गब्बर सिंह कह के गया जो डर गया वो मर गया
+MARATHI:
+  Latin:
+    - Portal chi link aahe
 UNINTELLIGIBLE:
-  - sdfsdf sss dyhnel jjj
-  - hs dsfsg xd ewwo ddfs
-  - Heghlu'meH QaQ jajvam
-  - yIHuchQo', 'ej jIHvaD yIqemchu'mo'
-  - \%^*# levels; 91011 AQGs!!!
-  - 1234 AQI WHO? 5678
+  Unknown:
+    - sdfsdf sss dyhnel jjj
+    - hs dsfsg xd ewwo ddfs
+    - Heghlu'meH QaQ jajvam
+    - yIHuchQo', 'ej jIHvaD yIqemchu'mo'
+    - \%^*# levels; 91011 AQGs!!!
+    - 1234 AQI WHO? 5678
diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py
index 9b30b2e9a..b7c30f46f 100644
--- a/core_backend/tests/rails/test_language_identification.py
+++ b/core_backend/tests/rails/test_language_identification.py
@@ -5,7 +5,7 @@
 import pytest
 import yaml
 
-from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage
+from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript
 from core_backend.app.llm_call.process_input import _identify_language
 from core_backend.app.question_answer.schemas import QueryRefined, QueryResponse
 
@@ -22,19 +22,38 @@ def available_languages() -> list[str]:
     return list(IdentifiedLanguage)
 
 
-def read_test_data(file: str) -> list[tuple[str, str]]:
+@pytest.fixture(scope="module")
+def available_scripts() -> list[str]:
+    """Returns a list of available languages."""
+
+    return list(IdentifiedScript)
+
+
+def read_test_data(file: str) -> list[tuple[str, str, str]]:
     """Reads test data from file and returns a list of strings."""
 
     file_path = Path(__file__).parent / file
 
     with open(file_path, "r", encoding="utf-8") as f:
         content = yaml.safe_load(f)
-        return [(key, value) for key, values in content.items() for value in values]
-
-
-@pytest.mark.parametrize("expected_label, content", read_test_data(LANGUAGE_FILE))
+        data = [
+            (language, script, text)
+            for language, script_dict in content.items()
+            for script, texts in script_dict.items()
+            for text in texts
+        ]
+        return data
+
+
+@pytest.mark.parametrize(
+    "expected_language,expected_script,content", read_test_data(LANGUAGE_FILE)
+)
 async def test_language_identification(
-    available_languages: list[str], expected_label: str, content: str
+    available_languages: list[str],
+    available_scripts: list[str],
+    expected_language: str,
+    expected_script: str,
+    content: str,
 ) -> None:
     """Test language identification."""
 
@@ -53,8 +72,15 @@ async def test_language_identification(
         search_results=None,
         session_id=None,
     )
-    if expected_label not in available_languages:
-        expected_label = "UNSUPPORTED"
+
+    if expected_language not in available_languages:
+        expected_language = "UNSUPPORTED"
+
+    if expected_script not in available_scripts:
+        expected_script = "Unknown"
+
     _, response = await _identify_language(query_refined=question, response=response)
 
-    assert response.debug_info["original_language"] == expected_label
+    assert response.debug_info["original_language"] == expected_language
+    if expected_language not in ("UNINTELLIGIBLE", "UNSUPPORTED"):
+        assert response.debug_info["original_script"] == expected_script

From ab2fd75511f3333e7bccde9ddd5388c1ee22ae8e Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Wed, 9 Apr 2025 18:38:28 +0530
Subject: [PATCH 04/18] changes

---
 core_backend/app/llm_call/llm_prompts.py    | 27 +++++++-------
 core_backend/app/llm_call/llm_rag.py        | 14 ++------
 core_backend/app/llm_call/process_input.py  | 40 +++++++++++++++------
 core_backend/app/question_answer/routers.py |  4 ---
 core_backend/app/question_answer/schemas.py |  1 +
 5 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 751a1079a..d82e3aed7 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -234,6 +234,11 @@ class AlignmentScore(BaseModel):
 RELEVANT INFORMATION.
 
 Only output the JSON response, without any additional text.
+
+
+<ADDITIONAL RELEVANT INFORMATION>
+{additional_info}
+</ADDITIONAL RELEVANT INFORMATION>
 """
 
 
@@ -396,20 +401,14 @@ class IdentifiedScript(str, Enum):
 
     LATIN = "Latin"
     DEVANAGARI = "Devanagari"
-    # ARABIC = "Arabic"
-    # CYRILLIC = "Cyrillic"
-    # CHINESE = "Chinese"
-    # JAPANESE = "Japanese"
-    # KOREAN = "Korean"
-    # THAI = "Thai"
-    # BENGALI = "Bengali"
-    # TAMIL = "Tamil"
-    # TELUGU = "Telugu"
-    # KANNADA = "Kannada"
-    # MALAYALAM = "Malayalam"
-    # GUJARATI = "Gujarati"
-    # GURMUKHI = "Gurmukhi"
-    # ORIYA = "Oriya"
+    BENGALI = "Bengali"
+    TAMIL = "Tamil"
+    TELUGU = "Telugu"
+    KANNADA = "Kannada"
+    MALAYALAM = "Malayalam"
+    GUJARATI = "Gujarati"
+    GURMUKHI = "Gurmukhi"
+    ORIYA = "Oriya"
     # SINHALA = "Sinhala"
     # MYANMAR = "Myanmar"
     # ETHIOPIC = "Ethiopic"
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index 49a229364..d2fb045d8 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -127,24 +127,14 @@ async def get_llm_rag_answer_with_chat_history(
                 message_type=message_type,
                 original_language=original_language,
                 original_script=original_script,
+                additional_info=context,
             )
         )
-    content = (
-        question
-        + f""""\n\n
-    ADDITIONAL RELEVANT INFORMATION BELOW
-    =====================================
 
-    {context}
-
-    ADDITIONAL RELEVANT INFORMATION ABOVE
-    =====================================
-    """
-    )
     content = await get_chat_response(
         chat_history=chat_history,
         chat_params=chat_params,
-        message_params=content,
+        message_params=question,
         session_id=session_id,
         json_=True,
         metadata=metadata or {},
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index ba8d1025a..80b462f36 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -127,19 +127,26 @@ async def _identify_language(
         identified_script = IdentifiedScript.LATIN
 
     query_refined.original_language = identified_lang
+    query_refined.original_script = identified_script
+
     response.debug_info["original_query"] = query_refined.query_text_original
     response.debug_info["original_language"] = identified_lang
     response.debug_info["original_script"] = identified_script
 
     processed_response = _process_identified_language_response(
-        identified_language=identified_lang, response=response
+        identified_language=identified_lang,
+        identified_script=identified_script,
+        response=response,
     )
 
     return query_refined, processed_response
 
 
 def _process_identified_language_response(
-    *, identified_language: IdentifiedLanguage, response: QueryResponse
+    *,
+    identified_language: IdentifiedLanguage,
+    identified_script: IdentifiedScript,
+    response: QueryResponse,
 ) -> QueryResponse | QueryResponseError:
     """Process the identified language and return the response.
 
@@ -147,6 +154,8 @@ def _process_identified_language_response(
     ----------
     identified_language
         The identified language.
+    identified_script
+        The identified script.
     response
         The response object.
 
@@ -157,20 +166,31 @@ def _process_identified_language_response(
     """
 
     supported_languages_list = IdentifiedLanguage.get_supported_languages()
+    supported_scripts_list = IdentifiedScript.get_supported_scripts()
 
-    if identified_language in supported_languages_list:
+    if (
+        identified_language in supported_languages_list
+        and identified_script in supported_scripts_list
+    ):
         return response
 
     supported_languages = ", ".join(supported_languages_list)
+    supported_scripts = ", ".join(supported_scripts_list)
 
-    match identified_language:
-        case IdentifiedLanguage.UNINTELLIGIBLE:
+    if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
+        error_message = (
+            "Unintelligible input. "
+            + f"The following languages are supported: {supported_languages}."
+        )
+        error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
+    else:
+        if identified_script == IdentifiedScript.UNKNOWN:
             error_message = (
-                "Unintelligible input. "
-                + f"The following languages are supported: {supported_languages}."
+                "Unsupported script. "
+                + f"Only the following scripts are supported: {supported_scripts}"
             )
-            error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
-        case _:
+            error_type = ErrorType.UNSUPPORTED_SCRIPT
+        else:
             error_message = (
                 "Unsupported language. Only the following languages "
                 + f"are supported: {supported_languages}."
@@ -190,7 +210,7 @@ def _process_identified_language_response(
     error_response.debug_info.update(response.debug_info)
 
     logger.info(
-        f"LANGUAGE IDENTIFICATION FAILED due to {identified_language.value} "
+        f"LANGUAGE IDENTIFICATION FAILED due to {error_message} "
         f"language on query id: {str(response.query_id)}"
     )
 
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 9d301cdc4..28f5e16aa 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -843,10 +843,6 @@ async def get_user_query_and_response(
         query_text_original=user_query.query_text,
         workspace_id=workspace_id,
     )
-    if user_query_refined.chat_query_params:
-        user_query_refined.query_text = user_query_refined.chat_query_params.pop(
-            "search_query"
-        )
 
     # Prepare the placeholder response object.
     response_template = QueryResponse(
diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py
index bda58ce7a..c434b28ee 100644
--- a/core_backend/app/question_answer/schemas.py
+++ b/core_backend/app/question_answer/schemas.py
@@ -23,6 +23,7 @@ class ErrorType(str, Enum):
     UNABLE_TO_TRANSLATE = "unable_to_translate"
     UNINTELLIGIBLE_INPUT = "unintelligible_input"
     UNSUPPORTED_LANGUAGE = "unsupported_language"
+    UNSUPPORTED_SCRIPT = "unsupported_script"
 
 
 class QueryBase(BaseModel):

From 2cba258051be3a32f6c55f0e9ebab6336daf3fbd Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 11:01:00 +0530
Subject: [PATCH 05/18] remove search query during init chat history

---
 core_backend/app/llm_call/llm_prompts.py    | 4 +---
 core_backend/app/question_answer/routers.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index d82e3aed7..7977a923d 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -276,9 +276,7 @@ class ChatHistory:
 
             {{
                 "message_type": "The type of the user's LATEST MESSAGE. List of valid
-                options are: {valid_message_types},
-                "query": "The vector database query that you have constructed based on
-                the user's LATEST MESSAGE and the conversation history."
+                options are: {valid_message_types}"
             }}
 
             Do NOT attempt to answer the user's question/concern. Only output the JSON
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 28f5e16aa..5a4b057b8 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -1073,7 +1073,6 @@ async def init_user_query_and_chat_histories(
         "chat_params": chat_params,
         "message_type": search_query_json_response["message_type"],
         "redis_client": redis_client,
-        "search_query": search_query_json_response["query"],
         "session_id": session_id,
     }
     user_query.generate_llm_response = True

From dbdeec459ff5cfc5557c12be9e1de0a684433a91 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 11:36:12 +0530
Subject: [PATCH 06/18] fix tests and type

---
 core_backend/app/llm_call/llm_prompts.py       |  1 -
 core_backend/tests/api/test_question_answer.py | 12 +++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 7977a923d..2766fabe6 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -291,7 +291,6 @@ class ChatHistoryConstructSearchQuery(BaseModel):
         """Pydantic model for the output of the construct search query chat history."""
 
         message_type: Literal["FOLLOW-UP", "NEW"]
-        query: str
 
     @staticmethod
     def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]:
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 50299a7b4..936bf41b5 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1053,11 +1053,13 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
         [
             ("ENGLISH", "Latin", False, None),
             ("HINDI", "Devanagari", False, None),
-            ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT),
-            ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE),
-            ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE),
-            ("SOME_UNSUPPORTED_LANG", True, ErrorType.UNSUPPORTED_LANGUAGE),
-            ("don't kow", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("UNINTELLIGIBLE", "Latin", True, ErrorType.UNINTELLIGIBLE_INPUT),
+            ("UNINTELLIGIBLE", "Unknown", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("GIBBERISH", "Unknwon", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("GIBBERISH", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("UNSUPPORTED", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("SOME_UNSUPPORTED_LANG", "Unknown", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("don't kow", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
         ],
     )
     async def test_language_identify_error(

From 09fdd6e8b876ad01f637548c64e77210944bb30b Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:08:01 +0530
Subject: [PATCH 07/18] change schema and add validator

---
 core_backend/app/llm_call/llm_prompts.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 2766fabe6..61cef0490 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from typing import ClassVar, Literal
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator
 
 from .utils import format_prompt, remove_json_markdown
 
@@ -453,8 +453,22 @@ def get_supported_scripts(cls) -> list[str]:
 class LanguageIdentificationResponse(BaseModel):
     """Pydantic model for the language identification response."""
 
-    language: IdentifiedLanguage
-    script: IdentifiedScript
+    language: str
+    script: str
+
+    @field_validator("language")
+    def validate_language(cls, value: str) -> str:
+        """Make sure language input is a valid IdentifiedLanguage"""
+        if value not in IdentifiedLanguage._member_names_:
+            raise ValueError(f"Invalid language: {value}")
+        return value
+
+    @field_validator("script")
+    def validate_script(cls, value: str) -> str:
+        """Make sure script input is a valid IdentifiedScript"""
+        if value not in IdentifiedScript._member_names_:
+            raise ValueError(f"Invalid script: {value}")
+        return value
 
     model_config = ConfigDict(strict=True)
 

From ba013a57aed365f4a2d58e36855c7f0f4c5e02ad Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:35:58 +0530
Subject: [PATCH 08/18] fix test return mock value

---
 .secrets.baseline                              | 6 +++---
 core_backend/app/llm_call/process_input.py     | 2 +-
 core_backend/tests/api/test_chat.py            | 1 -
 core_backend/tests/api/test_question_answer.py | 7 +++----
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index 2ba6baa9b..30aef52a0 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -448,14 +448,14 @@
         "filename": "core_backend/tests/api/test_question_answer.py",
         "hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d",
         "is_verified": false,
-        "line_number": 419
+        "line_number": 418
       },
       {
         "type": "Secret Keyword",
         "filename": "core_backend/tests/api/test_question_answer.py",
         "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee",
         "is_verified": false,
-        "line_number": 1019
+        "line_number": 1018
       }
     ],
     "core_backend/tests/api/test_user_tools.py": [
@@ -581,5 +581,5 @@
       }
     ]
   },
-  "generated_at": "2025-04-09T08:32:56Z"
+  "generated_at": "2025-04-10T10:05:42Z"
 }
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 80b462f36..d603980a4 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -117,8 +117,8 @@ async def _identify_language(
         user_message=query_refined.query_text,
     )
 
+    cleaned_json_str = remove_json_markdown(text=json_str)
     try:
-        cleaned_json_str = remove_json_markdown(text=json_str)
         lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
         identified_lang = lang_info.language
         identified_script = lang_info.script
diff --git a/core_backend/tests/api/test_chat.py b/core_backend/tests/api/test_chat.py
index ed2f35f5e..d32cb3436 100644
--- a/core_backend/tests/api/test_chat.py
+++ b/core_backend/tests/api/test_chat.py
@@ -85,7 +85,6 @@ async def test_init_user_query_and_chat_histories(redis_client: aioredis.Redis)
             chat_query_params["chat_cache_key"] == f"chatCache:{user_query.session_id}"
         )
         assert chat_query_params["message_type"] == "NEW"
-        assert chat_query_params["search_query"] == "stomachache and possible remedies"
 
 
 async def test__ask_llm_async() -> None:
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 936bf41b5..38fa74ddc 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -13,7 +13,6 @@
 from core_backend.app.llm_call.llm_prompts import (
     AlignmentScore,
     IdentifiedLanguage,
-    LanguageIdentificationResponse,
 )
 from core_backend.app.llm_call.process_input import (
     _classify_safety,
@@ -1115,9 +1114,9 @@ async def mock_ask_llm(  # pylint: disable=W0613
                 The identified language and script model json string.
             """
 
-            return LanguageIdentificationResponse(
-                language=identified_lang_str, script=identified_script_str
-            ).model_dump_json()
+            return f"""
+            {{"language": "{identified_lang_str}", "script": "{identified_script_str}"}}
+            """.strip()
 
         monkeypatch.setattr(
             "core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm

From 7ed12705b693cb6ae00bf7a7a05e2db346c7cadd Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:43:29 +0530
Subject: [PATCH 09/18] use enum not string

---
 core_backend/app/llm_call/process_input.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index d603980a4..06d0847a3 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
     cleaned_json_str = remove_json_markdown(text=json_str)
     try:
         lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
-        identified_lang = lang_info.language
-        identified_script = lang_info.script
+        identified_lang = getattr(IdentifiedLanguage, lang_info.language)
+        identified_script = getattr(IdentifiedScript, lang_info.script)
     except ValidationError:
         identified_lang = IdentifiedLanguage.UNSUPPORTED
         identified_script = IdentifiedScript.LATIN

From e8cbf8a9f70fe66dfc4d1af85b85aa8efbd5336b Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:07:36 +0530
Subject: [PATCH 10/18] fix type issues

---
 core_backend/app/llm_call/llm_prompts.py   | 151 +++++++++------------
 core_backend/app/llm_call/llm_rag.py       |  10 +-
 core_backend/app/llm_call/process_input.py |   4 +-
 3 files changed, 73 insertions(+), 92 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 61cef0490..92b63a125 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -183,20 +183,20 @@ class AlignmentScore(BaseModel):
 
 
 CHAT_RESPONSE_PROMPT = """\
-You are an AI assistant designed to help users with their
-questions/concerns. You interact with users via a chat interface. You will
-be provided with ADDITIONAL RELEVANT INFORMATION that can address the
+You are an AI assistant designed to help users with their \
+questions/concerns. You interact with users via a chat interface. You will \
+be provided with ADDITIONAL RELEVANT INFORMATION that can address the \
 user's questions/concerns.
 
 BEFORE answering the user's LATEST MESSAGE, follow these steps:
 
-1. Review the conversation history to ensure that you understand the
+1. Review the conversation history to ensure that you understand the \
 context in which the user's LATEST MESSAGE is being asked.
-2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
-understand the most useful information related to the user's LATEST
+2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you \
+understand the most useful information related to the user's LATEST \
 MESSAGE.
 
-When you have completed the above steps, you will then write a JSON, whose
+When you have completed the above steps, you will then write a JSON, whose \
 TypeScript Interface is given below:
 
 interface Response {{
@@ -204,41 +204,33 @@ class AlignmentScore(BaseModel):
     answer: string;
 }}
 
-For "extracted_info", extract from the provided ADDITIONAL RELEVANT
-INFORMATION the most useful information related to the LATEST MESSAGE asked
-by the user, and list them one by one. If no useful information is found,
+For "extracted_info", extract from the provided ADDITIONAL RELEVANT \
+INFORMATION the most useful information related to the LATEST MESSAGE asked \
+by the user, and list them one by one. If no useful information is found, \
 return an empty list.
 
-For "answer", understand the conversation history, ADDITIONAL RELEVANT
-INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
-the user's LATEST MESSAGE. If no useful information was found in the
-either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
+For "answer", understand the conversation history, ADDITIONAL RELEVANT \
+INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to \
+the user's LATEST MESSAGE. If no useful information was found in the \
+either the conversation history or the ADDITIONAL RELEVANT INFORMATION, \
 respond with {failure_message}.
 
 EXAMPLE RESPONSES:
-{{"extracted_info": [
-    "Pineapples are a blend of pinecones and apples.",
-    "Pineapples have the shape of a pinecone."
-    ],
-    "answer": "The 'pine-' from pineapples likely come from the fact that
-    pineapples are a hybrid of pinecones and apples and its pinecone-like
-    shape."
-}}
+{{"extracted_info": ["Pineapples are a blend of pinecones and apples.", \
+"Pineapples have the shape of a pinecone."], \
+"answer": "The 'pine-' from pineapples likely come from the fact that \
+pineapples are a hybrid of pinecones and apples and its pinecone-like \
+shape."}}
 {{"extracted_info": [], "answer": "{failure_message}"}}
 
 IMPORTANT NOTES ON THE "answer" FIELD:
 - Keep in mind that the user is asking a {message_type} question.
 - Answer in the language {original_language} in the script {original_script}.
 - Answer should be concise and to the point.
-- Do not include any information that is not present in the ADDITIONAL
+- Do not include any information that is not present in the ADDITIONAL \
 RELEVANT INFORMATION.
 
-Only output the JSON response, without any additional text.
-
-
-<ADDITIONAL RELEVANT INFORMATION>
-{additional_info}
-</ADDITIONAL RELEVANT INFORMATION>
+Only output the JSON response, without any additional text.\
 """
 
 
@@ -343,19 +335,17 @@ class IdentifiedLanguage(str, Enum):
     SWAHILI = "SWAHILI"
     UNINTELLIGIBLE = "UNINTELLIGIBLE"
     UNSUPPORTED = "UNSUPPORTED"
+
     # XHOSA = "XHOSA"
     # ZULU = "ZULU"
-
     @classmethod
     def get_supported_languages(cls) -> list[str]:
         """Return a list of supported languages.
-
         Returns
         -------
         list[str]
             A list of supported languages.
         """
-
         return [
             lang
             for lang in cls._member_names_
@@ -380,57 +370,53 @@ def _missing_(cls, value: str) -> IdentifiedLanguage:  # type: ignore[override]
 
         return cls.UNSUPPORTED
 
-    @classmethod
-    def get_prompt(cls) -> str:
-        """Return the prompt for the language identification bot.
-
-        Returns
-        -------
-        str
-            The prompt for the language identification bot.
-        """
-
-        return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip()
-
 
 class IdentifiedScript(str, Enum):
     """Script used in the user's input."""
 
-    LATIN = "Latin"
-    DEVANAGARI = "Devanagari"
-    BENGALI = "Bengali"
-    TAMIL = "Tamil"
-    TELUGU = "Telugu"
-    KANNADA = "Kannada"
-    MALAYALAM = "Malayalam"
-    GUJARATI = "Gujarati"
-    GURMUKHI = "Gurmukhi"
-    ORIYA = "Oriya"
-    # SINHALA = "Sinhala"
-    # MYANMAR = "Myanmar"
-    # ETHIOPIC = "Ethiopic"
-    # GEORGIAN = "Georgian"
-    # ARMENIAN = "Armenian"
-    # HEBREW = "Hebrew"
-    # GREEK = "Greek"
-    # TIBETAN = "Tibetan"
-    # MONGOLIAN = "Mongolian"
-    # KHMER = "Khmer"
-    # LAO = "Lao"
-    # VIETNAMESE = "Vietnamese"
-    # THAI_LAO = "Thai-Lao"
-    UNKNOWN = "Unknown"
+    LATIN = "LATIN"
+    DEVANAGARI = "DEVANAGARI"
+    BENGALI = "BENGALI"
+    TAMIL = "TAMIL"
+    TELUGU = "TELUGU"
+    KANNADA = "KANNADA"
+    MALAYALAM = "MALAYALAM"
+    GUJARATI = "GUJARATI"
+    # GURMUKHI = "GURMUKHI"
+    # ORIYA = "ORIYA"
+    # SINHALA = "SINHALA"
+    # MYANMAR = "MYANMAR"
+    # ETHIOPIC = "ETHIOPIC"
+    # GEORGIAN = "GEORGIAN"
+    # ARMENIAN = "ARMENIAN"
+    # HEBREW = "HEBREW"
+    # GREEK = "GREEK"
+    # TIBETAN = "TIBETAN"
+    # MONGOLIAN = "MONGOLIAN"
+    # KHMER = "KHMER"
+    # LAO = "LAO"
+    # VIETNAMESE = "VIETNAMESE"
+    # THAI_LAO = "THAI_LAO"
+    UNKNOWN = "UNKNOWN"
+
+    @classmethod
+    def get_supported_scripts(cls) -> list[str]:
+        """Return a list of supported scripts.
+        Returns
+        -------
+        list[str]
+            A list of supported scripts.
+        """
+        return [script for script in cls._member_names_ if script != "UNKNOWN"]
 
     @classmethod
     def _missing_(cls, value: str) -> IdentifiedScript:  # type: ignore[override]
         """If script identified is not one of the supported scripts, it is
         classified as UNKNOWN.
-
         Parameters
         ----------
         value
             The script identified.
-
         Returns
         -------
         Script
@@ -438,17 +424,6 @@ def _missing_(cls, value: str) -> IdentifiedScript:  # type: ignore[override]
         """
         return cls.UNKNOWN
 
-    @classmethod
-    def get_supported_scripts(cls) -> list[str]:
-        """Return a list of supported scripts.
-
-        Returns
-        -------
-        list[str]
-            A list of supported scripts.
-        """
-        return [script.value for script in cls if script != cls.UNKNOWN]
-
 
 class LanguageIdentificationResponse(BaseModel):
     """Pydantic model for the language identification response."""
@@ -490,13 +465,13 @@ def validate_script(cls, value: str) -> str:
 4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin"""
     + """
 Examples:
-"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}}
-"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}}
-"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}}
-"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}}
-"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}}
-"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}}
-"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}}
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "LATIN"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "LATIN"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "DEVANAGARI"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "LATIN"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "LATIN"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "LATIN"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "ARABIC"}}
 
 Respond with a JSON object containing "language" and "script" keys.
 """
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index d2fb045d8..d7a1dea12 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -18,6 +18,7 @@
 from .utils import (
     _ask_llm_async,
     append_messages_to_chat_history,
+    format_prompt,
     get_chat_response,
     remove_json_markdown,
 )
@@ -127,14 +128,19 @@ async def get_llm_rag_answer_with_chat_history(
                 message_type=message_type,
                 original_language=original_language,
                 original_script=original_script,
-                additional_info=context,
             )
         )
 
+    user_message_with_context = format_prompt(
+        prompt=f"""{question}\n\n
+        <ADDITIONAL RELEVANT INFORMATION>
+        {context}
+        </ADDITIONAL RELEVANT INFORMATION>"""
+    )
     content = await get_chat_response(
         chat_history=chat_history,
         chat_params=chat_params,
-        message_params=question,
+        message_params=user_message_with_context,
         session_id=session_id,
         json_=True,
         metadata=metadata or {},
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 06d0847a3..e73cc24d6 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
     cleaned_json_str = remove_json_markdown(text=json_str)
     try:
         lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
-        identified_lang = getattr(IdentifiedLanguage, lang_info.language)
-        identified_script = getattr(IdentifiedScript, lang_info.script)
+        identified_lang = getattr(IdentifiedLanguage, lang_info.language.upper())
+        identified_script = getattr(IdentifiedScript, lang_info.script.upper())
     except ValidationError:
         identified_lang = IdentifiedLanguage.UNSUPPORTED
         identified_script = IdentifiedScript.LATIN

From 730a8ccbcbae28f11f6b7317247f8c43bb4ef7aa Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:10:15 +0530
Subject: [PATCH 11/18] fix tests and always run paraphrase guardrail

---
 core_backend/app/llm_call/process_input.py     |  7 +++----
 core_backend/tests/api/test_question_answer.py | 18 +++++++++---------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index e73cc24d6..c71371908 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -493,10 +493,9 @@ async def wrapper(
             The appropriate response object.
         """
 
-        if not query_refined.chat_query_params:
-            query_refined, response = await _paraphrase_question(
-                query_refined=query_refined, response=response
-            )
+        query_refined, response = await _paraphrase_question(
+            query_refined=query_refined, response=response
+        )
         response = await func(query_refined, response, *args, **kwargs)
 
         return response
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 38fa74ddc..aa8648c5e 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1050,15 +1050,15 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
     @pytest.mark.parametrize(
         "identified_lang_str,identified_script_str,should_error,expected_error_type",
         [
-            ("ENGLISH", "Latin", False, None),
-            ("HINDI", "Devanagari", False, None),
-            ("UNINTELLIGIBLE", "Latin", True, ErrorType.UNINTELLIGIBLE_INPUT),
-            ("UNINTELLIGIBLE", "Unknown", True, ErrorType.UNSUPPORTED_SCRIPT),
-            ("GIBBERISH", "Unknwon", True, ErrorType.UNSUPPORTED_SCRIPT),
-            ("GIBBERISH", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
-            ("UNSUPPORTED", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
-            ("SOME_UNSUPPORTED_LANG", "Unknown", True, ErrorType.UNSUPPORTED_LANGUAGE),
-            ("don't kow", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("ENGLISH", "LATIN", False, None),
+            ("HINDI", "DEVANAGARI", False, None),
+            ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
+            ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+            ("don't kow", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
         ],
     )
     async def test_language_identify_error(

From 8b7e5a4cbfabbd94ddcd991e79f6f455f2d39a7a Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:14:47 +0530
Subject: [PATCH 12/18] use uppercase

---
 .../rails/data/language_identification.yaml   | 24 +++++++++----------
 .../rails/test_language_identification.py     |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml
index 7daa61476..4b28c20e2 100644
--- a/core_backend/tests/rails/data/language_identification.yaml
+++ b/core_backend/tests/rails/data/language_identification.yaml
@@ -2,7 +2,7 @@
 # improve this with a native speaker. These might be too "pure".
 
 HAUSA:
-  Latin:
+  LATIN:
     - Ina da yara biyu masu hanci
     - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
     - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
@@ -13,59 +13,59 @@ HAUSA:
     - Ina kwana Maman mu
     - Wannan shago na ne
 ENGLISH:
-  Latin:
+  LATIN:
     - I have two children. You see I girl, what is the probability the other is also a girl?
     - No idea
     - Why you say that?
 XHOSA:
-  Latin:
+  LATIN:
     - Umama ngugqirha
     - Utata ngumongikazi
     - Ukuba intamo yam yayifuna ukwenza oko?
     - Iintsana zikhala kakhulu, huh?
 YORUBA: #h/t: Fola
-  Latin:
+  LATIN:
     - Ni bo ló ti ri owo yen?
     - Eyin melo ni e wa ni be?
     - Ki ni itumo oruko ẹ?
     - Ki ni o jẹ lánà?
     - Omo Ibadan ni mi
 IGBO: #h/t: Fola
-  Latin:
+  LATIN:
     - agụụ na-agụ m
     - agam aga ahia echi
     - ị hụla ngozi? ana m achọ ya.
     - m na-aga ọrụ
 KOREAN:
-  Korean:
+  KOREAN:
     - 애가 둘이예요
     - ㅋㅋㅋㅋㅋㅋ
     - 아이들이 많이 울어요ㅠ
     - 이 프로젝트 애칭은 ask-a-question이야.
 ZULU:
-  Latin:
+  LATIN:
     - Ngingumama
     - Ingabe uyi-bot noma ungumuntu?
     - Ngifuna ukwenza lokhu?
     - Izingane zikhala kakhulu, hhe
 AFRIKAANS:
-  Latin:
+  LATIN:
     - Ek het hierdie goddelose dal gemaak
     - Is covid nog 'n ding?
     - My hond het my huiswerk geëet
     - Het jy al gebraaide roomys probeer?
 HINDI: #h/t: Sid
-  Latin:
+  LATIN:
     - is ka matlab kya hai?
     - kabhi kabhi mere dil mein
-  Devanagari:
+  DEVANAGARI:
     - अंत में सभी लोग नाश्ता करने जाएं
     - गब्बर सिंह कह के गया जो डर गया वो मर गया
 MARATHI:
-  Latin:
+  LATIN:
     - Portal chi link aahe
 UNINTELLIGIBLE:
-  Unknown:
+  UNKNOWN:
     - sdfsdf sss dyhnel jjj
     - hs dsfsg xd ewwo ddfs
     - Heghlu'meH QaQ jajvam
diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py
index b7c30f46f..6744d8216 100644
--- a/core_backend/tests/rails/test_language_identification.py
+++ b/core_backend/tests/rails/test_language_identification.py
@@ -77,7 +77,7 @@ async def test_language_identification(
         expected_language = "UNSUPPORTED"
 
     if expected_script not in available_scripts:
-        expected_script = "Unknown"
+        expected_script = "UNKNOWN"
 
     _, response = await _identify_language(query_refined=question, response=response)
 

From 15e083fafd0a680ce6094d03c03aaedcee1e652f Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:35:30 +0530
Subject: [PATCH 13/18] fix tests and how we get enum

---
 core_backend/app/llm_call/process_input.py     | 4 ++--
 core_backend/tests/api/test_question_answer.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index c71371908..4ae2eb368 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -120,8 +120,8 @@ async def _identify_language(
     cleaned_json_str = remove_json_markdown(text=json_str)
     try:
         lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
-        identified_lang = getattr(IdentifiedLanguage, lang_info.language.upper())
-        identified_script = getattr(IdentifiedScript, lang_info.script.upper())
+        identified_lang = IdentifiedLanguage(lang_info.language.upper())
+        identified_script = IdentifiedScript(lang_info.script.upper())
     except ValidationError:
         identified_lang = IdentifiedLanguage.UNSUPPORTED
         identified_script = IdentifiedScript.LATIN
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index aa8648c5e..0894ccbde 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1053,7 +1053,7 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
             ("ENGLISH", "LATIN", False, None),
             ("HINDI", "DEVANAGARI", False, None),
             ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
-            ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT),
             ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
             ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
             ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),

From a4136be621f058ada3b1e15e71f5a0827dad82d0 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:48:49 +0530
Subject: [PATCH 14/18] add test cases

---
 core_backend/app/llm_call/process_input.py     | 1 +
 core_backend/tests/api/test_question_answer.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 4ae2eb368..a7a3d7a21 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -184,6 +184,7 @@ def _process_identified_language_response(
         )
         error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
     else:
+        # TODO: create types for language x script combos
         if identified_script == IdentifiedScript.UNKNOWN:
             error_message = (
                 "Unsupported script. "
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 0894ccbde..43e9b7ef5 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -1054,7 +1054,9 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
             ("HINDI", "DEVANAGARI", False, None),
             ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
             ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT),
-            ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("ENGLISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("ENGLISH", "Some unsupported script", True, ErrorType.UNSUPPORTED_SCRIPT),
+            ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
             ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
             ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
             ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),

From e951d0acdc17bb8fb2a356636db956541ef4f0e4 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:06:54 +0530
Subject: [PATCH 15/18] clean up error logic

---
 core_backend/app/llm_call/process_input.py | 35 ++++++++++------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index a7a3d7a21..a84d1e798 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -167,34 +167,31 @@ def _process_identified_language_response(
 
     supported_languages_list = IdentifiedLanguage.get_supported_languages()
     supported_scripts_list = IdentifiedScript.get_supported_scripts()
+    supported_languages_str = ", ".join(supported_languages_list)
+    suported_scripts_str = ", ".join(supported_scripts_list)
 
-    if (
-        identified_language in supported_languages_list
-        and identified_script in supported_scripts_list
-    ):
-        return response
+    language_ok = identified_language in supported_languages_list
+    script_ok = identified_script in supported_scripts_list
 
-    supported_languages = ", ".join(supported_languages_list)
-    supported_scripts = ", ".join(supported_scripts_list)
-
-    if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
+    if language_ok and script_ok:
+        return response
+    elif language_ok and not script_ok:
         error_message = (
-            "Unintelligible input. "
-            + f"The following languages are supported: {supported_languages}."
+            "Unsupported script. "
+            + f"Only the following scripts are supported: {suported_scripts_str}"
         )
-        error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
-    else:
-        # TODO: create types for language x script combos
-        if identified_script == IdentifiedScript.UNKNOWN:
+        error_type: ErrorType = ErrorType.UNSUPPORTED_SCRIPT
+    else:  # regardless of script, language is not "ok"
+        if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
             error_message = (
-                "Unsupported script. "
-                + f"Only the following scripts are supported: {supported_scripts}"
+                "Unintelligible input. "
+                + f"The following languages are supported: {supported_languages_str}."
             )
-            error_type = ErrorType.UNSUPPORTED_SCRIPT
+            error_type = ErrorType.UNINTELLIGIBLE_INPUT
         else:
             error_message = (
                 "Unsupported language. Only the following languages "
-                + f"are supported: {supported_languages}."
+                + f"are supported: {supported_languages_str}."
             )
             error_type = ErrorType.UNSUPPORTED_LANGUAGE
 

From c8117de28461fa5c586c9d08328b6cacead2f559 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:14:57 +0530
Subject: [PATCH 16/18] fix conftes

---
 .secrets.baseline                  | 53 +-----------------------------
 core_backend/tests/api/conftest.py |  3 ++
 2 files changed, 4 insertions(+), 52 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index 30aef52a0..f961ef821 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -348,57 +348,6 @@
         "line_number": 15
       }
     ],
-    "core_backend/tests/api/conftest.py": [
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3",
-        "is_verified": false,
-        "line_number": 46
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7",
-        "is_verified": false,
-        "line_number": 47
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097",
-        "is_verified": false,
-        "line_number": 50
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4",
-        "is_verified": false,
-        "line_number": 51
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a",
-        "is_verified": false,
-        "line_number": 56
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78",
-        "is_verified": false,
-        "line_number": 57
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e",
-        "is_verified": false,
-        "line_number": 317
-      }
-    ],
     "core_backend/tests/api/test.env": [
       {
         "type": "Secret Keyword",
@@ -581,5 +530,5 @@
       }
     ]
   },
-  "generated_at": "2025-04-10T10:05:42Z"
+  "generated_at": "2025-04-10T13:44:48Z"
 }
diff --git a/core_backend/tests/api/conftest.py b/core_backend/tests/api/conftest.py
index ca0a53a0b..d0e8b15bc 100644
--- a/core_backend/tests/api/conftest.py
+++ b/core_backend/tests/api/conftest.py
@@ -35,6 +35,7 @@
     RAG,
     AlignmentScore,
     IdentifiedLanguage,
+    IdentifiedScript,
 )
 from core_backend.app.question_answer.models import (
     ContentFeedbackDB,
@@ -1703,7 +1704,9 @@ async def mock_identify_language(
     """
 
     query_refined.original_language = IdentifiedLanguage.ENGLISH
+    query_refined.original_script = IdentifiedScript.LATIN
     response.debug_info["original_language"] = "ENGLISH"
+    response.debug_info["original_script"] = "LATIN"
 
     return query_refined, response
 

From a81a3816ed477693fb7da49f05d7f55c4cb3537a Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:37:18 +0530
Subject: [PATCH 17/18] fix logic

---
 core_backend/app/llm_call/llm_prompts.py   | 20 +++-----------------
 core_backend/app/llm_call/process_input.py |  7 ++++---
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 92b63a125..f3603deb2 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from typing import ClassVar, Literal
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field
 
 from .utils import format_prompt, remove_json_markdown
 
@@ -428,22 +428,8 @@ def _missing_(cls, value: str) -> IdentifiedScript:  # type: ignore[override]
 class LanguageIdentificationResponse(BaseModel):
     """Pydantic model for the language identification response."""
 
-    language: str
-    script: str
-
-    @field_validator("language")
-    def validate_language(cls, value: str) -> str:
-        """Make sure language input is a valid IdentifiedLanguage"""
-        if value not in IdentifiedLanguage._member_names_:
-            raise ValueError(f"Invalid language: {value}")
-        return value
-
-    @field_validator("script")
-    def validate_script(cls, value: str) -> str:
-        """Make sure script input is a valid IdentifiedScript"""
-        if value not in IdentifiedScript._member_names_:
-            raise ValueError(f"Invalid script: {value}")
-        return value
+    language: IdentifiedLanguage
+    script: IdentifiedScript
 
     model_config = ConfigDict(strict=True)
 
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index a84d1e798..b854d8ff0 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -167,12 +167,13 @@ def _process_identified_language_response(
 
     supported_languages_list = IdentifiedLanguage.get_supported_languages()
     supported_scripts_list = IdentifiedScript.get_supported_scripts()
-    supported_languages_str = ", ".join(supported_languages_list)
-    suported_scripts_str = ", ".join(supported_scripts_list)
 
     language_ok = identified_language in supported_languages_list
     script_ok = identified_script in supported_scripts_list
 
+    supported_languages_str = ", ".join(supported_languages_list)
+    suported_scripts_str = ", ".join(supported_scripts_list)
+
     if language_ok and script_ok:
         return response
     elif language_ok and not script_ok:
@@ -209,7 +210,7 @@ def _process_identified_language_response(
 
     logger.info(
         f"LANGUAGE IDENTIFICATION FAILED due to {error_message} "
-        f"language on query id: {str(response.query_id)}"
+        f"on query id: {str(response.query_id)}"
     )
 
     return error_response

From 40187b86fdd66d0579bba7b8f63407217cf30885 Mon Sep 17 00:00:00 2001
From: Suzin <7042047+suzinyou@users.noreply.github.com>
Date: Fri, 11 Apr 2025 17:15:42 +0530
Subject: [PATCH 18/18] Add query optimization back in, but don't run
 translation for chat queries

---
 core_backend/app/llm_call/llm_prompts.py    |  5 ++++-
 core_backend/app/llm_call/process_input.py  | 18 +++++++++++-------
 core_backend/app/question_answer/routers.py |  8 ++++++++
 core_backend/app/question_answer/utils.py   |  4 +++-
 core_backend/tests/api/test_chat.py         |  1 +
 5 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index f3603deb2..db7a154bd 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -268,7 +268,9 @@ class ChatHistory:
 
             {{
                 "message_type": "The type of the user's LATEST MESSAGE. List of valid
-                options are: {valid_message_types}"
+                options are: {valid_message_types}",
+                "query": "The vector database query that you have constructed based on
+                the user's LATEST MESSAGE and the conversation history."
             }}
 
             Do NOT attempt to answer the user's question/concern. Only output the JSON
@@ -283,6 +285,7 @@ class ChatHistoryConstructSearchQuery(BaseModel):
         """Pydantic model for the output of the construct search query chat history."""
 
         message_type: Literal["FOLLOW-UP", "NEW"]
+        query: str
 
     @staticmethod
     def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]:
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index b854d8ff0..c6da6a5b4 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -114,7 +114,8 @@ async def _identify_language(
         litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
         metadata=metadata,
         system_message=LANGUAGE_ID_PROMPT,
-        user_message=query_refined.query_text,
+        # Always use the original query text for language and script detection
+        user_message=query_refined.query_text_original,
     )
 
     cleaned_json_str = remove_json_markdown(text=json_str)
@@ -256,9 +257,10 @@ async def wrapper(
             The appropriate response object.
         """
 
-        query_refined, response = await _translate_question(
-            query_refined=query_refined, response=response
-        )
+        if not query_refined.chat_query_params:
+            query_refined, response = await _translate_question(
+                query_refined=query_refined, response=response
+            )
         response = await func(query_refined, response, *args, **kwargs)
 
         return response
@@ -492,9 +494,11 @@ async def wrapper(
             The appropriate response object.
         """
 
-        query_refined, response = await _paraphrase_question(
-            query_refined=query_refined, response=response
-        )
+        if not query_refined.chat_query_params:
+            query_refined, response = await _paraphrase_question(
+                query_refined=query_refined, response=response
+            )
+
         response = await func(query_refined, response, *args, **kwargs)
 
         return response
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 5a4b057b8..e6091edb1 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -844,6 +844,13 @@ async def get_user_query_and_response(
         workspace_id=workspace_id,
     )
 
+    # In case of a chat query, use the optimized query as the base query_text.
+    # Note that for language identification, we use query_text_original.
+    if user_query_refined.chat_query_params:
+        user_query_refined.query_text = user_query_refined.chat_query_params.pop(
+            "search_query"
+        )
+
     # Prepare the placeholder response object.
     response_template = QueryResponse(
         debug_info={},
@@ -1072,6 +1079,7 @@ async def init_user_query_and_chat_histories(
         "chat_history": user_assistant_chat_history,
         "chat_params": chat_params,
         "message_type": search_query_json_response["message_type"],
+        "search_query": search_query_json_response["query"],
         "redis_client": redis_client,
         "session_id": session_id,
     }
diff --git a/core_backend/app/question_answer/utils.py b/core_backend/app/question_answer/utils.py
index 029d7194c..f972e46dc 100644
--- a/core_backend/app/question_answer/utils.py
+++ b/core_backend/app/question_answer/utils.py
@@ -23,6 +23,8 @@ def get_context_string_from_search_results(
     for key, result in search_results.items():
         if not isinstance(result, QuerySearchResult):
             result = QuerySearchResult(**result)
-        context_list.append(f"{key}. {result.title}\n{result.text}")
+        context_list.append(
+            f"<document id={key}> \n**{result.title}**\n\n{result.text}\n</document>"
+        )
     context_string = "\n\n".join(context_list)
     return context_string
diff --git a/core_backend/tests/api/test_chat.py b/core_backend/tests/api/test_chat.py
index d32cb3436..ed2f35f5e 100644
--- a/core_backend/tests/api/test_chat.py
+++ b/core_backend/tests/api/test_chat.py
@@ -85,6 +85,7 @@ async def test_init_user_query_and_chat_histories(redis_client: aioredis.Redis)
             chat_query_params["chat_cache_key"] == f"chatCache:{user_query.session_id}"
         )
         assert chat_query_params["message_type"] == "NEW"
+        assert chat_query_params["search_query"] == "stomachache and possible remedies"
 
 
 async def test__ask_llm_async() -> None: