From a350cb0b46c71b6bddf145a8a3759cf4918d64d8 Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Wed, 2 Apr 2025 17:20:12 +0530 Subject: [PATCH 01/18] detect script as well --- core_backend/app/llm_call/llm_prompts.py | 230 ++++++++++++++------ core_backend/app/llm_call/llm_rag.py | 19 +- core_backend/app/llm_call/process_input.py | 24 +- core_backend/app/llm_call/process_output.py | 5 + core_backend/app/question_answer/schemas.py | 3 +- 5 files changed, 200 insertions(+), 81 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 2ede20f4c..0d1076834 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -103,7 +103,7 @@ {context} IMPORTANT NOTES ON THE "answer" FIELD: -- Answer in the language of the question ({original_language}). +- Answer in the language {original_language} in the script {original_script}. - Answer should be concise, to the point, and no longer than 80 words. - Do not include any information that is not present in the REFERENCE TEXT. """ @@ -182,6 +182,61 @@ class AlignmentScore(BaseModel): model_config = ConfigDict(strict=True) +CHAT_RESPONSE_PROMPT = """\ +You are an AI assistant designed to help users with their +questions/concerns. You interact with users via a chat interface. You will +be provided with ADDITIONAL RELEVANT INFORMATION that can address the +user's questions/concerns. + +BEFORE answering the user's LATEST MESSAGE, follow these steps: + +1. Review the conversation history to ensure that you understand the +context in which the user's LATEST MESSAGE is being asked. +2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you +understand the most useful information related to the user's LATEST +MESSAGE. + +When you have completed the above steps, you will then write a JSON, whose +TypeScript Interface is given below: + +interface Response {{ + extracted_info: string[]; + answer: string; +}} + +For "extracted_info", extract from the provided ADDITIONAL RELEVANT +INFORMATION the most useful information related to the LATEST MESSAGE asked +by the user, and list them one by one. If no useful information is found, +return an empty list. + +For "answer", understand the conversation history, ADDITIONAL RELEVANT +INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to +the user's LATEST MESSAGE. If no useful information was found in the +either the conversation history or the ADDITIONAL RELEVANT INFORMATION, +respond with {failure_message}. + +EXAMPLE RESPONSES: +{{"extracted_info": [ + "Pineapples are a blend of pinecones and apples.", + "Pineapples have the shape of a pinecone." + ], + "answer": "The 'pine-' from pineapples likely come from the fact that + pineapples are a hybrid of pinecones and apples and its pinecone-like + shape." +}} +{{"extracted_info": [], "answer": "{failure_message}"}} + +IMPORTANT NOTES ON THE "answer" FIELD: +- Keep in mind that the user is asking a {message_type} question. +- Answer in the language {original_language} in the script {original_script}. +- Answer should be concise and to the point. +- Do not include any information that is not present in the ADDITIONAL +RELEVANT INFORMATION. + +Only output the JSON response, without any additional text. +""" + + class ChatHistory: """Contains the prompts and models for the chat history task.""" @@ -227,62 +282,7 @@ class ChatHistory: ), prompt_kws={"valid_message_types": _valid_message_types}, ) - system_message_generate_response = format_prompt( - prompt=textwrap.dedent( - """You are an AI assistant designed to help users with their - questions/concerns. You interact with users via a chat interface. You will - be provided with ADDITIONAL RELEVANT INFORMATION that can address the - user's questions/concerns. - - BEFORE answering the user's LATEST MESSAGE, follow these steps: - - 1. Review the conversation history to ensure that you understand the - context in which the user's LATEST MESSAGE is being asked. - 2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you - understand the most useful information related to the user's LATEST - MESSAGE. - - When you have completed the above steps, you will then write a JSON, whose - TypeScript Interface is given below: - - interface Response {{ - extracted_info: string[]; - answer: string; - }} - - For "extracted_info", extract from the provided ADDITIONAL RELEVANT - INFORMATION the most useful information related to the LATEST MESSAGE asked - by the user, and list them one by one. If no useful information is found, - return an empty list. - - For "answer", understand the conversation history, ADDITIONAL RELEVANT - INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to - the user's LATEST MESSAGE. If no useful information was found in the - either the conversation history or the ADDITIONAL RELEVANT INFORMATION, - respond with {failure_message}. - - EXAMPLE RESPONSES: - {{"extracted_info": [ - "Pineapples are a blend of pinecones and apples.", - "Pineapples have the shape of a pinecone." - ], - "answer": "The 'pine-' from pineapples likely come from the fact that - pineapples are a hybrid of pinecones and apples and its pinecone-like - shape." - }} - {{"extracted_info": [], "answer": "{failure_message}"}} - - IMPORTANT NOTES ON THE "answer" FIELD: - - Keep in mind that the user is asking a {message_type} question. - - Answer in the language of the question ({original_language}). - - Answer should be concise and to the point. - - Do not include any information that is not present in the ADDITIONAL - RELEVANT INFORMATION. - - Only output the JSON response, without any additional text. - """ - ) - ) + system_message_generate_response = CHAT_RESPONSE_PROMPT class ChatHistoryConstructSearchQuery(BaseModel): """Pydantic model for the output of the construct search query chat history.""" @@ -330,6 +330,106 @@ def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str] raise ValueError(f"Error validating the output: {e}") from e +class IdentifiedScript(str, Enum): + """Script used in the user's input.""" + + LATIN = "Latin" + DEVANAGARI = "Devanagari" + ARABIC = "Arabic" + CYRILLIC = "Cyrillic" + CHINESE = "Chinese" + JAPANESE = "Japanese" + KOREAN = "Korean" + THAI = "Thai" + BENGALI = "Bengali" + TAMIL = "Tamil" + TELUGU = "Telugu" + KANNADA = "Kannada" + MALAYALAM = "Malayalam" + GUJARATI = "Gujarati" + GURMUKHI = "Gurmukhi" + ORIYA = "Oriya" + SINHALA = "Sinhala" + MYANMAR = "Myanmar" + ETHIOPIC = "Ethiopic" + GEORGIAN = "Georgian" + ARMENIAN = "Armenian" + HEBREW = "Hebrew" + GREEK = "Greek" + TIBETAN = "Tibetan" + MONGOLIAN = "Mongolian" + KHMER = "Khmer" + LAO = "Lao" + VIETNAMESE = "Vietnamese" + THAI_LAO = "Thai-Lao" + UNKNOWN = "Unknown" + + @classmethod + def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override] + """If script identified is not one of the supported scripts, it is + classified as UNKNOWN. + + Parameters + ---------- + value + The script identified. + + Returns + ------- + Script + The identified script (i.e., UNKNOWN). + """ + return cls.UNKNOWN + + @classmethod + def get_supported_scripts(cls) -> list[str]: + """Return a list of supported scripts. + + Returns + ------- + list[str] + A list of supported scripts. + """ + return [script.value for script in cls if script != cls.UNKNOWN] + + +class LanguageIdentificationResponse(BaseModel): + """Pydantic model for the language identification response.""" + + language: IdentifiedLanguage + script: IdentifiedScript + + model_config = ConfigDict(strict=True) + + +LANGUAGE_ID_PROMPT = f"""\ +You are a high-performing language identification bot that classifies the \ +language and script of the user input. + +For each input, identify: +1. The language (must be one of {{member_names}}) +2. The script (must be one of {", ".join(IdentifiedScript.get_supported_scripts())}) + +If the user input is: +1. in one of the supported languages, respond with that language and its script +2. written in a mix of languages, respond with the dominant language and its script +3. in a real language but not a supported language, respond with UNSUPPORTED and \ +its script +4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin + +Examples: +"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}} +"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}} +"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}} +"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}} +"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}} +"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}} +"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}} + +Respond with a JSON object containing "language" and "script" keys. +""" + + class IdentifiedLanguage(str, Enum): """Identified language of the user's input.""" @@ -387,21 +487,7 @@ def get_prompt(cls) -> str: The prompt for the language identification bot. """ - return textwrap.dedent( - f""" - You are a high-performing language identification bot that classifies the - language of the user input into one of {", ".join(cls._member_names_)}. - - If the user input is - 1. in one of the supported languages, then respond with that language. - 2. written in a mix of languages, then respond with the dominant language. - 3. in a real language but not a supported language, then respond with - UNSUPPORTED. - 4. unintelligible or gibberish, then respond with UNINTELLIGIBLE. - - Answer should be a single word and strictly one of - [{", ".join(cls._member_names_)}]""" - ).strip() + return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip() class RAG(BaseModel): diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py index ab4431ade..49a229364 100644 --- a/core_backend/app/llm_call/llm_rag.py +++ b/core_backend/app/llm_call/llm_rag.py @@ -8,7 +8,13 @@ from ..config import LITELLM_MODEL_GENERATION from ..utils import setup_logger -from .llm_prompts import RAG, RAG_FAILURE_MESSAGE, ChatHistory, IdentifiedLanguage +from .llm_prompts import ( + RAG, + RAG_FAILURE_MESSAGE, + ChatHistory, + IdentifiedLanguage, + IdentifiedScript, +) from .utils import ( _ask_llm_async, append_messages_to_chat_history, @@ -24,6 +30,7 @@ async def get_llm_rag_answer( context: str, metadata: dict | None = None, original_language: IdentifiedLanguage, + original_script: IdentifiedScript, question: str, ) -> RAG: """Get an answer from the LLM model using RAG. @@ -36,6 +43,8 @@ async def get_llm_rag_answer( Additional metadata to provide to the LLM model. original_language The original language of the question. + original_script + The scrip in which the original question was written. question The question to ask the LLM model. @@ -46,7 +55,11 @@ async def get_llm_rag_answer( """ metadata = metadata or {} - prompt = RAG.prompt.format(context=context, original_language=original_language) + prompt = RAG.prompt.format( + context=context, + original_language=original_language, + original_script=original_script, + ) result = await _ask_llm_async( json_=True, @@ -75,6 +88,7 @@ async def get_llm_rag_answer_with_chat_history( message_type: str, metadata: dict | None = None, original_language: IdentifiedLanguage, + original_script: IdentifiedScript, question: str, session_id: str, ) -> tuple[RAG, list[dict[str, str | None]]]: @@ -112,6 +126,7 @@ async def get_llm_rag_answer_with_chat_history( failure_message=RAG_FAILURE_MESSAGE, message_type=message_type, original_language=original_language, + original_script=original_script, ) ) content = ( diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index 9a30ffdeb..d714527b9 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -3,6 +3,8 @@ from functools import wraps from typing import Any, Callable, Optional +from pydantic import ValidationError + from ..config import ( LITELLM_MODEL_LANGUAGE_DETECT, LITELLM_MODEL_PARAPHRASE, @@ -22,9 +24,11 @@ TRANSLATE_FAILED_MESSAGE, TRANSLATE_PROMPT, IdentifiedLanguage, + IdentifiedScript, + LanguageIdentificationResponse, SafetyClassification, ) -from .utils import _ask_llm_async +from .utils import _ask_llm_async, remove_json_markdown logger = setup_logger(name="INPUT RAILS") @@ -84,7 +88,7 @@ async def _identify_language( query_refined: QueryRefined, response: QueryResponse | QueryResponseError, ) -> tuple[QueryRefined, QueryResponse | QueryResponseError]: - """Identify the language of the question. + """Identify the language and script of the question. Parameters ---------- @@ -104,19 +108,27 @@ async def _identify_language( if isinstance(response, QueryResponseError): return query_refined, response - llm_identified_lang = await _ask_llm_async( + json_str = await _ask_llm_async( + json_=True, litellm_model=LITELLM_MODEL_LANGUAGE_DETECT, metadata=metadata, system_message=IdentifiedLanguage.get_prompt(), user_message=query_refined.query_text, ) - identified_lang = getattr( - IdentifiedLanguage, llm_identified_lang, IdentifiedLanguage.UNSUPPORTED - ) + try: + cleaned_json_str = remove_json_markdown(text=json_str) + lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) + identified_lang = lang_info["language"] + identified_script = lang_info["script"] + except ValidationError: + identified_lang = IdentifiedLanguage.UNSUPPORTED + identified_script = IdentifiedScript.LATIN + query_refined.original_language = identified_lang response.debug_info["original_query"] = query_refined.query_text_original response.debug_info["original_language"] = identified_lang + response.debug_info["original_script"] = identified_script processed_response = _process_identified_language_response( identified_language=identified_lang, response=response diff --git a/core_backend/app/llm_call/process_output.py b/core_backend/app/llm_call/process_output.py index a4671030b..2a569f0e1 100644 --- a/core_backend/app/llm_call/process_output.py +++ b/core_backend/app/llm_call/process_output.py @@ -84,6 +84,9 @@ async def generate_llm_query_response( if query_refined.original_language is None: logger.warning("No original_language found in the query.") return response, chat_history + if query_refined.original_script is None: + logger.warning("No original_script found in the query.") + return response, chat_history context = get_context_string_from_search_results( search_results=response.search_results @@ -98,6 +101,7 @@ async def generate_llm_query_response( message_type=message_type, metadata=metadata, original_language=query_refined.original_language, + original_script=query_refined.original_script, question=query_refined.query_text_original, session_id=chat_query_params["session_id"], ) @@ -106,6 +110,7 @@ async def generate_llm_query_response( context=context, metadata=metadata, original_language=query_refined.original_language, + original_script=query_refined.original_script, question=query_refined.query_text_original, # Use the original query text ) diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py index 8904e2c36..bda58ce7a 100644 --- a/core_backend/app/question_answer/schemas.py +++ b/core_backend/app/question_answer/schemas.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field from pydantic.json_schema import SkipJsonSchema -from ..llm_call.llm_prompts import IdentifiedLanguage +from ..llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript from ..schemas import FeedbackSentiment, QuerySearchResult @@ -49,6 +49,7 @@ class QueryRefined(QueryBase): generate_tts: bool = Field(False) original_language: IdentifiedLanguage | None = None + original_script: IdentifiedScript | None = None query_text_original: str workspace_id: int From 5b1759350554ae2edd2f4a95d5f4fcbc5f99995c Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Wed, 2 Apr 2025 17:32:09 +0530 Subject: [PATCH 02/18] fix prompt --- core_backend/app/llm_call/llm_prompts.py | 100 +++++++++++---------- core_backend/app/llm_call/process_input.py | 3 +- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 0d1076834..18c0d5864 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -335,33 +335,33 @@ class IdentifiedScript(str, Enum): LATIN = "Latin" DEVANAGARI = "Devanagari" - ARABIC = "Arabic" - CYRILLIC = "Cyrillic" - CHINESE = "Chinese" - JAPANESE = "Japanese" - KOREAN = "Korean" - THAI = "Thai" + # ARABIC = "Arabic" + # CYRILLIC = "Cyrillic" + # CHINESE = "Chinese" + # JAPANESE = "Japanese" + # KOREAN = "Korean" + # THAI = "Thai" BENGALI = "Bengali" TAMIL = "Tamil" TELUGU = "Telugu" KANNADA = "Kannada" MALAYALAM = "Malayalam" GUJARATI = "Gujarati" - GURMUKHI = "Gurmukhi" - ORIYA = "Oriya" - SINHALA = "Sinhala" - MYANMAR = "Myanmar" - ETHIOPIC = "Ethiopic" - GEORGIAN = "Georgian" - ARMENIAN = "Armenian" - HEBREW = "Hebrew" - GREEK = "Greek" - TIBETAN = "Tibetan" - MONGOLIAN = "Mongolian" - KHMER = "Khmer" - LAO = "Lao" - VIETNAMESE = "Vietnamese" - THAI_LAO = "Thai-Lao" + # GURMUKHI = "Gurmukhi" + # ORIYA = "Oriya" + # SINHALA = "Sinhala" + # MYANMAR = "Myanmar" + # ETHIOPIC = "Ethiopic" + # GEORGIAN = "Georgian" + # ARMENIAN = "Armenian" + # HEBREW = "Hebrew" + # GREEK = "Greek" + # TIBETAN = "Tibetan" + # MONGOLIAN = "Mongolian" + # KHMER = "Khmer" + # LAO = "Lao" + # VIETNAMESE = "Vietnamese" + # THAI_LAO = "Thai-Lao" UNKNOWN = "Unknown" @classmethod @@ -402,34 +402,6 @@ class LanguageIdentificationResponse(BaseModel): model_config = ConfigDict(strict=True) -LANGUAGE_ID_PROMPT = f"""\ -You are a high-performing language identification bot that classifies the \ -language and script of the user input. - -For each input, identify: -1. The language (must be one of {{member_names}}) -2. The script (must be one of {", ".join(IdentifiedScript.get_supported_scripts())}) - -If the user input is: -1. in one of the supported languages, respond with that language and its script -2. written in a mix of languages, respond with the dominant language and its script -3. in a real language but not a supported language, respond with UNSUPPORTED and \ -its script -4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin - -Examples: -"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}} -"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}} -"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}} -"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}} -"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}} -"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}} -"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}} - -Respond with a JSON object containing "language" and "script" keys. -""" - - class IdentifiedLanguage(str, Enum): """Identified language of the user's input.""" @@ -490,6 +462,36 @@ def get_prompt(cls) -> str: return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip() +LANGUAGE_ID_PROMPT = ( + f"""\ +You are a high-performing language identification bot that classifies the \ +language and script of the user input. + +For each input, identify: +1. The language (must be one of {", ".join(IdentifiedLanguage._member_names_)}) +2. The script (must be one of {", ".join(IdentifiedScript._member_names_)}) + +If the user input is: +1. in one of the supported languages, respond with that language and its script +2. written in a mix of languages, respond with the dominant language and its script +3. in a real language but not a supported language, respond with UNSUPPORTED and \ +its script +4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin""" + + """ +Examples: +"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}} +"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}} +"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}} +"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}} +"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}} +"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}} +"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}} + +Respond with a JSON object containing "language" and "script" keys. +""" +) + + class RAG(BaseModel): """Generated response based on question and retrieved context.""" diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index d714527b9..dfc3341a7 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -19,6 +19,7 @@ ) from ..utils import setup_logger from .llm_prompts import ( + LANGUAGE_ID_PROMPT, PARAPHRASE_FAILED_MESSAGE, PARAPHRASE_PROMPT, TRANSLATE_FAILED_MESSAGE, @@ -112,7 +113,7 @@ async def _identify_language( json_=True, litellm_model=LITELLM_MODEL_LANGUAGE_DETECT, metadata=metadata, - system_message=IdentifiedLanguage.get_prompt(), + system_message=LANGUAGE_ID_PROMPT, user_message=query_refined.query_text, ) From f7fee04f2f59d37370950c4ce4d87e449c4f2e5b Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:03:06 +0530 Subject: [PATCH 03/18] fix tests --- .secrets.baseline | 8 +- core_backend/app/llm_call/llm_prompts.py | 133 +++++++++--------- core_backend/app/llm_call/process_input.py | 4 +- .../tests/api/test_question_answer.py | 21 ++- .../rails/data/language_identification.yaml | 108 +++++++------- .../rails/test_language_identification.py | 46 ++++-- 6 files changed, 185 insertions(+), 135 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 5cab9e8c1..2ba6baa9b 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -448,14 +448,14 @@ "filename": "core_backend/tests/api/test_question_answer.py", "hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d", "is_verified": false, - "line_number": 294 + "line_number": 419 }, { "type": "Secret Keyword", "filename": "core_backend/tests/api/test_question_answer.py", "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee", "is_verified": false, - "line_number": 653 + "line_number": 1019 } ], "core_backend/tests/api/test_user_tools.py": [ @@ -473,7 +473,7 @@ "filename": "core_backend/tests/rails/test_language_identification.py", "hashed_secret": "051b2c1d98174fabc4749641c4f4f4660556441e", "is_verified": false, - "line_number": 48 + "line_number": 69 } ], "core_backend/tests/rails/test_paraphrasing.py": [ @@ -581,5 +581,5 @@ } ] }, - "generated_at": "2025-01-24T13:35:08Z" + "generated_at": "2025-04-09T08:32:56Z" } diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 18c0d5864..751a1079a 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -330,6 +330,67 @@ def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str] raise ValueError(f"Error validating the output: {e}") from e +class IdentifiedLanguage(str, Enum): + """Identified language of the user's input.""" + + # AFRIKAANS = "AFRIKAANS" + ENGLISH = "ENGLISH" + FRENCH = "FRENCH" + HINDI = "HINDI" + MARATHI = "MARATHI" + SWAHILI = "SWAHILI" + UNINTELLIGIBLE = "UNINTELLIGIBLE" + UNSUPPORTED = "UNSUPPORTED" + # XHOSA = "XHOSA" + # ZULU = "ZULU" + + @classmethod + def get_supported_languages(cls) -> list[str]: + """Return a list of supported languages. + + Returns + ------- + list[str] + A list of supported languages. + """ + + return [ + lang + for lang in cls._member_names_ + if lang not in ("UNINTELLIGIBLE", "UNSUPPORTED") + ] + + @classmethod + def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override] + """If language identified is not one of the supported language, it is + classified as UNSUPPORTED. + + Parameters + ---------- + value + The language identified. + + Returns + ------- + IdentifiedLanguage + The identified language (i.e., UNSUPPORTED). + """ + + return cls.UNSUPPORTED + + @classmethod + def get_prompt(cls) -> str: + """Return the prompt for the language identification bot. + + Returns + ------- + str + The prompt for the language identification bot. + """ + + return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip() + + class IdentifiedScript(str, Enum): """Script used in the user's input.""" @@ -341,12 +402,12 @@ class IdentifiedScript(str, Enum): # JAPANESE = "Japanese" # KOREAN = "Korean" # THAI = "Thai" - BENGALI = "Bengali" - TAMIL = "Tamil" - TELUGU = "Telugu" - KANNADA = "Kannada" - MALAYALAM = "Malayalam" - GUJARATI = "Gujarati" + # BENGALI = "Bengali" + # TAMIL = "Tamil" + # TELUGU = "Telugu" + # KANNADA = "Kannada" + # MALAYALAM = "Malayalam" + # GUJARATI = "Gujarati" # GURMUKHI = "Gurmukhi" # ORIYA = "Oriya" # SINHALA = "Sinhala" @@ -402,66 +463,6 @@ class LanguageIdentificationResponse(BaseModel): model_config = ConfigDict(strict=True) -class IdentifiedLanguage(str, Enum): - """Identified language of the user's input.""" - - # AFRIKAANS = "AFRIKAANS" - ENGLISH = "ENGLISH" - FRENCH = "FRENCH" - HINDI = "HINDI" - SWAHILI = "SWAHILI" - UNINTELLIGIBLE = "UNINTELLIGIBLE" - UNSUPPORTED = "UNSUPPORTED" - # XHOSA = "XHOSA" - # ZULU = "ZULU" - - @classmethod - def get_supported_languages(cls) -> list[str]: - """Return a list of supported languages. - - Returns - ------- - list[str] - A list of supported languages. - """ - - return [ - lang - for lang in cls._member_names_ - if lang not in ("UNINTELLIGIBLE", "UNSUPPORTED") - ] - - @classmethod - def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override] - """If language identified is not one of the supported language, it is - classified as UNSUPPORTED. - - Parameters - ---------- - value - The language identified. - - Returns - ------- - IdentifiedLanguage - The identified language (i.e., UNSUPPORTED). - """ - - return cls.UNSUPPORTED - - @classmethod - def get_prompt(cls) -> str: - """Return the prompt for the language identification bot. - - Returns - ------- - str - The prompt for the language identification bot. - """ - - return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip() - - LANGUAGE_ID_PROMPT = ( f"""\ You are a high-performing language identification bot that classifies the \ diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index dfc3341a7..ba8d1025a 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -120,8 +120,8 @@ async def _identify_language( try: cleaned_json_str = remove_json_markdown(text=json_str) lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) - identified_lang = lang_info["language"] - identified_script = lang_info["script"] + identified_lang = lang_info.language + identified_script = lang_info.script except ValidationError: identified_lang = IdentifiedLanguage.UNSUPPORTED identified_script = IdentifiedScript.LATIN diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index 163e77574..50299a7b4 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -10,7 +10,11 @@ from fastapi import status from fastapi.testclient import TestClient -from core_backend.app.llm_call.llm_prompts import AlignmentScore, IdentifiedLanguage +from core_backend.app.llm_call.llm_prompts import ( + AlignmentScore, + IdentifiedLanguage, + LanguageIdentificationResponse, +) from core_backend.app.llm_call.process_input import ( _classify_safety, _identify_language, @@ -1045,10 +1049,10 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: ) @pytest.mark.parametrize( - "identified_lang_str,should_error,expected_error_type", + "identified_lang_str,identified_script_str,should_error,expected_error_type", [ - ("ENGLISH", False, None), - ("HINDI", False, None), + ("ENGLISH", "Latin", False, None), + ("HINDI", "Devanagari", False, None), ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT), ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE), ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE), @@ -1059,6 +1063,7 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: async def test_language_identify_error( self, identified_lang_str: str, + identified_script_str: str, should_error: bool, expected_error_type: ErrorType, monkeypatch: pytest.MonkeyPatch, @@ -1084,6 +1089,7 @@ async def test_language_identify_error( generate_llm_response=False, generate_tts=False, original_language=None, + original_script=None, query_text="This is a basic query", query_text_original="This is a query original", workspace_id=124, @@ -1104,10 +1110,12 @@ async def mock_ask_llm( # pylint: disable=W0613 Returns ------- str - The identified language string. + The identified language and script model json string. """ - return identified_lang_str + return LanguageIdentificationResponse( + language=identified_lang_str, script=identified_script_str + ).model_dump_json() monkeypatch.setattr( "core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm @@ -1233,6 +1241,7 @@ async def mock_ask_llm( # pylint: disable=W0613 generate_llm_response=False, generate_tts=False, original_language=None, + original_script=None, query_text="This is a basic query", query_text_original="This is a query original", workspace_id=124, diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml index a4d3ddb34..7daa61476 100644 --- a/core_backend/tests/rails/data/language_identification.yaml +++ b/core_backend/tests/rails/data/language_identification.yaml @@ -2,59 +2,73 @@ # improve this with a native speaker. These might be too "pure". HAUSA: - - Ina da yara biyu masu hanci - - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi - - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa? - - Menene wannan? - - Sannun ku da zuwa #h/t: Fola from here on - - Ni yarinya ne - - Zo ka chi abunchi - - Ina kwana Maman mu - - Wannan shago na ne + Latin: + - Ina da yara biyu masu hanci + - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi + - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa? + - Menene wannan? + - Sannun ku da zuwa #h/t: Fola from here on + - Ni yarinya ne + - Zo ka chi abunchi + - Ina kwana Maman mu + - Wannan shago na ne ENGLISH: - - I have two children. You see I girl, what is the probability the other is also a girl? - - No idea - - Why you say that? + Latin: + - I have two children. You see I girl, what is the probability the other is also a girl? + - No idea + - Why you say that? XHOSA: - - Umama ngugqirha - - Utata ngumongikazi - - Ukuba intamo yam yayifuna ukwenza oko? - - Iintsana zikhala kakhulu, huh? + Latin: + - Umama ngugqirha + - Utata ngumongikazi + - Ukuba intamo yam yayifuna ukwenza oko? + - Iintsana zikhala kakhulu, huh? YORUBA: #h/t: Fola - - Ni bo ló ti ri owo yen? - - Eyin melo ni e wa ni be? - - Ki ni itumo oruko ẹ? - - Ki ni o jẹ lánà? - - Omo Ibadan ni mi + Latin: + - Ni bo ló ti ri owo yen? + - Eyin melo ni e wa ni be? + - Ki ni itumo oruko ẹ? + - Ki ni o jẹ lánà? + - Omo Ibadan ni mi IGBO: #h/t: Fola - - agụụ na-agụ m - - agam aga ahia echi - - ị hụla ngozi? ana m achọ ya. - - m na-aga ọrụ + Latin: + - agụụ na-agụ m + - agam aga ahia echi + - ị hụla ngozi? ana m achọ ya. + - m na-aga ọrụ KOREAN: - - 애가 둘이예요 - - ㅋㅋㅋㅋㅋㅋ - - 아이들이 많이 울어요ㅠ - - 이 프로젝트 애칭은 ask-a-question이야. + Korean: + - 애가 둘이예요 + - ㅋㅋㅋㅋㅋㅋ + - 아이들이 많이 울어요ㅠ + - 이 프로젝트 애칭은 ask-a-question이야. ZULU: - - Ngingumama - - Ingabe uyi-bot noma ungumuntu? - - Ngifuna ukwenza lokhu? - - Izingane zikhala kakhulu, hhe + Latin: + - Ngingumama + - Ingabe uyi-bot noma ungumuntu? + - Ngifuna ukwenza lokhu? + - Izingane zikhala kakhulu, hhe AFRIKAANS: - - Ek het hierdie goddelose dal gemaak - - Is covid nog 'n ding? - - My hond het my huiswerk geëet - - Het jy al gebraaide roomys probeer? + Latin: + - Ek het hierdie goddelose dal gemaak + - Is covid nog 'n ding? + - My hond het my huiswerk geëet + - Het jy al gebraaide roomys probeer? HINDI: #h/t: Sid - - is ka matlab kya hai? - - kabhi kabhi mere dil mein - - अंत में सभी लोग नाश्ता करने जाएं - - गब्बर सिंह कह के गया जो डर गया वो मर गया + Latin: + - is ka matlab kya hai? + - kabhi kabhi mere dil mein + Devanagari: + - अंत में सभी लोग नाश्ता करने जाएं + - गब्बर सिंह कह के गया जो डर गया वो मर गया +MARATHI: + Latin: + - Portal chi link aahe UNINTELLIGIBLE: - - sdfsdf sss dyhnel jjj - - hs dsfsg xd ewwo ddfs - - Heghlu'meH QaQ jajvam - - yIHuchQo', 'ej jIHvaD yIqemchu'mo' - - \%^*# levels; 91011 AQGs!!! - - 1234 AQI WHO? 5678 + Unknown: + - sdfsdf sss dyhnel jjj + - hs dsfsg xd ewwo ddfs + - Heghlu'meH QaQ jajvam + - yIHuchQo', 'ej jIHvaD yIqemchu'mo' + - \%^*# levels; 91011 AQGs!!! + - 1234 AQI WHO? 5678 diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py index 9b30b2e9a..b7c30f46f 100644 --- a/core_backend/tests/rails/test_language_identification.py +++ b/core_backend/tests/rails/test_language_identification.py @@ -5,7 +5,7 @@ import pytest import yaml -from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage +from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript from core_backend.app.llm_call.process_input import _identify_language from core_backend.app.question_answer.schemas import QueryRefined, QueryResponse @@ -22,19 +22,38 @@ def available_languages() -> list[str]: return list(IdentifiedLanguage) -def read_test_data(file: str) -> list[tuple[str, str]]: +@pytest.fixture(scope="module") +def available_scripts() -> list[str]: + """Returns a list of available languages.""" + + return list(IdentifiedScript) + + +def read_test_data(file: str) -> list[tuple[str, str, str]]: """Reads test data from file and returns a list of strings.""" file_path = Path(__file__).parent / file with open(file_path, "r", encoding="utf-8") as f: content = yaml.safe_load(f) - return [(key, value) for key, values in content.items() for value in values] - - -@pytest.mark.parametrize("expected_label, content", read_test_data(LANGUAGE_FILE)) + data = [ + (language, script, text) + for language, script_dict in content.items() + for script, texts in script_dict.items() + for text in texts + ] + return data + + +@pytest.mark.parametrize( + "expected_language,expected_script,content", read_test_data(LANGUAGE_FILE) +) async def test_language_identification( - available_languages: list[str], expected_label: str, content: str + available_languages: list[str], + available_scripts: list[str], + expected_language: str, + expected_script: str, + content: str, ) -> None: """Test language identification.""" @@ -53,8 +72,15 @@ async def test_language_identification( search_results=None, session_id=None, ) - if expected_label not in available_languages: - expected_label = "UNSUPPORTED" + + if expected_language not in available_languages: + expected_language = "UNSUPPORTED" + + if expected_script not in available_scripts: + expected_script = "Unknown" + _, response = await _identify_language(query_refined=question, response=response) - assert response.debug_info["original_language"] == expected_label + assert response.debug_info["original_language"] == expected_language + if expected_language not in ("UNINTELLIGIBLE", "UNSUPPORTED"): + assert response.debug_info["original_script"] == expected_script From ab2fd75511f3333e7bccde9ddd5388c1ee22ae8e Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Wed, 9 Apr 2025 18:38:28 +0530 Subject: [PATCH 04/18] changes --- core_backend/app/llm_call/llm_prompts.py | 27 +++++++------- core_backend/app/llm_call/llm_rag.py | 14 ++------ core_backend/app/llm_call/process_input.py | 40 +++++++++++++++------ core_backend/app/question_answer/routers.py | 4 --- core_backend/app/question_answer/schemas.py | 1 + 5 files changed, 46 insertions(+), 40 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 751a1079a..d82e3aed7 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -234,6 +234,11 @@ class AlignmentScore(BaseModel): RELEVANT INFORMATION. Only output the JSON response, without any additional text. + + + +{additional_info} + """ @@ -396,20 +401,14 @@ class IdentifiedScript(str, Enum): LATIN = "Latin" DEVANAGARI = "Devanagari" - # ARABIC = "Arabic" - # CYRILLIC = "Cyrillic" - # CHINESE = "Chinese" - # JAPANESE = "Japanese" - # KOREAN = "Korean" - # THAI = "Thai" - # BENGALI = "Bengali" - # TAMIL = "Tamil" - # TELUGU = "Telugu" - # KANNADA = "Kannada" - # MALAYALAM = "Malayalam" - # GUJARATI = "Gujarati" - # GURMUKHI = "Gurmukhi" - # ORIYA = "Oriya" + BENGALI = "Bengali" + TAMIL = "Tamil" + TELUGU = "Telugu" + KANNADA = "Kannada" + MALAYALAM = "Malayalam" + GUJARATI = "Gujarati" + GURMUKHI = "Gurmukhi" + ORIYA = "Oriya" # SINHALA = "Sinhala" # MYANMAR = "Myanmar" # ETHIOPIC = "Ethiopic" diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py index 49a229364..d2fb045d8 100644 --- a/core_backend/app/llm_call/llm_rag.py +++ b/core_backend/app/llm_call/llm_rag.py @@ -127,24 +127,14 @@ async def get_llm_rag_answer_with_chat_history( message_type=message_type, original_language=original_language, original_script=original_script, + additional_info=context, ) ) - content = ( - question - + f""""\n\n - ADDITIONAL RELEVANT INFORMATION BELOW - ===================================== - {context} - - ADDITIONAL RELEVANT INFORMATION ABOVE - ===================================== - """ - ) content = await get_chat_response( chat_history=chat_history, chat_params=chat_params, - message_params=content, + message_params=question, session_id=session_id, json_=True, metadata=metadata or {}, diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index ba8d1025a..80b462f36 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -127,19 +127,26 @@ async def _identify_language( identified_script = IdentifiedScript.LATIN query_refined.original_language = identified_lang + query_refined.original_script = identified_script + response.debug_info["original_query"] = query_refined.query_text_original response.debug_info["original_language"] = identified_lang response.debug_info["original_script"] = identified_script processed_response = _process_identified_language_response( - identified_language=identified_lang, response=response + identified_language=identified_lang, + identified_script=identified_script, + response=response, ) return query_refined, processed_response def _process_identified_language_response( - *, identified_language: IdentifiedLanguage, response: QueryResponse + *, + identified_language: IdentifiedLanguage, + identified_script: IdentifiedScript, + response: QueryResponse, ) -> QueryResponse | QueryResponseError: """Process the identified language and return the response. @@ -147,6 +154,8 @@ def _process_identified_language_response( ---------- identified_language The identified language. + identified_script + The identified script. response The response object. @@ -157,20 +166,31 @@ def _process_identified_language_response( """ supported_languages_list = IdentifiedLanguage.get_supported_languages() + supported_scripts_list = IdentifiedScript.get_supported_scripts() - if identified_language in supported_languages_list: + if ( + identified_language in supported_languages_list + and identified_script in supported_scripts_list + ): return response supported_languages = ", ".join(supported_languages_list) + supported_scripts = ", ".join(supported_scripts_list) - match identified_language: - case IdentifiedLanguage.UNINTELLIGIBLE: + if identified_language == IdentifiedLanguage.UNINTELLIGIBLE: + error_message = ( + "Unintelligible input. " + + f"The following languages are supported: {supported_languages}." + ) + error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT + else: + if identified_script == IdentifiedScript.UNKNOWN: error_message = ( - "Unintelligible input. " - + f"The following languages are supported: {supported_languages}." + "Unsupported script. " + + f"Only the following scripts are supported: {supported_scripts}" ) - error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT - case _: + error_type = ErrorType.UNSUPPORTED_SCRIPT + else: error_message = ( "Unsupported language. Only the following languages " + f"are supported: {supported_languages}." @@ -190,7 +210,7 @@ def _process_identified_language_response( error_response.debug_info.update(response.debug_info) logger.info( - f"LANGUAGE IDENTIFICATION FAILED due to {identified_language.value} " + f"LANGUAGE IDENTIFICATION FAILED due to {error_message} " f"language on query id: {str(response.query_id)}" ) diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py index 9d301cdc4..28f5e16aa 100644 --- a/core_backend/app/question_answer/routers.py +++ b/core_backend/app/question_answer/routers.py @@ -843,10 +843,6 @@ async def get_user_query_and_response( query_text_original=user_query.query_text, workspace_id=workspace_id, ) - if user_query_refined.chat_query_params: - user_query_refined.query_text = user_query_refined.chat_query_params.pop( - "search_query" - ) # Prepare the placeholder response object. response_template = QueryResponse( diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py index bda58ce7a..c434b28ee 100644 --- a/core_backend/app/question_answer/schemas.py +++ b/core_backend/app/question_answer/schemas.py @@ -23,6 +23,7 @@ class ErrorType(str, Enum): UNABLE_TO_TRANSLATE = "unable_to_translate" UNINTELLIGIBLE_INPUT = "unintelligible_input" UNSUPPORTED_LANGUAGE = "unsupported_language" + UNSUPPORTED_SCRIPT = "unsupported_script" class QueryBase(BaseModel): From 2cba258051be3a32f6c55f0e9ebab6336daf3fbd Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:01:00 +0530 Subject: [PATCH 05/18] remove search query during init chat history --- core_backend/app/llm_call/llm_prompts.py | 4 +--- core_backend/app/question_answer/routers.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index d82e3aed7..7977a923d 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -276,9 +276,7 @@ class ChatHistory: {{ "message_type": "The type of the user's LATEST MESSAGE. List of valid - options are: {valid_message_types}, - "query": "The vector database query that you have constructed based on - the user's LATEST MESSAGE and the conversation history." + options are: {valid_message_types}" }} Do NOT attempt to answer the user's question/concern. Only output the JSON diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py index 28f5e16aa..5a4b057b8 100644 --- a/core_backend/app/question_answer/routers.py +++ b/core_backend/app/question_answer/routers.py @@ -1073,7 +1073,6 @@ async def init_user_query_and_chat_histories( "chat_params": chat_params, "message_type": search_query_json_response["message_type"], "redis_client": redis_client, - "search_query": search_query_json_response["query"], "session_id": session_id, } user_query.generate_llm_response = True From dbdeec459ff5cfc5557c12be9e1de0a684433a91 Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:36:12 +0530 Subject: [PATCH 06/18] fix tests and type --- core_backend/app/llm_call/llm_prompts.py | 1 - core_backend/tests/api/test_question_answer.py | 12 +++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 7977a923d..2766fabe6 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -291,7 +291,6 @@ class ChatHistoryConstructSearchQuery(BaseModel): """Pydantic model for the output of the construct search query chat history.""" message_type: Literal["FOLLOW-UP", "NEW"] - query: str @staticmethod def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]: diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index 50299a7b4..936bf41b5 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -1053,11 +1053,13 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: [ ("ENGLISH", "Latin", False, None), ("HINDI", "Devanagari", False, None), - ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT), - ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("SOME_UNSUPPORTED_LANG", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("don't kow", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("UNINTELLIGIBLE", "Latin", True, ErrorType.UNINTELLIGIBLE_INPUT), + ("UNINTELLIGIBLE", "Unknown", True, ErrorType.UNSUPPORTED_SCRIPT), + ("GIBBERISH", "Unknwon", True, ErrorType.UNSUPPORTED_SCRIPT), + ("GIBBERISH", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("UNSUPPORTED", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("SOME_UNSUPPORTED_LANG", "Unknown", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("don't kow", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE), ], ) async def test_language_identify_error( From 09fdd6e8b876ad01f637548c64e77210944bb30b Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:08:01 +0530 Subject: [PATCH 07/18] change schema and add validator --- core_backend/app/llm_call/llm_prompts.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 2766fabe6..61cef0490 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -7,7 +7,7 @@ from enum import Enum from typing import ClassVar, Literal -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, field_validator from .utils import format_prompt, remove_json_markdown @@ -453,8 +453,22 @@ def get_supported_scripts(cls) -> list[str]: class LanguageIdentificationResponse(BaseModel): """Pydantic model for the language identification response.""" - language: IdentifiedLanguage - script: IdentifiedScript + language: str + script: str + + @field_validator("language") + def validate_language(cls, value: str) -> str: + """Make sure language input is a valid IdentifiedLanguage""" + if value not in IdentifiedLanguage._member_names_: + raise ValueError(f"Invalid language: {value}") + return value + + @field_validator("script") + def validate_script(cls, value: str) -> str: + """Make sure script input is a valid IdentifiedScript""" + if value not in IdentifiedScript._member_names_: + raise ValueError(f"Invalid script: {value}") + return value model_config = ConfigDict(strict=True) From ba013a57aed365f4a2d58e36855c7f0f4c5e02ad Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:35:58 +0530 Subject: [PATCH 08/18] fix test return mock value --- .secrets.baseline | 6 +++--- core_backend/app/llm_call/process_input.py | 2 +- core_backend/tests/api/test_chat.py | 1 - core_backend/tests/api/test_question_answer.py | 7 +++---- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 2ba6baa9b..30aef52a0 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -448,14 +448,14 @@ "filename": "core_backend/tests/api/test_question_answer.py", "hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d", "is_verified": false, - "line_number": 419 + "line_number": 418 }, { "type": "Secret Keyword", "filename": "core_backend/tests/api/test_question_answer.py", "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee", "is_verified": false, - "line_number": 1019 + "line_number": 1018 } ], "core_backend/tests/api/test_user_tools.py": [ @@ -581,5 +581,5 @@ } ] }, - "generated_at": "2025-04-09T08:32:56Z" + "generated_at": "2025-04-10T10:05:42Z" } diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index 80b462f36..d603980a4 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -117,8 +117,8 @@ async def _identify_language( user_message=query_refined.query_text, ) + cleaned_json_str = remove_json_markdown(text=json_str) try: - cleaned_json_str = remove_json_markdown(text=json_str) lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) identified_lang = lang_info.language identified_script = lang_info.script diff --git a/core_backend/tests/api/test_chat.py b/core_backend/tests/api/test_chat.py index ed2f35f5e..d32cb3436 100644 --- a/core_backend/tests/api/test_chat.py +++ b/core_backend/tests/api/test_chat.py @@ -85,7 +85,6 @@ async def test_init_user_query_and_chat_histories(redis_client: aioredis.Redis) chat_query_params["chat_cache_key"] == f"chatCache:{user_query.session_id}" ) assert chat_query_params["message_type"] == "NEW" - assert chat_query_params["search_query"] == "stomachache and possible remedies" async def test__ask_llm_async() -> None: diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index 936bf41b5..38fa74ddc 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -13,7 +13,6 @@ from core_backend.app.llm_call.llm_prompts import ( AlignmentScore, IdentifiedLanguage, - LanguageIdentificationResponse, ) from core_backend.app.llm_call.process_input import ( _classify_safety, @@ -1115,9 +1114,9 @@ async def mock_ask_llm( # pylint: disable=W0613 The identified language and script model json string. """ - return LanguageIdentificationResponse( - language=identified_lang_str, script=identified_script_str - ).model_dump_json() + return f""" + {{"language": "{identified_lang_str}", "script": "{identified_script_str}"}} + """.strip() monkeypatch.setattr( "core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm From 7ed12705b693cb6ae00bf7a7a05e2db346c7cadd Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:43:29 +0530 Subject: [PATCH 09/18] use enum not string --- core_backend/app/llm_call/process_input.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index d603980a4..06d0847a3 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -120,8 +120,8 @@ async def _identify_language( cleaned_json_str = remove_json_markdown(text=json_str) try: lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) - identified_lang = lang_info.language - identified_script = lang_info.script + identified_lang = getattr(IdentifiedLanguage, lang_info.language) + identified_script = getattr(IdentifiedScript, lang_info.script) except ValidationError: identified_lang = IdentifiedLanguage.UNSUPPORTED identified_script = IdentifiedScript.LATIN From e8cbf8a9f70fe66dfc4d1af85b85aa8efbd5336b Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:07:36 +0530 Subject: [PATCH 10/18] fix type issues --- core_backend/app/llm_call/llm_prompts.py | 151 +++++++++------------ core_backend/app/llm_call/llm_rag.py | 10 +- core_backend/app/llm_call/process_input.py | 4 +- 3 files changed, 73 insertions(+), 92 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 61cef0490..92b63a125 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -183,20 +183,20 @@ class AlignmentScore(BaseModel): CHAT_RESPONSE_PROMPT = """\ -You are an AI assistant designed to help users with their -questions/concerns. You interact with users via a chat interface. You will -be provided with ADDITIONAL RELEVANT INFORMATION that can address the +You are an AI assistant designed to help users with their \ +questions/concerns. You interact with users via a chat interface. You will \ +be provided with ADDITIONAL RELEVANT INFORMATION that can address the \ user's questions/concerns. BEFORE answering the user's LATEST MESSAGE, follow these steps: -1. Review the conversation history to ensure that you understand the +1. Review the conversation history to ensure that you understand the \ context in which the user's LATEST MESSAGE is being asked. -2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you -understand the most useful information related to the user's LATEST +2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you \ +understand the most useful information related to the user's LATEST \ MESSAGE. -When you have completed the above steps, you will then write a JSON, whose +When you have completed the above steps, you will then write a JSON, whose \ TypeScript Interface is given below: interface Response {{ @@ -204,41 +204,33 @@ class AlignmentScore(BaseModel): answer: string; }} -For "extracted_info", extract from the provided ADDITIONAL RELEVANT -INFORMATION the most useful information related to the LATEST MESSAGE asked -by the user, and list them one by one. If no useful information is found, +For "extracted_info", extract from the provided ADDITIONAL RELEVANT \ +INFORMATION the most useful information related to the LATEST MESSAGE asked \ +by the user, and list them one by one. If no useful information is found, \ return an empty list. -For "answer", understand the conversation history, ADDITIONAL RELEVANT -INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to -the user's LATEST MESSAGE. If no useful information was found in the -either the conversation history or the ADDITIONAL RELEVANT INFORMATION, +For "answer", understand the conversation history, ADDITIONAL RELEVANT \ +INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to \ +the user's LATEST MESSAGE. If no useful information was found in the \ +either the conversation history or the ADDITIONAL RELEVANT INFORMATION, \ respond with {failure_message}. EXAMPLE RESPONSES: -{{"extracted_info": [ - "Pineapples are a blend of pinecones and apples.", - "Pineapples have the shape of a pinecone." - ], - "answer": "The 'pine-' from pineapples likely come from the fact that - pineapples are a hybrid of pinecones and apples and its pinecone-like - shape." -}} +{{"extracted_info": ["Pineapples are a blend of pinecones and apples.", \ +"Pineapples have the shape of a pinecone."], \ +"answer": "The 'pine-' from pineapples likely come from the fact that \ +pineapples are a hybrid of pinecones and apples and its pinecone-like \ +shape."}} {{"extracted_info": [], "answer": "{failure_message}"}} IMPORTANT NOTES ON THE "answer" FIELD: - Keep in mind that the user is asking a {message_type} question. - Answer in the language {original_language} in the script {original_script}. - Answer should be concise and to the point. -- Do not include any information that is not present in the ADDITIONAL +- Do not include any information that is not present in the ADDITIONAL \ RELEVANT INFORMATION. -Only output the JSON response, without any additional text. - - - -{additional_info} - +Only output the JSON response, without any additional text.\ """ @@ -343,19 +335,17 @@ class IdentifiedLanguage(str, Enum): SWAHILI = "SWAHILI" UNINTELLIGIBLE = "UNINTELLIGIBLE" UNSUPPORTED = "UNSUPPORTED" + # XHOSA = "XHOSA" # ZULU = "ZULU" - @classmethod def get_supported_languages(cls) -> list[str]: """Return a list of supported languages. - Returns ------- list[str] A list of supported languages. """ - return [ lang for lang in cls._member_names_ @@ -380,57 +370,53 @@ def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override] return cls.UNSUPPORTED - @classmethod - def get_prompt(cls) -> str: - """Return the prompt for the language identification bot. - - Returns - ------- - str - The prompt for the language identification bot. - """ - - return LANGUAGE_ID_PROMPT.format(member_names=cls._member_names_).strip() - class IdentifiedScript(str, Enum): """Script used in the user's input.""" - LATIN = "Latin" - DEVANAGARI = "Devanagari" - BENGALI = "Bengali" - TAMIL = "Tamil" - TELUGU = "Telugu" - KANNADA = "Kannada" - MALAYALAM = "Malayalam" - GUJARATI = "Gujarati" - GURMUKHI = "Gurmukhi" - ORIYA = "Oriya" - # SINHALA = "Sinhala" - # MYANMAR = "Myanmar" - # ETHIOPIC = "Ethiopic" - # GEORGIAN = "Georgian" - # ARMENIAN = "Armenian" - # HEBREW = "Hebrew" - # GREEK = "Greek" - # TIBETAN = "Tibetan" - # MONGOLIAN = "Mongolian" - # KHMER = "Khmer" - # LAO = "Lao" - # VIETNAMESE = "Vietnamese" - # THAI_LAO = "Thai-Lao" - UNKNOWN = "Unknown" + LATIN = "LATIN" + DEVANAGARI = "DEVANAGARI" + BENGALI = "BENGALI" + TAMIL = "TAMIL" + TELUGU = "TELUGU" + KANNADA = "KANNADA" + MALAYALAM = "MALAYALAM" + GUJARATI = "GUJARATI" + # GURMUKHI = "GURMUKHI" + # ORIYA = "ORIYA" + # SINHALA = "SINHALA" + # MYANMAR = "MYANMAR" + # ETHIOPIC = "ETHIOPIC" + # GEORGIAN = "GEORGIAN" + # ARMENIAN = "ARMENIAN" + # HEBREW = "HEBREW" + # GREEK = "GREEK" + # TIBETAN = "TIBETAN" + # MONGOLIAN = "MONGOLIAN" + # KHMER = "KHMER" + # LAO = "LAO" + # VIETNAMESE = "VIETNAMESE" + # THAI_LAO = "THAI_LAO" + UNKNOWN = "UNKNOWN" + + @classmethod + def get_supported_scripts(cls) -> list[str]: + """Return a list of supported scripts. + Returns + ------- + list[str] + A list of supported scripts. + """ + return [script for script in cls._member_names_ if script != "UNKNOWN"] @classmethod def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override] """If script identified is not one of the supported scripts, it is classified as UNKNOWN. - Parameters ---------- value The script identified. - Returns ------- Script @@ -438,17 +424,6 @@ def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override] """ return cls.UNKNOWN - @classmethod - def get_supported_scripts(cls) -> list[str]: - """Return a list of supported scripts. - - Returns - ------- - list[str] - A list of supported scripts. - """ - return [script.value for script in cls if script != cls.UNKNOWN] - class LanguageIdentificationResponse(BaseModel): """Pydantic model for the language identification response.""" @@ -490,13 +465,13 @@ def validate_script(cls, value: str) -> str: 4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin""" + """ Examples: -"How many beds are there?" -> {{"language": "ENGLISH", "script": "Latin"}} -"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "Latin"}} -"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "Devanagari"}} -"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "Latin"}} -"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "Latin"}} -"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "Latin"}} -"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "Arabic"}} +"How many beds are there?" -> {{"language": "ENGLISH", "script": "LATIN"}} +"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "LATIN"}} +"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "DEVANAGARI"}} +"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "LATIN"}} +"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "LATIN"}} +"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "LATIN"}} +"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "ARABIC"}} Respond with a JSON object containing "language" and "script" keys. """ diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py index d2fb045d8..d7a1dea12 100644 --- a/core_backend/app/llm_call/llm_rag.py +++ b/core_backend/app/llm_call/llm_rag.py @@ -18,6 +18,7 @@ from .utils import ( _ask_llm_async, append_messages_to_chat_history, + format_prompt, get_chat_response, remove_json_markdown, ) @@ -127,14 +128,19 @@ async def get_llm_rag_answer_with_chat_history( message_type=message_type, original_language=original_language, original_script=original_script, - additional_info=context, ) ) + user_message_with_context = format_prompt( + prompt=f"""{question}\n\n + + {context} + """ + ) content = await get_chat_response( chat_history=chat_history, chat_params=chat_params, - message_params=question, + message_params=user_message_with_context, session_id=session_id, json_=True, metadata=metadata or {}, diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index 06d0847a3..e73cc24d6 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -120,8 +120,8 @@ async def _identify_language( cleaned_json_str = remove_json_markdown(text=json_str) try: lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) - identified_lang = getattr(IdentifiedLanguage, lang_info.language) - identified_script = getattr(IdentifiedScript, lang_info.script) + identified_lang = getattr(IdentifiedLanguage, lang_info.language.upper()) + identified_script = getattr(IdentifiedScript, lang_info.script.upper()) except ValidationError: identified_lang = IdentifiedLanguage.UNSUPPORTED identified_script = IdentifiedScript.LATIN From 730a8ccbcbae28f11f6b7317247f8c43bb4ef7aa Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:10:15 +0530 Subject: [PATCH 11/18] fix tests and always run paraphrase guardrail --- core_backend/app/llm_call/process_input.py | 7 +++---- core_backend/tests/api/test_question_answer.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index e73cc24d6..c71371908 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -493,10 +493,9 @@ async def wrapper( The appropriate response object. """ - if not query_refined.chat_query_params: - query_refined, response = await _paraphrase_question( - query_refined=query_refined, response=response - ) + query_refined, response = await _paraphrase_question( + query_refined=query_refined, response=response + ) response = await func(query_refined, response, *args, **kwargs) return response diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index 38fa74ddc..aa8648c5e 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -1050,15 +1050,15 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: @pytest.mark.parametrize( "identified_lang_str,identified_script_str,should_error,expected_error_type", [ - ("ENGLISH", "Latin", False, None), - ("HINDI", "Devanagari", False, None), - ("UNINTELLIGIBLE", "Latin", True, ErrorType.UNINTELLIGIBLE_INPUT), - ("UNINTELLIGIBLE", "Unknown", True, ErrorType.UNSUPPORTED_SCRIPT), - ("GIBBERISH", "Unknwon", True, ErrorType.UNSUPPORTED_SCRIPT), - ("GIBBERISH", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("UNSUPPORTED", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("SOME_UNSUPPORTED_LANG", "Unknown", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("don't kow", "Latin", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("ENGLISH", "LATIN", False, None), + ("HINDI", "DEVANAGARI", False, None), + ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT), + ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), + ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), + ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("don't kow", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), ], ) async def test_language_identify_error( From 8b7e5a4cbfabbd94ddcd991e79f6f455f2d39a7a Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:14:47 +0530 Subject: [PATCH 12/18] use uppercase --- .../rails/data/language_identification.yaml | 24 +++++++++---------- .../rails/test_language_identification.py | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml index 7daa61476..4b28c20e2 100644 --- a/core_backend/tests/rails/data/language_identification.yaml +++ b/core_backend/tests/rails/data/language_identification.yaml @@ -2,7 +2,7 @@ # improve this with a native speaker. These might be too "pure". HAUSA: - Latin: + LATIN: - Ina da yara biyu masu hanci - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa? @@ -13,59 +13,59 @@ HAUSA: - Ina kwana Maman mu - Wannan shago na ne ENGLISH: - Latin: + LATIN: - I have two children. You see I girl, what is the probability the other is also a girl? - No idea - Why you say that? XHOSA: - Latin: + LATIN: - Umama ngugqirha - Utata ngumongikazi - Ukuba intamo yam yayifuna ukwenza oko? - Iintsana zikhala kakhulu, huh? YORUBA: #h/t: Fola - Latin: + LATIN: - Ni bo ló ti ri owo yen? - Eyin melo ni e wa ni be? - Ki ni itumo oruko ẹ? - Ki ni o jẹ lánà? - Omo Ibadan ni mi IGBO: #h/t: Fola - Latin: + LATIN: - agụụ na-agụ m - agam aga ahia echi - ị hụla ngozi? ana m achọ ya. - m na-aga ọrụ KOREAN: - Korean: + KOREAN: - 애가 둘이예요 - ㅋㅋㅋㅋㅋㅋ - 아이들이 많이 울어요ㅠ - 이 프로젝트 애칭은 ask-a-question이야. ZULU: - Latin: + LATIN: - Ngingumama - Ingabe uyi-bot noma ungumuntu? - Ngifuna ukwenza lokhu? - Izingane zikhala kakhulu, hhe AFRIKAANS: - Latin: + LATIN: - Ek het hierdie goddelose dal gemaak - Is covid nog 'n ding? - My hond het my huiswerk geëet - Het jy al gebraaide roomys probeer? HINDI: #h/t: Sid - Latin: + LATIN: - is ka matlab kya hai? - kabhi kabhi mere dil mein - Devanagari: + DEVANAGARI: - अंत में सभी लोग नाश्ता करने जाएं - गब्बर सिंह कह के गया जो डर गया वो मर गया MARATHI: - Latin: + LATIN: - Portal chi link aahe UNINTELLIGIBLE: - Unknown: + UNKNOWN: - sdfsdf sss dyhnel jjj - hs dsfsg xd ewwo ddfs - Heghlu'meH QaQ jajvam diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py index b7c30f46f..6744d8216 100644 --- a/core_backend/tests/rails/test_language_identification.py +++ b/core_backend/tests/rails/test_language_identification.py @@ -77,7 +77,7 @@ async def test_language_identification( expected_language = "UNSUPPORTED" if expected_script not in available_scripts: - expected_script = "Unknown" + expected_script = "UNKNOWN" _, response = await _identify_language(query_refined=question, response=response) From 15e083fafd0a680ce6094d03c03aaedcee1e652f Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:35:30 +0530 Subject: [PATCH 13/18] fix tests and how we get enum --- core_backend/app/llm_call/process_input.py | 4 ++-- core_backend/tests/api/test_question_answer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index c71371908..4ae2eb368 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -120,8 +120,8 @@ async def _identify_language( cleaned_json_str = remove_json_markdown(text=json_str) try: lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) - identified_lang = getattr(IdentifiedLanguage, lang_info.language.upper()) - identified_script = getattr(IdentifiedScript, lang_info.script.upper()) + identified_lang = IdentifiedLanguage(lang_info.language.upper()) + identified_script = IdentifiedScript(lang_info.script.upper()) except ValidationError: identified_lang = IdentifiedLanguage.UNSUPPORTED identified_script = IdentifiedScript.LATIN diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index aa8648c5e..0894ccbde 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -1053,7 +1053,7 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: ("ENGLISH", "LATIN", False, None), ("HINDI", "DEVANAGARI", False, None), ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT), - ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), + ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT), ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), From a4136be621f058ada3b1e15e71f5a0827dad82d0 Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:48:49 +0530 Subject: [PATCH 14/18] add test cases --- core_backend/app/llm_call/process_input.py | 1 + core_backend/tests/api/test_question_answer.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index 4ae2eb368..a7a3d7a21 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -184,6 +184,7 @@ def _process_identified_language_response( ) error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT else: + # TODO: create types for language x script combos if identified_script == IdentifiedScript.UNKNOWN: error_message = ( "Unsupported script. " diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index 0894ccbde..43e9b7ef5 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -1054,7 +1054,9 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: ("HINDI", "DEVANAGARI", False, None), ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT), ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT), - ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), + ("ENGLISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), + ("ENGLISH", "Some unsupported script", True, ErrorType.UNSUPPORTED_SCRIPT), + ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE), ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE), From e951d0acdc17bb8fb2a356636db956541ef4f0e4 Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 19:06:54 +0530 Subject: [PATCH 15/18] clean up error logic --- core_backend/app/llm_call/process_input.py | 35 ++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index a7a3d7a21..a84d1e798 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -167,34 +167,31 @@ def _process_identified_language_response( supported_languages_list = IdentifiedLanguage.get_supported_languages() supported_scripts_list = IdentifiedScript.get_supported_scripts() + supported_languages_str = ", ".join(supported_languages_list) + suported_scripts_str = ", ".join(supported_scripts_list) - if ( - identified_language in supported_languages_list - and identified_script in supported_scripts_list - ): - return response + language_ok = identified_language in supported_languages_list + script_ok = identified_script in supported_scripts_list - supported_languages = ", ".join(supported_languages_list) - supported_scripts = ", ".join(supported_scripts_list) - - if identified_language == IdentifiedLanguage.UNINTELLIGIBLE: + if language_ok and script_ok: + return response + elif language_ok and not script_ok: error_message = ( - "Unintelligible input. " - + f"The following languages are supported: {supported_languages}." + "Unsupported script. " + + f"Only the following scripts are supported: {suported_scripts_str}" ) - error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT - else: - # TODO: create types for language x script combos - if identified_script == IdentifiedScript.UNKNOWN: + error_type: ErrorType = ErrorType.UNSUPPORTED_SCRIPT + else: # regardless of script, language is not "ok" + if identified_language == IdentifiedLanguage.UNINTELLIGIBLE: error_message = ( - "Unsupported script. " - + f"Only the following scripts are supported: {supported_scripts}" + "Unintelligible input. " + + f"The following languages are supported: {supported_languages_str}." ) - error_type = ErrorType.UNSUPPORTED_SCRIPT + error_type = ErrorType.UNINTELLIGIBLE_INPUT else: error_message = ( "Unsupported language. Only the following languages " - + f"are supported: {supported_languages}." + + f"are supported: {supported_languages_str}." ) error_type = ErrorType.UNSUPPORTED_LANGUAGE From c8117de28461fa5c586c9d08328b6cacead2f559 Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 19:14:57 +0530 Subject: [PATCH 16/18] fix conftes --- .secrets.baseline | 53 +----------------------------- core_backend/tests/api/conftest.py | 3 ++ 2 files changed, 4 insertions(+), 52 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 30aef52a0..f961ef821 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -348,57 +348,6 @@ "line_number": 15 } ], - "core_backend/tests/api/conftest.py": [ - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3", - "is_verified": false, - "line_number": 46 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7", - "is_verified": false, - "line_number": 47 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097", - "is_verified": false, - "line_number": 50 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4", - "is_verified": false, - "line_number": 51 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a", - "is_verified": false, - "line_number": 56 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78", - "is_verified": false, - "line_number": 57 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e", - "is_verified": false, - "line_number": 317 - } - ], "core_backend/tests/api/test.env": [ { "type": "Secret Keyword", @@ -581,5 +530,5 @@ } ] }, - "generated_at": "2025-04-10T10:05:42Z" + "generated_at": "2025-04-10T13:44:48Z" } diff --git a/core_backend/tests/api/conftest.py b/core_backend/tests/api/conftest.py index ca0a53a0b..d0e8b15bc 100644 --- a/core_backend/tests/api/conftest.py +++ b/core_backend/tests/api/conftest.py @@ -35,6 +35,7 @@ RAG, AlignmentScore, IdentifiedLanguage, + IdentifiedScript, ) from core_backend.app.question_answer.models import ( ContentFeedbackDB, @@ -1703,7 +1704,9 @@ async def mock_identify_language( """ query_refined.original_language = IdentifiedLanguage.ENGLISH + query_refined.original_script = IdentifiedScript.LATIN response.debug_info["original_language"] = "ENGLISH" + response.debug_info["original_script"] = "LATIN" return query_refined, response From a81a3816ed477693fb7da49f05d7f55c4cb3537a Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Thu, 10 Apr 2025 19:37:18 +0530 Subject: [PATCH 17/18] fix logic --- core_backend/app/llm_call/llm_prompts.py | 20 +++----------------- core_backend/app/llm_call/process_input.py | 7 ++++--- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 92b63a125..f3603deb2 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -7,7 +7,7 @@ from enum import Enum from typing import ClassVar, Literal -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field from .utils import format_prompt, remove_json_markdown @@ -428,22 +428,8 @@ def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override] class LanguageIdentificationResponse(BaseModel): """Pydantic model for the language identification response.""" - language: str - script: str - - @field_validator("language") - def validate_language(cls, value: str) -> str: - """Make sure language input is a valid IdentifiedLanguage""" - if value not in IdentifiedLanguage._member_names_: - raise ValueError(f"Invalid language: {value}") - return value - - @field_validator("script") - def validate_script(cls, value: str) -> str: - """Make sure script input is a valid IdentifiedScript""" - if value not in IdentifiedScript._member_names_: - raise ValueError(f"Invalid script: {value}") - return value + language: IdentifiedLanguage + script: IdentifiedScript model_config = ConfigDict(strict=True) diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index a84d1e798..b854d8ff0 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -167,12 +167,13 @@ def _process_identified_language_response( supported_languages_list = IdentifiedLanguage.get_supported_languages() supported_scripts_list = IdentifiedScript.get_supported_scripts() - supported_languages_str = ", ".join(supported_languages_list) - suported_scripts_str = ", ".join(supported_scripts_list) language_ok = identified_language in supported_languages_list script_ok = identified_script in supported_scripts_list + supported_languages_str = ", ".join(supported_languages_list) + suported_scripts_str = ", ".join(supported_scripts_list) + if language_ok and script_ok: return response elif language_ok and not script_ok: @@ -209,7 +210,7 @@ def _process_identified_language_response( logger.info( f"LANGUAGE IDENTIFICATION FAILED due to {error_message} " - f"language on query id: {str(response.query_id)}" + f"on query id: {str(response.query_id)}" ) return error_response From 40187b86fdd66d0579bba7b8f63407217cf30885 Mon Sep 17 00:00:00 2001 From: Suzin <7042047+suzinyou@users.noreply.github.com> Date: Fri, 11 Apr 2025 17:15:42 +0530 Subject: [PATCH 18/18] Add query optimization back in, but don't run translation for chat queries --- core_backend/app/llm_call/llm_prompts.py | 5 ++++- core_backend/app/llm_call/process_input.py | 18 +++++++++++------- core_backend/app/question_answer/routers.py | 8 ++++++++ core_backend/app/question_answer/utils.py | 4 +++- core_backend/tests/api/test_chat.py | 1 + 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index f3603deb2..db7a154bd 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -268,7 +268,9 @@ class ChatHistory: {{ "message_type": "The type of the user's LATEST MESSAGE. List of valid - options are: {valid_message_types}" + options are: {valid_message_types}", + "query": "The vector database query that you have constructed based on + the user's LATEST MESSAGE and the conversation history." }} Do NOT attempt to answer the user's question/concern. Only output the JSON @@ -283,6 +285,7 @@ class ChatHistoryConstructSearchQuery(BaseModel): """Pydantic model for the output of the construct search query chat history.""" message_type: Literal["FOLLOW-UP", "NEW"] + query: str @staticmethod def parse_json(*, chat_type: Literal["search"], json_str: str) -> dict[str, str]: diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index b854d8ff0..c6da6a5b4 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -114,7 +114,8 @@ async def _identify_language( litellm_model=LITELLM_MODEL_LANGUAGE_DETECT, metadata=metadata, system_message=LANGUAGE_ID_PROMPT, - user_message=query_refined.query_text, + # Always use the original query text for language and script detection + user_message=query_refined.query_text_original, ) cleaned_json_str = remove_json_markdown(text=json_str) @@ -256,9 +257,10 @@ async def wrapper( The appropriate response object. """ - query_refined, response = await _translate_question( - query_refined=query_refined, response=response - ) + if not query_refined.chat_query_params: + query_refined, response = await _translate_question( + query_refined=query_refined, response=response + ) response = await func(query_refined, response, *args, **kwargs) return response @@ -492,9 +494,11 @@ async def wrapper( The appropriate response object. """ - query_refined, response = await _paraphrase_question( - query_refined=query_refined, response=response - ) + if not query_refined.chat_query_params: + query_refined, response = await _paraphrase_question( + query_refined=query_refined, response=response + ) + response = await func(query_refined, response, *args, **kwargs) return response diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py index 5a4b057b8..e6091edb1 100644 --- a/core_backend/app/question_answer/routers.py +++ b/core_backend/app/question_answer/routers.py @@ -844,6 +844,13 @@ async def get_user_query_and_response( workspace_id=workspace_id, ) + # In case of a chat query, use the optimized query as the base query_text. + # Note that for language identification, we use query_text_original. + if user_query_refined.chat_query_params: + user_query_refined.query_text = user_query_refined.chat_query_params.pop( + "search_query" + ) + # Prepare the placeholder response object. response_template = QueryResponse( debug_info={}, @@ -1072,6 +1079,7 @@ async def init_user_query_and_chat_histories( "chat_history": user_assistant_chat_history, "chat_params": chat_params, "message_type": search_query_json_response["message_type"], + "search_query": search_query_json_response["query"], "redis_client": redis_client, "session_id": session_id, } diff --git a/core_backend/app/question_answer/utils.py b/core_backend/app/question_answer/utils.py index 029d7194c..f972e46dc 100644 --- a/core_backend/app/question_answer/utils.py +++ b/core_backend/app/question_answer/utils.py @@ -23,6 +23,8 @@ def get_context_string_from_search_results( for key, result in search_results.items(): if not isinstance(result, QuerySearchResult): result = QuerySearchResult(**result) - context_list.append(f"{key}. {result.title}\n{result.text}") + context_list.append( + f" \n**{result.title}**\n\n{result.text}\n" + ) context_string = "\n\n".join(context_list) return context_string diff --git a/core_backend/tests/api/test_chat.py b/core_backend/tests/api/test_chat.py index d32cb3436..ed2f35f5e 100644 --- a/core_backend/tests/api/test_chat.py +++ b/core_backend/tests/api/test_chat.py @@ -85,6 +85,7 @@ async def test_init_user_query_and_chat_histories(redis_client: aioredis.Redis) chat_query_params["chat_cache_key"] == f"chatCache:{user_query.session_id}" ) assert chat_query_params["message_type"] == "NEW" + assert chat_query_params["search_query"] == "stomachache and possible remedies" async def test__ask_llm_async() -> None: