diff --git a/.secrets.baseline b/.secrets.baseline index 5cab9e8c1..f961ef821 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -348,57 +348,6 @@ "line_number": 15 } ], - "core_backend/tests/api/conftest.py": [ - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3", - "is_verified": false, - "line_number": 46 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7", - "is_verified": false, - "line_number": 47 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097", - "is_verified": false, - "line_number": 50 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4", - "is_verified": false, - "line_number": 51 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a", - "is_verified": false, - "line_number": 56 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78", - "is_verified": false, - "line_number": 57 - }, - { - "type": "Secret Keyword", - "filename": "core_backend/tests/api/conftest.py", - "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e", - "is_verified": false, - "line_number": 317 - } - ], "core_backend/tests/api/test.env": [ { "type": "Secret Keyword", @@ -448,14 +397,14 @@ "filename": "core_backend/tests/api/test_question_answer.py", "hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d", "is_verified": false, - "line_number": 294 + "line_number": 418 }, { "type": "Secret Keyword", "filename": "core_backend/tests/api/test_question_answer.py", "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee", "is_verified": false, - "line_number": 653 + "line_number": 1018 } ], "core_backend/tests/api/test_user_tools.py": [ @@ -473,7 +422,7 @@ "filename": "core_backend/tests/rails/test_language_identification.py", "hashed_secret": "051b2c1d98174fabc4749641c4f4f4660556441e", "is_verified": false, - "line_number": 48 + "line_number": 69 } ], "core_backend/tests/rails/test_paraphrasing.py": [ @@ -581,5 +530,5 @@ } ] }, - "generated_at": "2025-01-24T13:35:08Z" + "generated_at": "2025-04-10T13:44:48Z" } diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py index 2ede20f4c..db7a154bd 100644 --- a/core_backend/app/llm_call/llm_prompts.py +++ b/core_backend/app/llm_call/llm_prompts.py @@ -103,7 +103,7 @@ {context} IMPORTANT NOTES ON THE "answer" FIELD: -- Answer in the language of the question ({original_language}). +- Answer in the language {original_language} in the script {original_script}. - Answer should be concise, to the point, and no longer than 80 words. - Do not include any information that is not present in the REFERENCE TEXT. """ @@ -182,6 +182,58 @@ class AlignmentScore(BaseModel): model_config = ConfigDict(strict=True) +CHAT_RESPONSE_PROMPT = """\ +You are an AI assistant designed to help users with their \ +questions/concerns. You interact with users via a chat interface. You will \ +be provided with ADDITIONAL RELEVANT INFORMATION that can address the \ +user's questions/concerns. + +BEFORE answering the user's LATEST MESSAGE, follow these steps: + +1. Review the conversation history to ensure that you understand the \ +context in which the user's LATEST MESSAGE is being asked. +2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you \ +understand the most useful information related to the user's LATEST \ +MESSAGE. + +When you have completed the above steps, you will then write a JSON, whose \ +TypeScript Interface is given below: + +interface Response {{ + extracted_info: string[]; + answer: string; +}} + +For "extracted_info", extract from the provided ADDITIONAL RELEVANT \ +INFORMATION the most useful information related to the LATEST MESSAGE asked \ +by the user, and list them one by one. If no useful information is found, \ +return an empty list. + +For "answer", understand the conversation history, ADDITIONAL RELEVANT \ +INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to \ +the user's LATEST MESSAGE. If no useful information was found in the \ +either the conversation history or the ADDITIONAL RELEVANT INFORMATION, \ +respond with {failure_message}. + +EXAMPLE RESPONSES: +{{"extracted_info": ["Pineapples are a blend of pinecones and apples.", \ +"Pineapples have the shape of a pinecone."], \ +"answer": "The 'pine-' from pineapples likely come from the fact that \ +pineapples are a hybrid of pinecones and apples and its pinecone-like \ +shape."}} +{{"extracted_info": [], "answer": "{failure_message}"}} + +IMPORTANT NOTES ON THE "answer" FIELD: +- Keep in mind that the user is asking a {message_type} question. +- Answer in the language {original_language} in the script {original_script}. +- Answer should be concise and to the point. +- Do not include any information that is not present in the ADDITIONAL \ +RELEVANT INFORMATION. + +Only output the JSON response, without any additional text.\ +""" + + class ChatHistory: """Contains the prompts and models for the chat history task.""" @@ -216,7 +268,7 @@ class ChatHistory: {{ "message_type": "The type of the user's LATEST MESSAGE. List of valid - options are: {valid_message_types}, + options are: {valid_message_types}", "query": "The vector database query that you have constructed based on the user's LATEST MESSAGE and the conversation history." }} @@ -227,62 +279,7 @@ class ChatHistory: ), prompt_kws={"valid_message_types": _valid_message_types}, ) - system_message_generate_response = format_prompt( - prompt=textwrap.dedent( - """You are an AI assistant designed to help users with their - questions/concerns. You interact with users via a chat interface. You will - be provided with ADDITIONAL RELEVANT INFORMATION that can address the - user's questions/concerns. - - BEFORE answering the user's LATEST MESSAGE, follow these steps: - - 1. Review the conversation history to ensure that you understand the - context in which the user's LATEST MESSAGE is being asked. - 2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you - understand the most useful information related to the user's LATEST - MESSAGE. - - When you have completed the above steps, you will then write a JSON, whose - TypeScript Interface is given below: - - interface Response {{ - extracted_info: string[]; - answer: string; - }} - - For "extracted_info", extract from the provided ADDITIONAL RELEVANT - INFORMATION the most useful information related to the LATEST MESSAGE asked - by the user, and list them one by one. If no useful information is found, - return an empty list. - - For "answer", understand the conversation history, ADDITIONAL RELEVANT - INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to - the user's LATEST MESSAGE. If no useful information was found in the - either the conversation history or the ADDITIONAL RELEVANT INFORMATION, - respond with {failure_message}. - - EXAMPLE RESPONSES: - {{"extracted_info": [ - "Pineapples are a blend of pinecones and apples.", - "Pineapples have the shape of a pinecone." - ], - "answer": "The 'pine-' from pineapples likely come from the fact that - pineapples are a hybrid of pinecones and apples and its pinecone-like - shape." - }} - {{"extracted_info": [], "answer": "{failure_message}"}} - - IMPORTANT NOTES ON THE "answer" FIELD: - - Keep in mind that the user is asking a {message_type} question. - - Answer in the language of the question ({original_language}). - - Answer should be concise and to the point. - - Do not include any information that is not present in the ADDITIONAL - RELEVANT INFORMATION. - - Only output the JSON response, without any additional text. - """ - ) - ) + system_message_generate_response = CHAT_RESPONSE_PROMPT class ChatHistoryConstructSearchQuery(BaseModel): """Pydantic model for the output of the construct search query chat history.""" @@ -337,22 +334,21 @@ class IdentifiedLanguage(str, Enum): ENGLISH = "ENGLISH" FRENCH = "FRENCH" HINDI = "HINDI" + MARATHI = "MARATHI" SWAHILI = "SWAHILI" UNINTELLIGIBLE = "UNINTELLIGIBLE" UNSUPPORTED = "UNSUPPORTED" + # XHOSA = "XHOSA" # ZULU = "ZULU" - @classmethod def get_supported_languages(cls) -> list[str]: """Return a list of supported languages. - Returns ------- list[str] A list of supported languages. """ - return [ lang for lang in cls._member_names_ @@ -377,31 +373,98 @@ def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override] return cls.UNSUPPORTED + +class IdentifiedScript(str, Enum): + """Script used in the user's input.""" + + LATIN = "LATIN" + DEVANAGARI = "DEVANAGARI" + BENGALI = "BENGALI" + TAMIL = "TAMIL" + TELUGU = "TELUGU" + KANNADA = "KANNADA" + MALAYALAM = "MALAYALAM" + GUJARATI = "GUJARATI" + # GURMUKHI = "GURMUKHI" + # ORIYA = "ORIYA" + # SINHALA = "SINHALA" + # MYANMAR = "MYANMAR" + # ETHIOPIC = "ETHIOPIC" + # GEORGIAN = "GEORGIAN" + # ARMENIAN = "ARMENIAN" + # HEBREW = "HEBREW" + # GREEK = "GREEK" + # TIBETAN = "TIBETAN" + # MONGOLIAN = "MONGOLIAN" + # KHMER = "KHMER" + # LAO = "LAO" + # VIETNAMESE = "VIETNAMESE" + # THAI_LAO = "THAI_LAO" + UNKNOWN = "UNKNOWN" + @classmethod - def get_prompt(cls) -> str: - """Return the prompt for the language identification bot. + def get_supported_scripts(cls) -> list[str]: + """Return a list of supported scripts. + Returns + ------- + list[str] + A list of supported scripts. + """ + return [script for script in cls._member_names_ if script != "UNKNOWN"] + @classmethod + def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override] + """If script identified is not one of the supported scripts, it is + classified as UNKNOWN. + Parameters + ---------- + value + The script identified. Returns ------- - str - The prompt for the language identification bot. + Script + The identified script (i.e., UNKNOWN). """ + return cls.UNKNOWN - return textwrap.dedent( - f""" - You are a high-performing language identification bot that classifies the - language of the user input into one of {", ".join(cls._member_names_)}. - - If the user input is - 1. in one of the supported languages, then respond with that language. - 2. written in a mix of languages, then respond with the dominant language. - 3. in a real language but not a supported language, then respond with - UNSUPPORTED. - 4. unintelligible or gibberish, then respond with UNINTELLIGIBLE. - - Answer should be a single word and strictly one of - [{", ".join(cls._member_names_)}]""" - ).strip() + +class LanguageIdentificationResponse(BaseModel): + """Pydantic model for the language identification response.""" + + language: IdentifiedLanguage + script: IdentifiedScript + + model_config = ConfigDict(strict=True) + + +LANGUAGE_ID_PROMPT = ( + f"""\ +You are a high-performing language identification bot that classifies the \ +language and script of the user input. + +For each input, identify: +1. The language (must be one of {", ".join(IdentifiedLanguage._member_names_)}) +2. The script (must be one of {", ".join(IdentifiedScript._member_names_)}) + +If the user input is: +1. in one of the supported languages, respond with that language and its script +2. written in a mix of languages, respond with the dominant language and its script +3. in a real language but not a supported language, respond with UNSUPPORTED and \ +its script +4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin""" + + """ +Examples: +"How many beds are there?" -> {{"language": "ENGLISH", "script": "LATIN"}} +"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "LATIN"}} +"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "DEVANAGARI"}} +"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "LATIN"}} +"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "LATIN"}} +"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "LATIN"}} +"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "ARABIC"}} + +Respond with a JSON object containing "language" and "script" keys. +""" +) class RAG(BaseModel): diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py index ab4431ade..d7a1dea12 100644 --- a/core_backend/app/llm_call/llm_rag.py +++ b/core_backend/app/llm_call/llm_rag.py @@ -8,10 +8,17 @@ from ..config import LITELLM_MODEL_GENERATION from ..utils import setup_logger -from .llm_prompts import RAG, RAG_FAILURE_MESSAGE, ChatHistory, IdentifiedLanguage +from .llm_prompts import ( + RAG, + RAG_FAILURE_MESSAGE, + ChatHistory, + IdentifiedLanguage, + IdentifiedScript, +) from .utils import ( _ask_llm_async, append_messages_to_chat_history, + format_prompt, get_chat_response, remove_json_markdown, ) @@ -24,6 +31,7 @@ async def get_llm_rag_answer( context: str, metadata: dict | None = None, original_language: IdentifiedLanguage, + original_script: IdentifiedScript, question: str, ) -> RAG: """Get an answer from the LLM model using RAG. @@ -36,6 +44,8 @@ async def get_llm_rag_answer( Additional metadata to provide to the LLM model. original_language The original language of the question. + original_script + The scrip in which the original question was written. question The question to ask the LLM model. @@ -46,7 +56,11 @@ async def get_llm_rag_answer( """ metadata = metadata or {} - prompt = RAG.prompt.format(context=context, original_language=original_language) + prompt = RAG.prompt.format( + context=context, + original_language=original_language, + original_script=original_script, + ) result = await _ask_llm_async( json_=True, @@ -75,6 +89,7 @@ async def get_llm_rag_answer_with_chat_history( message_type: str, metadata: dict | None = None, original_language: IdentifiedLanguage, + original_script: IdentifiedScript, question: str, session_id: str, ) -> tuple[RAG, list[dict[str, str | None]]]: @@ -112,24 +127,20 @@ async def get_llm_rag_answer_with_chat_history( failure_message=RAG_FAILURE_MESSAGE, message_type=message_type, original_language=original_language, + original_script=original_script, ) ) - content = ( - question - + f""""\n\n - ADDITIONAL RELEVANT INFORMATION BELOW - ===================================== - {context} - - ADDITIONAL RELEVANT INFORMATION ABOVE - ===================================== - """ + user_message_with_context = format_prompt( + prompt=f"""{question}\n\n + + {context} + """ ) content = await get_chat_response( chat_history=chat_history, chat_params=chat_params, - message_params=content, + message_params=user_message_with_context, session_id=session_id, json_=True, metadata=metadata or {}, diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py index 9a30ffdeb..c6da6a5b4 100644 --- a/core_backend/app/llm_call/process_input.py +++ b/core_backend/app/llm_call/process_input.py @@ -3,6 +3,8 @@ from functools import wraps from typing import Any, Callable, Optional +from pydantic import ValidationError + from ..config import ( LITELLM_MODEL_LANGUAGE_DETECT, LITELLM_MODEL_PARAPHRASE, @@ -17,14 +19,17 @@ ) from ..utils import setup_logger from .llm_prompts import ( + LANGUAGE_ID_PROMPT, PARAPHRASE_FAILED_MESSAGE, PARAPHRASE_PROMPT, TRANSLATE_FAILED_MESSAGE, TRANSLATE_PROMPT, IdentifiedLanguage, + IdentifiedScript, + LanguageIdentificationResponse, SafetyClassification, ) -from .utils import _ask_llm_async +from .utils import _ask_llm_async, remove_json_markdown logger = setup_logger(name="INPUT RAILS") @@ -84,7 +89,7 @@ async def _identify_language( query_refined: QueryRefined, response: QueryResponse | QueryResponseError, ) -> tuple[QueryRefined, QueryResponse | QueryResponseError]: - """Identify the language of the question. + """Identify the language and script of the question. Parameters ---------- @@ -104,29 +109,45 @@ async def _identify_language( if isinstance(response, QueryResponseError): return query_refined, response - llm_identified_lang = await _ask_llm_async( + json_str = await _ask_llm_async( + json_=True, litellm_model=LITELLM_MODEL_LANGUAGE_DETECT, metadata=metadata, - system_message=IdentifiedLanguage.get_prompt(), - user_message=query_refined.query_text, + system_message=LANGUAGE_ID_PROMPT, + # Always use the original query text for language and script detection + user_message=query_refined.query_text_original, ) - identified_lang = getattr( - IdentifiedLanguage, llm_identified_lang, IdentifiedLanguage.UNSUPPORTED - ) + cleaned_json_str = remove_json_markdown(text=json_str) + try: + lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str) + identified_lang = IdentifiedLanguage(lang_info.language.upper()) + identified_script = IdentifiedScript(lang_info.script.upper()) + except ValidationError: + identified_lang = IdentifiedLanguage.UNSUPPORTED + identified_script = IdentifiedScript.LATIN + query_refined.original_language = identified_lang + query_refined.original_script = identified_script + response.debug_info["original_query"] = query_refined.query_text_original response.debug_info["original_language"] = identified_lang + response.debug_info["original_script"] = identified_script processed_response = _process_identified_language_response( - identified_language=identified_lang, response=response + identified_language=identified_lang, + identified_script=identified_script, + response=response, ) return query_refined, processed_response def _process_identified_language_response( - *, identified_language: IdentifiedLanguage, response: QueryResponse + *, + identified_language: IdentifiedLanguage, + identified_script: IdentifiedScript, + response: QueryResponse, ) -> QueryResponse | QueryResponseError: """Process the identified language and return the response. @@ -134,6 +155,8 @@ def _process_identified_language_response( ---------- identified_language The identified language. + identified_script + The identified script. response The response object. @@ -144,23 +167,33 @@ def _process_identified_language_response( """ supported_languages_list = IdentifiedLanguage.get_supported_languages() + supported_scripts_list = IdentifiedScript.get_supported_scripts() - if identified_language in supported_languages_list: - return response + language_ok = identified_language in supported_languages_list + script_ok = identified_script in supported_scripts_list - supported_languages = ", ".join(supported_languages_list) + supported_languages_str = ", ".join(supported_languages_list) + suported_scripts_str = ", ".join(supported_scripts_list) - match identified_language: - case IdentifiedLanguage.UNINTELLIGIBLE: + if language_ok and script_ok: + return response + elif language_ok and not script_ok: + error_message = ( + "Unsupported script. " + + f"Only the following scripts are supported: {suported_scripts_str}" + ) + error_type: ErrorType = ErrorType.UNSUPPORTED_SCRIPT + else: # regardless of script, language is not "ok" + if identified_language == IdentifiedLanguage.UNINTELLIGIBLE: error_message = ( "Unintelligible input. " - + f"The following languages are supported: {supported_languages}." + + f"The following languages are supported: {supported_languages_str}." ) - error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT - case _: + error_type = ErrorType.UNINTELLIGIBLE_INPUT + else: error_message = ( "Unsupported language. Only the following languages " - + f"are supported: {supported_languages}." + + f"are supported: {supported_languages_str}." ) error_type = ErrorType.UNSUPPORTED_LANGUAGE @@ -177,8 +210,8 @@ def _process_identified_language_response( error_response.debug_info.update(response.debug_info) logger.info( - f"LANGUAGE IDENTIFICATION FAILED due to {identified_language.value} " - f"language on query id: {str(response.query_id)}" + f"LANGUAGE IDENTIFICATION FAILED due to {error_message} " + f"on query id: {str(response.query_id)}" ) return error_response @@ -224,9 +257,10 @@ async def wrapper( The appropriate response object. """ - query_refined, response = await _translate_question( - query_refined=query_refined, response=response - ) + if not query_refined.chat_query_params: + query_refined, response = await _translate_question( + query_refined=query_refined, response=response + ) response = await func(query_refined, response, *args, **kwargs) return response @@ -464,6 +498,7 @@ async def wrapper( query_refined, response = await _paraphrase_question( query_refined=query_refined, response=response ) + response = await func(query_refined, response, *args, **kwargs) return response diff --git a/core_backend/app/llm_call/process_output.py b/core_backend/app/llm_call/process_output.py index a4671030b..2a569f0e1 100644 --- a/core_backend/app/llm_call/process_output.py +++ b/core_backend/app/llm_call/process_output.py @@ -84,6 +84,9 @@ async def generate_llm_query_response( if query_refined.original_language is None: logger.warning("No original_language found in the query.") return response, chat_history + if query_refined.original_script is None: + logger.warning("No original_script found in the query.") + return response, chat_history context = get_context_string_from_search_results( search_results=response.search_results @@ -98,6 +101,7 @@ async def generate_llm_query_response( message_type=message_type, metadata=metadata, original_language=query_refined.original_language, + original_script=query_refined.original_script, question=query_refined.query_text_original, session_id=chat_query_params["session_id"], ) @@ -106,6 +110,7 @@ async def generate_llm_query_response( context=context, metadata=metadata, original_language=query_refined.original_language, + original_script=query_refined.original_script, question=query_refined.query_text_original, # Use the original query text ) diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py index 9d301cdc4..e6091edb1 100644 --- a/core_backend/app/question_answer/routers.py +++ b/core_backend/app/question_answer/routers.py @@ -843,6 +843,9 @@ async def get_user_query_and_response( query_text_original=user_query.query_text, workspace_id=workspace_id, ) + + # In case of a chat query, use the optimized query as the base query_text. + # Note that for language identification, we use query_text_original. if user_query_refined.chat_query_params: user_query_refined.query_text = user_query_refined.chat_query_params.pop( "search_query" @@ -1076,8 +1079,8 @@ async def init_user_query_and_chat_histories( "chat_history": user_assistant_chat_history, "chat_params": chat_params, "message_type": search_query_json_response["message_type"], - "redis_client": redis_client, "search_query": search_query_json_response["query"], + "redis_client": redis_client, "session_id": session_id, } user_query.generate_llm_response = True diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py index 8904e2c36..c434b28ee 100644 --- a/core_backend/app/question_answer/schemas.py +++ b/core_backend/app/question_answer/schemas.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field from pydantic.json_schema import SkipJsonSchema -from ..llm_call.llm_prompts import IdentifiedLanguage +from ..llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript from ..schemas import FeedbackSentiment, QuerySearchResult @@ -23,6 +23,7 @@ class ErrorType(str, Enum): UNABLE_TO_TRANSLATE = "unable_to_translate" UNINTELLIGIBLE_INPUT = "unintelligible_input" UNSUPPORTED_LANGUAGE = "unsupported_language" + UNSUPPORTED_SCRIPT = "unsupported_script" class QueryBase(BaseModel): @@ -49,6 +50,7 @@ class QueryRefined(QueryBase): generate_tts: bool = Field(False) original_language: IdentifiedLanguage | None = None + original_script: IdentifiedScript | None = None query_text_original: str workspace_id: int diff --git a/core_backend/app/question_answer/utils.py b/core_backend/app/question_answer/utils.py index 029d7194c..f972e46dc 100644 --- a/core_backend/app/question_answer/utils.py +++ b/core_backend/app/question_answer/utils.py @@ -23,6 +23,8 @@ def get_context_string_from_search_results( for key, result in search_results.items(): if not isinstance(result, QuerySearchResult): result = QuerySearchResult(**result) - context_list.append(f"{key}. {result.title}\n{result.text}") + context_list.append( + f" \n**{result.title}**\n\n{result.text}\n" + ) context_string = "\n\n".join(context_list) return context_string diff --git a/core_backend/tests/api/conftest.py b/core_backend/tests/api/conftest.py index ca0a53a0b..d0e8b15bc 100644 --- a/core_backend/tests/api/conftest.py +++ b/core_backend/tests/api/conftest.py @@ -35,6 +35,7 @@ RAG, AlignmentScore, IdentifiedLanguage, + IdentifiedScript, ) from core_backend.app.question_answer.models import ( ContentFeedbackDB, @@ -1703,7 +1704,9 @@ async def mock_identify_language( """ query_refined.original_language = IdentifiedLanguage.ENGLISH + query_refined.original_script = IdentifiedScript.LATIN response.debug_info["original_language"] = "ENGLISH" + response.debug_info["original_script"] = "LATIN" return query_refined, response diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py index 163e77574..43e9b7ef5 100644 --- a/core_backend/tests/api/test_question_answer.py +++ b/core_backend/tests/api/test_question_answer.py @@ -10,7 +10,10 @@ from fastapi import status from fastapi.testclient import TestClient -from core_backend.app.llm_call.llm_prompts import AlignmentScore, IdentifiedLanguage +from core_backend.app.llm_call.llm_prompts import ( + AlignmentScore, + IdentifiedLanguage, +) from core_backend.app.llm_call.process_input import ( _classify_safety, _identify_language, @@ -1045,20 +1048,25 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined: ) @pytest.mark.parametrize( - "identified_lang_str,should_error,expected_error_type", + "identified_lang_str,identified_script_str,should_error,expected_error_type", [ - ("ENGLISH", False, None), - ("HINDI", False, None), - ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT), - ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("SOME_UNSUPPORTED_LANG", True, ErrorType.UNSUPPORTED_LANGUAGE), - ("don't kow", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("ENGLISH", "LATIN", False, None), + ("HINDI", "DEVANAGARI", False, None), + ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT), + ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT), + ("ENGLISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT), + ("ENGLISH", "Some unsupported script", True, ErrorType.UNSUPPORTED_SCRIPT), + ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE), + ("don't kow", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE), ], ) async def test_language_identify_error( self, identified_lang_str: str, + identified_script_str: str, should_error: bool, expected_error_type: ErrorType, monkeypatch: pytest.MonkeyPatch, @@ -1084,6 +1092,7 @@ async def test_language_identify_error( generate_llm_response=False, generate_tts=False, original_language=None, + original_script=None, query_text="This is a basic query", query_text_original="This is a query original", workspace_id=124, @@ -1104,10 +1113,12 @@ async def mock_ask_llm( # pylint: disable=W0613 Returns ------- str - The identified language string. + The identified language and script model json string. """ - return identified_lang_str + return f""" + {{"language": "{identified_lang_str}", "script": "{identified_script_str}"}} + """.strip() monkeypatch.setattr( "core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm @@ -1233,6 +1244,7 @@ async def mock_ask_llm( # pylint: disable=W0613 generate_llm_response=False, generate_tts=False, original_language=None, + original_script=None, query_text="This is a basic query", query_text_original="This is a query original", workspace_id=124, diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml index a4d3ddb34..4b28c20e2 100644 --- a/core_backend/tests/rails/data/language_identification.yaml +++ b/core_backend/tests/rails/data/language_identification.yaml @@ -2,59 +2,73 @@ # improve this with a native speaker. These might be too "pure". HAUSA: - - Ina da yara biyu masu hanci - - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi - - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa? - - Menene wannan? - - Sannun ku da zuwa #h/t: Fola from here on - - Ni yarinya ne - - Zo ka chi abunchi - - Ina kwana Maman mu - - Wannan shago na ne + LATIN: + - Ina da yara biyu masu hanci + - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi + - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa? + - Menene wannan? + - Sannun ku da zuwa #h/t: Fola from here on + - Ni yarinya ne + - Zo ka chi abunchi + - Ina kwana Maman mu + - Wannan shago na ne ENGLISH: - - I have two children. You see I girl, what is the probability the other is also a girl? - - No idea - - Why you say that? + LATIN: + - I have two children. You see I girl, what is the probability the other is also a girl? + - No idea + - Why you say that? XHOSA: - - Umama ngugqirha - - Utata ngumongikazi - - Ukuba intamo yam yayifuna ukwenza oko? - - Iintsana zikhala kakhulu, huh? + LATIN: + - Umama ngugqirha + - Utata ngumongikazi + - Ukuba intamo yam yayifuna ukwenza oko? + - Iintsana zikhala kakhulu, huh? YORUBA: #h/t: Fola - - Ni bo ló ti ri owo yen? - - Eyin melo ni e wa ni be? - - Ki ni itumo oruko ẹ? - - Ki ni o jẹ lánà? - - Omo Ibadan ni mi + LATIN: + - Ni bo ló ti ri owo yen? + - Eyin melo ni e wa ni be? + - Ki ni itumo oruko ẹ? + - Ki ni o jẹ lánà? + - Omo Ibadan ni mi IGBO: #h/t: Fola - - agụụ na-agụ m - - agam aga ahia echi - - ị hụla ngozi? ana m achọ ya. - - m na-aga ọrụ + LATIN: + - agụụ na-agụ m + - agam aga ahia echi + - ị hụla ngozi? ana m achọ ya. + - m na-aga ọrụ KOREAN: - - 애가 둘이예요 - - ㅋㅋㅋㅋㅋㅋ - - 아이들이 많이 울어요ㅠ - - 이 프로젝트 애칭은 ask-a-question이야. + KOREAN: + - 애가 둘이예요 + - ㅋㅋㅋㅋㅋㅋ + - 아이들이 많이 울어요ㅠ + - 이 프로젝트 애칭은 ask-a-question이야. ZULU: - - Ngingumama - - Ingabe uyi-bot noma ungumuntu? - - Ngifuna ukwenza lokhu? - - Izingane zikhala kakhulu, hhe + LATIN: + - Ngingumama + - Ingabe uyi-bot noma ungumuntu? + - Ngifuna ukwenza lokhu? + - Izingane zikhala kakhulu, hhe AFRIKAANS: - - Ek het hierdie goddelose dal gemaak - - Is covid nog 'n ding? - - My hond het my huiswerk geëet - - Het jy al gebraaide roomys probeer? + LATIN: + - Ek het hierdie goddelose dal gemaak + - Is covid nog 'n ding? + - My hond het my huiswerk geëet + - Het jy al gebraaide roomys probeer? HINDI: #h/t: Sid - - is ka matlab kya hai? - - kabhi kabhi mere dil mein - - अंत में सभी लोग नाश्ता करने जाएं - - गब्बर सिंह कह के गया जो डर गया वो मर गया + LATIN: + - is ka matlab kya hai? + - kabhi kabhi mere dil mein + DEVANAGARI: + - अंत में सभी लोग नाश्ता करने जाएं + - गब्बर सिंह कह के गया जो डर गया वो मर गया +MARATHI: + LATIN: + - Portal chi link aahe UNINTELLIGIBLE: - - sdfsdf sss dyhnel jjj - - hs dsfsg xd ewwo ddfs - - Heghlu'meH QaQ jajvam - - yIHuchQo', 'ej jIHvaD yIqemchu'mo' - - \%^*# levels; 91011 AQGs!!! - - 1234 AQI WHO? 5678 + UNKNOWN: + - sdfsdf sss dyhnel jjj + - hs dsfsg xd ewwo ddfs + - Heghlu'meH QaQ jajvam + - yIHuchQo', 'ej jIHvaD yIqemchu'mo' + - \%^*# levels; 91011 AQGs!!! + - 1234 AQI WHO? 5678 diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py index 9b30b2e9a..6744d8216 100644 --- a/core_backend/tests/rails/test_language_identification.py +++ b/core_backend/tests/rails/test_language_identification.py @@ -5,7 +5,7 @@ import pytest import yaml -from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage +from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript from core_backend.app.llm_call.process_input import _identify_language from core_backend.app.question_answer.schemas import QueryRefined, QueryResponse @@ -22,19 +22,38 @@ def available_languages() -> list[str]: return list(IdentifiedLanguage) -def read_test_data(file: str) -> list[tuple[str, str]]: +@pytest.fixture(scope="module") +def available_scripts() -> list[str]: + """Returns a list of available languages.""" + + return list(IdentifiedScript) + + +def read_test_data(file: str) -> list[tuple[str, str, str]]: """Reads test data from file and returns a list of strings.""" file_path = Path(__file__).parent / file with open(file_path, "r", encoding="utf-8") as f: content = yaml.safe_load(f) - return [(key, value) for key, values in content.items() for value in values] - - -@pytest.mark.parametrize("expected_label, content", read_test_data(LANGUAGE_FILE)) + data = [ + (language, script, text) + for language, script_dict in content.items() + for script, texts in script_dict.items() + for text in texts + ] + return data + + +@pytest.mark.parametrize( + "expected_language,expected_script,content", read_test_data(LANGUAGE_FILE) +) async def test_language_identification( - available_languages: list[str], expected_label: str, content: str + available_languages: list[str], + available_scripts: list[str], + expected_language: str, + expected_script: str, + content: str, ) -> None: """Test language identification.""" @@ -53,8 +72,15 @@ async def test_language_identification( search_results=None, session_id=None, ) - if expected_label not in available_languages: - expected_label = "UNSUPPORTED" + + if expected_language not in available_languages: + expected_language = "UNSUPPORTED" + + if expected_script not in available_scripts: + expected_script = "UNKNOWN" + _, response = await _identify_language(query_refined=question, response=response) - assert response.debug_info["original_language"] == expected_label + assert response.debug_info["original_language"] == expected_language + if expected_language not in ("UNINTELLIGIBLE", "UNSUPPORTED"): + assert response.debug_info["original_script"] == expected_script