diff --git a/.secrets.baseline b/.secrets.baseline
index 5cab9e8c1..f961ef821 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -348,57 +348,6 @@
"line_number": 15
}
],
- "core_backend/tests/api/conftest.py": [
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3",
- "is_verified": false,
- "line_number": 46
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7",
- "is_verified": false,
- "line_number": 47
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097",
- "is_verified": false,
- "line_number": 50
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4",
- "is_verified": false,
- "line_number": 51
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a",
- "is_verified": false,
- "line_number": 56
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78",
- "is_verified": false,
- "line_number": 57
- },
- {
- "type": "Secret Keyword",
- "filename": "core_backend/tests/api/conftest.py",
- "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e",
- "is_verified": false,
- "line_number": 317
- }
- ],
"core_backend/tests/api/test.env": [
{
"type": "Secret Keyword",
@@ -448,14 +397,14 @@
"filename": "core_backend/tests/api/test_question_answer.py",
"hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d",
"is_verified": false,
- "line_number": 294
+ "line_number": 418
},
{
"type": "Secret Keyword",
"filename": "core_backend/tests/api/test_question_answer.py",
"hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee",
"is_verified": false,
- "line_number": 653
+ "line_number": 1018
}
],
"core_backend/tests/api/test_user_tools.py": [
@@ -473,7 +422,7 @@
"filename": "core_backend/tests/rails/test_language_identification.py",
"hashed_secret": "051b2c1d98174fabc4749641c4f4f4660556441e",
"is_verified": false,
- "line_number": 48
+ "line_number": 69
}
],
"core_backend/tests/rails/test_paraphrasing.py": [
@@ -581,5 +530,5 @@
}
]
},
- "generated_at": "2025-01-24T13:35:08Z"
+ "generated_at": "2025-04-10T13:44:48Z"
}
diff --git a/core_backend/app/llm_call/llm_prompts.py b/core_backend/app/llm_call/llm_prompts.py
index 2ede20f4c..db7a154bd 100644
--- a/core_backend/app/llm_call/llm_prompts.py
+++ b/core_backend/app/llm_call/llm_prompts.py
@@ -103,7 +103,7 @@
{context}
IMPORTANT NOTES ON THE "answer" FIELD:
-- Answer in the language of the question ({original_language}).
+- Answer in the language {original_language} in the script {original_script}.
- Answer should be concise, to the point, and no longer than 80 words.
- Do not include any information that is not present in the REFERENCE TEXT.
"""
@@ -182,6 +182,58 @@ class AlignmentScore(BaseModel):
model_config = ConfigDict(strict=True)
+CHAT_RESPONSE_PROMPT = """\
+You are an AI assistant designed to help users with their \
+questions/concerns. You interact with users via a chat interface. You will \
+be provided with ADDITIONAL RELEVANT INFORMATION that can address the \
+user's questions/concerns.
+
+BEFORE answering the user's LATEST MESSAGE, follow these steps:
+
+1. Review the conversation history to ensure that you understand the \
+context in which the user's LATEST MESSAGE is being asked.
+2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you \
+understand the most useful information related to the user's LATEST \
+MESSAGE.
+
+When you have completed the above steps, you will then write a JSON, whose \
+TypeScript Interface is given below:
+
+interface Response {{
+ extracted_info: string[];
+ answer: string;
+}}
+
+For "extracted_info", extract from the provided ADDITIONAL RELEVANT \
+INFORMATION the most useful information related to the LATEST MESSAGE asked \
+by the user, and list them one by one. If no useful information is found, \
+return an empty list.
+
+For "answer", understand the conversation history, ADDITIONAL RELEVANT \
+INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to \
+the user's LATEST MESSAGE. If no useful information was found in the \
+either the conversation history or the ADDITIONAL RELEVANT INFORMATION, \
+respond with {failure_message}.
+
+EXAMPLE RESPONSES:
+{{"extracted_info": ["Pineapples are a blend of pinecones and apples.", \
+"Pineapples have the shape of a pinecone."], \
+"answer": "The 'pine-' from pineapples likely come from the fact that \
+pineapples are a hybrid of pinecones and apples and its pinecone-like \
+shape."}}
+{{"extracted_info": [], "answer": "{failure_message}"}}
+
+IMPORTANT NOTES ON THE "answer" FIELD:
+- Keep in mind that the user is asking a {message_type} question.
+- Answer in the language {original_language} in the script {original_script}.
+- Answer should be concise and to the point.
+- Do not include any information that is not present in the ADDITIONAL \
+RELEVANT INFORMATION.
+
+Only output the JSON response, without any additional text.\
+"""
+
+
class ChatHistory:
"""Contains the prompts and models for the chat history task."""
@@ -216,7 +268,7 @@ class ChatHistory:
{{
"message_type": "The type of the user's LATEST MESSAGE. List of valid
- options are: {valid_message_types},
+ options are: {valid_message_types}",
"query": "The vector database query that you have constructed based on
the user's LATEST MESSAGE and the conversation history."
}}
@@ -227,62 +279,7 @@ class ChatHistory:
),
prompt_kws={"valid_message_types": _valid_message_types},
)
- system_message_generate_response = format_prompt(
- prompt=textwrap.dedent(
- """You are an AI assistant designed to help users with their
- questions/concerns. You interact with users via a chat interface. You will
- be provided with ADDITIONAL RELEVANT INFORMATION that can address the
- user's questions/concerns.
-
- BEFORE answering the user's LATEST MESSAGE, follow these steps:
-
- 1. Review the conversation history to ensure that you understand the
- context in which the user's LATEST MESSAGE is being asked.
- 2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
- understand the most useful information related to the user's LATEST
- MESSAGE.
-
- When you have completed the above steps, you will then write a JSON, whose
- TypeScript Interface is given below:
-
- interface Response {{
- extracted_info: string[];
- answer: string;
- }}
-
- For "extracted_info", extract from the provided ADDITIONAL RELEVANT
- INFORMATION the most useful information related to the LATEST MESSAGE asked
- by the user, and list them one by one. If no useful information is found,
- return an empty list.
-
- For "answer", understand the conversation history, ADDITIONAL RELEVANT
- INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
- the user's LATEST MESSAGE. If no useful information was found in the
- either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
- respond with {failure_message}.
-
- EXAMPLE RESPONSES:
- {{"extracted_info": [
- "Pineapples are a blend of pinecones and apples.",
- "Pineapples have the shape of a pinecone."
- ],
- "answer": "The 'pine-' from pineapples likely come from the fact that
- pineapples are a hybrid of pinecones and apples and its pinecone-like
- shape."
- }}
- {{"extracted_info": [], "answer": "{failure_message}"}}
-
- IMPORTANT NOTES ON THE "answer" FIELD:
- - Keep in mind that the user is asking a {message_type} question.
- - Answer in the language of the question ({original_language}).
- - Answer should be concise and to the point.
- - Do not include any information that is not present in the ADDITIONAL
- RELEVANT INFORMATION.
-
- Only output the JSON response, without any additional text.
- """
- )
- )
+ system_message_generate_response = CHAT_RESPONSE_PROMPT
class ChatHistoryConstructSearchQuery(BaseModel):
"""Pydantic model for the output of the construct search query chat history."""
@@ -337,22 +334,21 @@ class IdentifiedLanguage(str, Enum):
ENGLISH = "ENGLISH"
FRENCH = "FRENCH"
HINDI = "HINDI"
+ MARATHI = "MARATHI"
SWAHILI = "SWAHILI"
UNINTELLIGIBLE = "UNINTELLIGIBLE"
UNSUPPORTED = "UNSUPPORTED"
+
# XHOSA = "XHOSA"
# ZULU = "ZULU"
-
@classmethod
def get_supported_languages(cls) -> list[str]:
"""Return a list of supported languages.
-
Returns
-------
list[str]
A list of supported languages.
"""
-
return [
lang
for lang in cls._member_names_
@@ -377,31 +373,98 @@ def _missing_(cls, value: str) -> IdentifiedLanguage: # type: ignore[override]
return cls.UNSUPPORTED
+
+class IdentifiedScript(str, Enum):
+ """Script used in the user's input."""
+
+ LATIN = "LATIN"
+ DEVANAGARI = "DEVANAGARI"
+ BENGALI = "BENGALI"
+ TAMIL = "TAMIL"
+ TELUGU = "TELUGU"
+ KANNADA = "KANNADA"
+ MALAYALAM = "MALAYALAM"
+ GUJARATI = "GUJARATI"
+ # GURMUKHI = "GURMUKHI"
+ # ORIYA = "ORIYA"
+ # SINHALA = "SINHALA"
+ # MYANMAR = "MYANMAR"
+ # ETHIOPIC = "ETHIOPIC"
+ # GEORGIAN = "GEORGIAN"
+ # ARMENIAN = "ARMENIAN"
+ # HEBREW = "HEBREW"
+ # GREEK = "GREEK"
+ # TIBETAN = "TIBETAN"
+ # MONGOLIAN = "MONGOLIAN"
+ # KHMER = "KHMER"
+ # LAO = "LAO"
+ # VIETNAMESE = "VIETNAMESE"
+ # THAI_LAO = "THAI_LAO"
+ UNKNOWN = "UNKNOWN"
+
@classmethod
- def get_prompt(cls) -> str:
- """Return the prompt for the language identification bot.
+ def get_supported_scripts(cls) -> list[str]:
+ """Return a list of supported scripts.
+ Returns
+ -------
+ list[str]
+ A list of supported scripts.
+ """
+ return [script for script in cls._member_names_ if script != "UNKNOWN"]
+ @classmethod
+ def _missing_(cls, value: str) -> IdentifiedScript: # type: ignore[override]
+ """If script identified is not one of the supported scripts, it is
+ classified as UNKNOWN.
+ Parameters
+ ----------
+ value
+ The script identified.
Returns
-------
- str
- The prompt for the language identification bot.
+ Script
+ The identified script (i.e., UNKNOWN).
"""
+ return cls.UNKNOWN
- return textwrap.dedent(
- f"""
- You are a high-performing language identification bot that classifies the
- language of the user input into one of {", ".join(cls._member_names_)}.
-
- If the user input is
- 1. in one of the supported languages, then respond with that language.
- 2. written in a mix of languages, then respond with the dominant language.
- 3. in a real language but not a supported language, then respond with
- UNSUPPORTED.
- 4. unintelligible or gibberish, then respond with UNINTELLIGIBLE.
-
- Answer should be a single word and strictly one of
- [{", ".join(cls._member_names_)}]"""
- ).strip()
+
+class LanguageIdentificationResponse(BaseModel):
+ """Pydantic model for the language identification response."""
+
+ language: IdentifiedLanguage
+ script: IdentifiedScript
+
+ model_config = ConfigDict(strict=True)
+
+
+LANGUAGE_ID_PROMPT = (
+ f"""\
+You are a high-performing language identification bot that classifies the \
+language and script of the user input.
+
+For each input, identify:
+1. The language (must be one of {", ".join(IdentifiedLanguage._member_names_)})
+2. The script (must be one of {", ".join(IdentifiedScript._member_names_)})
+
+If the user input is:
+1. in one of the supported languages, respond with that language and its script
+2. written in a mix of languages, respond with the dominant language and its script
+3. in a real language but not a supported language, respond with UNSUPPORTED and \
+its script
+4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin"""
+ + """
+Examples:
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "LATIN"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "LATIN"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "DEVANAGARI"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "LATIN"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "LATIN"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "LATIN"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "ARABIC"}}
+
+Respond with a JSON object containing "language" and "script" keys.
+"""
+)
class RAG(BaseModel):
diff --git a/core_backend/app/llm_call/llm_rag.py b/core_backend/app/llm_call/llm_rag.py
index ab4431ade..d7a1dea12 100644
--- a/core_backend/app/llm_call/llm_rag.py
+++ b/core_backend/app/llm_call/llm_rag.py
@@ -8,10 +8,17 @@
from ..config import LITELLM_MODEL_GENERATION
from ..utils import setup_logger
-from .llm_prompts import RAG, RAG_FAILURE_MESSAGE, ChatHistory, IdentifiedLanguage
+from .llm_prompts import (
+ RAG,
+ RAG_FAILURE_MESSAGE,
+ ChatHistory,
+ IdentifiedLanguage,
+ IdentifiedScript,
+)
from .utils import (
_ask_llm_async,
append_messages_to_chat_history,
+ format_prompt,
get_chat_response,
remove_json_markdown,
)
@@ -24,6 +31,7 @@ async def get_llm_rag_answer(
context: str,
metadata: dict | None = None,
original_language: IdentifiedLanguage,
+ original_script: IdentifiedScript,
question: str,
) -> RAG:
"""Get an answer from the LLM model using RAG.
@@ -36,6 +44,8 @@ async def get_llm_rag_answer(
Additional metadata to provide to the LLM model.
original_language
The original language of the question.
+ original_script
+ The scrip in which the original question was written.
question
The question to ask the LLM model.
@@ -46,7 +56,11 @@ async def get_llm_rag_answer(
"""
metadata = metadata or {}
- prompt = RAG.prompt.format(context=context, original_language=original_language)
+ prompt = RAG.prompt.format(
+ context=context,
+ original_language=original_language,
+ original_script=original_script,
+ )
result = await _ask_llm_async(
json_=True,
@@ -75,6 +89,7 @@ async def get_llm_rag_answer_with_chat_history(
message_type: str,
metadata: dict | None = None,
original_language: IdentifiedLanguage,
+ original_script: IdentifiedScript,
question: str,
session_id: str,
) -> tuple[RAG, list[dict[str, str | None]]]:
@@ -112,24 +127,20 @@ async def get_llm_rag_answer_with_chat_history(
failure_message=RAG_FAILURE_MESSAGE,
message_type=message_type,
original_language=original_language,
+ original_script=original_script,
)
)
- content = (
- question
- + f""""\n\n
- ADDITIONAL RELEVANT INFORMATION BELOW
- =====================================
- {context}
-
- ADDITIONAL RELEVANT INFORMATION ABOVE
- =====================================
- """
+ user_message_with_context = format_prompt(
+ prompt=f"""{question}\n\n
+
+ {context}
+ """
)
content = await get_chat_response(
chat_history=chat_history,
chat_params=chat_params,
- message_params=content,
+ message_params=user_message_with_context,
session_id=session_id,
json_=True,
metadata=metadata or {},
diff --git a/core_backend/app/llm_call/process_input.py b/core_backend/app/llm_call/process_input.py
index 9a30ffdeb..c6da6a5b4 100644
--- a/core_backend/app/llm_call/process_input.py
+++ b/core_backend/app/llm_call/process_input.py
@@ -3,6 +3,8 @@
from functools import wraps
from typing import Any, Callable, Optional
+from pydantic import ValidationError
+
from ..config import (
LITELLM_MODEL_LANGUAGE_DETECT,
LITELLM_MODEL_PARAPHRASE,
@@ -17,14 +19,17 @@
)
from ..utils import setup_logger
from .llm_prompts import (
+ LANGUAGE_ID_PROMPT,
PARAPHRASE_FAILED_MESSAGE,
PARAPHRASE_PROMPT,
TRANSLATE_FAILED_MESSAGE,
TRANSLATE_PROMPT,
IdentifiedLanguage,
+ IdentifiedScript,
+ LanguageIdentificationResponse,
SafetyClassification,
)
-from .utils import _ask_llm_async
+from .utils import _ask_llm_async, remove_json_markdown
logger = setup_logger(name="INPUT RAILS")
@@ -84,7 +89,7 @@ async def _identify_language(
query_refined: QueryRefined,
response: QueryResponse | QueryResponseError,
) -> tuple[QueryRefined, QueryResponse | QueryResponseError]:
- """Identify the language of the question.
+ """Identify the language and script of the question.
Parameters
----------
@@ -104,29 +109,45 @@ async def _identify_language(
if isinstance(response, QueryResponseError):
return query_refined, response
- llm_identified_lang = await _ask_llm_async(
+ json_str = await _ask_llm_async(
+ json_=True,
litellm_model=LITELLM_MODEL_LANGUAGE_DETECT,
metadata=metadata,
- system_message=IdentifiedLanguage.get_prompt(),
- user_message=query_refined.query_text,
+ system_message=LANGUAGE_ID_PROMPT,
+ # Always use the original query text for language and script detection
+ user_message=query_refined.query_text_original,
)
- identified_lang = getattr(
- IdentifiedLanguage, llm_identified_lang, IdentifiedLanguage.UNSUPPORTED
- )
+ cleaned_json_str = remove_json_markdown(text=json_str)
+ try:
+ lang_info = LanguageIdentificationResponse.model_validate_json(cleaned_json_str)
+ identified_lang = IdentifiedLanguage(lang_info.language.upper())
+ identified_script = IdentifiedScript(lang_info.script.upper())
+ except ValidationError:
+ identified_lang = IdentifiedLanguage.UNSUPPORTED
+ identified_script = IdentifiedScript.LATIN
+
query_refined.original_language = identified_lang
+ query_refined.original_script = identified_script
+
response.debug_info["original_query"] = query_refined.query_text_original
response.debug_info["original_language"] = identified_lang
+ response.debug_info["original_script"] = identified_script
processed_response = _process_identified_language_response(
- identified_language=identified_lang, response=response
+ identified_language=identified_lang,
+ identified_script=identified_script,
+ response=response,
)
return query_refined, processed_response
def _process_identified_language_response(
- *, identified_language: IdentifiedLanguage, response: QueryResponse
+ *,
+ identified_language: IdentifiedLanguage,
+ identified_script: IdentifiedScript,
+ response: QueryResponse,
) -> QueryResponse | QueryResponseError:
"""Process the identified language and return the response.
@@ -134,6 +155,8 @@ def _process_identified_language_response(
----------
identified_language
The identified language.
+ identified_script
+ The identified script.
response
The response object.
@@ -144,23 +167,33 @@ def _process_identified_language_response(
"""
supported_languages_list = IdentifiedLanguage.get_supported_languages()
+ supported_scripts_list = IdentifiedScript.get_supported_scripts()
- if identified_language in supported_languages_list:
- return response
+ language_ok = identified_language in supported_languages_list
+ script_ok = identified_script in supported_scripts_list
- supported_languages = ", ".join(supported_languages_list)
+ supported_languages_str = ", ".join(supported_languages_list)
+ suported_scripts_str = ", ".join(supported_scripts_list)
- match identified_language:
- case IdentifiedLanguage.UNINTELLIGIBLE:
+ if language_ok and script_ok:
+ return response
+ elif language_ok and not script_ok:
+ error_message = (
+ "Unsupported script. "
+ + f"Only the following scripts are supported: {suported_scripts_str}"
+ )
+ error_type: ErrorType = ErrorType.UNSUPPORTED_SCRIPT
+ else: # regardless of script, language is not "ok"
+ if identified_language == IdentifiedLanguage.UNINTELLIGIBLE:
error_message = (
"Unintelligible input. "
- + f"The following languages are supported: {supported_languages}."
+ + f"The following languages are supported: {supported_languages_str}."
)
- error_type: ErrorType = ErrorType.UNINTELLIGIBLE_INPUT
- case _:
+ error_type = ErrorType.UNINTELLIGIBLE_INPUT
+ else:
error_message = (
"Unsupported language. Only the following languages "
- + f"are supported: {supported_languages}."
+ + f"are supported: {supported_languages_str}."
)
error_type = ErrorType.UNSUPPORTED_LANGUAGE
@@ -177,8 +210,8 @@ def _process_identified_language_response(
error_response.debug_info.update(response.debug_info)
logger.info(
- f"LANGUAGE IDENTIFICATION FAILED due to {identified_language.value} "
- f"language on query id: {str(response.query_id)}"
+ f"LANGUAGE IDENTIFICATION FAILED due to {error_message} "
+ f"on query id: {str(response.query_id)}"
)
return error_response
@@ -224,9 +257,10 @@ async def wrapper(
The appropriate response object.
"""
- query_refined, response = await _translate_question(
- query_refined=query_refined, response=response
- )
+ if not query_refined.chat_query_params:
+ query_refined, response = await _translate_question(
+ query_refined=query_refined, response=response
+ )
response = await func(query_refined, response, *args, **kwargs)
return response
@@ -464,6 +498,7 @@ async def wrapper(
query_refined, response = await _paraphrase_question(
query_refined=query_refined, response=response
)
+
response = await func(query_refined, response, *args, **kwargs)
return response
diff --git a/core_backend/app/llm_call/process_output.py b/core_backend/app/llm_call/process_output.py
index a4671030b..2a569f0e1 100644
--- a/core_backend/app/llm_call/process_output.py
+++ b/core_backend/app/llm_call/process_output.py
@@ -84,6 +84,9 @@ async def generate_llm_query_response(
if query_refined.original_language is None:
logger.warning("No original_language found in the query.")
return response, chat_history
+ if query_refined.original_script is None:
+ logger.warning("No original_script found in the query.")
+ return response, chat_history
context = get_context_string_from_search_results(
search_results=response.search_results
@@ -98,6 +101,7 @@ async def generate_llm_query_response(
message_type=message_type,
metadata=metadata,
original_language=query_refined.original_language,
+ original_script=query_refined.original_script,
question=query_refined.query_text_original,
session_id=chat_query_params["session_id"],
)
@@ -106,6 +110,7 @@ async def generate_llm_query_response(
context=context,
metadata=metadata,
original_language=query_refined.original_language,
+ original_script=query_refined.original_script,
question=query_refined.query_text_original, # Use the original query text
)
diff --git a/core_backend/app/question_answer/routers.py b/core_backend/app/question_answer/routers.py
index 9d301cdc4..e6091edb1 100644
--- a/core_backend/app/question_answer/routers.py
+++ b/core_backend/app/question_answer/routers.py
@@ -843,6 +843,9 @@ async def get_user_query_and_response(
query_text_original=user_query.query_text,
workspace_id=workspace_id,
)
+
+ # In case of a chat query, use the optimized query as the base query_text.
+ # Note that for language identification, we use query_text_original.
if user_query_refined.chat_query_params:
user_query_refined.query_text = user_query_refined.chat_query_params.pop(
"search_query"
@@ -1076,8 +1079,8 @@ async def init_user_query_and_chat_histories(
"chat_history": user_assistant_chat_history,
"chat_params": chat_params,
"message_type": search_query_json_response["message_type"],
- "redis_client": redis_client,
"search_query": search_query_json_response["query"],
+ "redis_client": redis_client,
"session_id": session_id,
}
user_query.generate_llm_response = True
diff --git a/core_backend/app/question_answer/schemas.py b/core_backend/app/question_answer/schemas.py
index 8904e2c36..c434b28ee 100644
--- a/core_backend/app/question_answer/schemas.py
+++ b/core_backend/app/question_answer/schemas.py
@@ -6,7 +6,7 @@
from pydantic import BaseModel, ConfigDict, Field
from pydantic.json_schema import SkipJsonSchema
-from ..llm_call.llm_prompts import IdentifiedLanguage
+from ..llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript
from ..schemas import FeedbackSentiment, QuerySearchResult
@@ -23,6 +23,7 @@ class ErrorType(str, Enum):
UNABLE_TO_TRANSLATE = "unable_to_translate"
UNINTELLIGIBLE_INPUT = "unintelligible_input"
UNSUPPORTED_LANGUAGE = "unsupported_language"
+ UNSUPPORTED_SCRIPT = "unsupported_script"
class QueryBase(BaseModel):
@@ -49,6 +50,7 @@ class QueryRefined(QueryBase):
generate_tts: bool = Field(False)
original_language: IdentifiedLanguage | None = None
+ original_script: IdentifiedScript | None = None
query_text_original: str
workspace_id: int
diff --git a/core_backend/app/question_answer/utils.py b/core_backend/app/question_answer/utils.py
index 029d7194c..f972e46dc 100644
--- a/core_backend/app/question_answer/utils.py
+++ b/core_backend/app/question_answer/utils.py
@@ -23,6 +23,8 @@ def get_context_string_from_search_results(
for key, result in search_results.items():
if not isinstance(result, QuerySearchResult):
result = QuerySearchResult(**result)
- context_list.append(f"{key}. {result.title}\n{result.text}")
+ context_list.append(
+ f" \n**{result.title}**\n\n{result.text}\n"
+ )
context_string = "\n\n".join(context_list)
return context_string
diff --git a/core_backend/tests/api/conftest.py b/core_backend/tests/api/conftest.py
index ca0a53a0b..d0e8b15bc 100644
--- a/core_backend/tests/api/conftest.py
+++ b/core_backend/tests/api/conftest.py
@@ -35,6 +35,7 @@
RAG,
AlignmentScore,
IdentifiedLanguage,
+ IdentifiedScript,
)
from core_backend.app.question_answer.models import (
ContentFeedbackDB,
@@ -1703,7 +1704,9 @@ async def mock_identify_language(
"""
query_refined.original_language = IdentifiedLanguage.ENGLISH
+ query_refined.original_script = IdentifiedScript.LATIN
response.debug_info["original_language"] = "ENGLISH"
+ response.debug_info["original_script"] = "LATIN"
return query_refined, response
diff --git a/core_backend/tests/api/test_question_answer.py b/core_backend/tests/api/test_question_answer.py
index 163e77574..43e9b7ef5 100644
--- a/core_backend/tests/api/test_question_answer.py
+++ b/core_backend/tests/api/test_question_answer.py
@@ -10,7 +10,10 @@
from fastapi import status
from fastapi.testclient import TestClient
-from core_backend.app.llm_call.llm_prompts import AlignmentScore, IdentifiedLanguage
+from core_backend.app.llm_call.llm_prompts import (
+ AlignmentScore,
+ IdentifiedLanguage,
+)
from core_backend.app.llm_call.process_input import (
_classify_safety,
_identify_language,
@@ -1045,20 +1048,25 @@ def user_query_refined(self, request: pytest.FixtureRequest) -> QueryRefined:
)
@pytest.mark.parametrize(
- "identified_lang_str,should_error,expected_error_type",
+ "identified_lang_str,identified_script_str,should_error,expected_error_type",
[
- ("ENGLISH", False, None),
- ("HINDI", False, None),
- ("UNINTELLIGIBLE", True, ErrorType.UNINTELLIGIBLE_INPUT),
- ("GIBBERISH", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("UNSUPPORTED", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("SOME_UNSUPPORTED_LANG", True, ErrorType.UNSUPPORTED_LANGUAGE),
- ("don't kow", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("ENGLISH", "LATIN", False, None),
+ ("HINDI", "DEVANAGARI", False, None),
+ ("UNINTELLIGIBLE", "LATIN", True, ErrorType.UNINTELLIGIBLE_INPUT),
+ ("UNINTELLIGIBLE", "UNKNOWN", True, ErrorType.UNINTELLIGIBLE_INPUT),
+ ("ENGLISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("ENGLISH", "Some unsupported script", True, ErrorType.UNSUPPORTED_SCRIPT),
+ ("GIBBERISH", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("GIBBERISH", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("UNSUPPORTED", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("SOME_UNSUPPORTED_LANG", "UNKNOWN", True, ErrorType.UNSUPPORTED_LANGUAGE),
+ ("don't kow", "LATIN", True, ErrorType.UNSUPPORTED_LANGUAGE),
],
)
async def test_language_identify_error(
self,
identified_lang_str: str,
+ identified_script_str: str,
should_error: bool,
expected_error_type: ErrorType,
monkeypatch: pytest.MonkeyPatch,
@@ -1084,6 +1092,7 @@ async def test_language_identify_error(
generate_llm_response=False,
generate_tts=False,
original_language=None,
+ original_script=None,
query_text="This is a basic query",
query_text_original="This is a query original",
workspace_id=124,
@@ -1104,10 +1113,12 @@ async def mock_ask_llm( # pylint: disable=W0613
Returns
-------
str
- The identified language string.
+ The identified language and script model json string.
"""
- return identified_lang_str
+ return f"""
+ {{"language": "{identified_lang_str}", "script": "{identified_script_str}"}}
+ """.strip()
monkeypatch.setattr(
"core_backend.app.llm_call.process_input._ask_llm_async", mock_ask_llm
@@ -1233,6 +1244,7 @@ async def mock_ask_llm( # pylint: disable=W0613
generate_llm_response=False,
generate_tts=False,
original_language=None,
+ original_script=None,
query_text="This is a basic query",
query_text_original="This is a query original",
workspace_id=124,
diff --git a/core_backend/tests/rails/data/language_identification.yaml b/core_backend/tests/rails/data/language_identification.yaml
index a4d3ddb34..4b28c20e2 100644
--- a/core_backend/tests/rails/data/language_identification.yaml
+++ b/core_backend/tests/rails/data/language_identification.yaml
@@ -2,59 +2,73 @@
# improve this with a native speaker. These might be too "pure".
HAUSA:
- - Ina da yara biyu masu hanci
- - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
- - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
- - Menene wannan?
- - Sannun ku da zuwa #h/t: Fola from here on
- - Ni yarinya ne
- - Zo ka chi abunchi
- - Ina kwana Maman mu
- - Wannan shago na ne
+ LATIN:
+ - Ina da yara biyu masu hanci
+ - Jiya ina jin barci akan kujera yau kuma bayana yayi zafi
+ - Shin ya zama al'ada a gare ku don jin zafi a duk lokacin da kuka yi atishawa?
+ - Menene wannan?
+ - Sannun ku da zuwa #h/t: Fola from here on
+ - Ni yarinya ne
+ - Zo ka chi abunchi
+ - Ina kwana Maman mu
+ - Wannan shago na ne
ENGLISH:
- - I have two children. You see I girl, what is the probability the other is also a girl?
- - No idea
- - Why you say that?
+ LATIN:
+ - I have two children. You see I girl, what is the probability the other is also a girl?
+ - No idea
+ - Why you say that?
XHOSA:
- - Umama ngugqirha
- - Utata ngumongikazi
- - Ukuba intamo yam yayifuna ukwenza oko?
- - Iintsana zikhala kakhulu, huh?
+ LATIN:
+ - Umama ngugqirha
+ - Utata ngumongikazi
+ - Ukuba intamo yam yayifuna ukwenza oko?
+ - Iintsana zikhala kakhulu, huh?
YORUBA: #h/t: Fola
- - Ni bo ló ti ri owo yen?
- - Eyin melo ni e wa ni be?
- - Ki ni itumo oruko ẹ?
- - Ki ni o jẹ lánà?
- - Omo Ibadan ni mi
+ LATIN:
+ - Ni bo ló ti ri owo yen?
+ - Eyin melo ni e wa ni be?
+ - Ki ni itumo oruko ẹ?
+ - Ki ni o jẹ lánà?
+ - Omo Ibadan ni mi
IGBO: #h/t: Fola
- - agụụ na-agụ m
- - agam aga ahia echi
- - ị hụla ngozi? ana m achọ ya.
- - m na-aga ọrụ
+ LATIN:
+ - agụụ na-agụ m
+ - agam aga ahia echi
+ - ị hụla ngozi? ana m achọ ya.
+ - m na-aga ọrụ
KOREAN:
- - 애가 둘이예요
- - ㅋㅋㅋㅋㅋㅋ
- - 아이들이 많이 울어요ㅠ
- - 이 프로젝트 애칭은 ask-a-question이야.
+ KOREAN:
+ - 애가 둘이예요
+ - ㅋㅋㅋㅋㅋㅋ
+ - 아이들이 많이 울어요ㅠ
+ - 이 프로젝트 애칭은 ask-a-question이야.
ZULU:
- - Ngingumama
- - Ingabe uyi-bot noma ungumuntu?
- - Ngifuna ukwenza lokhu?
- - Izingane zikhala kakhulu, hhe
+ LATIN:
+ - Ngingumama
+ - Ingabe uyi-bot noma ungumuntu?
+ - Ngifuna ukwenza lokhu?
+ - Izingane zikhala kakhulu, hhe
AFRIKAANS:
- - Ek het hierdie goddelose dal gemaak
- - Is covid nog 'n ding?
- - My hond het my huiswerk geëet
- - Het jy al gebraaide roomys probeer?
+ LATIN:
+ - Ek het hierdie goddelose dal gemaak
+ - Is covid nog 'n ding?
+ - My hond het my huiswerk geëet
+ - Het jy al gebraaide roomys probeer?
HINDI: #h/t: Sid
- - is ka matlab kya hai?
- - kabhi kabhi mere dil mein
- - अंत में सभी लोग नाश्ता करने जाएं
- - गब्बर सिंह कह के गया जो डर गया वो मर गया
+ LATIN:
+ - is ka matlab kya hai?
+ - kabhi kabhi mere dil mein
+ DEVANAGARI:
+ - अंत में सभी लोग नाश्ता करने जाएं
+ - गब्बर सिंह कह के गया जो डर गया वो मर गया
+MARATHI:
+ LATIN:
+ - Portal chi link aahe
UNINTELLIGIBLE:
- - sdfsdf sss dyhnel jjj
- - hs dsfsg xd ewwo ddfs
- - Heghlu'meH QaQ jajvam
- - yIHuchQo', 'ej jIHvaD yIqemchu'mo'
- - \%^*# levels; 91011 AQGs!!!
- - 1234 AQI WHO? 5678
+ UNKNOWN:
+ - sdfsdf sss dyhnel jjj
+ - hs dsfsg xd ewwo ddfs
+ - Heghlu'meH QaQ jajvam
+ - yIHuchQo', 'ej jIHvaD yIqemchu'mo'
+ - \%^*# levels; 91011 AQGs!!!
+ - 1234 AQI WHO? 5678
diff --git a/core_backend/tests/rails/test_language_identification.py b/core_backend/tests/rails/test_language_identification.py
index 9b30b2e9a..6744d8216 100644
--- a/core_backend/tests/rails/test_language_identification.py
+++ b/core_backend/tests/rails/test_language_identification.py
@@ -5,7 +5,7 @@
import pytest
import yaml
-from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage
+from core_backend.app.llm_call.llm_prompts import IdentifiedLanguage, IdentifiedScript
from core_backend.app.llm_call.process_input import _identify_language
from core_backend.app.question_answer.schemas import QueryRefined, QueryResponse
@@ -22,19 +22,38 @@ def available_languages() -> list[str]:
return list(IdentifiedLanguage)
-def read_test_data(file: str) -> list[tuple[str, str]]:
+@pytest.fixture(scope="module")
+def available_scripts() -> list[str]:
+ """Returns a list of available languages."""
+
+ return list(IdentifiedScript)
+
+
+def read_test_data(file: str) -> list[tuple[str, str, str]]:
"""Reads test data from file and returns a list of strings."""
file_path = Path(__file__).parent / file
with open(file_path, "r", encoding="utf-8") as f:
content = yaml.safe_load(f)
- return [(key, value) for key, values in content.items() for value in values]
-
-
-@pytest.mark.parametrize("expected_label, content", read_test_data(LANGUAGE_FILE))
+ data = [
+ (language, script, text)
+ for language, script_dict in content.items()
+ for script, texts in script_dict.items()
+ for text in texts
+ ]
+ return data
+
+
+@pytest.mark.parametrize(
+ "expected_language,expected_script,content", read_test_data(LANGUAGE_FILE)
+)
async def test_language_identification(
- available_languages: list[str], expected_label: str, content: str
+ available_languages: list[str],
+ available_scripts: list[str],
+ expected_language: str,
+ expected_script: str,
+ content: str,
) -> None:
"""Test language identification."""
@@ -53,8 +72,15 @@ async def test_language_identification(
search_results=None,
session_id=None,
)
- if expected_label not in available_languages:
- expected_label = "UNSUPPORTED"
+
+ if expected_language not in available_languages:
+ expected_language = "UNSUPPORTED"
+
+ if expected_script not in available_scripts:
+ expected_script = "UNKNOWN"
+
_, response = await _identify_language(query_refined=question, response=response)
- assert response.debug_info["original_language"] == expected_label
+ assert response.debug_info["original_language"] == expected_language
+ if expected_language not in ("UNINTELLIGIBLE", "UNSUPPORTED"):
+ assert response.debug_info["original_script"] == expected_script