IDinsight · suzinyou · Apr 2, 2025 · Apr 2, 2025 · Apr 9, 2025 · Apr 9, 2025
@@ -348,57 +348,6 @@
         "line_number": 15
       }
     ],
-    "core_backend/tests/api/conftest.py": [
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3",
-        "is_verified": false,
-        "line_number": 46
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7",
-        "is_verified": false,
-        "line_number": 47
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097",
-        "is_verified": false,
-        "line_number": 50
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4",
-        "is_verified": false,
-        "line_number": 51
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a",
-        "is_verified": false,
-        "line_number": 56
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78",
-        "is_verified": false,
-        "line_number": 57
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e",
-        "is_verified": false,
-        "line_number": 317
-      }
-    ],
     "core_backend/tests/api/test.env": [
       {
         "type": "Secret Keyword",
@@ -448,14 +397,14 @@
         "filename": "core_backend/tests/api/test_question_answer.py",
         "hashed_secret": "1d2be5ef28a76e2207456e7eceabe1219305e43d",
         "is_verified": false,
-        "line_number": 294
+        "line_number": 418
       },
       {
         "type": "Secret Keyword",
         "filename": "core_backend/tests/api/test_question_answer.py",
         "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee",
         "is_verified": false,
-        "line_number": 653
+        "line_number": 1018
       }
     ],
     "core_backend/tests/api/test_user_tools.py": [
@@ -473,7 +422,7 @@
         "filename": "core_backend/tests/rails/test_language_identification.py",
         "hashed_secret": "051b2c1d98174fabc4749641c4f4f4660556441e",
         "is_verified": false,
-        "line_number": 48
+        "line_number": 69
       }
     ],
     "core_backend/tests/rails/test_paraphrasing.py": [
@@ -581,5 +530,5 @@
       }
     ]
   },
-  "generated_at": "2025-01-24T13:35:08Z"
+  "generated_at": "2025-04-10T13:44:48Z"
 }
@@ -103,7 +103,7 @@
 {context}
 
 IMPORTANT NOTES ON THE "answer" FIELD:
-- Answer in the language of the question ({original_language}).
+- Answer in the language {original_language} in the script {original_script}.
 - Answer should be concise, to the point, and no longer than 80 words.
 - Do not include any information that is not present in the REFERENCE TEXT.
 """
@@ -182,6 +182,58 @@ class AlignmentScore(BaseModel):
     model_config = ConfigDict(strict=True)
 
 
+CHAT_RESPONSE_PROMPT = """\
+You are an AI assistant designed to help users with their \
+questions/concerns. You interact with users via a chat interface. You will \
+be provided with ADDITIONAL RELEVANT INFORMATION that can address the \
+user's questions/concerns.
+
+BEFORE answering the user's LATEST MESSAGE, follow these steps:
+
+1. Review the conversation history to ensure that you understand the \
+context in which the user's LATEST MESSAGE is being asked.
+2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you \
+understand the most useful information related to the user's LATEST \
+MESSAGE.
+
+When you have completed the above steps, you will then write a JSON, whose \
+TypeScript Interface is given below:
+
+interface Response {{
+    extracted_info: string[];
+    answer: string;
+}}
+
+For "extracted_info", extract from the provided ADDITIONAL RELEVANT \
+INFORMATION the most useful information related to the LATEST MESSAGE asked \
+by the user, and list them one by one. If no useful information is found, \
+return an empty list.
+
+For "answer", understand the conversation history, ADDITIONAL RELEVANT \
+INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to \
+the user's LATEST MESSAGE. If no useful information was found in the \
+either the conversation history or the ADDITIONAL RELEVANT INFORMATION, \
+respond with {failure_message}.
+
+EXAMPLE RESPONSES:
+{{"extracted_info": ["Pineapples are a blend of pinecones and apples.", \
+"Pineapples have the shape of a pinecone."], \
+"answer": "The 'pine-' from pineapples likely come from the fact that \
+pineapples are a hybrid of pinecones and apples and its pinecone-like \
+shape."}}
+{{"extracted_info": [], "answer": "{failure_message}"}}
+
+IMPORTANT NOTES ON THE "answer" FIELD:
+- Keep in mind that the user is asking a {message_type} question.
+- Answer in the language {original_language} in the script {original_script}.
+- Answer should be concise and to the point.
+- Do not include any information that is not present in the ADDITIONAL \
+RELEVANT INFORMATION.
+
+Only output the JSON response, without any additional text.\
+"""
+
+
 class ChatHistory:
     """Contains the prompts and models for the chat history task."""
 
@@ -216,7 +268,7 @@ class ChatHistory:
 
             {{
                 "message_type": "The type of the user's LATEST MESSAGE. List of valid
-                options are: {valid_message_types},
+                options are: {valid_message_types}",
                 "query": "The vector database query that you have constructed based on
                 the user's LATEST MESSAGE and the conversation history."
             }}
@@ -227,62 +279,7 @@ class ChatHistory:
         ),
         prompt_kws={"valid_message_types": _valid_message_types},
     )
-    system_message_generate_response = format_prompt(
-        prompt=textwrap.dedent(
-            """You are an AI assistant designed to help users with their
-            questions/concerns. You interact with users via a chat interface. You will
-            be provided with ADDITIONAL RELEVANT INFORMATION that can address the
-            user's questions/concerns.
-
-            BEFORE answering the user's LATEST MESSAGE, follow these steps:
-
-            1. Review the conversation history to ensure that you understand the
-            context in which the user's LATEST MESSAGE is being asked.
-            2. Review the provided ADDITIONAL RELEVANT INFORMATION to ensure that you
-            understand the most useful information related to the user's LATEST
-            MESSAGE.
-
-            When you have completed the above steps, you will then write a JSON, whose
-            TypeScript Interface is given below:
-
-            interface Response {{
-                extracted_info: string[];
-                answer: string;
-            }}
-
-            For "extracted_info", extract from the provided ADDITIONAL RELEVANT
-            INFORMATION the most useful information related to the LATEST MESSAGE asked
-            by the user, and list them one by one. If no useful information is found,
-            return an empty list.
-
-            For "answer", understand the conversation history, ADDITIONAL RELEVANT
-            INFORMATION, and the user's LATEST MESSAGE, and then provide an answer to
-            the user's LATEST MESSAGE. If no useful information was found in the
-            either the conversation history or the ADDITIONAL RELEVANT INFORMATION,
-            respond with {failure_message}.
-
-            EXAMPLE RESPONSES:
-            {{"extracted_info": [
-                "Pineapples are a blend of pinecones and apples.",
-                "Pineapples have the shape of a pinecone."
-                ],
-              "answer": "The 'pine-' from pineapples likely come from the fact that
-               pineapples are a hybrid of pinecones and apples and its pinecone-like
-               shape."
-            }}
-            {{"extracted_info": [], "answer": "{failure_message}"}}
-
-            IMPORTANT NOTES ON THE "answer" FIELD:
-            - Keep in mind that the user is asking a {message_type} question.
-            - Answer in the language of the question ({original_language}).
-            - Answer should be concise and to the point.
-            - Do not include any information that is not present in the ADDITIONAL
-            RELEVANT INFORMATION.
-
-            Only output the JSON response, without any additional text.
-            """
-        )
-    )
+    system_message_generate_response = CHAT_RESPONSE_PROMPT
 
     class ChatHistoryConstructSearchQuery(BaseModel):
         """Pydantic model for the output of the construct search query chat history."""
@@ -337,22 +334,21 @@ class IdentifiedLanguage(str, Enum):
     ENGLISH = "ENGLISH"
     FRENCH = "FRENCH"
     HINDI = "HINDI"
+    MARATHI = "MARATHI"
     SWAHILI = "SWAHILI"
     UNINTELLIGIBLE = "UNINTELLIGIBLE"
     UNSUPPORTED = "UNSUPPORTED"
+
     # XHOSA = "XHOSA"
     # ZULU = "ZULU"
-
     @classmethod
     def get_supported_languages(cls) -> list[str]:
         """Return a list of supported languages.
-
         Returns
         -------
         list[str]
             A list of supported languages.
         """
-
         return [
             lang
             for lang in cls._member_names_
@@ -377,31 +373,98 @@ def _missing_(cls, value: str) -> IdentifiedLanguage:  # type: ignore[override]
 
         return cls.UNSUPPORTED
 
+
+class IdentifiedScript(str, Enum):
+    """Script used in the user's input."""
+
+    LATIN = "LATIN"
+    DEVANAGARI = "DEVANAGARI"
+    BENGALI = "BENGALI"
+    TAMIL = "TAMIL"
+    TELUGU = "TELUGU"
+    KANNADA = "KANNADA"
+    MALAYALAM = "MALAYALAM"
+    GUJARATI = "GUJARATI"
+    # GURMUKHI = "GURMUKHI"
+    # ORIYA = "ORIYA"
+    # SINHALA = "SINHALA"
+    # MYANMAR = "MYANMAR"
+    # ETHIOPIC = "ETHIOPIC"
+    # GEORGIAN = "GEORGIAN"
+    # ARMENIAN = "ARMENIAN"
+    # HEBREW = "HEBREW"
+    # GREEK = "GREEK"
+    # TIBETAN = "TIBETAN"
+    # MONGOLIAN = "MONGOLIAN"
+    # KHMER = "KHMER"
+    # LAO = "LAO"
+    # VIETNAMESE = "VIETNAMESE"
+    # THAI_LAO = "THAI_LAO"
+    UNKNOWN = "UNKNOWN"
+
     @classmethod
-    def get_prompt(cls) -> str:
-        """Return the prompt for the language identification bot.
+    def get_supported_scripts(cls) -> list[str]:
+        """Return a list of supported scripts.
+        Returns
+        -------
+        list[str]
+            A list of supported scripts.
+        """
+        return [script for script in cls._member_names_ if script != "UNKNOWN"]
 
+    @classmethod
+    def _missing_(cls, value: str) -> IdentifiedScript:  # type: ignore[override]
+        """If script identified is not one of the supported scripts, it is
+        classified as UNKNOWN.
+        Parameters
+        ----------
+        value
+            The script identified.
         Returns
         -------
-        str
-            The prompt for the language identification bot.
+        Script
+            The identified script (i.e., UNKNOWN).
         """
+        return cls.UNKNOWN
 
-        return textwrap.dedent(
-            f"""
-            You are a high-performing language identification bot that classifies the
-            language of the user input into one of {", ".join(cls._member_names_)}.
-
-            If the user input is
-            1. in one of the supported languages, then respond with that language.
-            2. written in a mix of languages, then respond with the dominant language.
-            3. in a real language but not a supported language, then respond with
-            UNSUPPORTED.
-            4. unintelligible or gibberish, then respond with UNINTELLIGIBLE.
-
-            Answer should be a single word and strictly one of
-            [{", ".join(cls._member_names_)}]"""
-        ).strip()
+
+class LanguageIdentificationResponse(BaseModel):
+    """Pydantic model for the language identification response."""
+
+    language: IdentifiedLanguage
+    script: IdentifiedScript
+
+    model_config = ConfigDict(strict=True)
+
+
+LANGUAGE_ID_PROMPT = (
+    f"""\
+You are a high-performing language identification bot that classifies the \
+language and script of the user input.
+
+For each input, identify:
+1. The language (must be one of {", ".join(IdentifiedLanguage._member_names_)})
+2. The script (must be one of {", ".join(IdentifiedScript._member_names_)})
+
+If the user input is:
+1. in one of the supported languages, respond with that language and its script
+2. written in a mix of languages, respond with the dominant language and its script
+3. in a real language but not a supported language, respond with UNSUPPORTED and \
+its script
+4. unintelligible or gibberish, respond with UNINTELLIGIBLE and Latin"""
+    + """
+Examples:
+"How many beds are there?" -> {{"language": "ENGLISH", "script": "LATIN"}}
+"vahaan kitane bistar hain?" -> {{"language": "HINDI", "script": "LATIN"}}
+"वहाँ कितने बिस्तर हैं?" -> {{"language": "HINDI", "script": "DEVANAGARI"}}
+"Bonjour, comment allez-vous?" -> {{"language": "FRENCH", "script": "LATIN"}}
+"Jambo, habari gani?" -> {{"language": "SWAHILI", "script": "LATIN"}}
+"asdfjkl" -> {{"language": "UNINTELLIGIBLE", "script": "LATIN"}}
+"مرحبا كيف حالك" -> {{"language": "UNSUPPORTED", "script": "ARABIC"}}
+
+Respond with a JSON object containing "language" and "script" keys.
+"""
+)
 
 
 class RAG(BaseModel):