SimulatorML · w3ban · May 4, 2023 · May 9, 2023 · Jun 12, 2023 · Jun 12, 2023
diff --git a/.github/workflows/code_quality_pipline.yml → .github/workflows/code_quality_pipeline.yml b/.github/workflows/code_quality_pipline.yml → .github/workflows/code_quality_pipeline.yml
diff --git a/.gitignore b/.gitignore
@@ -105,7 +105,7 @@ ENV/
 .vscode/
 .idea/
 
-src/models/bert_classfier/artifacts
+src/models/bert_classifier/artifacts
 
 # .csv
 *.csv

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ Developed by students of the [Simulator ML (Karpov.Courses)](https://karpov.cour
 
 ### Launch
 
-1) `git clone https://github.com/uberkinder/Nometa-Bot.git`
+1) `git clone https://github.com/SimulatorML/Nometa-Bot.git`
 2) Create a telegram bot and get a token through https://t.me/BotFather
 
 Choose a launch option and continue with the steps

diff --git a/exploration/data_parser.py b/exploration/data_parser.py
@@ -6,12 +6,24 @@
 import pandas as pd
 from telethon import TelegramClient
 
-API_ID = os.getenv("API_ID") # YOUR API_ID FROM https://my.telegram.org/apps
-API_HASH = os.getenv("API_HASH") # YOUR API_HASH FROM https://my.telegram.org/apps
+API_ID = os.getenv("API_ID")  # YOUR API_ID FROM https://my.telegram.org/apps
+API_HASH = os.getenv("API_HASH")  # YOUR API_HASH FROM https://my.telegram.org/apps
 
 
-def find_nometa_replies(data: dict):
-    """Finds all nometa_replies in json-like data"""
+def find_nometa_replies(data: dict) -> List[int]:
+    """
+    Finds all replies to messages containing 'nometa' in json-like data.
+
+    Parameters
+    ----------
+    data : dict
+        JSON-like data containing chat messages.
+
+    Returns
+    -------
+    List[int]
+        List of message IDs that are replies to 'nometa' messages.
+    """
     id_messages_with_nometa = []
     for info_message in data['messages']:
         message = str(info_message['text'])
@@ -26,7 +38,18 @@ def find_nometa_replies(data: dict):
 
 
 def exctract_nometa(data: dict, id_messages_with_nometa: List[int], path: str = 'data.csv'):
-    """Exctract all questions and converts it to .cvs"""
+    """
+    Extracts all questions and converts them to a .csv file.
+
+    Parameters
+    ----------
+    data : dict
+        JSON-like data containing chat messages.
+    id_messages_with_nometa : List[int]
+        List of message IDs that are replies to 'nometa' messages.
+    path : str, optional
+        Path for saving the CSV file, by default 'data.csv'.
+    """
     messages_with_nometa = pd.DataFrame(columns=['text'])
     for info_message in data['messages']:
         if info_message['id'] in id_messages_with_nometa:
@@ -40,7 +63,16 @@ def exctract_nometa(data: dict, id_messages_with_nometa: List[int], path: str =
 
 
 def parse_json(source: List[str] = None, path: str = 'data.csv'):
-    """Parse data from telegram chats using api/json"""
+    """
+    Parses data from telegram chats using JSON files and converts to CSV.
+
+    Parameters
+    ----------
+    source : List[str], optional
+        List of paths to JSON files, by default None.
+    path : str, optional
+        Path for saving the CSV file, by default 'data.csv'.
+    """
     for file in source:
         with open(file, encoding="utf-8") as f:
             data = json.load(f)
@@ -51,11 +83,28 @@ def parse_json(source: List[str] = None, path: str = 'data.csv'):
 
 async def parse_api(source: List[str],
                     api_id: int,
-                    api_hash=str,
+                    api_hash: str,
                     path: str = "data.csv",
                     word: str = "nometa",
                     limit: int = 10000000):
-    """Parses data from telegram chats using TG API"""
+    """
+    Parses data from telegram chats using TG API and converts to CSV.
+
+    Parameters
+    ----------
+    source : List[str]
+        List of chat IDs.
+    api_id : int
+        Your Telegram API ID.
+    api_hash : str
+        Your Telegram API hash.
+    path : str, optional
+        Path for saving the CSV file, by default 'data.csv'.
+    word : str, optional
+        Keyword to search for in messages, by default 'nometa'.
+    limit : int, optional
+        Maximum number of messages to retrieve, by default 10000000.
+    """
     meta_messages_ids = []
     meta_messages = []
     async with TelegramClient('my', api_id, api_hash) as client:

diff --git a/src/app/bot.py b/src/app/bot.py
@@ -7,9 +7,16 @@
                            check_question_pattern,
                            check_question_with_tfidf_model)
 from src.app.constants import GROUP_MESSAGES
+from src.app.constants import PRIVATE_MESSAGES_POS
+from src.app.constants import PRIVATE_MESSAGES_NEG
 
-
+# Load your Bot Token and Channel ID from environment variables
 BOT_TOKEN = os.getenv("BOT_TOKEN")  # YOUR BOT_TOKEN FROM @BotFather
+try:
+    CHANNEL_ID = os.getenv("CHANNEL_ID")  # CHANNEL_ID FOR COLLECTING DATA
+except:
+    CHANNEL_ID = None
+# Create a Bot instance and a Dispatcher
 bot = Bot(token=BOT_TOKEN)
 dp = Dispatcher(bot)
 
@@ -20,30 +27,67 @@
 
 class BotMetaMessageChecker:
     """
-    TODO: add descriptions
+    A class to manage message checking and handling for the bot.
     """
+
     def start(self):
+        """
+        Start polling for new messages using the Dispatcher.
+        """
         executor.start_polling(dp, skip_updates=True)
 
 
+@dp.message_handler(commands=['start'])
+async def welcome(message: types.Message):
+    """
+    Handle the /start command.
+    """
+    if message.chat.type == "private":
+        await message.answer("Привет! Для того чтобы начать, просто напиши мне какой-нибудь вопрос или сообщение.")
+
+
 @dp.message_handler()
 async def check_message(message: types.Message):
     """
-    Функция обрабатывает сообщения пользователей.
-    В личном чате на каждое сообщение отвечает классом,
-    к которому оно относится:
-        - Сообщение без вопроса
-        - Мета-вопрос
-        - Обычный вопрос
-    В групповом чате отвечает только на мета-вопросы.
+    Process user messages and respond accordingly.
+
+    In a private chat, respond based on the type of question:
+        - Non-question message
+        - Meta-question
+        - Regular question
+
+    In a group chat, only respond to meta-questions.
+    Responds to help questions
+
+    Parameters
+    ----------
+    message : types.Message
+        The incoming message to process.
     """
+
+    prediction, info = message_check(message.text)
+
     if message.chat.type == 'private':
-        if message_check(message.text):
-            await message.reply('Это мета-вопрос.')
+        if prediction:
+            await message.reply(
+                random.choice(PRIVATE_MESSAGES_POS), parse_mode='html'
+            )
         else:
-            await message.reply('Это обычный вопрос.')
+            await message.reply(
+                random.choice(PRIVATE_MESSAGES_NEG), parse_mode='html'
+            )
     else:
-        if message_check(message.text):
+        if message.photo:
+            prediction = 0
+        if prediction:
             await message.reply(
                 random.choice(GROUP_MESSAGES), parse_mode='html'
             )
+    if CHANNEL_ID:
+        await bot.send_message(chat_id=CHANNEL_ID, text=f"Message from user @{message.from_user.username}\n {info}")
+
+
+# Instantiate the BotMetaMessageChecker class and start polling
+if __name__ == '__main__':
+    bot_meta_checker = BotMetaMessageChecker()
+    bot_meta_checker.start()
diff --git a/src/app/constants.py b/src/app/constants.py
@@ -15,3 +15,17 @@
     'кто юзал',
     'можно задать вопрос',
 ]
+
+
+PRIVATE_MESSAGES_POS = [
+    """Это <a href='https://nometa.xyz/ru.html'>мета-вопрос</a>. Старайтесь задавать свои вопрос напрямую""",
+    """Это <a href='https://nometa.xyz/ru.html'>мета-вопрос</a>. Просто задайте свой вопрос напрямую.""",
+    """Это <a href='https://nometa.xyz/ru.html'>мета-вопрос</a>. Продолжайте задавать мне вопросы, я постараюсь помочь!"""
+]
+
+PRIVATE_MESSAGES_NEG = [
+    """Это обычный вопрос. Напоминаю, что я идентифицирую мета-вопросы. Пожалуйста, задайте мета-вопрос или другой вопрос, чтобы проверить мои возможности.""",
+    """Это обычный вопрос. Напоминаю, что моя задача — идентифицировать мета-вопросы. Пожалуйста, продолжайте тестирование.""",
+    """Это обычный вопрос. Я здесь для определения мета-вопросов. Пожалуйста, продолжайте проверку."""
+
+]
diff --git a/src/app/utils.py b/src/app/utils.py
@@ -1,33 +1,43 @@
+from typing import Tuple
+
 from src.app.constants import PATTERNS_META_QUESTIONS
 from src.models.tfidf_text_classifier.model import TfidfTextClassifier
-from src.models.bert_classfier.model import BertClassifier
-
+from src.models.bert_classifier.model import BertClassifier  # Updated import name
 
 def check_question_pattern(message: str) -> bool:
     """
-    Функция проверяет, является ли сообщение мета-вопросом.
+    Check if a message is a meta-question.
 
     Parameters
     ----------
     message : str
-        Сообщенние от пользователя.
+        User's message.
 
     Returns
     -------
     bool
-        True, если сообщение - мета вопрос.
-        False, если сообщение - обычный вопрос.
+        True if the message is a meta-question.
+        False if the message is a regular question.
     """
     message = message.lower()
     for meta_question in PATTERNS_META_QUESTIONS:
         if meta_question in message:
             return True
     return False
 
-
 def check_question_with_tfidf_model(message: str) -> bool:
     """
-    TODO: add descriptions
+    Check question using the TF-IDF model.
+
+    Parameters
+    ----------
+    message : str
+        User's message.
+
+    Returns
+    -------
+    bool
+        True if the message is classified as a question, otherwise False.
     """
     model = TfidfTextClassifier()
     model.load_model(
@@ -38,14 +48,35 @@ def check_question_with_tfidf_model(message: str) -> bool:
 
     return bool(prediction)
 
-
-def check_question_with_rubert_clf(message: str) -> bool:
+def check_question_with_rubert_clf(message: str) -> Tuple[bool, str]:
     """
-    TODO: add descriptions
+    Check question using the RuBERT classifier.
+
+    Parameters
+    ----------
+    message : str
+        User's message.
+
+    Returns
+    -------
+    Tuple[bool, str]
+        A tuple containing:
+        - A boolean indicating if the message is a question or not.
+        - Information about the prediction and details of the classification.
     """
-    if len(message) > 20:
-        model = BertClassifier(model_path="../src/models/bert_classfier/artifacts")
+    if "?" in message and len(message) > 10:
+        model = BertClassifier(model_path="../src/models/bert_classifier/artifacts")  # Updated path
         prediction = model.predict(message)
+        score = model.predict_proba(message)
+        info = f"""
+Message: {message}\n
+Predict: {prediction}\n
+Logit: {score}\n
+Current threshold: {model.threshold}"""
     else:
         prediction = 0
-    return bool(prediction)
+        info = f"""
+Predict: {prediction}
+Note: {message} not a question
+"""
+    return bool(prediction), info
-Original file line number
+Diff line change
@@ Expand Up / @@ -105,7 +105,7 @@ ENV/ @@
     .vscode/
     .idea/
-    src/models/bert_classfier/artifacts
+    src/models/bert_classifier/artifacts
     # .csv
     *.csv
@@ Expand Down @@