Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
6b837f2
add new answers (#17)
lzvsk May 4, 2023
069f97c
Pylint (#18)
lzvsk May 9, 2023
5217a83
Hotfixes (#23)
w3ban Jun 12, 2023
75fd967
:pencil: update: add less toxic messages
w3ban Jun 12, 2023
3273523
Merge branch 'main' into develop
w3ban Jun 12, 2023
12f8b0e
fix messages
Jun 12, 2023
e322fa6
Merge branch 'main' into develop
lzvsk Jun 12, 2023
c44e6f9
Model tuning (#27)
w3ban Jun 13, 2023
232b1d8
:wrench: fix: fix model parameters:
w3ban Jun 14, 2023
aa84734
Merge branch 'main' into develop
w3ban Jun 14, 2023
a6fb233
Inference fixes (#30)
w3ban Jul 5, 2023
a84bad9
Merge branch 'main' into develop
w3ban Jul 5, 2023
96ec13c
Predict collection (#33)
w3ban Aug 6, 2023
51841d9
:shirt: refactor: refactor code in app module
w3ban Aug 11, 2023
9abfec9
:shirt: refactor: refactor code in metrics module
w3ban Aug 11, 2023
c998ff5
:shirt: refactor: refactor code in app module
w3ban Aug 11, 2023
10e267b
:shirt: refactor: refactor code in bert_classifier module
w3ban Aug 11, 2023
87c471b
:shirt: refactor: refactor data parser
w3ban Aug 11, 2023
a8d0b90
:shirt: refactor: refactor in tfidf_classifier module
w3ban Aug 11, 2023
c553e02
:truck: update rename code quality pipeline
w3ban Aug 11, 2023
2c4061b
:truck: update rename bert_classifier folder
w3ban Aug 11, 2023
44e8e85
:wrench: fix conflicts
w3ban Aug 11, 2023
20d3ac3
:pencil: update .gitignorefile
w3ban Aug 11, 2023
48ffce1
:truck: rename: rename gitworkflow yml
w3ban Aug 11, 2023
48031f5
Code refactor (#36)
w3ban Aug 11, 2023
99a61a1
Bot improvement (#38)
w3ban Aug 15, 2023
0a171b5
:pencil: refactor: update bot functions and answers
w3ban Sep 7, 2023
fac43bc
:pencil: docs: update README.md
w3ban Sep 7, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ ENV/
.vscode/
.idea/

src/models/bert_classfier/artifacts
src/models/bert_classifier/artifacts

# .csv
*.csv
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Developed by students of the [Simulator ML (Karpov.Courses)](https://karpov.cour

### Launch

1) `git clone https://github.com/uberkinder/Nometa-Bot.git`
1) `git clone https://github.com/SimulatorML/Nometa-Bot.git`
2) Create a telegram bot and get a token through https://t.me/BotFather

Choose a launch option and continue with the steps
Expand Down
65 changes: 57 additions & 8 deletions exploration/data_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,24 @@
import pandas as pd
from telethon import TelegramClient

API_ID = os.getenv("API_ID") # YOUR API_ID FROM https://my.telegram.org/apps
API_HASH = os.getenv("API_HASH") # YOUR API_HASH FROM https://my.telegram.org/apps
API_ID = os.getenv("API_ID") # YOUR API_ID FROM https://my.telegram.org/apps
API_HASH = os.getenv("API_HASH") # YOUR API_HASH FROM https://my.telegram.org/apps


def find_nometa_replies(data: dict):
"""Finds all nometa_replies in json-like data"""
def find_nometa_replies(data: dict) -> List[int]:
"""
Finds all replies to messages containing 'nometa' in json-like data.

Parameters
----------
data : dict
JSON-like data containing chat messages.

Returns
-------
List[int]
List of message IDs that are replies to 'nometa' messages.
"""
id_messages_with_nometa = []
for info_message in data['messages']:
message = str(info_message['text'])
Expand All @@ -26,7 +38,18 @@ def find_nometa_replies(data: dict):


def exctract_nometa(data: dict, id_messages_with_nometa: List[int], path: str = 'data.csv'):
"""Exctract all questions and converts it to .cvs"""
"""
Extracts all questions and converts them to a .csv file.

Parameters
----------
data : dict
JSON-like data containing chat messages.
id_messages_with_nometa : List[int]
List of message IDs that are replies to 'nometa' messages.
path : str, optional
Path for saving the CSV file, by default 'data.csv'.
"""
messages_with_nometa = pd.DataFrame(columns=['text'])
for info_message in data['messages']:
if info_message['id'] in id_messages_with_nometa:
Expand All @@ -40,7 +63,16 @@ def exctract_nometa(data: dict, id_messages_with_nometa: List[int], path: str =


def parse_json(source: List[str] = None, path: str = 'data.csv'):
"""Parse data from telegram chats using api/json"""
"""
Parses data from telegram chats using JSON files and converts to CSV.

Parameters
----------
source : List[str], optional
List of paths to JSON files, by default None.
path : str, optional
Path for saving the CSV file, by default 'data.csv'.
"""
for file in source:
with open(file, encoding="utf-8") as f:
data = json.load(f)
Expand All @@ -51,11 +83,28 @@ def parse_json(source: List[str] = None, path: str = 'data.csv'):

async def parse_api(source: List[str],
api_id: int,
api_hash=str,
api_hash: str,
path: str = "data.csv",
word: str = "nometa",
limit: int = 10000000):
"""Parses data from telegram chats using TG API"""
"""
Parses data from telegram chats using TG API and converts to CSV.

Parameters
----------
source : List[str]
List of chat IDs.
api_id : int
Your Telegram API ID.
api_hash : str
Your Telegram API hash.
path : str, optional
Path for saving the CSV file, by default 'data.csv'.
word : str, optional
Keyword to search for in messages, by default 'nometa'.
limit : int, optional
Maximum number of messages to retrieve, by default 10000000.
"""
meta_messages_ids = []
meta_messages = []
async with TelegramClient('my', api_id, api_hash) as client:
Expand Down
70 changes: 57 additions & 13 deletions src/app/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,16 @@
check_question_pattern,
check_question_with_tfidf_model)
from src.app.constants import GROUP_MESSAGES
from src.app.constants import PRIVATE_MESSAGES_POS
from src.app.constants import PRIVATE_MESSAGES_NEG


# Load your Bot Token and Channel ID from environment variables
BOT_TOKEN = os.getenv("BOT_TOKEN") # YOUR BOT_TOKEN FROM @BotFather
try:
CHANNEL_ID = os.getenv("CHANNEL_ID") # CHANNEL_ID FOR COLLECTING DATA
except:
CHANNEL_ID = None
# Create a Bot instance and a Dispatcher
bot = Bot(token=BOT_TOKEN)
dp = Dispatcher(bot)

Expand All @@ -20,30 +27,67 @@

class BotMetaMessageChecker:
"""
TODO: add descriptions
A class to manage message checking and handling for the bot.
"""

def start(self):
"""
Start polling for new messages using the Dispatcher.
"""
executor.start_polling(dp, skip_updates=True)


@dp.message_handler(commands=['start'])
async def welcome(message: types.Message):
"""
Handle the /start command.
"""
if message.chat.type == "private":
await message.answer("Привет! Для того чтобы начать, просто напиши мне какой-нибудь вопрос или сообщение.")


@dp.message_handler()
async def check_message(message: types.Message):
"""
Функция обрабатывает сообщения пользователей.
В личном чате на каждое сообщение отвечает классом,
к которому оно относится:
- Сообщение без вопроса
- Мета-вопрос
- Обычный вопрос
В групповом чате отвечает только на мета-вопросы.
Process user messages and respond accordingly.

In a private chat, respond based on the type of question:
- Non-question message
- Meta-question
- Regular question

In a group chat, only respond to meta-questions.
Responds to help questions

Parameters
----------
message : types.Message
The incoming message to process.
"""

prediction, info = message_check(message.text)

if message.chat.type == 'private':
if message_check(message.text):
await message.reply('Это мета-вопрос.')
if prediction:
await message.reply(
random.choice(PRIVATE_MESSAGES_POS), parse_mode='html'
)
else:
await message.reply('Это обычный вопрос.')
await message.reply(
random.choice(PRIVATE_MESSAGES_NEG), parse_mode='html'
)
else:
if message_check(message.text):
if message.photo:
prediction = 0
if prediction:
await message.reply(
random.choice(GROUP_MESSAGES), parse_mode='html'
)
if CHANNEL_ID:
await bot.send_message(chat_id=CHANNEL_ID, text=f"Message from user @{message.from_user.username}\n {info}")


# Instantiate the BotMetaMessageChecker class and start polling
if __name__ == '__main__':
bot_meta_checker = BotMetaMessageChecker()
bot_meta_checker.start()
14 changes: 14 additions & 0 deletions src/app/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,17 @@
'кто юзал',
'можно задать вопрос',
]


PRIVATE_MESSAGES_POS = [
"""Это <a href='https://nometa.xyz/ru.html'>мета-вопрос</a>. Старайтесь задавать свои вопрос напрямую""",
"""Это <a href='https://nometa.xyz/ru.html'>мета-вопрос</a>. Просто задайте свой вопрос напрямую.""",
"""Это <a href='https://nometa.xyz/ru.html'>мета-вопрос</a>. Продолжайте задавать мне вопросы, я постараюсь помочь!"""
]

PRIVATE_MESSAGES_NEG = [
"""Это обычный вопрос. Напоминаю, что я идентифицирую мета-вопросы. Пожалуйста, задайте мета-вопрос или другой вопрос, чтобы проверить мои возможности.""",
"""Это обычный вопрос. Напоминаю, что моя задача — идентифицировать мета-вопросы. Пожалуйста, продолжайте тестирование.""",
"""Это обычный вопрос. Я здесь для определения мета-вопросов. Пожалуйста, продолжайте проверку."""

]
59 changes: 45 additions & 14 deletions src/app/utils.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,43 @@
from typing import Tuple

from src.app.constants import PATTERNS_META_QUESTIONS
from src.models.tfidf_text_classifier.model import TfidfTextClassifier
from src.models.bert_classfier.model import BertClassifier

from src.models.bert_classifier.model import BertClassifier # Updated import name

def check_question_pattern(message: str) -> bool:
"""
Функция проверяет, является ли сообщение мета-вопросом.
Check if a message is a meta-question.

Parameters
----------
message : str
Сообщенние от пользователя.
User's message.

Returns
-------
bool
True, если сообщение - мета вопрос.
False, если сообщение - обычный вопрос.
True if the message is a meta-question.
False if the message is a regular question.
"""
message = message.lower()
for meta_question in PATTERNS_META_QUESTIONS:
if meta_question in message:
return True
return False


def check_question_with_tfidf_model(message: str) -> bool:
"""
TODO: add descriptions
Check question using the TF-IDF model.

Parameters
----------
message : str
User's message.

Returns
-------
bool
True if the message is classified as a question, otherwise False.
"""
model = TfidfTextClassifier()
model.load_model(
Expand All @@ -38,14 +48,35 @@ def check_question_with_tfidf_model(message: str) -> bool:

return bool(prediction)


def check_question_with_rubert_clf(message: str) -> bool:
def check_question_with_rubert_clf(message: str) -> Tuple[bool, str]:
"""
TODO: add descriptions
Check question using the RuBERT classifier.

Parameters
----------
message : str
User's message.

Returns
-------
Tuple[bool, str]
A tuple containing:
- A boolean indicating if the message is a question or not.
- Information about the prediction and details of the classification.
"""
if len(message) > 20:
model = BertClassifier(model_path="../src/models/bert_classfier/artifacts")
if "?" in message and len(message) > 10:
model = BertClassifier(model_path="../src/models/bert_classifier/artifacts") # Updated path
prediction = model.predict(message)
score = model.predict_proba(message)
info = f"""
Message: {message}\n
Predict: {prediction}\n
Logit: {score}\n
Current threshold: {model.threshold}"""
else:
prediction = 0
return bool(prediction)
info = f"""
Predict: {prediction}
Note: {message} not a question
"""
return bool(prediction), info
Loading