Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ app/mirror_bot.py
/config
.DS_store
.history
/.venv/
/.venv/
/bin
/lib
/lib64
/share
pyvenv.cfg
2 changes: 1 addition & 1 deletion prompts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spam_classification_prompt_karpov_courses: |
Evaluation Criteria:

1. Message Content:
- Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication.
- Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication. Any questions or discussions about VPN.
- Not Spam: Relevant discussions, code sharing, variety of questions and bug reports(including those about Karpov.Courses), help requests, providing assistance, links from recognized and reputable sources. Messages related to Karpov.Courses (e.g., course enrollment, link requests, course content questions) and 'karpov' related links or mentions (e.g., 'https://karpov.courses', 'https://lab.karpov.courses', '@karpov_anatoly') are always considered relevant and <not-spam>.
- Language: Messages can be in Russian or English and may contain profanity.
- Keep in mind that we encourage open discussions about different topics (e.g. techonology, economics) and also some messages might be out of context.
Expand Down
23 changes: 22 additions & 1 deletion src/models/rule_based_model_prod.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def __init__(self):
"name": "contains_special_characters",
"check": self._check_special_characters,
},
{"name": "check_len_message", "check": self._check_len_message},
{
"name": "contains_words_fuzzy_not_enough",
"check": self._check_words_fuzzy_not_enough,
Expand Down Expand Up @@ -93,11 +92,20 @@ def predict(self, X):
logger.info("Predicting...")
total_score = 0.0
name_features = ""
detected_features = set()
for rule in self.rules:
temp_score, temp_name_features = rule["check"](X.iloc[0, :])
detected_features.add(rule["name"]) if temp_score > 0 else None
total_score += temp_score
name_features += temp_name_features
total_score_normalized = self._normalize_score(total_score, threshold=1)
if len(X.iloc[0, :]["text"].split()) < 2 and all(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

мне кажется если хочешь не штрафовать за короткие сообщения, то просто убери правило _len_msg

Такие костыли усложняют код и не логичные

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Сделал так, потому короткие сообщения тоже могут быть спамовыми, e.g тг-линк + картинка. Или сообщения из одного стоп-ворда. Однако в основном короткие сообщения это ок и штрафовать их не нужно. Ориентировался на треин/тест/некоторые собранные примеры. Так убирались некоторые FP, однако явный спам все еще блокировался (не рос FN). Просто убирать правило коротких сообщений не приносит профита.

x not in detected_features for x in ["contains_url",
"contains_telegram_link",
"contains_stop_word",
"contains_spam_word"]):
total_score_normalized = 0
name_features = "0.0 сообщение слишком короткое\n"

return total_score_normalized, name_features

Expand Down Expand Up @@ -396,6 +404,19 @@ def _contains_emoji(self, message):
"\U0001F911", # Money-Mouth Face
"\U00002728", # Sparkles
"\U0001F6A8", # Police Cars Revolving Light
"\U000027a1", # Right Arrow
"\U00002B05", # Left Arrow
"\U0001F680", # Rocket
"\U0001F4E9", # Envelope with Arrow
"\U0001F525", # Fire
"\U0001F514", # Bell
"\U0001F5F3", # Ballot Box with Ballot
"\U0001F310", # Globe with Meridians
"\U0001F4B8", # Money with Wings
"\U0001F4B2", # Heavy Dollar Sign
"\U00002935", # Right Arrow Curving down
"\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL
"\U0001F50D", # Left-Pointing Magnifying Glass
]
emoji_pattern = re.compile("|".join(emojis))

Expand Down
45 changes: 24 additions & 21 deletions src/models/rule_based_model_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def __init__(self):
"name": "contains_special_characters",
"check": self._check_special_characters,
},
{"name": "check_len_message", "check": self._check_len_message},
{
"name": "contains_words_fuzzy_not_enough",
"check": self._check_words_fuzzy_not_enough,
Expand Down Expand Up @@ -104,11 +103,11 @@ def predict(self, X):
"""
logger.info("Predicting...")
pred_scores = []
name_features = ""
name_features = []
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Изменил, так как раньше возвращалась общая строка для всего датасета

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

О, спасибо

for index in tqdm(range(len(X))):
message = X.iloc[index, :]
score, temp_name_features = self._predict_message(message)
name_features += temp_name_features
name_features.append(temp_name_features)
pred_scores.append(score)

return pred_scores, name_features
Expand All @@ -125,11 +124,20 @@ def _predict_message(self, message):
"""
total_score = 0.0
name_features = ""
detected_features = set()
for rule in self.rules:
temp_score, temp_name_features = rule["check"](message)
detected_features.add(rule["name"]) if temp_score > 0 else None
total_score += temp_score
name_features += temp_name_features
total_score_normalized = self._normalize_score(total_score, threshold=1)
if len(message["text"].split()) < 2 and all(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ну тут опять так же

на какого рода сообщений это нацелено? прям конкретно усложняет код, а цели я особо не вижу пока

x not in detected_features for x in ["contains_url",
"contains_telegram_link",
"contains_stop_word",
"contains_spam_word"]):
total_score_normalized = 0
name_features = "[0.0] - Cообщение слишком короткое\n"

return total_score_normalized, name_features

Expand Down Expand Up @@ -336,24 +344,6 @@ def _check_special_characters(self, message):

return score, feature

def _check_len_message(self, message):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

из модели для валидации удалил правило, а в prod оставил

уже тогда и из продовой удалить это правило

"""
Calculate the score for the length of the message.

Parameters:
message (dict): A dictionary containing the message text.

Returns:
float: The score for the length of the message.
"""
score = 0.0
feature = ""
if len(message["text"]) < 5 and len(message["text"]) != 0:
score += 0.1
feature = "[+0.1] - Сообщение чересчур короткое\n"

return score, feature

def _check_words_fuzzy_not_enough(self, message):
"""
Calculate the score for a given message based on the presence of words in the 'words_fuzzy_not_enough' list.
Expand Down Expand Up @@ -428,6 +418,19 @@ def _contains_emoji(self, message):
"\U0001F911", # Money-Mouth Face
"\U00002728", # Sparkles
"\U0001F6A8", # Police Cars Revolving Light
"\U000027a1", # Right Arrow
"\U00002B05", # Left Arrow
"\U0001F680", # Rocket
"\U0001F4E9", # Envelope with Arrow
"\U0001F525", # Fire
"\U0001F514", # Bell
"\U0001F5F3", # Ballot Box with Ballot
"\U0001F310", # Globe with Meridians
"\U0001F4B8", # Money with Wings
"\U0001F4B2", # Heavy Dollar Sign
"\U00002935", # Right Arrow Curving down
"\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL
"\U0001F50D", # Left-Pointing Magnifying Glass
]
emoji_pattern = re.compile("|".join(emojis))

Expand Down