-
Notifications
You must be signed in to change notification settings - Fork 10
Improving model #18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Improving model #18
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,4 +19,9 @@ app/mirror_bot.py | |
| /config | ||
| .DS_store | ||
| .history | ||
| /.venv/ | ||
| /.venv/ | ||
| /bin | ||
| /lib | ||
| /lib64 | ||
| /share | ||
| pyvenv.cfg | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -62,7 +62,6 @@ def __init__(self): | |
| "name": "contains_special_characters", | ||
| "check": self._check_special_characters, | ||
| }, | ||
| {"name": "check_len_message", "check": self._check_len_message}, | ||
| { | ||
| "name": "contains_words_fuzzy_not_enough", | ||
| "check": self._check_words_fuzzy_not_enough, | ||
|
|
@@ -104,11 +103,11 @@ def predict(self, X): | |
| """ | ||
| logger.info("Predicting...") | ||
| pred_scores = [] | ||
| name_features = "" | ||
| name_features = [] | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Изменил, так как раньше возвращалась общая строка для всего датасета
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. О, спасибо |
||
| for index in tqdm(range(len(X))): | ||
| message = X.iloc[index, :] | ||
| score, temp_name_features = self._predict_message(message) | ||
| name_features += temp_name_features | ||
| name_features.append(temp_name_features) | ||
| pred_scores.append(score) | ||
|
|
||
| return pred_scores, name_features | ||
|
|
@@ -125,11 +124,20 @@ def _predict_message(self, message): | |
| """ | ||
| total_score = 0.0 | ||
| name_features = "" | ||
| detected_features = set() | ||
| for rule in self.rules: | ||
| temp_score, temp_name_features = rule["check"](message) | ||
| detected_features.add(rule["name"]) if temp_score > 0 else None | ||
| total_score += temp_score | ||
| name_features += temp_name_features | ||
| total_score_normalized = self._normalize_score(total_score, threshold=1) | ||
| if len(message["text"].split()) < 2 and all( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ну тут опять так же на какого рода сообщений это нацелено? прям конкретно усложняет код, а цели я особо не вижу пока |
||
| x not in detected_features for x in ["contains_url", | ||
| "contains_telegram_link", | ||
| "contains_stop_word", | ||
| "contains_spam_word"]): | ||
| total_score_normalized = 0 | ||
| name_features = "[0.0] - Cообщение слишком короткое\n" | ||
|
|
||
| return total_score_normalized, name_features | ||
|
|
||
|
|
@@ -336,24 +344,6 @@ def _check_special_characters(self, message): | |
|
|
||
| return score, feature | ||
|
|
||
| def _check_len_message(self, message): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. из модели для валидации удалил правило, а в prod оставил уже тогда и из продовой удалить это правило |
||
| """ | ||
| Calculate the score for the length of the message. | ||
|
|
||
| Parameters: | ||
| message (dict): A dictionary containing the message text. | ||
|
|
||
| Returns: | ||
| float: The score for the length of the message. | ||
| """ | ||
| score = 0.0 | ||
| feature = "" | ||
| if len(message["text"]) < 5 and len(message["text"]) != 0: | ||
| score += 0.1 | ||
| feature = "[+0.1] - Сообщение чересчур короткое\n" | ||
|
|
||
| return score, feature | ||
|
|
||
| def _check_words_fuzzy_not_enough(self, message): | ||
| """ | ||
| Calculate the score for a given message based on the presence of words in the 'words_fuzzy_not_enough' list. | ||
|
|
@@ -428,6 +418,19 @@ def _contains_emoji(self, message): | |
| "\U0001F911", # Money-Mouth Face | ||
| "\U00002728", # Sparkles | ||
| "\U0001F6A8", # Police Cars Revolving Light | ||
| "\U000027a1", # Right Arrow | ||
| "\U00002B05", # Left Arrow | ||
| "\U0001F680", # Rocket | ||
| "\U0001F4E9", # Envelope with Arrow | ||
| "\U0001F525", # Fire | ||
| "\U0001F514", # Bell | ||
| "\U0001F5F3", # Ballot Box with Ballot | ||
| "\U0001F310", # Globe with Meridians | ||
| "\U0001F4B8", # Money with Wings | ||
| "\U0001F4B2", # Heavy Dollar Sign | ||
| "\U00002935", # Right Arrow Curving down | ||
| "\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL | ||
| "\U0001F50D", # Left-Pointing Magnifying Glass | ||
| ] | ||
| emoji_pattern = re.compile("|".join(emojis)) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
мне кажется если хочешь не штрафовать за короткие сообщения, то просто убери правило _len_msg
Такие костыли усложняют код и не логичные
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Сделал так, потому короткие сообщения тоже могут быть спамовыми, e.g тг-линк + картинка. Или сообщения из одного стоп-ворда. Однако в основном короткие сообщения это ок и штрафовать их не нужно. Ориентировался на треин/тест/некоторые собранные примеры. Так убирались некоторые FP, однако явный спам все еще блокировался (не рос FN). Просто убирать правило коротких сообщений не приносит профита.