diff --git a/.gitignore b/.gitignore index 074e251..fc93236 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,9 @@ app/mirror_bot.py /config .DS_store .history -/.venv/ \ No newline at end of file +/.venv/ +/bin +/lib +/lib64 +/share +pyvenv.cfg \ No newline at end of file diff --git a/prompts.yml b/prompts.yml index 711cd21..c4e73cb 100644 --- a/prompts.yml +++ b/prompts.yml @@ -6,7 +6,7 @@ spam_classification_prompt_karpov_courses: | Evaluation Criteria: 1. Message Content: - - Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication. + - Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication. Any questions or discussions about VPN. - Not Spam: Relevant discussions, code sharing, variety of questions and bug reports(including those about Karpov.Courses), help requests, providing assistance, links from recognized and reputable sources. Messages related to Karpov.Courses (e.g., course enrollment, link requests, course content questions) and 'karpov' related links or mentions (e.g., 'https://karpov.courses', 'https://lab.karpov.courses', '@karpov_anatoly') are always considered relevant and . - Language: Messages can be in Russian or English and may contain profanity. - Keep in mind that we encourage open discussions about different topics (e.g. techonology, economics) and also some messages might be out of context. diff --git a/src/models/rule_based_model_prod.py b/src/models/rule_based_model_prod.py index a123399..f2665c8 100644 --- a/src/models/rule_based_model_prod.py +++ b/src/models/rule_based_model_prod.py @@ -64,7 +64,6 @@ def __init__(self): "name": "contains_special_characters", "check": self._check_special_characters, }, - {"name": "check_len_message", "check": self._check_len_message}, { "name": "contains_words_fuzzy_not_enough", "check": self._check_words_fuzzy_not_enough, @@ -93,11 +92,20 @@ def predict(self, X): logger.info("Predicting...") total_score = 0.0 name_features = "" + detected_features = set() for rule in self.rules: temp_score, temp_name_features = rule["check"](X.iloc[0, :]) + detected_features.add(rule["name"]) if temp_score > 0 else None total_score += temp_score name_features += temp_name_features total_score_normalized = self._normalize_score(total_score, threshold=1) + if len(X.iloc[0, :]["text"].split()) < 2 and all( + x not in detected_features for x in ["contains_url", + "contains_telegram_link", + "contains_stop_word", + "contains_spam_word"]): + total_score_normalized = 0 + name_features = "0.0 сообщение слишком короткое\n" return total_score_normalized, name_features @@ -396,6 +404,19 @@ def _contains_emoji(self, message): "\U0001F911", # Money-Mouth Face "\U00002728", # Sparkles "\U0001F6A8", # Police Cars Revolving Light + "\U000027a1", # Right Arrow + "\U00002B05", # Left Arrow + "\U0001F680", # Rocket + "\U0001F4E9", # Envelope with Arrow + "\U0001F525", # Fire + "\U0001F514", # Bell + "\U0001F5F3", # Ballot Box with Ballot + "\U0001F310", # Globe with Meridians + "\U0001F4B8", # Money with Wings + "\U0001F4B2", # Heavy Dollar Sign + "\U00002935", # Right Arrow Curving down + "\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL + "\U0001F50D", # Left-Pointing Magnifying Glass ] emoji_pattern = re.compile("|".join(emojis)) diff --git a/src/models/rule_based_model_validation.py b/src/models/rule_based_model_validation.py index e849510..db2151d 100644 --- a/src/models/rule_based_model_validation.py +++ b/src/models/rule_based_model_validation.py @@ -62,7 +62,6 @@ def __init__(self): "name": "contains_special_characters", "check": self._check_special_characters, }, - {"name": "check_len_message", "check": self._check_len_message}, { "name": "contains_words_fuzzy_not_enough", "check": self._check_words_fuzzy_not_enough, @@ -104,11 +103,11 @@ def predict(self, X): """ logger.info("Predicting...") pred_scores = [] - name_features = "" + name_features = [] for index in tqdm(range(len(X))): message = X.iloc[index, :] score, temp_name_features = self._predict_message(message) - name_features += temp_name_features + name_features.append(temp_name_features) pred_scores.append(score) return pred_scores, name_features @@ -125,11 +124,20 @@ def _predict_message(self, message): """ total_score = 0.0 name_features = "" + detected_features = set() for rule in self.rules: temp_score, temp_name_features = rule["check"](message) + detected_features.add(rule["name"]) if temp_score > 0 else None total_score += temp_score name_features += temp_name_features total_score_normalized = self._normalize_score(total_score, threshold=1) + if len(message["text"].split()) < 2 and all( + x not in detected_features for x in ["contains_url", + "contains_telegram_link", + "contains_stop_word", + "contains_spam_word"]): + total_score_normalized = 0 + name_features = "[0.0] - Cообщение слишком короткое\n" return total_score_normalized, name_features @@ -336,24 +344,6 @@ def _check_special_characters(self, message): return score, feature - def _check_len_message(self, message): - """ - Calculate the score for the length of the message. - - Parameters: - message (dict): A dictionary containing the message text. - - Returns: - float: The score for the length of the message. - """ - score = 0.0 - feature = "" - if len(message["text"]) < 5 and len(message["text"]) != 0: - score += 0.1 - feature = "[+0.1] - Сообщение чересчур короткое\n" - - return score, feature - def _check_words_fuzzy_not_enough(self, message): """ Calculate the score for a given message based on the presence of words in the 'words_fuzzy_not_enough' list. @@ -428,6 +418,19 @@ def _contains_emoji(self, message): "\U0001F911", # Money-Mouth Face "\U00002728", # Sparkles "\U0001F6A8", # Police Cars Revolving Light + "\U000027a1", # Right Arrow + "\U00002B05", # Left Arrow + "\U0001F680", # Rocket + "\U0001F4E9", # Envelope with Arrow + "\U0001F525", # Fire + "\U0001F514", # Bell + "\U0001F5F3", # Ballot Box with Ballot + "\U0001F310", # Globe with Meridians + "\U0001F4B8", # Money with Wings + "\U0001F4B2", # Heavy Dollar Sign + "\U00002935", # Right Arrow Curving down + "\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL + "\U0001F50D", # Left-Pointing Magnifying Glass ] emoji_pattern = re.compile("|".join(emojis))