From 645163b285e125e02af5028ac5280a3a7e1a3d51 Mon Sep 17 00:00:00 2001 From: Artem Date: Wed, 28 Feb 2024 23:29:47 +0300 Subject: [PATCH 1/4] update gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 074e251..fc93236 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,9 @@ app/mirror_bot.py /config .DS_store .history -/.venv/ \ No newline at end of file +/.venv/ +/bin +/lib +/lib64 +/share +pyvenv.cfg \ No newline at end of file From ae5a14c83ecb3b9ff8f03fd0585c7fe936bca3c9 Mon Sep 17 00:00:00 2001 From: Artem Date: Mon, 18 Mar 2024 18:19:00 +0300 Subject: [PATCH 2/4] Add vpn to gpt promt --- prompts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompts.yml b/prompts.yml index 711cd21..c4e73cb 100644 --- a/prompts.yml +++ b/prompts.yml @@ -6,7 +6,7 @@ spam_classification_prompt_karpov_courses: | Evaluation Criteria: 1. Message Content: - - Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication. + - Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication. Any questions or discussions about VPN. - Not Spam: Relevant discussions, code sharing, variety of questions and bug reports(including those about Karpov.Courses), help requests, providing assistance, links from recognized and reputable sources. Messages related to Karpov.Courses (e.g., course enrollment, link requests, course content questions) and 'karpov' related links or mentions (e.g., 'https://karpov.courses', 'https://lab.karpov.courses', '@karpov_anatoly') are always considered relevant and . - Language: Messages can be in Russian or English and may contain profanity. - Keep in mind that we encourage open discussions about different topics (e.g. techonology, economics) and also some messages might be out of context. From e65c2336d839312125bddc52c616c6ab5adae8f9 Mon Sep 17 00:00:00 2001 From: Artem Date: Wed, 20 Mar 2024 16:50:02 +0300 Subject: [PATCH 3/4] add more dangerous emoji --- src/models/rule_based_model_prod.py | 14 ++++++++++++++ src/models/rule_based_model_validation.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/models/rule_based_model_prod.py b/src/models/rule_based_model_prod.py index a123399..db0047d 100644 --- a/src/models/rule_based_model_prod.py +++ b/src/models/rule_based_model_prod.py @@ -396,6 +396,20 @@ def _contains_emoji(self, message): "\U0001F911", # Money-Mouth Face "\U00002728", # Sparkles "\U0001F6A8", # Police Cars Revolving Light + "\U000027a1", # Right Arrow + "\U00002B05", # Left Arrow + "\U0001F680", # Rocket + "\U0001F4E9", # Envelope with Arrow + "\U0001F525", # Fire + "\U0001F514", # Bell + "\U0001F60E", # Smiling Face with Sunglasses + "\U0001F5F3", # Ballot Box with Ballot + "\U0001F310", # Globe with Meridians + "\U0001F4B8", # Money with Wings + "\U0001F4B2", # Heavy Dollar Sign + "\U00002935", # Right Arrow Curving down + "\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL + "\U0001F50D", # Left-Pointing Magnifying Glass ] emoji_pattern = re.compile("|".join(emojis)) diff --git a/src/models/rule_based_model_validation.py b/src/models/rule_based_model_validation.py index e849510..7aff1da 100644 --- a/src/models/rule_based_model_validation.py +++ b/src/models/rule_based_model_validation.py @@ -428,6 +428,20 @@ def _contains_emoji(self, message): "\U0001F911", # Money-Mouth Face "\U00002728", # Sparkles "\U0001F6A8", # Police Cars Revolving Light + "\U000027a1", # Right Arrow + "\U00002B05", # Left Arrow + "\U0001F680", # Rocket + "\U0001F4E9", # Envelope with Arrow + "\U0001F525", # Fire + "\U0001F514", # Bell + "\U0001F60E", # Smiling Face with Sunglasses + "\U0001F5F3", # Ballot Box with Ballot + "\U0001F310", # Globe with Meridians + "\U0001F4B8", # Money with Wings + "\U0001F4B2", # Heavy Dollar Sign + "\U00002935", # Right Arrow Curving down + "\U0001F51E", # NO ONE UNDER EIGHTEEN SYMBOL + "\U0001F50D", # Left-Pointing Magnifying Glass ] emoji_pattern = re.compile("|".join(emojis)) From cbbe594a4c3da4e2593e582b15ae38d8a3bbca20 Mon Sep 17 00:00:00 2001 From: Artem Date: Thu, 21 Mar 2024 23:59:53 +0300 Subject: [PATCH 4/4] update rools --- src/models/rule_based_model_prod.py | 11 ++++++-- src/models/rule_based_model_validation.py | 33 ++++++++--------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/models/rule_based_model_prod.py b/src/models/rule_based_model_prod.py index db0047d..f2665c8 100644 --- a/src/models/rule_based_model_prod.py +++ b/src/models/rule_based_model_prod.py @@ -64,7 +64,6 @@ def __init__(self): "name": "contains_special_characters", "check": self._check_special_characters, }, - {"name": "check_len_message", "check": self._check_len_message}, { "name": "contains_words_fuzzy_not_enough", "check": self._check_words_fuzzy_not_enough, @@ -93,11 +92,20 @@ def predict(self, X): logger.info("Predicting...") total_score = 0.0 name_features = "" + detected_features = set() for rule in self.rules: temp_score, temp_name_features = rule["check"](X.iloc[0, :]) + detected_features.add(rule["name"]) if temp_score > 0 else None total_score += temp_score name_features += temp_name_features total_score_normalized = self._normalize_score(total_score, threshold=1) + if len(X.iloc[0, :]["text"].split()) < 2 and all( + x not in detected_features for x in ["contains_url", + "contains_telegram_link", + "contains_stop_word", + "contains_spam_word"]): + total_score_normalized = 0 + name_features = "0.0 сообщение слишком короткое\n" return total_score_normalized, name_features @@ -402,7 +410,6 @@ def _contains_emoji(self, message): "\U0001F4E9", # Envelope with Arrow "\U0001F525", # Fire "\U0001F514", # Bell - "\U0001F60E", # Smiling Face with Sunglasses "\U0001F5F3", # Ballot Box with Ballot "\U0001F310", # Globe with Meridians "\U0001F4B8", # Money with Wings diff --git a/src/models/rule_based_model_validation.py b/src/models/rule_based_model_validation.py index 7aff1da..db2151d 100644 --- a/src/models/rule_based_model_validation.py +++ b/src/models/rule_based_model_validation.py @@ -62,7 +62,6 @@ def __init__(self): "name": "contains_special_characters", "check": self._check_special_characters, }, - {"name": "check_len_message", "check": self._check_len_message}, { "name": "contains_words_fuzzy_not_enough", "check": self._check_words_fuzzy_not_enough, @@ -104,11 +103,11 @@ def predict(self, X): """ logger.info("Predicting...") pred_scores = [] - name_features = "" + name_features = [] for index in tqdm(range(len(X))): message = X.iloc[index, :] score, temp_name_features = self._predict_message(message) - name_features += temp_name_features + name_features.append(temp_name_features) pred_scores.append(score) return pred_scores, name_features @@ -125,11 +124,20 @@ def _predict_message(self, message): """ total_score = 0.0 name_features = "" + detected_features = set() for rule in self.rules: temp_score, temp_name_features = rule["check"](message) + detected_features.add(rule["name"]) if temp_score > 0 else None total_score += temp_score name_features += temp_name_features total_score_normalized = self._normalize_score(total_score, threshold=1) + if len(message["text"].split()) < 2 and all( + x not in detected_features for x in ["contains_url", + "contains_telegram_link", + "contains_stop_word", + "contains_spam_word"]): + total_score_normalized = 0 + name_features = "[0.0] - Cообщение слишком короткое\n" return total_score_normalized, name_features @@ -336,24 +344,6 @@ def _check_special_characters(self, message): return score, feature - def _check_len_message(self, message): - """ - Calculate the score for the length of the message. - - Parameters: - message (dict): A dictionary containing the message text. - - Returns: - float: The score for the length of the message. - """ - score = 0.0 - feature = "" - if len(message["text"]) < 5 and len(message["text"]) != 0: - score += 0.1 - feature = "[+0.1] - Сообщение чересчур короткое\n" - - return score, feature - def _check_words_fuzzy_not_enough(self, message): """ Calculate the score for a given message based on the presence of words in the 'words_fuzzy_not_enough' list. @@ -434,7 +424,6 @@ def _contains_emoji(self, message): "\U0001F4E9", # Envelope with Arrow "\U0001F525", # Fire "\U0001F514", # Bell - "\U0001F60E", # Smiling Face with Sunglasses "\U0001F5F3", # Ballot Box with Ballot "\U0001F310", # Globe with Meridians "\U0001F4B8", # Money with Wings