SimulatorML · AmarskiyArtem · Feb 28, 2024 · Mar 18, 2024 · Mar 20, 2024 · Mar 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,9 @@ app/mirror_bot.py
 /config
 .DS_store
 .history
-/.venv/
+/.venv/
+/bin
+/lib
+/lib64
+/share
+pyvenv.cfg
diff --git a/prompts.yml b/prompts.yml
@@ -6,7 +6,7 @@ spam_classification_prompt_karpov_courses: |
   Evaluation Criteria:
 
   1. Message Content:
-  - Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication.
+  - Spam: Unsolicited ads or job offers, phishing links or Telegram-links (e.g, t.me/link, @link or telegra.ph/link), scams, adult content, religious content, messages encouraging private communication. Any questions or discussions about VPN.
   - Not Spam: Relevant discussions, code sharing, variety of questions and bug reports(including those about Karpov.Courses), help requests, providing assistance, links from recognized and reputable sources. Messages related to Karpov.Courses (e.g., course enrollment, link requests, course content questions) and 'karpov' related links or mentions (e.g., 'https://karpov.courses', 'https://lab.karpov.courses', '@karpov_anatoly') are always considered relevant and <not-spam>.
   - Language: Messages can be in Russian or English and may contain profanity.
   - Keep in mind that we encourage open discussions about different topics (e.g. techonology, economics) and also some messages might be out of context.

diff --git a/src/models/rule_based_model_prod.py b/src/models/rule_based_model_prod.py
@@ -64,7 +64,6 @@ def __init__(self):
                 "name": "contains_special_characters",
                 "check": self._check_special_characters,
             },
-            {"name": "check_len_message", "check": self._check_len_message},
             {
                 "name": "contains_words_fuzzy_not_enough",
                 "check": self._check_words_fuzzy_not_enough,
@@ -93,11 +92,20 @@ def predict(self, X):
         logger.info("Predicting...")
         total_score = 0.0
         name_features = ""
+        detected_features = set()
         for rule in self.rules:
             temp_score, temp_name_features = rule["check"](X.iloc[0, :])
+            detected_features.add(rule["name"]) if temp_score > 0 else None
             total_score += temp_score
             name_features += temp_name_features
         total_score_normalized = self._normalize_score(total_score, threshold=1)
+        if len(X.iloc[0, :]["text"].split()) < 2 and all(
+            x not in detected_features for x in ["contains_url", 
+                                             "contains_telegram_link",
+                                             "contains_stop_word",
+                                             "contains_spam_word"]):
+            total_score_normalized = 0
+            name_features = "0.0 сообщение слишком короткое\n"
 
         return total_score_normalized, name_features
 
@@ -396,6 +404,19 @@ def _contains_emoji(self, message):
             "\U0001F911",  # Money-Mouth Face
             "\U00002728",  # Sparkles
             "\U0001F6A8",  # Police Cars Revolving Light
+            "\U000027a1",  # Right Arrow
+            "\U00002B05",  # Left Arrow
+            "\U0001F680",  # Rocket
+            "\U0001F4E9",  # Envelope with Arrow
+            "\U0001F525",  # Fire
+            "\U0001F514",  # Bell
+            "\U0001F5F3",  # Ballot Box with Ballot
+            "\U0001F310",  # Globe with Meridians
+            "\U0001F4B8",  # Money with Wings
+            "\U0001F4B2",  # Heavy Dollar Sign
+            "\U00002935",  # Right Arrow Curving down
+            "\U0001F51E",  # NO ONE UNDER EIGHTEEN SYMBOL
+            "\U0001F50D",  # Left-Pointing Magnifying Glass
         ]
         emoji_pattern = re.compile("|".join(emojis))
 

diff --git a/src/models/rule_based_model_validation.py b/src/models/rule_based_model_validation.py
@@ -62,7 +62,6 @@ def __init__(self):
                 "name": "contains_special_characters",
                 "check": self._check_special_characters,
             },
-            {"name": "check_len_message", "check": self._check_len_message},
             {
                 "name": "contains_words_fuzzy_not_enough",
                 "check": self._check_words_fuzzy_not_enough,
@@ -104,11 +103,11 @@ def predict(self, X):
         """
         logger.info("Predicting...")
         pred_scores = []
-        name_features = ""
+        name_features = []
         for index in tqdm(range(len(X))):
             message = X.iloc[index, :]
             score, temp_name_features = self._predict_message(message)
-            name_features += temp_name_features
+            name_features.append(temp_name_features)
             pred_scores.append(score)
 
         return pred_scores, name_features
@@ -125,11 +124,20 @@ def _predict_message(self, message):
         """
         total_score = 0.0
         name_features = ""
+        detected_features = set()
         for rule in self.rules:
             temp_score, temp_name_features = rule["check"](message)
+            detected_features.add(rule["name"]) if temp_score > 0 else None
             total_score += temp_score
             name_features += temp_name_features
         total_score_normalized = self._normalize_score(total_score, threshold=1)
+        if len(message["text"].split()) < 2 and all(
+            x not in detected_features for x in ["contains_url", 
+                                             "contains_telegram_link",
+                                             "contains_stop_word",
+                                             "contains_spam_word"]):
+            total_score_normalized = 0
+            name_features = "[0.0] - Cообщение слишком короткое\n"
 
         return total_score_normalized, name_features
 
@@ -336,24 +344,6 @@ def _check_special_characters(self, message):
 
         return score, feature
 
-    def _check_len_message(self, message):
-        """
-        Calculate the score for the length of the message.
-
-        Parameters:
-            message (dict): A dictionary containing the message text.
-
-        Returns:
-            float: The score for the length of the message.
-        """
-        score = 0.0
-        feature = ""
-        if len(message["text"]) < 5 and len(message["text"]) != 0:
-            score += 0.1
-            feature = "[+0.1] - Сообщение чересчур короткое\n"
-
-        return score, feature
-
     def _check_words_fuzzy_not_enough(self, message):
         """
         Calculate the score for a given message based on the presence of words in the 'words_fuzzy_not_enough' list.
@@ -428,6 +418,19 @@ def _contains_emoji(self, message):
             "\U0001F911",  # Money-Mouth Face
             "\U00002728",  # Sparkles
             "\U0001F6A8",  # Police Cars Revolving Light
+            "\U000027a1",  # Right Arrow
+            "\U00002B05",  # Left Arrow
+            "\U0001F680",  # Rocket
+            "\U0001F4E9",  # Envelope with Arrow
+            "\U0001F525",  # Fire
+            "\U0001F514",  # Bell
+            "\U0001F5F3",  # Ballot Box with Ballot
+            "\U0001F310",  # Globe with Meridians
+            "\U0001F4B8",  # Money with Wings
+            "\U0001F4B2",  # Heavy Dollar Sign
+            "\U00002935",  # Right Arrow Curving down
+            "\U0001F51E",  # NO ONE UNDER EIGHTEEN SYMBOL
+            "\U0001F50D",  # Left-Pointing Magnifying Glass
         ]
         emoji_pattern = re.compile("|".join(emojis))