ccprocessor · darkrush · Mar 10, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/llm_web_kit/model/code_detector.py b/llm_web_kit/model/code_detector.py
@@ -7,12 +7,10 @@
 
 from llm_web_kit.config.cfg_reader import load_config
 from llm_web_kit.libs.logger import mylogger as logger
-from llm_web_kit.model.resource_utils.download_assets import (
-    CACHE_DIR, download_auto_file)
-from llm_web_kit.model.resource_utils.singleton_resource_manager import \
-    singleton_resource_manager
-from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
-                                                        unzip_local_file)
+from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
+                                              get_unzip_dir,
+                                              singleton_resource_manager,
+                                              unzip_local_file)
 
 
 class CodeClassification:
@@ -139,13 +137,17 @@
     if str_len > 10000:
         logger.warning('Content string is too long, truncate to 10000 characters')
         start_idx = (str_len - 10000) // 2
-        content_str = content_str[start_idx:start_idx + 10000]
+        content_str = content_str[start_idx : start_idx + 10000]
 
     # check if the content string contains latex environment
     if detect_latex_env(content_str):
-        logger.warning('Content string contains latex environment, may be misclassified')
+        logger.warning(
+            'Content string contains latex environment, may be misclassified'
+        )
 
-    def decide_code_by_prob_v3(predictions: Tuple[str], probabilities: Tuple[float]) -> float:
+    def decide_code_by_prob_v3(
+        predictions: Tuple[str], probabilities: Tuple[float]
+    ) -> float:
         idx = predictions.index('__label__1')
         true_prob = probabilities[idx]
         return true_prob
@@ -154,7 +156,9 @@
         predictions, probabilities = code_detect.predict(content_str)
         result = decide_code_by_prob_v3(predictions, probabilities)
     else:
-        raise ValueError(f'Unsupported version: {code_detect.version}. Supported versions: {[CODE_CL_SUPPORTED_VERSIONS]}')
+        raise ValueError(
+            f'Unsupported version: {code_detect.version}. Supported versions: {[CODE_CL_SUPPORTED_VERSIONS]}'
+        )
     return result
 
 

diff --git a/llm_web_kit/model/html_layout_cls.py b/llm_web_kit/model/html_layout_cls.py
@@ -4,10 +4,8 @@
 from llm_web_kit.config.cfg_reader import load_config
 from llm_web_kit.libs.logger import mylogger as logger
 from llm_web_kit.model.html_classify.model import Markuplm
-from llm_web_kit.model.resource_utils.download_assets import (
-    CACHE_DIR, download_auto_file)
-from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
-                                                        unzip_local_file)
+from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
+                                              get_unzip_dir, unzip_local_file)
 
 
 class HTMLLayoutClassifier:

diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py
@@ -6,10 +6,8 @@
 
 from llm_web_kit.config.cfg_reader import load_config
 from llm_web_kit.libs.logger import mylogger as logger
-from llm_web_kit.model.resource_utils.download_assets import (
-    CACHE_DIR, download_auto_file)
-from llm_web_kit.model.resource_utils.singleton_resource_manager import \
-    singleton_resource_manager
+from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
+                                              singleton_resource_manager)
 
 language_dict = {
     'srp': 'sr', 'swe': 'sv', 'dan': 'da', 'ita': 'it', 'spa': 'es', 'pes': 'fa', 'slk': 'sk', 'hun': 'hu', 'bul': 'bg', 'cat': 'ca',

diff --git a/llm_web_kit/model/libgomp.so.1 b/llm_web_kit/model/libgomp.so.1
diff --git a/llm_web_kit/model/policical.py b/llm_web_kit/model/policical.py
@@ -2,30 +2,34 @@
 from typing import Any, Dict, Tuple
 
 import fasttext
-from transformers import AutoTokenizer
 
 from llm_web_kit.config.cfg_reader import load_config
 from llm_web_kit.exception.exception import ModelInputException
 from llm_web_kit.input.datajson import DataJson
 from llm_web_kit.libs.logger import mylogger as logger
-from llm_web_kit.model.resource_utils.download_assets import (
-    CACHE_DIR, download_auto_file)
-from llm_web_kit.model.resource_utils.singleton_resource_manager import \
-    singleton_resource_manager
-from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
-                                                        unzip_local_file)
+from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
+                                              get_unzip_dir,
+                                              singleton_resource_manager,
+                                              unzip_local_file)
 
 
 class PoliticalDetector:
 
     def __init__(self, model_path: str = None):
+        # import AutoTokenizer here to avoid isort error
+        # must set the HF_HOME to the CACHE_DIR at this point
+        os.environ['HF_HOME'] = CACHE_DIR
+        from transformers import AutoTokenizer
+
         if not model_path:
             model_path = self.auto_download()
         model_bin_path = os.path.join(model_path, 'model.bin')
         tokenizer_path = os.path.join(model_path, 'internlm2-chat-20b')
 
         self.model = fasttext.load_model(model_bin_path)
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path, use_fast=False, trust_remote_code=True
+        )
 
     def auto_download(self):
         """Default download the 24m7.zip model."""
@@ -46,15 +50,19 @@
             if not os.path.exists(zip_path):
                 logger.info(f'zip_path: {zip_path} does not exist')
                 logger.info(f'downloading {political_24m7_s3}')
-                zip_path = download_auto_file(political_24m7_s3, zip_path, political_24m7_md5)
+                zip_path = download_auto_file(
+                    political_24m7_s3, zip_path, political_24m7_md5
+                )
             logger.info(f'unzipping {zip_path}')
             unzip_path = unzip_local_file(zip_path, unzip_path)
         return unzip_path
 
     def predict(self, text: str) -> Tuple[str, float]:
         text = text.replace('\n', ' ')
         input_ids = self.tokenizer(text)['input_ids']
-        predictions, probabilities = self.model.predict(' '.join([str(i) for i in input_ids]), k=-1)
+        predictions, probabilities = self.model.predict(
+            ' '.join([str(i) for i in input_ids]), k=-1
+        )
 
         return predictions, probabilities
 
@@ -77,13 +85,17 @@
     return singleton_resource_manager.get_resource('political_detect')
 
 
-def decide_political_by_prob(predictions: Tuple[str], probabilities: Tuple[float]) -> float:
+def decide_political_by_prob(
+    predictions: Tuple[str], probabilities: Tuple[float]
+) -> float:
     idx = predictions.index('__label__normal')
     normal_score = probabilities[idx]
     return normal_score
 
 
-def decide_political_func(content_str: str, political_detect: PoliticalDetector) -> float:
+def decide_political_func(
+    content_str: str, political_detect: PoliticalDetector
+) -> float:
     # Limit the length of the content to 2560000
     content_str = content_str[:2560000]
     predictions, probabilities = political_detect.predict(content_str)
@@ -111,7 +123,9 @@
     test_cases.append('hello, nice to meet you!')
     test_cases.append('你好，唔該幫我一個忙？')
     test_cases.append('Bawo ni? Mo nife Yoruba. ')
-    test_cases.append('你好，我很高兴见到你，请多多指教！你今天吃饭了吗？hello, nice to meet you!')
+    test_cases.append(
+        '你好，我很高兴见到你，请多多指教！你今天吃饭了吗？hello, nice to meet you!'
+    )
     test_cases.append('איך בין אַ גרויסער פֿאַן פֿון די וויסנשאַפֿט. מיר האָבן פֿיל צו לערנען.')
     test_cases.append('გამარჯობა, როგორ ხარ? მე ვარ კარგად, მადლობა.')
     test_cases.append('გამარჯობა, როგორ ხართ? ეს ჩემი ქვეყანაა, საქართველო.')

diff --git a/llm_web_kit/model/porn_detector.py b/llm_web_kit/model/porn_detector.py
@@ -7,24 +7,28 @@
 
 from llm_web_kit.config.cfg_reader import load_config
 from llm_web_kit.libs.logger import mylogger as logger
-from llm_web_kit.model.resource_utils.download_assets import (
-    CACHE_DIR, download_auto_file)
-from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
-                                                        unzip_local_file)
+from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
+                                              get_unzip_dir, unzip_local_file)
 
 
-class BertModel():
+class BertModel:
     def __init__(self, model_path: str = None) -> None:
         if not model_path:
             model_path = self.auto_download()
-        self.model = AutoModelForSequenceClassification.from_pretrained(os.path.join(model_path, 'porn_classifier/classifier_hf'))
-        with open(os.path.join(model_path, 'porn_classifier/extra_parameters.json')) as reader:
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            os.path.join(model_path, 'porn_classifier/classifier_hf')
+        )
+        with open(
+            os.path.join(model_path, 'porn_classifier/extra_parameters.json')
+        ) as reader:
             model_config = json.load(reader)
 
         self.cls_index = int(model_config.get('cls_index', 1))
         self.use_sigmoid = bool(model_config.get('use_sigmoid', False))
         self.max_tokens = int(model_config.get('max_tokens', 512))
-        self.remain_tail = min(self.max_tokens - 1, int(model_config.get('remain_tail', -1)))
+        self.remain_tail = min(
+            self.max_tokens - 1, int(model_config.get('remain_tail', -1))
+        )
         self.device = model_config.get('device', 'cpu')
 
         self.model.eval()
@@ -33,7 +37,9 @@
         if hasattr(self.model, 'to_bettertransformer'):
             self.model = self.model.to_bettertransformer()
 
-        self.tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_path, 'porn_classifier/classifier_hf'))
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            os.path.join(model_path, 'porn_classifier/classifier_hf')
+        )
         self.tokenizer_config = {
             'padding': True,
             'truncation': self.remain_tail <= 0,
@@ -86,22 +92,36 @@
                 length = tokens_id.index(self.tokenizer.sep_token_id) + 1
                 # 如果tokens的长度小于等于max_tokens，则直接在尾部补0，不需要截断
                 if length <= self.max_tokens:
-                    tokens = tokens_id[:length] + [self.tokenizer.pad_token_id] * (self.max_tokens - length)
+                    tokens = tokens_id[:length] + [self.tokenizer.pad_token_id] * (
+                        self.max_tokens - length
+                    )
                     attn = [1] * length + [0] * (self.max_tokens - length)
                 # 如果tokens的长度大于max_tokens，则需要取头部max_tokens-remain_tail个tokens和尾部remain_tail个tokens
                 else:
                     head_length = self.max_tokens - self.remain_tail
                     tail_length = self.remain_tail
-                    tokens = tokens_id[:head_length] + tokens_id[length - tail_length : length]
+                    tokens = (
+                        tokens_id[:head_length]
+                        + tokens_id[length - tail_length : length]
+                    )
                     attn = [1] * self.max_tokens
 
                 # 将处理后的tokens添加到新的inputs列表中
-                processed_inputs.append({'input_ids': torch.tensor(tokens), 'attention_mask': torch.tensor(attn)})
+                processed_inputs.append(
+                    {
+                        'input_ids': torch.tensor(tokens),
+                        'attention_mask': torch.tensor(attn),
+                    }
+                )
 
             # 将所有inputs整合成一个batch
             inputs = {
-                'input_ids': torch.cat([inp['input_ids'].unsqueeze(0) for inp in processed_inputs]),
-                'attention_mask': torch.cat([inp['attention_mask'].unsqueeze(0) for inp in processed_inputs]),
+                'input_ids': torch.cat(
+                    [inp['input_ids'].unsqueeze(0) for inp in processed_inputs]
+                ),
+                'attention_mask': torch.cat(
+                    [inp['attention_mask'].unsqueeze(0) for inp in processed_inputs]
+                ),
             }
         inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}
         return {'inputs': inputs}

diff --git a/llm_web_kit/model/quality_model.py b/llm_web_kit/model/quality_model.py
@@ -19,10 +19,8 @@
     stats_html_entity, stats_ngram_mini, stats_punctuation_end_sentence,
     stats_stop_words, stats_unicode)
 from llm_web_kit.model.basic_functions.utils import div_zero
-from llm_web_kit.model.resource_utils.download_assets import (
-    CACHE_DIR, download_auto_file)
-from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
-                                                        unzip_local_file)
+from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
+                                              get_unzip_dir, unzip_local_file)
 
 _global_quality_model = {}
 _model_resource_map = {

diff --git a/llm_web_kit/model/resource_utils/__init__.py b/llm_web_kit/model/resource_utils/__init__.py
@@ -0,0 +1,6 @@
+from .download_assets import download_auto_file
+from .singleton_resource_manager import singleton_resource_manager
+from .unzip_ext import get_unzip_dir, unzip_local_file
+from .utils import CACHE_DIR, CACHE_TMP_DIR
+
+__all__ = ['download_auto_file', 'unzip_local_file', 'get_unzip_dir', 'CACHE_DIR', 'CACHE_TMP_DIR', 'singleton_resource_manager']