Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
288ffda
feat: 添加文件锁处理功能并优化文件删除逻辑
darkrush Mar 7, 2025
2ac2c53
Merge branch 'dev' of github.com:ccprocessor/llm-webkit-mirror into f…
darkrush Mar 7, 2025
78669e2
feat: 添加多进程文件处理测试用例,优化锁文件处理逻辑
darkrush Mar 7, 2025
6916243
feat: 添加临时缓存目录支持,优化文件下载和解压逻辑
darkrush Mar 7, 2025
c859c87
feat: 优化模型加载和下载逻辑,添加缓存目录环境变量支持
darkrush Mar 7, 2025
26b0a46
feat: 设置HF_HOME环境变量以支持Hugging Face缓存目录
darkrush Mar 7, 2025
d54b4f3
feat: 修复模型初始化中的tokenizer路径引用,确保正确加载
darkrush Mar 7, 2025
a43ce1a
feat: 重构资源管理,统一CACHE_DIR的引用,优化代码结构
darkrush Mar 10, 2025
ae5ba22
doc: 为下载功能添加注释和函数代码
darkrush Mar 10, 2025
f977d49
feat: 为文件处理函数添加详细文档注释,优化代码可读性
darkrush Mar 10, 2025
d655704
feat: 增强文件验证和解压功能,添加路径存在性检查以提高错误处理能力
darkrush Mar 10, 2025
3af5951
test: 增加文件校验测试用例,添加临时文件处理和异常情况测试
darkrush Mar 10, 2025
650aa09
feat: 重构资源管理,统一资源导入方式,优化代码结构
darkrush Mar 10, 2025
981d8e3
Merge branch 'dev' of github.com:ccprocessor/llm-webkit-mirror into f…
darkrush Mar 10, 2025
c2ed5ce
feat: 优化缓存目录创建,确保目录存在时不报错
darkrush Mar 10, 2025
18172c7
feat: 添加 libgomp.so.1 二进制文件
darkrush Mar 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions llm_web_kit/model/code_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@

from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.libs.logger import mylogger as logger
from llm_web_kit.model.resource_utils.download_assets import (
CACHE_DIR, download_auto_file)
from llm_web_kit.model.resource_utils.singleton_resource_manager import \
singleton_resource_manager
from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
unzip_local_file)
from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
get_unzip_dir,
singleton_resource_manager,
unzip_local_file)


class CodeClassification:
Expand Down Expand Up @@ -139,13 +137,17 @@
if str_len > 10000:
logger.warning('Content string is too long, truncate to 10000 characters')
start_idx = (str_len - 10000) // 2
content_str = content_str[start_idx:start_idx + 10000]
content_str = content_str[start_idx : start_idx + 10000]

Check warning on line 140 in llm_web_kit/model/code_detector.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/model/code_detector.py#L140

Added line #L140 was not covered by tests

# check if the content string contains latex environment
if detect_latex_env(content_str):
logger.warning('Content string contains latex environment, may be misclassified')
logger.warning(

Check warning on line 144 in llm_web_kit/model/code_detector.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/model/code_detector.py#L144

Added line #L144 was not covered by tests
'Content string contains latex environment, may be misclassified'
)

def decide_code_by_prob_v3(predictions: Tuple[str], probabilities: Tuple[float]) -> float:
def decide_code_by_prob_v3(
predictions: Tuple[str], probabilities: Tuple[float]
) -> float:
idx = predictions.index('__label__1')
true_prob = probabilities[idx]
return true_prob
Expand All @@ -154,7 +156,9 @@
predictions, probabilities = code_detect.predict(content_str)
result = decide_code_by_prob_v3(predictions, probabilities)
else:
raise ValueError(f'Unsupported version: {code_detect.version}. Supported versions: {[CODE_CL_SUPPORTED_VERSIONS]}')
raise ValueError(

Check warning on line 159 in llm_web_kit/model/code_detector.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/model/code_detector.py#L159

Added line #L159 was not covered by tests
f'Unsupported version: {code_detect.version}. Supported versions: {[CODE_CL_SUPPORTED_VERSIONS]}'
)
return result


Expand Down
6 changes: 2 additions & 4 deletions llm_web_kit/model/html_layout_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.libs.logger import mylogger as logger
from llm_web_kit.model.html_classify.model import Markuplm
from llm_web_kit.model.resource_utils.download_assets import (
CACHE_DIR, download_auto_file)
from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
unzip_local_file)
from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
get_unzip_dir, unzip_local_file)


class HTMLLayoutClassifier:
Expand Down
6 changes: 2 additions & 4 deletions llm_web_kit/model/lang_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@

from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.libs.logger import mylogger as logger
from llm_web_kit.model.resource_utils.download_assets import (
CACHE_DIR, download_auto_file)
from llm_web_kit.model.resource_utils.singleton_resource_manager import \
singleton_resource_manager
from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
singleton_resource_manager)

language_dict = {
'srp': 'sr', 'swe': 'sv', 'dan': 'da', 'ita': 'it', 'spa': 'es', 'pes': 'fa', 'slk': 'sk', 'hun': 'hu', 'bul': 'bg', 'cat': 'ca',
Expand Down
Binary file added llm_web_kit/model/libgomp.so.1
Binary file not shown.
40 changes: 27 additions & 13 deletions llm_web_kit/model/policical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,34 @@
from typing import Any, Dict, Tuple

import fasttext
from transformers import AutoTokenizer

from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.exception.exception import ModelInputException
from llm_web_kit.input.datajson import DataJson
from llm_web_kit.libs.logger import mylogger as logger
from llm_web_kit.model.resource_utils.download_assets import (
CACHE_DIR, download_auto_file)
from llm_web_kit.model.resource_utils.singleton_resource_manager import \
singleton_resource_manager
from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
unzip_local_file)
from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
get_unzip_dir,
singleton_resource_manager,
unzip_local_file)


class PoliticalDetector:

def __init__(self, model_path: str = None):
# import AutoTokenizer here to avoid isort error
# must set the HF_HOME to the CACHE_DIR at this point
os.environ['HF_HOME'] = CACHE_DIR
from transformers import AutoTokenizer

if not model_path:
model_path = self.auto_download()
model_bin_path = os.path.join(model_path, 'model.bin')
tokenizer_path = os.path.join(model_path, 'internlm2-chat-20b')

self.model = fasttext.load_model(model_bin_path)
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, trust_remote_code=True)
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path, use_fast=False, trust_remote_code=True
)

def auto_download(self):
"""Default download the 24m7.zip model."""
Expand All @@ -46,15 +50,19 @@
if not os.path.exists(zip_path):
logger.info(f'zip_path: {zip_path} does not exist')
logger.info(f'downloading {political_24m7_s3}')
zip_path = download_auto_file(political_24m7_s3, zip_path, political_24m7_md5)
zip_path = download_auto_file(

Check warning on line 53 in llm_web_kit/model/policical.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/model/policical.py#L53

Added line #L53 was not covered by tests
political_24m7_s3, zip_path, political_24m7_md5
)
logger.info(f'unzipping {zip_path}')
unzip_path = unzip_local_file(zip_path, unzip_path)
return unzip_path

def predict(self, text: str) -> Tuple[str, float]:
text = text.replace('\n', ' ')
input_ids = self.tokenizer(text)['input_ids']
predictions, probabilities = self.model.predict(' '.join([str(i) for i in input_ids]), k=-1)
predictions, probabilities = self.model.predict(
' '.join([str(i) for i in input_ids]), k=-1
)

return predictions, probabilities

Expand All @@ -77,13 +85,17 @@
return singleton_resource_manager.get_resource('political_detect')


def decide_political_by_prob(predictions: Tuple[str], probabilities: Tuple[float]) -> float:
def decide_political_by_prob(
predictions: Tuple[str], probabilities: Tuple[float]
) -> float:
idx = predictions.index('__label__normal')
normal_score = probabilities[idx]
return normal_score


def decide_political_func(content_str: str, political_detect: PoliticalDetector) -> float:
def decide_political_func(
content_str: str, political_detect: PoliticalDetector
) -> float:
# Limit the length of the content to 2560000
content_str = content_str[:2560000]
predictions, probabilities = political_detect.predict(content_str)
Expand Down Expand Up @@ -111,7 +123,9 @@
test_cases.append('hello, nice to meet you!')
test_cases.append('你好,唔該幫我一個忙?')
test_cases.append('Bawo ni? Mo nife Yoruba. ')
test_cases.append('你好,我很高兴见到你,请多多指教!你今天吃饭了吗?hello, nice to meet you!')
test_cases.append(

Check warning on line 126 in llm_web_kit/model/policical.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/model/policical.py#L126

Added line #L126 was not covered by tests
'你好,我很高兴见到你,请多多指教!你今天吃饭了吗?hello, nice to meet you!'
)
test_cases.append('איך בין אַ גרויסער פֿאַן פֿון די וויסנשאַפֿט. מיר האָבן פֿיל צו לערנען.')
test_cases.append('გამარჯობა, როგორ ხარ? მე ვარ კარგად, მადლობა.')
test_cases.append('გამარჯობა, როგორ ხართ? ეს ჩემი ქვეყანაა, საქართველო.')
Expand Down
48 changes: 34 additions & 14 deletions llm_web_kit/model/porn_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,28 @@

from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.libs.logger import mylogger as logger
from llm_web_kit.model.resource_utils.download_assets import (
CACHE_DIR, download_auto_file)
from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
unzip_local_file)
from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
get_unzip_dir, unzip_local_file)


class BertModel():
class BertModel:
def __init__(self, model_path: str = None) -> None:
if not model_path:
model_path = self.auto_download()
self.model = AutoModelForSequenceClassification.from_pretrained(os.path.join(model_path, 'porn_classifier/classifier_hf'))
with open(os.path.join(model_path, 'porn_classifier/extra_parameters.json')) as reader:
self.model = AutoModelForSequenceClassification.from_pretrained(
os.path.join(model_path, 'porn_classifier/classifier_hf')
)
with open(
os.path.join(model_path, 'porn_classifier/extra_parameters.json')
) as reader:
model_config = json.load(reader)

self.cls_index = int(model_config.get('cls_index', 1))
self.use_sigmoid = bool(model_config.get('use_sigmoid', False))
self.max_tokens = int(model_config.get('max_tokens', 512))
self.remain_tail = min(self.max_tokens - 1, int(model_config.get('remain_tail', -1)))
self.remain_tail = min(
self.max_tokens - 1, int(model_config.get('remain_tail', -1))
)
self.device = model_config.get('device', 'cpu')

self.model.eval()
Expand All @@ -33,7 +37,9 @@
if hasattr(self.model, 'to_bettertransformer'):
self.model = self.model.to_bettertransformer()

self.tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_path, 'porn_classifier/classifier_hf'))
self.tokenizer = AutoTokenizer.from_pretrained(
os.path.join(model_path, 'porn_classifier/classifier_hf')
)
self.tokenizer_config = {
'padding': True,
'truncation': self.remain_tail <= 0,
Expand Down Expand Up @@ -86,22 +92,36 @@
length = tokens_id.index(self.tokenizer.sep_token_id) + 1
# 如果tokens的长度小于等于max_tokens,则直接在尾部补0,不需要截断
if length <= self.max_tokens:
tokens = tokens_id[:length] + [self.tokenizer.pad_token_id] * (self.max_tokens - length)
tokens = tokens_id[:length] + [self.tokenizer.pad_token_id] * (

Check warning on line 95 in llm_web_kit/model/porn_detector.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/model/porn_detector.py#L95

Added line #L95 was not covered by tests
self.max_tokens - length
)
attn = [1] * length + [0] * (self.max_tokens - length)
# 如果tokens的长度大于max_tokens,则需要取头部max_tokens-remain_tail个tokens和尾部remain_tail个tokens
else:
head_length = self.max_tokens - self.remain_tail
tail_length = self.remain_tail
tokens = tokens_id[:head_length] + tokens_id[length - tail_length : length]
tokens = (
tokens_id[:head_length]
+ tokens_id[length - tail_length : length]
)
attn = [1] * self.max_tokens

# 将处理后的tokens添加到新的inputs列表中
processed_inputs.append({'input_ids': torch.tensor(tokens), 'attention_mask': torch.tensor(attn)})
processed_inputs.append(
{
'input_ids': torch.tensor(tokens),
'attention_mask': torch.tensor(attn),
}
)

# 将所有inputs整合成一个batch
inputs = {
'input_ids': torch.cat([inp['input_ids'].unsqueeze(0) for inp in processed_inputs]),
'attention_mask': torch.cat([inp['attention_mask'].unsqueeze(0) for inp in processed_inputs]),
'input_ids': torch.cat(
[inp['input_ids'].unsqueeze(0) for inp in processed_inputs]
),
'attention_mask': torch.cat(
[inp['attention_mask'].unsqueeze(0) for inp in processed_inputs]
),
}
inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}
return {'inputs': inputs}
Expand Down
6 changes: 2 additions & 4 deletions llm_web_kit/model/quality_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@
stats_html_entity, stats_ngram_mini, stats_punctuation_end_sentence,
stats_stop_words, stats_unicode)
from llm_web_kit.model.basic_functions.utils import div_zero
from llm_web_kit.model.resource_utils.download_assets import (
CACHE_DIR, download_auto_file)
from llm_web_kit.model.resource_utils.unzip_ext import (get_unzip_dir,
unzip_local_file)
from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file,
get_unzip_dir, unzip_local_file)

_global_quality_model = {}
_model_resource_map = {
Expand Down
6 changes: 6 additions & 0 deletions llm_web_kit/model/resource_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .download_assets import download_auto_file
from .singleton_resource_manager import singleton_resource_manager
from .unzip_ext import get_unzip_dir, unzip_local_file
from .utils import CACHE_DIR, CACHE_TMP_DIR

__all__ = ['download_auto_file', 'unzip_local_file', 'get_unzip_dir', 'CACHE_DIR', 'CACHE_TMP_DIR', 'singleton_resource_manager']
Loading