From 4fbab4bb8a1e852488c575753691a7adb48b1f3d Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 11 Mar 2025 21:04:23 +0800 Subject: [PATCH 1/2] revise --- docs/llm_web_kit/model/lang_id.md | 15 +++++++-------- llm_web_kit/model/lang_id.py | 15 +++++++-------- tests/llm_web_kit/model/test_lang_id.py | 6 +++--- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index f8492313..91a1d1af 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -11,10 +11,6 @@ huggingface版本: "common":{ "cache_path": "~/.llm_web_kit_cache" }, - "lang-id-176": { - "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", - "md5": "01810bc59c6a3d2b79c79e6336612f65" - }, "lang-id-218": { "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" @@ -29,10 +25,6 @@ s3版本: "common":{ "cache_path": "~/.llm_web_kit_cache" }, - "lang-id-176": { - "download_path": "s3://web-parse-huawei/shared_resource/language/lid176.bin", - "md5": "01810bc59c6a3d2b79c79e6336612f65" - }, "lang-id-218": { "download_path": "s3://web-parse-huawei/shared_resource/language/lid218e.bin", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" @@ -52,10 +44,17 @@ print(update_language_by_str(text)) ## 运行时间 使用单cpu进行推理 + 共有 2099 条数据 + 总 token 数: 379375 + 平均 token 数: 180.74 + 载入数据时间: 0.0214 秒 + 语言识别时间: 2.4313 秒 + 总时间: 2.4527 秒 + 处理速度: 863.33 条/秒 diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index 7b08c04f..e5e546bf 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -6,8 +6,10 @@ from llm_web_kit.config.cfg_reader import load_config from llm_web_kit.libs.logger import mylogger as logger -from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file, - singleton_resource_manager) +from llm_web_kit.model.resource_utils.download_assets import ( + CACHE_DIR, download_auto_file) +from llm_web_kit.model.resource_utils.singleton_resource_manager import \ + singleton_resource_manager language_dict = { 'srp': 'sr', 'swe': 'sv', 'dan': 'da', 'ita': 'it', 'spa': 'es', 'pes': 'fa', 'slk': 'sk', 'hun': 'hu', 'bul': 'bg', 'cat': 'ca', @@ -245,12 +247,9 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) language_details = None if lang_detect.version == '218.bin': first_pred = predictions[0] - match = re.match(r'^__label__([a-z]+)_[A-Za-z]+$', first_pred) - if match: - lang_code = match.group(1) - else: - lang_code = first_pred.replace('__label__', '').split('_')[0] - language_details = lang_code + # Extract the full label (e.g., __label__eng_Latn -> eng_Latn) + if first_pred.startswith('__label__'): + language_details = first_pred.replace('__label__', '') return { 'language': language, diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 3ff5ea5d..2c1b1f96 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -131,7 +131,7 @@ def test_decide_language_func(): lang_detect.version = '218.bin' lang_detect.predict.return_value = (['__label__eng_Latn', '__label__zho_Hans'], [0.6, 0.4]) result = decide_language_func('test text', lang_detect) - assert result == {'language': 'en', 'language_details': 'eng'} + assert result == {'language': 'en', 'language_details': 'eng_Latn'} # Test for empty string result = decide_language_func('', lang_detect) @@ -144,7 +144,7 @@ def test_update_language_by_str(): # 设置模拟函数的返回值 mock_get_singleton_lang_detect.return_value = MagicMock() - mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng'} + mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng_Latn'} # 调用被测函数 result = update_language_by_str('test text') @@ -152,7 +152,7 @@ def test_update_language_by_str(): # 验证返回结果 expected_result = { 'language': 'en', - 'language_details': 'eng' + 'language_details': 'eng_Latn' } assert result == expected_result, f'Expected {expected_result}, but got {result}' print('Test passed!') From 8dadb063b7d15ec62cb4ae11bfb8a94394458560 Mon Sep 17 00:00:00 2001 From: huyc Date: Wed, 12 Mar 2025 21:23:50 +0800 Subject: [PATCH 2/2] revise cache_dir --- llm_web_kit/model/lang_id.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index e5e546bf..92121af7 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -6,10 +6,8 @@ from llm_web_kit.config.cfg_reader import load_config from llm_web_kit.libs.logger import mylogger as logger -from llm_web_kit.model.resource_utils.download_assets import ( - CACHE_DIR, download_auto_file) -from llm_web_kit.model.resource_utils.singleton_resource_manager import \ - singleton_resource_manager +from llm_web_kit.model.resource_utils import (CACHE_DIR, download_auto_file, + singleton_resource_manager) language_dict = { 'srp': 'sr', 'swe': 'sv', 'dan': 'da', 'ita': 'it', 'spa': 'es', 'pes': 'fa', 'slk': 'sk', 'hun': 'hu', 'bul': 'bg', 'cat': 'ca',