diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index f8492313..40857518 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -11,10 +11,6 @@ huggingface版本: "common":{ "cache_path": "~/.llm_web_kit_cache" }, - "lang-id-176": { - "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", - "md5": "01810bc59c6a3d2b79c79e6336612f65" - }, "lang-id-218": { "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" @@ -29,10 +25,6 @@ s3版本: "common":{ "cache_path": "~/.llm_web_kit_cache" }, - "lang-id-176": { - "download_path": "s3://web-parse-huawei/shared_resource/language/lid176.bin", - "md5": "01810bc59c6a3d2b79c79e6336612f65" - }, "lang-id-218": { "download_path": "s3://web-parse-huawei/shared_resource/language/lid218e.bin", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index a2e898ce..e5e546bf 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -247,12 +247,9 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) language_details = None if lang_detect.version == '218.bin': first_pred = predictions[0] - match = re.match(r'^__label__([a-z]+)_[A-Za-z]+$', first_pred) - if match: - lang_code = match.group(1) - else: - lang_code = first_pred.replace('__label__', '').split('_')[0] - language_details = lang_code + # Extract the full label (e.g., __label__eng_Latn -> eng_Latn) + if first_pred.startswith('__label__'): + language_details = first_pred.replace('__label__', '') return { 'language': language, diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 3ff5ea5d..2c1b1f96 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -131,7 +131,7 @@ def test_decide_language_func(): lang_detect.version = '218.bin' lang_detect.predict.return_value = (['__label__eng_Latn', '__label__zho_Hans'], [0.6, 0.4]) result = decide_language_func('test text', lang_detect) - assert result == {'language': 'en', 'language_details': 'eng'} + assert result == {'language': 'en', 'language_details': 'eng_Latn'} # Test for empty string result = decide_language_func('', lang_detect) @@ -144,7 +144,7 @@ def test_update_language_by_str(): # 设置模拟函数的返回值 mock_get_singleton_lang_detect.return_value = MagicMock() - mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng'} + mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng_Latn'} # 调用被测函数 result = update_language_by_str('test text') @@ -152,7 +152,7 @@ def test_update_language_by_str(): # 验证返回结果 expected_result = { 'language': 'en', - 'language_details': 'eng' + 'language_details': 'eng_Latn' } assert result == expected_result, f'Expected {expected_result}, but got {result}' print('Test passed!')