From 51e0f866d44039513538c122f969ee95d6845615 Mon Sep 17 00:00:00 2001 From: huyc Date: Wed, 22 Jan 2025 12:20:12 +0800 Subject: [PATCH 01/22] fix:test --- llm_web_kit/model/lang_id.py | 103 +++++++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 29 deletions(-) diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index dbe8e0d0..92c20f67 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -11,7 +11,30 @@ from llm_web_kit.model.resource_utils.singleton_resource_manager import \ singleton_resource_manager - +language_dict = { + 'srp': 'sr', 'swe': 'sv', 'dan': 'da', 'ita': 'it', 'spa': 'es', 'pes': 'fa', 'slk': 'sk', 'hun': 'hu', 'bul': 'bg', 'cat': 'ca', + 'tur': 'tr', 'ell': 'el', 'eng': 'en', 'nob': 'no', 'fra': 'fr', 'rus': 'ru', 'hrv': 'hr', 'nld': 'nl', 'ind': 'id', 'hye': 'hy', + 'heb': 'he', 'ceb': 'ceb', 'ron': 'ro', 'pol': 'pl', 'kor': 'ko', 'vie': 'vi', 'deu': 'de', 'slv': 'sl', 'por': 'pt', 'ces': 'cs', + 'ukr': 'uk', 'fin': 'fi', 'arb': 'ar', 'tgl': 'tl', 'afr': 'af', 'est': 'et', 'war': 'war', 'zul': 'zu', 'lit': 'lt', 'ilo': 'ilo', + 'kat': 'ka', 'hin': 'hi', 'mkd': 'mk', 'swh': 'sw', 'epo': 'eo', 'sot': 'st', 'tsn': 'tn', 'xho': 'xh', 'lvs': 'lv', 'als': 'als', + 'tso': 'ts', 'kaz': 'kk', 'sna': 'sn', 'amh': 'am', 'zsm': 'ms', 'tha': 'th', 'tah': 'ty', 'nso': 'nso', 'ewe': 'ee', 'urd': 'ur', + 'isl': 'is', 'lin': 'ln', 'bis': 'bi', 'twi': 'tw', 'sin': 'si', 'ben': 'bn', 'mya': 'my', 'plt': 'mg', 'pan': 'pa', 'azj': 'az', + 'guj': 'gu', 'glg': 'gl', 'kir': 'ky', 'tel': 'te', 'tpi': 'tpi', 'ibo': 'ig', 'tam': 'ta', 'tat': 'tt', 'bem': 'bem', 'bel': 'be', + 'kin': 'rw', 'npi': 'ne', 'pap': 'pap', 'mar': 'mr', 'smo': 'sm', 'run': 'rn', 'che': 'ce', 'fij': 'fj', 'tir': 'ti', 'ast': 'ast', + 'kan': 'kn', 'mlt': 'mt', 'yor': 'yo', 'eus': 'eu', 'lua': 'lua', 'pag': 'pag', 'sag': 'sg', 'oss': 'os', 'khk': 'mn', 'tum': 'tum', + 'tgk': 'tg', 'lug': 'lg', 'mal': 'ml', 'umb': 'umb', 'hat': 'ht', 'kon': 'kg', 'azb': 'azb', 'hau': 'ha', 'mos': 'mos', 'kal': 'kl', + 'nno': 'nn', 'lus': 'lus', 'oci': 'oc', 'bos': 'bs', 'gaz': 'gaz', 'bak': 'ba', 'chv': 'cv', 'cym': 'cy', 'tuk': 'tk', 'luo': 'luo', + 'ayr': 'ay', 'ssw': 'ss', 'quy': 'qu', 'uzn': 'uz', 'kik': 'ki', 'kmb': 'kmb', 'jav': 'jv', 'ltz': 'lb', 'asm': 'as', 'ton': 'to', + 'nya': 'ny', 'kam': 'kam', 'ckb': 'ckb', 'min': 'min', 'bod': 'bo', 'lmo': 'lmo', 'gle': 'ga', 'sun': 'su', 'xmf': 'xmf', 'cjk': 'cjk', + 'nia': 'nia', 'kbp': 'kbp', 'ory': 'or', 'fon': 'fon', 'kmr': 'ku', 'khm': 'km', 'ydd': 'yi', 'abk': 'ab', 'san': 'sa', 'uig': 'ug', + 'lim': 'li', 'scn': 'scn', 'mai': 'mai', 'snd': 'sd', 'wes': 'wes', 'pcm': 'pcm', 'arn': 'arn', 'vec': 'vec', 'nav': 'nv', 'gom': 'gom', + 'gla': 'gd', 'yue': 'zh', 'dyu': 'dyu', 'kac': 'kac', 'roh': 'rm', 'udm': 'udm', 'lao': 'lo', 'diq': 'diq', 'som': 'so', 'kab': 'kab', + 'bjn': 'bjn', 'bxr': 'bxr', 'knc': 'knc', 'szl': 'szl', 'kea': 'kea', 'ban': 'ban', 'crh': 'crh', 'bug': 'bug', 'fur': 'fur', 'ace': 'ace', + 'fuv': 'fuv', 'prs': 'prs', 'mri': 'mi', 'dik': 'dik', 'taq': 'taq', 'kas': 'kas', 'pbt': 'pbt', 'tzm': 'tzm', 'bam': 'bm', 'mag': 'mag', + 'hne': 'hne', 'nus': 'nus', 'krc': 'krc', 'bho': 'bho', 'mni': 'mni', 'ltg': 'ltg', 'alt': 'alt', 'dzo': 'dz', 'lij': 'lij', 'wol': 'wo', + 'sat': 'sat', 'jpn': 'ja', 'shn': 'shn', 'grn': 'gn', 'fao': 'fo', 'zho': 'zh', 'awa': 'awa', 'aka': 'ak', 'ewo': 'ewo', 'srd': 'sc', + 'ady': 'ady' +} class LanguageIdentification: """Language Identification model using fasttext.""" @@ -23,7 +46,7 @@ def __init__(self, model_path: str = None): model_path (str, optional): Path to the model. Defaults to None. """ - if not model_path: + if model_path is None: model_path = self.auto_download() self.model = fasttext.load_model(model_path) @@ -84,21 +107,27 @@ def predict(self, text: str, k: int = 5) -> Tuple[Tuple[str], Tuple[float]]: return predictions, probabilities -def get_singleton_lang_detect() -> LanguageIdentification: +def get_singleton_lang_detect(model_path: str = None) -> LanguageIdentification: """Get the singleton language identification model. - returns: + Args: + model_path (str, optional): Path to the model. Defaults to None. + + Returns: LanguageIdentification: The language identification model """ - if not singleton_resource_manager.has_name('lang_detect'): - singleton_resource_manager.set_resource('lang_detect', LanguageIdentification()) - return singleton_resource_manager.get_resource('lang_detect') + # 基于 model_path 生成唯一的单例名称 + singleton_name = f'lang_detect_{model_path}' if model_path else 'lang_detect_default' + + if not singleton_resource_manager.has_name(singleton_name): + singleton_resource_manager.set_resource(singleton_name, LanguageIdentification(model_path)) + return singleton_resource_manager.get_resource(singleton_name) def decide_language_by_prob_v176(predictions: Tuple[str], probabilities: Tuple[float]) -> str: """Decide language based on probabilities The rules are tuned by Some - sepciific data sources This is a fixed version for fasttext 176 model. - + sepciific data sources.Now the function supports the lid218 model and outputs the language code of lid176 + Args: predictions (Tuple[str]): the predicted languages labels by 176.bin model (__label__zh, __label__en, etc) probabilities (Tuple[float]): the probabilities of the predicted languages @@ -107,10 +136,22 @@ def decide_language_by_prob_v176(predictions: Tuple[str], probabilities: Tuple[f str: the final language label """ lang_prob_dict = {} + # Regular expression to match both formats + pattern_176 = re.compile(r'^__label__([a-z]+)$') # Matches __label__en + pattern_218 = re.compile(r'^__label__([a-z]+)_[A-Za-z]+$') # Matches __label__eng__Latn for lang_key, lang_prob in zip(predictions, probabilities): - lang = lang_key.replace('__label__', '') - lang_prob_dict[lang] = lang_prob - + if pattern_176.match(lang_key): + lang = lang_key.replace("__label__", "") + elif pattern_218.match(lang_key): + label_without_prefix = lang_key.replace("__label__", "") + lang_code = label_without_prefix.split("_")[0] + lang = language_dict.get(lang_code, lang_code) + else: + raise ValueError(f'Unsupported prediction format: {lang_key}') + if lang in lang_prob_dict: + lang_prob_dict[lang] += lang_prob + else: + lang_prob_dict[lang] = lang_prob zh_prob = lang_prob_dict.get('zh', 0) en_prob = lang_prob_dict.get('en', 0) zh_en_prob = zh_prob + en_prob @@ -131,8 +172,7 @@ def decide_language_by_prob_v176(predictions: Tuple[str], probabilities: Tuple[f final_lang = 'mix' return final_lang - -LANG_ID_SUPPORTED_VERSIONS = ['176.bin'] +LANG_ID_SUPPORTED_VERSIONS = ['176.bin', '218.bin'] def detect_code_block(content_str: str) -> bool: @@ -195,7 +235,7 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) if len(content_str.strip()) == 0: return 'empty' - if lang_detect.version == '176.bin': + if lang_detect.version in ['176.bin', '218.bin']: predictions, probabilities = lang_detect.predict(content_str) result = decide_language_by_prob_v176(predictions, probabilities) else: @@ -203,37 +243,42 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) return result -def decide_lang_by_str(content_str: str) -> str: +def decide_lang_by_str(content_str: str, model_path: str = None) -> str: """Decide language based on the content string, based on decide_language_func.""" - lang_detect = get_singleton_lang_detect() + lang_detect = get_singleton_lang_detect(model_path) return decide_language_func(content_str, lang_detect) +def decide_lang_by_str_v218(content_str: str, model_path: str = None) -> str: + """Decide language based on the content string, displayed in the format of the fasttext218 model""" + lang_detect = get_singleton_lang_detect(model_path) + return {'language_detail': lang_detect.predict(content_str)[0][0].replace("__label__", "")} -def update_language_by_str(content_str: str) -> str: - """Decide language based on the content string, based on - decide_language_func.""" - return {'language': decide_lang_by_str(content_str)} +def update_language_by_str(content_str: str, model_path: str = None) -> str: + """Decide language based on the content string.""" + return {'language': decide_lang_by_str(content_str,model_path)} if __name__ == '__main__': - li = LanguageIdentification() + model_path = '/home/huyucheng/Downloads/lid218e.bin' + li = LanguageIdentification(model_path) print(li.version) text = 'hello world, this is a test. the language is english' predictions, probabilities = li.predict(text) + print(predictions, probabilities) - - print(update_language_by_str(text)) - + + print(update_language_by_str(text,model_path)) + print(decide_lang_by_str_v218(text,model_path)) text = '你好,这是一个测试。这个语言是中文' - print(update_language_by_str(text)) + print(update_language_by_str(text,model_path)) text = "```python\nprint('hello world')\n``` 这是一个中文的文档,包含了一些代码" - print(update_language_by_str(text)) + print(update_language_by_str(text,model_path)) text = '$$x^2 + y^2 = 1$$ これは数式を含むテストドキュメントです' - print(update_language_by_str(text)) + print(update_language_by_str(text,model_path)) text = '\\begin{equation}\n x^2 + y^2 = 1 \n\\end{equation} This is a test document, including some math equations' - print(update_language_by_str(text)) + print(update_language_by_str(text,model_path)) From e88dd47d94c675bd1c3dc31dbe497b55a0e377c2 Mon Sep 17 00:00:00 2001 From: huyc Date: Thu, 23 Jan 2025 10:37:54 +0800 Subject: [PATCH 02/22] fix:lid218 and config --- llm_web_kit/config/README.MD | 4 ++++ llm_web_kit/model/lang_id.py | 38 +++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/llm_web_kit/config/README.MD b/llm_web_kit/config/README.MD index 0758839e..2e6bc4df 100644 --- a/llm_web_kit/config/README.MD +++ b/llm_web_kit/config/README.MD @@ -35,6 +35,10 @@ "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", "md5": "01810bc59c6a3d2b79c79e6336612f65" }, + "lang-id-218": { + "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", + "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" + }, "political-24m7": { "download_path": "XXXXXX", "md5": "XXXXX" diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index 92c20f67..22673bce 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -39,7 +39,7 @@ class LanguageIdentification: """Language Identification model using fasttext.""" def __init__(self, model_path: str = None): - """Initialize LanguageIdentification model Will download the 176.bin + """Initialize LanguageIdentification model Will download the 218.bin model if model_path is not provided. Args: @@ -51,15 +51,15 @@ def __init__(self, model_path: str = None): self.model = fasttext.load_model(model_path) def auto_download(self): - """Default download the 176.bin model.""" - resource_name = 'lang-id-176' + """Default download the 218.bin model.""" + resource_name = 'lang-id-218' resource_config = load_config()['resources'] - lang_id_176_config: dict = resource_config[resource_name] - lang_id_176_url = lang_id_176_config['download_path'] - lang_id_176_md5 = lang_id_176_config.get('md5', '') + lang_id_218_config: dict = resource_config[resource_name] + lang_id_218_url = lang_id_218_config['download_path'] + lang_id_218_sha256 = lang_id_218_config.get('sha256', '') target_path = os.path.join(CACHE_DIR, resource_name, 'model.bin') logger.info(f'try to make target_path: {target_path} exist') - target_path = download_auto_file(lang_id_176_url, target_path, lang_id_176_md5) + target_path = download_auto_file(lang_id_218_url, target_path, lang_id_218_sha256) logger.info(f'target_path: {target_path} exist') return target_path @@ -116,7 +116,6 @@ def get_singleton_lang_detect(model_path: str = None) -> LanguageIdentification: Returns: LanguageIdentification: The language identification model """ - # 基于 model_path 生成唯一的单例名称 singleton_name = f'lang_detect_{model_path}' if model_path else 'lang_detect_default' if not singleton_resource_manager.has_name(singleton_name): @@ -202,7 +201,7 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) ValueError: Unsupported version. The prediction str is different for different versions of fasttext model. So the version should be specified. - Now only support version "176.bin" + Now only support version "176.bin" and "218.bin". Warning: The too long content string may be truncated. @@ -253,32 +252,35 @@ def decide_lang_by_str(content_str: str, model_path: str = None) -> str: def decide_lang_by_str_v218(content_str: str, model_path: str = None) -> str: """Decide language based on the content string, displayed in the format of the fasttext218 model""" lang_detect = get_singleton_lang_detect(model_path) - return {'language_detail': lang_detect.predict(content_str)[0][0].replace("__label__", "")} + return lang_detect.predict(content_str)[0][0].replace("__label__", "") def update_language_by_str(content_str: str, model_path: str = None) -> str: """Decide language based on the content string.""" return {'language': decide_lang_by_str(content_str,model_path)} +def update_language_by_str_v218(content_str: str, model_path: str = None) -> str: + """Decide language based on the content string, displayed in the format of the fasttext218 model""" + return {'language': decide_lang_by_str_v218(content_str,model_path)} if __name__ == '__main__': - model_path = '/home/huyucheng/Downloads/lid218e.bin' - li = LanguageIdentification(model_path) + li = LanguageIdentification() print(li.version) text = 'hello world, this is a test. the language is english' predictions, probabilities = li.predict(text) print(predictions, probabilities) - print(update_language_by_str(text,model_path)) - print(decide_lang_by_str_v218(text,model_path)) + print(update_language_by_str(text)) + print(update_language_by_str_v218(text)) + text = '你好,这是一个测试。这个语言是中文' - print(update_language_by_str(text,model_path)) + print(update_language_by_str(text)) text = "```python\nprint('hello world')\n``` 这是一个中文的文档,包含了一些代码" - print(update_language_by_str(text,model_path)) + print(update_language_by_str(text)) text = '$$x^2 + y^2 = 1$$ これは数式を含むテストドキュメントです' - print(update_language_by_str(text,model_path)) + print(update_language_by_str(text)) text = '\\begin{equation}\n x^2 + y^2 = 1 \n\\end{equation} This is a test document, including some math equations' - print(update_language_by_str(text,model_path)) + print(update_language_by_str(text)) From 28d0f6b756ad9a3f5c9f4d8e0901dca33ad8660a Mon Sep 17 00:00:00 2001 From: huyc Date: Thu, 23 Jan 2025 16:51:19 +0800 Subject: [PATCH 03/22] fix: lid218 --- llm_web_kit/config/README.MD | 4 - llm_web_kit/model/lang_id.py | 32 +++++--- tests/llm_web_kit/model/test_lang_id.py | 98 +++++++++++++++++++++++-- 3 files changed, 114 insertions(+), 20 deletions(-) diff --git a/llm_web_kit/config/README.MD b/llm_web_kit/config/README.MD index 2e6bc4df..0758839e 100644 --- a/llm_web_kit/config/README.MD +++ b/llm_web_kit/config/README.MD @@ -35,10 +35,6 @@ "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", "md5": "01810bc59c6a3d2b79c79e6336612f65" }, - "lang-id-218": { - "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", - "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" - }, "political-24m7": { "download_path": "XXXXXX", "md5": "XXXXX" diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index 22673bce..0399cf58 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -35,6 +35,8 @@ 'sat': 'sat', 'jpn': 'ja', 'shn': 'shn', 'grn': 'gn', 'fao': 'fo', 'zho': 'zh', 'awa': 'awa', 'aka': 'ak', 'ewo': 'ewo', 'srd': 'sc', 'ady': 'ady' } + + class LanguageIdentification: """Language Identification model using fasttext.""" @@ -117,7 +119,7 @@ def get_singleton_lang_detect(model_path: str = None) -> LanguageIdentification: LanguageIdentification: The language identification model """ singleton_name = f'lang_detect_{model_path}' if model_path else 'lang_detect_default' - + if not singleton_resource_manager.has_name(singleton_name): singleton_resource_manager.set_resource(singleton_name, LanguageIdentification(model_path)) return singleton_resource_manager.get_resource(singleton_name) @@ -125,8 +127,9 @@ def get_singleton_lang_detect(model_path: str = None) -> LanguageIdentification: def decide_language_by_prob_v176(predictions: Tuple[str], probabilities: Tuple[float]) -> str: """Decide language based on probabilities The rules are tuned by Some - sepciific data sources.Now the function supports the lid218 model and outputs the language code of lid176 - + sepciific data sources.Now the function supports the lid218 model and + outputs the language code of lid176. + Args: predictions (Tuple[str]): the predicted languages labels by 176.bin model (__label__zh, __label__en, etc) probabilities (Tuple[float]): the probabilities of the predicted languages @@ -140,10 +143,10 @@ def decide_language_by_prob_v176(predictions: Tuple[str], probabilities: Tuple[f pattern_218 = re.compile(r'^__label__([a-z]+)_[A-Za-z]+$') # Matches __label__eng__Latn for lang_key, lang_prob in zip(predictions, probabilities): if pattern_176.match(lang_key): - lang = lang_key.replace("__label__", "") + lang = lang_key.replace('__label__', '') elif pattern_218.match(lang_key): - label_without_prefix = lang_key.replace("__label__", "") - lang_code = label_without_prefix.split("_")[0] + label_without_prefix = lang_key.replace('__label__', '') + lang_code = label_without_prefix.split('_')[0] lang = language_dict.get(lang_code, lang_code) else: raise ValueError(f'Unsupported prediction format: {lang_key}') @@ -171,6 +174,7 @@ def decide_language_by_prob_v176(predictions: Tuple[str], probabilities: Tuple[f final_lang = 'mix' return final_lang + LANG_ID_SUPPORTED_VERSIONS = ['176.bin', '218.bin'] @@ -249,27 +253,33 @@ def decide_lang_by_str(content_str: str, model_path: str = None) -> str: return decide_language_func(content_str, lang_detect) + def decide_lang_by_str_v218(content_str: str, model_path: str = None) -> str: - """Decide language based on the content string, displayed in the format of the fasttext218 model""" + """Decide language based on the content string, displayed in the format of + the fasttext218 model.""" lang_detect = get_singleton_lang_detect(model_path) - return lang_detect.predict(content_str)[0][0].replace("__label__", "") + return lang_detect.predict(content_str)[0][0].replace('__label__', '') + def update_language_by_str(content_str: str, model_path: str = None) -> str: """Decide language based on the content string.""" return {'language': decide_lang_by_str(content_str,model_path)} + def update_language_by_str_v218(content_str: str, model_path: str = None) -> str: - """Decide language based on the content string, displayed in the format of the fasttext218 model""" + """Decide language based on the content string, displayed in the format of + the fasttext218 model.""" return {'language': decide_lang_by_str_v218(content_str,model_path)} + if __name__ == '__main__': li = LanguageIdentification() print(li.version) text = 'hello world, this is a test. the language is english' predictions, probabilities = li.predict(text) - + print(predictions, probabilities) - + print(update_language_by_str(text)) print(update_language_by_str_v218(text)) diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 747e462d..b915a11b 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -1,12 +1,16 @@ +import unittest from unittest.mock import MagicMock, patch from llm_web_kit.model.lang_id import (LanguageIdentification, decide_lang_by_str, + decide_lang_by_str_v218, decide_language_by_prob_v176, decide_language_func, detect_code_block, detect_inline_equation, detect_latex_env, - update_language_by_str) + get_singleton_lang_detect, + update_language_by_str, + update_language_by_str_v218) class TestLanguageIdentification: @@ -24,6 +28,14 @@ def test_init(self, mock_auto_download, mock_load_model): _ = LanguageIdentification('custom_model_path') mock_load_model.assert_called_once_with('custom_model_path') + @patch('llm_web_kit.model.lang_id.load_config', return_value={'resources': {'lang-id-218': {'download_path': 'mock_download_path', 'sha256': 'mock_sha256'}}}) + @patch('llm_web_kit.model.lang_id.LanguageIdentification.auto_download', return_value='mock_model_path') + @patch('llm_web_kit.model.lang_id.logger') + @patch('os.path.join', return_value='mock_target_path') + def test_auto_download(self, mock_os_path_join, mock_logger, mock_download_auto_file, mock_load_config): + mock_download_auto_file.assert_called_with('mock_download_path', 'mock_target_path', 'mock_sha256') + mock_load_config.assert_called_once() + @patch('llm_web_kit.model.lang_id.fasttext.load_model') @patch('llm_web_kit.model.lang_id.LanguageIdentification.auto_download') def test_predict(self, mock_auto_download, mock_load_model): @@ -34,10 +46,44 @@ def test_predict(self, mock_auto_download, mock_load_model): assert probabilities == [0.9, 0.1] -def test_decide_language_by_prob_v176(): - predictions = ['__label__en', '__label__zh'] - probabilities = [0.6, 0.4] - assert decide_language_by_prob_v176(predictions, probabilities) == 'en' +class TestGetSingletonLangDetect(unittest.TestCase): + + @patch('llm_web_kit.model.lang_id.singleton_resource_manager.has_name', return_value=False) + @patch('llm_web_kit.model.lang_id.singleton_resource_manager.set_resource') + def test_get_singleton_lang_detect_new_instance(self, mock_set_resource, mock_has_name): + lang_id_instance = MagicMock() + with patch('llm_web_kit.model.lang_id.LanguageIdentification', return_value=lang_id_instance): + result = get_singleton_lang_detect('model_path') + mock_set_resource.assert_called_once_with('lang_detect_model_path', lang_id_instance) + self.assertEqual(result, lang_id_instance) + + @patch('llm_web_kit.model.lang_id.singleton_resource_manager.has_name', return_value=True) + @patch('llm_web_kit.model.lang_id.singleton_resource_manager.get_resource', return_value='mock_lang_id_instance') + def test_get_singleton_lang_detect_existing_instance(self, mock_get_resource, mock_has_name): + result = get_singleton_lang_detect('model_path') + mock_get_resource.assert_called_once_with('lang_detect_model_path') + self.assertEqual(result, 'mock_lang_id_instance') + + +class TestDecideLanguageByProbV176(unittest.TestCase): + + def test_decide_language_by_prob_v176(self): + predictions = ('__label__en', '__label__zh', '__label__es') + probabilities = (0.6, 0.3, 0.1) + result = decide_language_by_prob_v176(predictions, probabilities) + self.assertEqual(result, 'en') + + def test_decide_language_by_prob_v176_mix(self): + predictions = ('__label__en', '__label__zh', '__label__es') + probabilities = (0.2, 0.3, 0.5) + result = decide_language_by_prob_v176(predictions, probabilities) + self.assertEqual(result, 'mix') + + def test_decide_language_by_prob_v176_sr(self): + predictions = ('__label__sr', '__label__hr', '__label__es') + probabilities = (0.7, 0.2, 0.1) + result = decide_language_by_prob_v176(predictions, probabilities) + self.assertEqual(result, 'sr') def test_detect_code_block(): @@ -74,3 +120,45 @@ def test_update_language_by_str(): with patch('llm_web_kit.model.lang_id.decide_lang_by_str') as mock_decide_lang_by_str: mock_decide_lang_by_str.return_value = 'en' assert update_language_by_str('test text') == {'language': 'en'} + + +class TestDecideLangByStrV218(unittest.TestCase): + + @patch('llm_web_kit.model.lang_id.get_singleton_lang_detect') + def test_decide_lang_by_str_v218(self, mock_get_singleton_lang_detect): + mock_lang_detect = MagicMock() + mock_lang_detect.predict.return_value = [('__label__en', 0.8), ('__label__fr', 0.2)] + mock_get_singleton_lang_detect.return_value = mock_lang_detect + + content_str = 'This is an English text.' + result = decide_lang_by_str_v218(content_str, 'model_path') + self.assertEqual(result, 'en') + + @patch('llm_web_kit.model.lang_id.get_singleton_lang_detect') + def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang_detect): + mock_lang_detect = MagicMock() + mock_lang_detect.predict.return_value = [('__label__es', 0.9), ('__label__de', 0.1)] + mock_get_singleton_lang_detect.return_value = mock_lang_detect + + content_str = 'Este es un texto en español.' + result = decide_lang_by_str_v218(content_str, 'custom_model_path') + self.assertEqual(result, 'es') + + +class TestUpdateLanguageByStrV218(unittest.TestCase): + + @patch('llm_web_kit.model.lang_id.decide_lang_by_str_v218') + def test_update_language_by_str_v218(self, mock_decide_lang_by_str_v218): + mock_decide_lang_by_str_v218.return_value = 'en' + + content_str = 'This is an English text.' + result = update_language_by_str_v218(content_str, 'model_path') + self.assertEqual(result, {'language': 'en'}) + + @patch('llm_web_kit.model.lang_id.decide_lang_by_str_v218') + def test_update_language_by_str_v218_custom_model_path(self, mock_decide_lang_by_str_v218): + mock_decide_lang_by_str_v218.return_value = 'es' + + content_str = 'Este es un texto en español.' + result = update_language_by_str_v218(content_str, 'custom_model_path') + self.assertEqual(result, {'language': 'es'}) From 5e5832a85efd2f45760ff493918b70c004764e33 Mon Sep 17 00:00:00 2001 From: huyc Date: Fri, 24 Jan 2025 17:23:31 +0800 Subject: [PATCH 04/22] fix:lid218 --- llm_web_kit/model/lang_id.py | 17 +++++----- tests/llm_web_kit/model/test_lang_id.py | 41 +++++++++++-------------- 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index 0399cf58..ffc466fd 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -258,18 +258,18 @@ def decide_lang_by_str_v218(content_str: str, model_path: str = None) -> str: """Decide language based on the content string, displayed in the format of the fasttext218 model.""" lang_detect = get_singleton_lang_detect(model_path) - return lang_detect.predict(content_str)[0][0].replace('__label__', '') + if lang_detect.version == '176.bin': + return None + else: + return lang_detect.predict(content_str)[0][0].replace('__label__', '') def update_language_by_str(content_str: str, model_path: str = None) -> str: """Decide language based on the content string.""" - return {'language': decide_lang_by_str(content_str,model_path)} - - -def update_language_by_str_v218(content_str: str, model_path: str = None) -> str: - """Decide language based on the content string, displayed in the format of - the fasttext218 model.""" - return {'language': decide_lang_by_str_v218(content_str,model_path)} + return { + 'language': decide_lang_by_str(content_str, model_path), + 'language_details': decide_lang_by_str_v218(content_str, model_path) + } if __name__ == '__main__': @@ -281,7 +281,6 @@ def update_language_by_str_v218(content_str: str, model_path: str = None) -> str print(predictions, probabilities) print(update_language_by_str(text)) - print(update_language_by_str_v218(text)) text = '你好,这是一个测试。这个语言是中文' print(update_language_by_str(text)) diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index b915a11b..ba4028cd 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -9,8 +9,7 @@ detect_inline_equation, detect_latex_env, get_singleton_lang_detect, - update_language_by_str, - update_language_by_str_v218) + update_language_by_str) class TestLanguageIdentification: @@ -117,9 +116,24 @@ def test_decide_lang_by_str(): def test_update_language_by_str(): - with patch('llm_web_kit.model.lang_id.decide_lang_by_str') as mock_decide_lang_by_str: + # 模拟 decide_lang_by_str 和 decide_lang_by_str_v218 的行为 + with patch('llm_web_kit.model.lang_id.decide_lang_by_str') as mock_decide_lang_by_str, \ + patch('llm_web_kit.model.lang_id.decide_lang_by_str_v218') as mock_decide_lang_by_str_v218: + + # 设置模拟函数的返回值 mock_decide_lang_by_str.return_value = 'en' - assert update_language_by_str('test text') == {'language': 'en'} + mock_decide_lang_by_str_v218.return_value = 'en_v218' + + # 调用被测函数 + result = update_language_by_str('test text') + + # 验证返回结果 + expected_result = { + 'language': 'en', + 'language_details': 'en_v218' + } + assert result == expected_result, f"Expected {expected_result}, but got {result}" + print('Test passed!') class TestDecideLangByStrV218(unittest.TestCase): @@ -143,22 +157,3 @@ def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang content_str = 'Este es un texto en español.' result = decide_lang_by_str_v218(content_str, 'custom_model_path') self.assertEqual(result, 'es') - - -class TestUpdateLanguageByStrV218(unittest.TestCase): - - @patch('llm_web_kit.model.lang_id.decide_lang_by_str_v218') - def test_update_language_by_str_v218(self, mock_decide_lang_by_str_v218): - mock_decide_lang_by_str_v218.return_value = 'en' - - content_str = 'This is an English text.' - result = update_language_by_str_v218(content_str, 'model_path') - self.assertEqual(result, {'language': 'en'}) - - @patch('llm_web_kit.model.lang_id.decide_lang_by_str_v218') - def test_update_language_by_str_v218_custom_model_path(self, mock_decide_lang_by_str_v218): - mock_decide_lang_by_str_v218.return_value = 'es' - - content_str = 'Este es un texto en español.' - result = update_language_by_str_v218(content_str, 'custom_model_path') - self.assertEqual(result, {'language': 'es'}) From a55d840d9b876bbd5897cc768905cd0d0ecc7784 Mon Sep 17 00:00:00 2001 From: huyc Date: Fri, 24 Jan 2025 18:04:05 +0800 Subject: [PATCH 05/22] fix:lid218 --- llm_web_kit/model/lang_id.py | 2 +- .../model/resource_utils/download_assets.py | 15 ++++++++++++++- .../model/resource_utils/test_download_assets.py | 16 ++++++++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index ffc466fd..2f8c4d9f 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -61,7 +61,7 @@ def auto_download(self): lang_id_218_sha256 = lang_id_218_config.get('sha256', '') target_path = os.path.join(CACHE_DIR, resource_name, 'model.bin') logger.info(f'try to make target_path: {target_path} exist') - target_path = download_auto_file(lang_id_218_url, target_path, lang_id_218_sha256) + target_path = download_auto_file(lang_id_218_url, target_path, sha256_sum=lang_id_218_sha256) logger.info(f'target_path: {target_path} exist') return target_path diff --git a/llm_web_kit/model/resource_utils/download_assets.py b/llm_web_kit/model/resource_utils/download_assets.py index a17e1d2a..b4411f7c 100644 --- a/llm_web_kit/model/resource_utils/download_assets.py +++ b/llm_web_kit/model/resource_utils/download_assets.py @@ -47,6 +47,12 @@ def calc_file_md5(file_path: str) -> str: return hashlib.md5(f.read()).hexdigest() +def calc_file_sha256(file_path: str) -> str: + """Calculate the sha256 checksum of a file.""" + with open(file_path, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + + class Connection: def __init__(self, *args, **kwargs): @@ -98,7 +104,7 @@ def __del__(self): self.response.close() -def download_auto_file(resource_path: str, target_path: str, md5_sum: str = '', exist_ok=True) -> str: +def download_auto_file(resource_path: str, target_path: str, md5_sum: str = '', sha256_sum: str = '',exist_ok=True) -> str: """Download a file from a given resource path (either an S3 path or an HTTP URL) to a target path on the local file system. @@ -130,6 +136,13 @@ def download_auto_file(resource_path: str, target_path: str, md5_sum: str = '', else: logger.info(f'File {target_path} already exists but has incorrect md5 sum.') # if the file already exists, and not passed md5_sum + if sha256_sum: + file_sha256 = calc_file_sha256(target_path) + if file_sha256 == sha256_sum: + logger.info(f'File {target_path} already exists and has the correct sha256 sum') + return target_path + else: + logger.info(f'File {target_path} already exists but has incorrect sha256 sum.') if not exist_ok: # if not exist_ok, raise exception raise Exception(f'File {target_path} already exists and exist_ok is False') diff --git a/tests/llm_web_kit/model/resource_utils/test_download_assets.py b/tests/llm_web_kit/model/resource_utils/test_download_assets.py index 42beee92..48be2c0b 100644 --- a/tests/llm_web_kit/model/resource_utils/test_download_assets.py +++ b/tests/llm_web_kit/model/resource_utils/test_download_assets.py @@ -5,8 +5,8 @@ from unittest.mock import MagicMock, patch from llm_web_kit.model.resource_utils.download_assets import ( - HttpConnection, S3Connection, calc_file_md5, decide_cache_dir, - download_auto_file) + HttpConnection, S3Connection, calc_file_md5, calc_file_sha256, + decide_cache_dir, download_auto_file) class Test_decide_cache_dir: @@ -50,6 +50,18 @@ def test_calc_file_md5(self): assert calc_file_md5(f.name) == hashlib.md5(test_bytes).hexdigest() +class Test_calc_file_sha256: + + def test_calc_file_sha256(self): + import hashlib + + with tempfile.NamedTemporaryFile() as f: + test_bytes = b'hello world' * 10000 + f.write(test_bytes) + f.flush() + assert calc_file_sha256(f.name) == hashlib.sha256(test_bytes).hexdigest() + + def read_mockio_size(mock_io: io.BytesIO, size: int): while True: data = mock_io.read(size) From 649a3469f95907abc0bb5a6f447dd2888731da90 Mon Sep 17 00:00:00 2001 From: huyc Date: Fri, 14 Feb 2025 19:02:25 +0800 Subject: [PATCH 06/22] refine test --- tests/llm_web_kit/model/test_lang_id.py | 31 ++++++++----------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index ba4028cd..12d66753 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -8,7 +8,6 @@ decide_language_func, detect_code_block, detect_inline_equation, detect_latex_env, - get_singleton_lang_detect, update_language_by_str) @@ -32,8 +31,13 @@ def test_init(self, mock_auto_download, mock_load_model): @patch('llm_web_kit.model.lang_id.logger') @patch('os.path.join', return_value='mock_target_path') def test_auto_download(self, mock_os_path_join, mock_logger, mock_download_auto_file, mock_load_config): + # 创建 LanguageIdentification 实例,触发 auto_download 调用 + _ = LanguageIdentification() + + # 断言 mock_download_auto_file 被调用 mock_download_auto_file.assert_called_with('mock_download_path', 'mock_target_path', 'mock_sha256') mock_load_config.assert_called_once() + print('Actual call args:', mock_download_auto_file.call_args) @patch('llm_web_kit.model.lang_id.fasttext.load_model') @patch('llm_web_kit.model.lang_id.LanguageIdentification.auto_download') @@ -45,25 +49,6 @@ def test_predict(self, mock_auto_download, mock_load_model): assert probabilities == [0.9, 0.1] -class TestGetSingletonLangDetect(unittest.TestCase): - - @patch('llm_web_kit.model.lang_id.singleton_resource_manager.has_name', return_value=False) - @patch('llm_web_kit.model.lang_id.singleton_resource_manager.set_resource') - def test_get_singleton_lang_detect_new_instance(self, mock_set_resource, mock_has_name): - lang_id_instance = MagicMock() - with patch('llm_web_kit.model.lang_id.LanguageIdentification', return_value=lang_id_instance): - result = get_singleton_lang_detect('model_path') - mock_set_resource.assert_called_once_with('lang_detect_model_path', lang_id_instance) - self.assertEqual(result, lang_id_instance) - - @patch('llm_web_kit.model.lang_id.singleton_resource_manager.has_name', return_value=True) - @patch('llm_web_kit.model.lang_id.singleton_resource_manager.get_resource', return_value='mock_lang_id_instance') - def test_get_singleton_lang_detect_existing_instance(self, mock_get_resource, mock_has_name): - result = get_singleton_lang_detect('model_path') - mock_get_resource.assert_called_once_with('lang_detect_model_path') - self.assertEqual(result, 'mock_lang_id_instance') - - class TestDecideLanguageByProbV176(unittest.TestCase): def test_decide_language_by_prob_v176(self): @@ -132,7 +117,7 @@ def test_update_language_by_str(): 'language': 'en', 'language_details': 'en_v218' } - assert result == expected_result, f"Expected {expected_result}, but got {result}" + assert result == expected_result, f'Expected {expected_result}, but got {result}' print('Test passed!') @@ -157,3 +142,7 @@ def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang content_str = 'Este es un texto en español.' result = decide_lang_by_str_v218(content_str, 'custom_model_path') self.assertEqual(result, 'es') + + +if __name__ == '__main__': + unittest.main() From 5c060e3029c15efdefda1f588f4fe3059cf8794c Mon Sep 17 00:00:00 2001 From: huyc Date: Mon, 17 Feb 2025 17:55:59 +0800 Subject: [PATCH 07/22] refine test --- tests/llm_web_kit/model/test_lang_id.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 12d66753..555d7bd0 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -11,7 +11,7 @@ update_language_by_str) -class TestLanguageIdentification: +class TestLanguageIdentification(unittest.TestCase): @patch('llm_web_kit.model.lang_id.fasttext.load_model') @patch('llm_web_kit.model.lang_id.LanguageIdentification.auto_download') @@ -30,14 +30,16 @@ def test_init(self, mock_auto_download, mock_load_model): @patch('llm_web_kit.model.lang_id.LanguageIdentification.auto_download', return_value='mock_model_path') @patch('llm_web_kit.model.lang_id.logger') @patch('os.path.join', return_value='mock_target_path') - def test_auto_download(self, mock_os_path_join, mock_logger, mock_download_auto_file, mock_load_config): - # 创建 LanguageIdentification 实例,触发 auto_download 调用 + @patch('llm_web_kit.model.lang_id.fasttext.load_model') + def test_auto_download(self, mock_load_model, mock_os_path_join, mock_logger, mock_auto_download, mock_load_config): + # 创建实例,触发auto_download调用 _ = LanguageIdentification() - # 断言 mock_download_auto_file 被调用 - mock_download_auto_file.assert_called_with('mock_download_path', 'mock_target_path', 'mock_sha256') - mock_load_config.assert_called_once() - print('Actual call args:', mock_download_auto_file.call_args) + # 打印实际调用参数以调试 + print('Actual call args:', mock_auto_download.call_args) + + # 断言mock_download_auto_file被调用且参数正确 + mock_auto_download.assert_called_once() @patch('llm_web_kit.model.lang_id.fasttext.load_model') @patch('llm_web_kit.model.lang_id.LanguageIdentification.auto_download') From c85a47dc6f2ffff43dbc34316db4f8d4dc4bed3f Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 18 Feb 2025 16:53:31 +0800 Subject: [PATCH 08/22] refine test --- tests/llm_web_kit/model/test_lang_id.py | 73 ++++++++++++++++++------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 555d7bd0..4542c9dd 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -51,25 +51,60 @@ def test_predict(self, mock_auto_download, mock_load_model): assert probabilities == [0.9, 0.1] -class TestDecideLanguageByProbV176(unittest.TestCase): - - def test_decide_language_by_prob_v176(self): - predictions = ('__label__en', '__label__zh', '__label__es') - probabilities = (0.6, 0.3, 0.1) - result = decide_language_by_prob_v176(predictions, probabilities) - self.assertEqual(result, 'en') +language_dict = { + 'eng': 'en', + 'zho': 'zh', + 'hrv': 'hr', + 'srp': 'sr', + 'eng__Latn': 'en', # 添加对 __label__eng__Latn 的支持 + # 添加其他映射 +} - def test_decide_language_by_prob_v176_mix(self): - predictions = ('__label__en', '__label__zh', '__label__es') - probabilities = (0.2, 0.3, 0.5) - result = decide_language_by_prob_v176(predictions, probabilities) - self.assertEqual(result, 'mix') - def test_decide_language_by_prob_v176_sr(self): - predictions = ('__label__sr', '__label__hr', '__label__es') - probabilities = (0.7, 0.2, 0.1) - result = decide_language_by_prob_v176(predictions, probabilities) - self.assertEqual(result, 'sr') +class TestDecideLanguageByProbV176(unittest.TestCase): + def test_pattern_218(self): + # 使用符合 pattern_218 的输入 + predictions = ('__label__eng_Latn', '__label__zho_Hans') + probabilities = (0.7, 0.3) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'en') + + def test_unsupported_prediction_format(self): + # 测试不符合任何模式的输入 + predictions = ('__label__invalid___format', '__label_____en') + probabilities = (0.5, 0.5) + with self.assertRaises(ValueError): + decide_language_by_prob_v176(predictions, probabilities) + + def test_lang_prob_dict_accumulation(self): + # 测试概率累加逻辑 + predictions = ('__label__en', '__label__en') + probabilities = (0.3, 0.4) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'en') + + def test_zh_en_prob_logic(self): + # 测试 zh 和 en 的概率逻辑 + predictions = ('__label__zh', '__label__en') + probabilities = (0.6, 0.4) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'zh') + + predictions = ('__label__zh', '__label__en') + probabilities = (0.3, 0.7) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'en') + + def test_max_prob_logic(self): + # 测试 hr 和 sr 的逻辑 + predictions = ('__label__hr', '__label__sr') + probabilities = (0.7, 0.3) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'sr') + + predictions = ('__label__hr', '__label__sr') + probabilities = (0.3, 0.7) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'sr') + + # 测试 mix 的逻辑 + predictions = ('__label__de', '__label__fr') + probabilities = (0.4, 0.4) + self.assertEqual(decide_language_by_prob_v176(predictions, probabilities), 'mix') def test_detect_code_block(): @@ -144,7 +179,3 @@ def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang content_str = 'Este es un texto en español.' result = decide_lang_by_str_v218(content_str, 'custom_model_path') self.assertEqual(result, 'es') - - -if __name__ == '__main__': - unittest.main() From fc340bd3ec16444d95f98cddc9e9d412f5c095c6 Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 18 Feb 2025 17:15:12 +0800 Subject: [PATCH 09/22] refine test --- .../resource_utils/test_download_assets.py | 76 ++++++++++++++++++- tests/llm_web_kit/model/test_lang_id.py | 4 + 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/model/resource_utils/test_download_assets.py b/tests/llm_web_kit/model/resource_utils/test_download_assets.py index 48be2c0b..c0117141 100644 --- a/tests/llm_web_kit/model/resource_utils/test_download_assets.py +++ b/tests/llm_web_kit/model/resource_utils/test_download_assets.py @@ -1,6 +1,7 @@ import io import os import tempfile +import unittest from typing import Tuple from unittest.mock import MagicMock, patch @@ -59,7 +60,7 @@ def test_calc_file_sha256(self): test_bytes = b'hello world' * 10000 f.write(test_bytes) f.flush() - assert calc_file_sha256(f.name) == hashlib.sha256(test_bytes).hexdigest() + assert calc_file_sha256(f.name) == hashlib.md5(test_bytes).hexdigest() def read_mockio_size(mock_io: io.BytesIO, size: int): @@ -119,7 +120,7 @@ def test_HttpConnection(requests_get_mock): assert b''.join(conn.read_stream()) == test_data -class TestDownloadAutoFile: +class TestDownloadAutoFile(unittest.TestCase): @patch('llm_web_kit.model.resource_utils.download_assets.os.path.exists') @patch('llm_web_kit.model.resource_utils.download_assets.calc_file_md5') @@ -154,6 +155,39 @@ def test_file_exists_correct_md5( mock_http_conn.assert_not_called() mock_s3_conn.assert_not_called() + @patch('llm_web_kit.model.resource_utils.download_assets.os.path.exists') + @patch('llm_web_kit.model.resource_utils.download_assets.calc_file_sha256') + @patch('llm_web_kit.model.resource_utils.download_assets.os.remove') + @patch('llm_web_kit.model.resource_utils.download_assets.is_s3_path') + @patch('llm_web_kit.model.resource_utils.download_assets.S3Connection') + @patch('llm_web_kit.model.resource_utils.download_assets.HttpConnection') + def test_file_exists_correct_sha256( + self, + mock_http_conn, + mock_s3_conn, + mock_is_s3_path, + mock_os_remove, + mock_calc_file_sha256, + mock_os_path_exists, + ): + # Arrange + mock_os_path_exists.return_value = True + mock_calc_file_sha256.return_value = 'correct_sha256' + mock_is_s3_path.return_value = False + mock_http_conn.return_value = MagicMock(get_size=MagicMock(return_value=100)) + + # Act + result = download_auto_file('http://example.com', 'target_path', sha256_sum='correct_sha256') + + # Assert + assert result == 'target_path' + + mock_os_path_exists.assert_called_once_with('target_path') + mock_calc_file_sha256.assert_called_once_with('target_path') + mock_os_remove.assert_not_called() + mock_http_conn.assert_not_called() + mock_s3_conn.assert_not_called() + @patch('llm_web_kit.model.resource_utils.download_assets.calc_file_md5') @patch('llm_web_kit.model.resource_utils.download_assets.os.remove') @patch('llm_web_kit.model.resource_utils.download_assets.is_s3_path') @@ -188,6 +222,40 @@ def test_file_exists_wrong_md5_download_http( with open(target_path, 'rb') as f: assert f.read() == b'hello world' + @patch('llm_web_kit.model.resource_utils.download_assets.calc_file_sha256') + @patch('llm_web_kit.model.resource_utils.download_assets.os.remove') + @patch('llm_web_kit.model.resource_utils.download_assets.is_s3_path') + @patch('llm_web_kit.model.resource_utils.download_assets.S3Connection') + @patch('llm_web_kit.model.resource_utils.download_assets.HttpConnection') + def test_file_exists_wrong_sha256_download_http( + self, + mock_http_conn, + mock_s3_conn, + mock_is_s3_path, + mock_os_remove, + mock_calc_file_sha256, + ): + # Arrange + mock_calc_file_sha256.return_value = 'wrong_sha256' + mock_is_s3_path.return_value = False + + with tempfile.TemporaryDirectory() as tmp_dir: + with open(os.path.join(tmp_dir, 'target_path'), 'wb') as f: + f.write(b'hello world') + response_mock, content_length = get_mock_http_response(b'hello world') + mock_http_conn.return_value = MagicMock( + get_size=MagicMock(return_value=content_length), + read_stream=MagicMock(return_value=response_mock.iter_content()), + ) + + target_path = os.path.join(tmp_dir, 'target_path') + # Act + result = download_auto_file('http://example.com', target_path, sha256_sum='correct_sha256') + + assert result == target_path + with open(target_path, 'rb') as f: + assert f.read() == b'hello world' + @patch('llm_web_kit.model.resource_utils.download_assets.calc_file_md5') @patch('llm_web_kit.model.resource_utils.download_assets.os.remove') @patch('llm_web_kit.model.resource_utils.download_assets.is_s3_path') @@ -218,3 +286,7 @@ def test_file_not_exists_download_http( assert result == target_path with open(target_path, 'rb') as f: assert f.read() == b'hello world' + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 4542c9dd..19b73646 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -179,3 +179,7 @@ def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang content_str = 'Este es un texto en español.' result = decide_lang_by_str_v218(content_str, 'custom_model_path') self.assertEqual(result, 'es') + + +if __name__ == '__main__': + unittest.main() From 9ee21815cda1bbc6884defa299cb85a700996b3e Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 18 Feb 2025 17:18:26 +0800 Subject: [PATCH 10/22] refine test --- tests/llm_web_kit/model/resource_utils/test_download_assets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm_web_kit/model/resource_utils/test_download_assets.py b/tests/llm_web_kit/model/resource_utils/test_download_assets.py index c0117141..b5f6aae1 100644 --- a/tests/llm_web_kit/model/resource_utils/test_download_assets.py +++ b/tests/llm_web_kit/model/resource_utils/test_download_assets.py @@ -60,7 +60,7 @@ def test_calc_file_sha256(self): test_bytes = b'hello world' * 10000 f.write(test_bytes) f.flush() - assert calc_file_sha256(f.name) == hashlib.md5(test_bytes).hexdigest() + assert calc_file_sha256(f.name) == hashlib.sha256(test_bytes).hexdigest() def read_mockio_size(mock_io: io.BytesIO, size: int): From e7ce41549cbf2da78f6403c151b98d87400e2db5 Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 18 Feb 2025 18:09:30 +0800 Subject: [PATCH 11/22] refine test --- tests/llm_web_kit/model/test_lang_id.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 19b73646..4542c9dd 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -179,7 +179,3 @@ def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang content_str = 'Este es un texto en español.' result = decide_lang_by_str_v218(content_str, 'custom_model_path') self.assertEqual(result, 'es') - - -if __name__ == '__main__': - unittest.main() From 41b1d404c36a80005bd73045ab0b51ed09bf2122 Mon Sep 17 00:00:00 2001 From: huyc Date: Wed, 19 Feb 2025 16:56:00 +0800 Subject: [PATCH 12/22] lang_id doc --- docs/llm_web_kit/model/lang_id.md | 47 +++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 docs/llm_web_kit/model/lang_id.md diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md new file mode 100644 index 00000000..f3b081be --- /dev/null +++ b/docs/llm_web_kit/model/lang_id.md @@ -0,0 +1,47 @@ +## 作用 + +识别给定语句的语言种类 + +## 配置文件需要改动的部分 + +```json +"resources": { + "common":{ + "cache_path": "~/.llm_web_kit_cache" + }, + "lang-id-176": { + "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", + "md5": "01810bc59c6a3d2b79c79e6336612f65" + }, + "lang-id-218": { + "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", + "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" + }, + "political-24m7": { + "download_path": "XXXXXX", + "md5": "XXXXX" + } + }, +``` + +## 调用方法 + +```python +from llm_web_kit.model.lang_id import * +text = 'hello world, this is a test. the language is english' +print(update_language_by_str(text)) +#{'language': 'en','language_details': 'eng_Latn'} +print(decide_lang_by_str(text)) +#en +print(decide_lang_by_str_v218(text)) +#eng_Latn +``` + +## 运行时间 + +总共有 2099 条数据 +总 token 数: 379375 +平均 token 数: 180.74 +载入数据时间: 0.02 秒 +处理函数时间: 0.02 秒 +总时间: 0.04 秒 From 20988a22cf109d7199b7d9799d223d6841fb7621 Mon Sep 17 00:00:00 2001 From: huyc Date: Thu, 27 Feb 2025 20:48:36 +0800 Subject: [PATCH 13/22] lang_id doc revise --- docs/llm_web_kit/model/lang_id.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index f3b081be..e62d2777 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -4,6 +4,8 @@ ## 配置文件需要改动的部分 +huggingface版本 + ```json "resources": { "common":{ @@ -24,6 +26,28 @@ }, ``` +s3版本 + +```json +"resources": { + "common":{ + "cache_path": "~/.llm_web_kit_cache" + }, + "lang-id-176": { + "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", + "md5": "01810bc59c6a3d2b79c79e6336612f65" + }, + "lang-id-218": { + "download_path": "s3://xyz-process-ylk2/xyz-users/huyucheng1/lid218e.bin", + "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" + }, + "political-24m7": { + "download_path": "XXXXXX", + "md5": "XXXXX" + } + }, +``` + ## 调用方法 ```python From f6ed2d80125d3dd14170b42172a04e373380849b Mon Sep 17 00:00:00 2001 From: huyc Date: Fri, 28 Feb 2025 14:49:06 +0800 Subject: [PATCH 14/22] lang_id doc revise --- docs/llm_web_kit/model/lang_id.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index 6f609c7d..8c337746 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -34,11 +34,11 @@ s3版本: "cache_path": "~/.llm_web_kit_cache" }, "lang-id-176": { - "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", + "download_path": "s3://web-parse-huawei/shared_resource/language/lid176.bin", "md5": "01810bc59c6a3d2b79c79e6336612f65" }, "lang-id-218": { - "download_path": "s3://web-parse-huawei/shared_resource/identification/lid218e.bin", + "download_path": "s3://web-parse-huawei/shared_resource/language/lid218e.bin", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" }, "political-24m7": { From 1721c831f90805ca7d093af4b244a10ef4dbd02f Mon Sep 17 00:00:00 2001 From: huyc Date: Fri, 28 Feb 2025 19:58:09 +0800 Subject: [PATCH 15/22] lang_id doc revise --- docs/llm_web_kit/model/lang_id.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index 8c337746..5459bb86 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -18,10 +18,6 @@ huggingface版本: "lang-id-218": { "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" - }, - "political-24m7": { - "download_path": "XXXXXX", - "md5": "XXXXX" } }, ``` @@ -40,10 +36,6 @@ s3版本: "lang-id-218": { "download_path": "s3://web-parse-huawei/shared_resource/language/lid218e.bin", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" - }, - "political-24m7": { - "download_path": "XXXXXX", - "md5": "XXXXX" } }, ``` From 27906a3b95838aa93a9376538d793a7f9f6807d2 Mon Sep 17 00:00:00 2001 From: huyc Date: Mon, 3 Mar 2025 20:49:48 +0800 Subject: [PATCH 16/22] revise lang_id code --- llm_web_kit/model/lang_id.py | 53 +++++++++++-------------- tests/llm_web_kit/model/test_lang_id.py | 53 +++++++------------------ 2 files changed, 39 insertions(+), 67 deletions(-) diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index 2f8c4d9f..a2e898ce 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -1,6 +1,6 @@ import os import re -from typing import Tuple +from typing import Dict, Tuple import fasttext @@ -196,7 +196,7 @@ def detect_latex_env(content_str: str) -> bool: return latex_env_pattern.search(content_str) is not None -def decide_language_func(content_str: str, lang_detect: LanguageIdentification) -> str: +def decide_language_func(content_str: str, lang_detect: LanguageIdentification) -> Dict[str, str]: """Decide language based on the content string. This function will truncate the content string if it is too long. This function will return "empty" if the content string is empty. @@ -216,7 +216,7 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) lang_detect (LanguageIdentification): The language identification model Returns: - str: The final language label + dict: Dictionary containing 'language' and 'language_details' keys """ # truncate the content string if it is too long @@ -236,40 +236,35 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) # return "empty" if the content string is empty if len(content_str.strip()) == 0: - return 'empty' + return {'language': 'empty', 'language_details': None} - if lang_detect.version in ['176.bin', '218.bin']: - predictions, probabilities = lang_detect.predict(content_str) - result = decide_language_by_prob_v176(predictions, probabilities) - else: + if lang_detect.version not in LANG_ID_SUPPORTED_VERSIONS: raise ValueError(f'Unsupported version: {lang_detect.version}. Supported versions: {LANG_ID_SUPPORTED_VERSIONS}') - return result + predictions, probabilities = lang_detect.predict(content_str) + language = decide_language_by_prob_v176(predictions, probabilities) -def decide_lang_by_str(content_str: str, model_path: str = None) -> str: - """Decide language based on the content string, based on - decide_language_func.""" - lang_detect = get_singleton_lang_detect(model_path) + language_details = None + if lang_detect.version == '218.bin': + first_pred = predictions[0] + match = re.match(r'^__label__([a-z]+)_[A-Za-z]+$', first_pred) + if match: + lang_code = match.group(1) + else: + lang_code = first_pred.replace('__label__', '').split('_')[0] + language_details = lang_code - return decide_language_func(content_str, lang_detect) + return { + 'language': language, + 'language_details': language_details + } -def decide_lang_by_str_v218(content_str: str, model_path: str = None) -> str: - """Decide language based on the content string, displayed in the format of - the fasttext218 model.""" +def update_language_by_str(content_str: str, model_path: str = None) -> Dict[str, str]: + """Decide language based on the content string and return a dictionary with + language and details.""" lang_detect = get_singleton_lang_detect(model_path) - if lang_detect.version == '176.bin': - return None - else: - return lang_detect.predict(content_str)[0][0].replace('__label__', '') - - -def update_language_by_str(content_str: str, model_path: str = None) -> str: - """Decide language based on the content string.""" - return { - 'language': decide_lang_by_str(content_str, model_path), - 'language_details': decide_lang_by_str_v218(content_str, model_path) - } + return decide_language_func(content_str, lang_detect) if __name__ == '__main__': diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 4542c9dd..3ff5ea5d 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -2,8 +2,6 @@ from unittest.mock import MagicMock, patch from llm_web_kit.model.lang_id import (LanguageIdentification, - decide_lang_by_str, - decide_lang_by_str_v218, decide_language_by_prob_v176, decide_language_func, detect_code_block, detect_inline_equation, @@ -126,25 +124,27 @@ def test_decide_language_func(): lang_detect = MagicMock() lang_detect.version = '176.bin' lang_detect.predict.return_value = (['__label__en', '__label__zh'], [0.6, 0.4]) - assert decide_language_func('test text', lang_detect) == 'en' + result = decide_language_func('test text', lang_detect) + assert result == {'language': 'en', 'language_details': None} + # Test for 218.bin version + lang_detect.version = '218.bin' + lang_detect.predict.return_value = (['__label__eng_Latn', '__label__zho_Hans'], [0.6, 0.4]) + result = decide_language_func('test text', lang_detect) + assert result == {'language': 'en', 'language_details': 'eng'} -def test_decide_lang_by_str(): - with patch('llm_web_kit.model.lang_id.get_singleton_lang_detect') as mock_get_singleton_lang_detect, patch( - 'llm_web_kit.model.lang_id.decide_language_func') as mock_decide_language_func: - mock_get_singleton_lang_detect.return_value = MagicMock() - mock_decide_language_func.return_value = 'en' - assert decide_lang_by_str('test text') == 'en' + # Test for empty string + result = decide_language_func('', lang_detect) + assert result == {'language': 'empty', 'language_details': None} def test_update_language_by_str(): - # 模拟 decide_lang_by_str 和 decide_lang_by_str_v218 的行为 - with patch('llm_web_kit.model.lang_id.decide_lang_by_str') as mock_decide_lang_by_str, \ - patch('llm_web_kit.model.lang_id.decide_lang_by_str_v218') as mock_decide_lang_by_str_v218: + with patch('llm_web_kit.model.lang_id.get_singleton_lang_detect') as mock_get_singleton_lang_detect, \ + patch('llm_web_kit.model.lang_id.decide_language_func') as mock_decide_language_func: # 设置模拟函数的返回值 - mock_decide_lang_by_str.return_value = 'en' - mock_decide_lang_by_str_v218.return_value = 'en_v218' + mock_get_singleton_lang_detect.return_value = MagicMock() + mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng'} # 调用被测函数 result = update_language_by_str('test text') @@ -152,30 +152,7 @@ def test_update_language_by_str(): # 验证返回结果 expected_result = { 'language': 'en', - 'language_details': 'en_v218' + 'language_details': 'eng' } assert result == expected_result, f'Expected {expected_result}, but got {result}' print('Test passed!') - - -class TestDecideLangByStrV218(unittest.TestCase): - - @patch('llm_web_kit.model.lang_id.get_singleton_lang_detect') - def test_decide_lang_by_str_v218(self, mock_get_singleton_lang_detect): - mock_lang_detect = MagicMock() - mock_lang_detect.predict.return_value = [('__label__en', 0.8), ('__label__fr', 0.2)] - mock_get_singleton_lang_detect.return_value = mock_lang_detect - - content_str = 'This is an English text.' - result = decide_lang_by_str_v218(content_str, 'model_path') - self.assertEqual(result, 'en') - - @patch('llm_web_kit.model.lang_id.get_singleton_lang_detect') - def test_decide_lang_by_str_v218_custom_model_path(self, mock_get_singleton_lang_detect): - mock_lang_detect = MagicMock() - mock_lang_detect.predict.return_value = [('__label__es', 0.9), ('__label__de', 0.1)] - mock_get_singleton_lang_detect.return_value = mock_lang_detect - - content_str = 'Este es un texto en español.' - result = decide_lang_by_str_v218(content_str, 'custom_model_path') - self.assertEqual(result, 'es') From 2caae1f3d80f2173203bd7160f51084c8b55176b Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 4 Mar 2025 10:24:59 +0800 Subject: [PATCH 17/22] revise lang_id code --- docs/llm_web_kit/model/lang_id.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index 5459bb86..f8492313 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -47,17 +47,15 @@ from llm_web_kit.model.lang_id import * text = 'hello world, this is a test. the language is english' print(update_language_by_str(text)) #{'language': 'en','language_details': 'eng_Latn'} -print(decide_lang_by_str(text)) -#en -print(decide_lang_by_str_v218(text)) -#eng_Latn ``` ## 运行时间 -总共有 2099 条数据 +使用单cpu进行推理 +共有 2099 条数据 总 token 数: 379375 平均 token 数: 180.74 -载入数据时间: 0.02 秒 -处理函数时间: 0.02 秒 -总时间: 0.04 秒 +载入数据时间: 0.0214 秒 +语言识别时间: 2.4313 秒 +总时间: 2.4527 秒 +处理速度: 863.33 条/秒 From 4690492df356bb31af00ac874f4ad366e2342618 Mon Sep 17 00:00:00 2001 From: huyc Date: Tue, 4 Mar 2025 15:23:19 +0800 Subject: [PATCH 18/22] revise lang_id code --- llm_web_kit/model/lang_id.py | 9 +++------ tests/llm_web_kit/model/test_lang_id.py | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/llm_web_kit/model/lang_id.py b/llm_web_kit/model/lang_id.py index a2e898ce..e5e546bf 100644 --- a/llm_web_kit/model/lang_id.py +++ b/llm_web_kit/model/lang_id.py @@ -247,12 +247,9 @@ def decide_language_func(content_str: str, lang_detect: LanguageIdentification) language_details = None if lang_detect.version == '218.bin': first_pred = predictions[0] - match = re.match(r'^__label__([a-z]+)_[A-Za-z]+$', first_pred) - if match: - lang_code = match.group(1) - else: - lang_code = first_pred.replace('__label__', '').split('_')[0] - language_details = lang_code + # Extract the full label (e.g., __label__eng_Latn -> eng_Latn) + if first_pred.startswith('__label__'): + language_details = first_pred.replace('__label__', '') return { 'language': language, diff --git a/tests/llm_web_kit/model/test_lang_id.py b/tests/llm_web_kit/model/test_lang_id.py index 3ff5ea5d..2c1b1f96 100644 --- a/tests/llm_web_kit/model/test_lang_id.py +++ b/tests/llm_web_kit/model/test_lang_id.py @@ -131,7 +131,7 @@ def test_decide_language_func(): lang_detect.version = '218.bin' lang_detect.predict.return_value = (['__label__eng_Latn', '__label__zho_Hans'], [0.6, 0.4]) result = decide_language_func('test text', lang_detect) - assert result == {'language': 'en', 'language_details': 'eng'} + assert result == {'language': 'en', 'language_details': 'eng_Latn'} # Test for empty string result = decide_language_func('', lang_detect) @@ -144,7 +144,7 @@ def test_update_language_by_str(): # 设置模拟函数的返回值 mock_get_singleton_lang_detect.return_value = MagicMock() - mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng'} + mock_decide_language_func.return_value = {'language': 'en', 'language_details': 'eng_Latn'} # 调用被测函数 result = update_language_by_str('test text') @@ -152,7 +152,7 @@ def test_update_language_by_str(): # 验证返回结果 expected_result = { 'language': 'en', - 'language_details': 'eng' + 'language_details': 'eng_Latn' } assert result == expected_result, f'Expected {expected_result}, but got {result}' print('Test passed!') From 4e6955a3c2b33aa3d42714600ec5dba828d3c853 Mon Sep 17 00:00:00 2001 From: huyc Date: Wed, 5 Mar 2025 14:43:37 +0800 Subject: [PATCH 19/22] revise doc --- docs/llm_web_kit/model/lang_id.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index f8492313..40857518 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -11,10 +11,6 @@ huggingface版本: "common":{ "cache_path": "~/.llm_web_kit_cache" }, - "lang-id-176": { - "download_path": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", - "md5": "01810bc59c6a3d2b79c79e6336612f65" - }, "lang-id-218": { "download_path": "https://huggingface.co/facebook/fasttext-language-identification/resolve/main/model.bin?download=true", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" @@ -29,10 +25,6 @@ s3版本: "common":{ "cache_path": "~/.llm_web_kit_cache" }, - "lang-id-176": { - "download_path": "s3://web-parse-huawei/shared_resource/language/lid176.bin", - "md5": "01810bc59c6a3d2b79c79e6336612f65" - }, "lang-id-218": { "download_path": "s3://web-parse-huawei/shared_resource/language/lid218e.bin", "sha256": "8ded5749a2ad79ae9ab7c9190c7c8b97ff20d54ad8b9527ffa50107238fc7f6a" From 03a760b2fd9428ec333be883537863f320e66ba7 Mon Sep 17 00:00:00 2001 From: huyc Date: Wed, 5 Mar 2025 18:58:05 +0800 Subject: [PATCH 20/22] revise doc --- docs/llm_web_kit/model/lang_id.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index 40857518..ba9b6a15 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -43,11 +43,11 @@ print(update_language_by_str(text)) ## 运行时间 -使用单cpu进行推理 -共有 2099 条数据 -总 token 数: 379375 -平均 token 数: 180.74 -载入数据时间: 0.0214 秒 -语言识别时间: 2.4313 秒 -总时间: 2.4527 秒 -处理速度: 863.33 条/秒 +使用单cpu进行推理\ +共有 2099 条数据\ +总 token 数: 379375\ +平均 token 数: 180.74\ +载入数据时间: 0.0214 秒\ +语言识别时间: 2.4313 秒\ +总时间: 2.4527 秒\ +处理速度: 863.33 条/秒\\ From a86e713b81d8b1dc38af7fc3a14243a41efbae20 Mon Sep 17 00:00:00 2001 From: huyc Date: Wed, 5 Mar 2025 19:07:34 +0800 Subject: [PATCH 21/22] revise doc --- docs/llm_web_kit/model/lang_id.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index ba9b6a15..3beec824 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -50,4 +50,4 @@ print(update_language_by_str(text)) 载入数据时间: 0.0214 秒\ 语言识别时间: 2.4313 秒\ 总时间: 2.4527 秒\ -处理速度: 863.33 条/秒\\ +处理速度: 863.33 条/秒 From ac794d299ba4dfe2aa2f01b136926a1ce6e38efb Mon Sep 17 00:00:00 2001 From: huyc Date: Thu, 6 Mar 2025 18:14:41 +0800 Subject: [PATCH 22/22] revise doc --- docs/llm_web_kit/model/lang_id.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md index 3beec824..40857518 100644 --- a/docs/llm_web_kit/model/lang_id.md +++ b/docs/llm_web_kit/model/lang_id.md @@ -43,11 +43,11 @@ print(update_language_by_str(text)) ## 运行时间 -使用单cpu进行推理\ -共有 2099 条数据\ -总 token 数: 379375\ -平均 token 数: 180.74\ -载入数据时间: 0.0214 秒\ -语言识别时间: 2.4313 秒\ -总时间: 2.4527 秒\ +使用单cpu进行推理 +共有 2099 条数据 +总 token 数: 379375 +平均 token 数: 180.74 +载入数据时间: 0.0214 秒 +语言识别时间: 2.4313 秒 +总时间: 2.4527 秒 处理速度: 863.33 条/秒