From a963d725f3699807116f8a2d22477b3ad5c4614b Mon Sep 17 00:00:00 2001 From: drunkpig Date: Wed, 19 Mar 2025 11:55:55 +0800 Subject: [PATCH 1/2] feat: simple user api to extract html to markdown --- README.md | 19 +++++ llm_web_kit/config/__init__.py | 0 llm_web_kit/config/cfg_reader.py | 15 +++- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/libs/logger.py | 2 +- llm_web_kit/model/resource_utils/utils.py | 3 +- llm_web_kit/simple.py | 60 ++++++++++++++ tests/llm_web_kit/config/test_cfg_reader.py | 24 ++++++ tests/llm_web_kit/test_simple.py | 86 +++++++++++++++++++++ 9 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 llm_web_kit/config/__init__.py create mode 100644 llm_web_kit/simple.py create mode 100644 tests/llm_web_kit/config/test_cfg_reader.py create mode 100644 tests/llm_web_kit/test_simple.py diff --git a/README.md b/README.md index 53b1f449..c4e0f717 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,25 @@ llm-web-kit is a python library that .. ## Quick Start +```python +from llm_web_kit.simple import extract_html_to_md +import traceback +from loguru import logger + +def extract(url:str, html:str) -> str: + try: + nlp_md = extract_html_to_md(url, html) + return nlp_md + except Exception as e: + logger.exception(e) + return None + +if __name__=="__main__": + url = "" + html = "" + markdown = extract(url, html) +``` + ## Usage # TODO diff --git a/llm_web_kit/config/__init__.py b/llm_web_kit/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/llm_web_kit/config/cfg_reader.py b/llm_web_kit/config/cfg_reader.py index 24aa9948..bd295d27 100644 --- a/llm_web_kit/config/cfg_reader.py +++ b/llm_web_kit/config/cfg_reader.py @@ -1,12 +1,13 @@ import os import commentjson as json +from loguru import logger from llm_web_kit.exception.exception import ModelResourceException from llm_web_kit.libs.path_lib import get_py_pkg_root_dir -def load_config() -> dict: +def load_config(suppress_error: bool = False) -> dict: """Load the configuration file for the web kit. First try to read the configuration file from the environment variable LLM_WEB_KIT_CFG_PATH. If the environment variable is not set, use the default configuration file @@ -27,12 +28,24 @@ def load_config() -> dict: if env_cfg_path: cfg_path = env_cfg_path if not os.path.exists(cfg_path): + if suppress_error: + return {} + + logger.warning( + f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}' + ) raise ModelResourceException( f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}' ) else: cfg_path = os.path.expanduser('~/.llm-web-kit.jsonc') if not os.path.exists(cfg_path): + if suppress_error: + return {} + + logger.warning( + f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path' + ) raise ModelResourceException( f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path' ) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 1194b71c..da5f4d92 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -379,5 +379,5 @@ def __get_custom_rule(self) -> dict: Returns: """ - config = load_config() + config = load_config(suppress_error=True) return config.get('magic-html-custom-rule', {}) diff --git a/llm_web_kit/libs/logger.py b/llm_web_kit/libs/logger.py index 610d1182..4b72c45b 100644 --- a/llm_web_kit/libs/logger.py +++ b/llm_web_kit/libs/logger.py @@ -16,7 +16,7 @@ def init_logger(config: dict = None): logger_cfg = config.get('logger', []) if not logger_cfg: - logger_cfg = load_config().get('logger', []) + logger_cfg = load_config(suppress_error=True).get('logger', []) if not logger_cfg: return logger diff --git a/llm_web_kit/model/resource_utils/utils.py b/llm_web_kit/model/resource_utils/utils.py index 42dacbf5..58947ac0 100644 --- a/llm_web_kit/model/resource_utils/utils.py +++ b/llm_web_kit/model/resource_utils/utils.py @@ -14,12 +14,11 @@ def decide_cache_dir(): if 'WEB_KIT_CACHE_DIR' in os.environ: cache_dir = os.environ['WEB_KIT_CACHE_DIR'] - try: config = load_config() cache_dir = config['resources']['common']['cache_path'] except Exception: - pass + pass # ignore this exception if cache_dir.startswith('~/'): cache_dir = os.path.expanduser(cache_dir) diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py new file mode 100644 index 00000000..4ccc91cd --- /dev/null +++ b/llm_web_kit/simple.py @@ -0,0 +1,60 @@ +"""predefined simple user functions.""" + +import uuid +from datetime import datetime + +from llm_web_kit.config.cfg_reader import load_pipe_tpl +from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory +from llm_web_kit.input.datajson import DataJson + + +class ExtractorType: + HTML = 'html' + PDF = 'pdf' + EBOOK = 'ebook' + + +class ExtractorFactory: + """factory class for extractor.""" + html_extractor = None + pdf_extractor = None + ebook_extractor = None + + @staticmethod + def get_extractor(extractor_type: ExtractorType): + if extractor_type == ExtractorType.HTML: + if ExtractorFactory.html_extractor is None: + extractor_cfg = load_pipe_tpl('html') + chain = ExtractSimpleFactory.create(extractor_cfg) + ExtractorFactory.html_extractor = chain + return ExtractorFactory.html_extractor + else: + raise ValueError(f'Invalid extractor type: {extractor_type}') + + +def __extract_html(url:str, html_content: str) -> DataJson: + extractor = ExtractorFactory.get_extractor(ExtractorType.HTML) + input_data_dict = { + 'track_id': str(uuid.uuid4()), + 'url': url, + 'html': html_content, + 'dataset_name': 'llm-web-kit-quickstart', + 'data_source_category': 'HTML', + 'file_bytes': len(html_content), + 'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + } + d = DataJson(input_data_dict) + result = extractor.extract(d) + return result + + +def extract_html_to_md(url:str, html_content: str) -> str: + """extract html to markdown without images.""" + result = __extract_html(url, html_content) + return result.get_content_list().to_mm_md() + + +def extract_html_to_mm_md(url:str, html_content: str) -> str: + """extract html to markdown with images.""" + result = __extract_html(url, html_content) + return result.get_content_list().to_mm_md() diff --git a/tests/llm_web_kit/config/test_cfg_reader.py b/tests/llm_web_kit/config/test_cfg_reader.py new file mode 100644 index 00000000..3dba7551 --- /dev/null +++ b/tests/llm_web_kit/config/test_cfg_reader.py @@ -0,0 +1,24 @@ + +import os +import unittest + +from llm_web_kit.config.cfg_reader import load_config +from llm_web_kit.exception.exception import ModelResourceException + + +class TestCfgReader(unittest.TestCase): + """Test cases for the config reader module.""" + def test_get_config_path(self): + """Test the get_config_path function with different scenarios.""" + # Test when environment variable is set + # Test with non-existent file path in environment variable + os.environ['LLM_WEB_KIT_CFG_PATH'] = '/path/to/nonexistent/config.jsonc' + with self.assertRaises(ModelResourceException): + load_config() + + # Test with suppress_error=True + config = load_config(suppress_error=True) + assert config == {} + + # Clean up environment variable + del os.environ['LLM_WEB_KIT_CFG_PATH'] diff --git a/tests/llm_web_kit/test_simple.py b/tests/llm_web_kit/test_simple.py new file mode 100644 index 00000000..b435c8a5 --- /dev/null +++ b/tests/llm_web_kit/test_simple.py @@ -0,0 +1,86 @@ +import unittest +from unittest.mock import MagicMock, patch + +from llm_web_kit.input.datajson import DataJson +from llm_web_kit.simple import (ExtractorFactory, ExtractorType, + extract_html_to_md, extract_html_to_mm_md) + + +class TestSimple(unittest.TestCase): + def setUp(self): + self.url = 'https://example.com' + self.html_content = '

Test Content

This is a test paragraph.

' + + @patch('llm_web_kit.simple.ExtractSimpleFactory.create') + @patch('llm_web_kit.simple.load_pipe_tpl') + def test_extractor_factory(self, mock_load_pipe_tpl, mock_create): + # Setup mocks + mock_chain = MagicMock() + mock_create.return_value = mock_chain + mock_load_pipe_tpl.return_value = {'config': 'test'} + + # Test HTML extractor creation + extractor = ExtractorFactory.get_extractor(ExtractorType.HTML) + self.assertEqual(extractor, mock_chain) + mock_load_pipe_tpl.assert_called_once_with('html') + mock_create.assert_called_once_with({'config': 'test'}) + + # Test caching - should reuse the same extractor + ExtractorFactory.get_extractor(ExtractorType.HTML) + # Verify the mocks were not called again + mock_load_pipe_tpl.assert_called_once() + mock_create.assert_called_once() + + # Test invalid extractor type + with self.assertRaises(ValueError): + ExtractorFactory.get_extractor('invalid_type') + + @patch('llm_web_kit.simple.ExtractorFactory.get_extractor') + def test_extract_html_to_md(self, mock_get_extractor): + # Setup mock + mock_extractor = MagicMock() + mock_result = MagicMock() + mock_content_list = MagicMock() + mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.' + mock_result.get_content_list.return_value = mock_content_list + mock_extractor.extract.return_value = mock_result + mock_get_extractor.return_value = mock_extractor + + # Test extract_html_to_md + result = extract_html_to_md(self.url, self.html_content) + self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.') + + # Verify the mock was called with correct parameters + mock_get_extractor.assert_called_once_with(ExtractorType.HTML) + mock_extractor.extract.assert_called_once() + # Verify DataJson was created with correct data + call_args = mock_extractor.extract.call_args[0][0] + self.assertIsInstance(call_args, DataJson) + self.assertEqual(call_args.get('url'), self.url) + self.assertEqual(call_args.get('html_content'), self.html_content) + self.assertEqual(call_args.get('dataset_name'), 'llm-web-kit-quickstart') + self.assertEqual(call_args.get('data_source_category'), 'HTML') + + @patch('llm_web_kit.simple.ExtractorFactory.get_extractor') + def test_extract_html_to_mm_md(self, mock_get_extractor): + # Setup mock + mock_extractor = MagicMock() + mock_result = MagicMock() + mock_content_list = MagicMock() + mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.' + mock_result.get_content_list.return_value = mock_content_list + mock_extractor.extract.return_value = mock_result + mock_get_extractor.return_value = mock_extractor + + # Test extract_html_to_mm_md + result = extract_html_to_mm_md(self.url, self.html_content) + self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.') + + # Verify the mock was called with correct parameters + mock_get_extractor.assert_called_once_with(ExtractorType.HTML) + mock_extractor.extract.assert_called_once() + # Verify DataJson was created with correct data + call_args = mock_extractor.extract.call_args[0][0] + self.assertIsInstance(call_args, DataJson) + self.assertEqual(call_args.get('url'), self.url) + self.assertEqual(call_args.get('html_content'), self.html_content) From ebc6dda37fd526f1cd4ed608516e0ed6f9ee3f9e Mon Sep 17 00:00:00 2001 From: drunkpig Date: Wed, 19 Mar 2025 13:39:49 +0800 Subject: [PATCH 2/2] feat: user simple api for extract html --- README.md | 1 + llm_web_kit/simple.py | 3 +- tests/llm_web_kit/test_simple.py | 83 +++----------------------------- 3 files changed, 11 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index c4e0f717..cb24d1d2 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ from loguru import logger def extract(url:str, html:str) -> str: try: nlp_md = extract_html_to_md(url, html) + # or mm_nlp_md = extract_html_to_mm_md(url, html) return nlp_md except Exception as e: logger.exception(e) diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py index 4ccc91cd..b483b130 100644 --- a/llm_web_kit/simple.py +++ b/llm_web_kit/simple.py @@ -51,10 +51,11 @@ def __extract_html(url:str, html_content: str) -> DataJson: def extract_html_to_md(url:str, html_content: str) -> str: """extract html to markdown without images.""" result = __extract_html(url, html_content) - return result.get_content_list().to_mm_md() + return result.get_content_list().to_nlp_md() def extract_html_to_mm_md(url:str, html_content: str) -> str: """extract html to markdown with images.""" + result = __extract_html(url, html_content) return result.get_content_list().to_mm_md() diff --git a/tests/llm_web_kit/test_simple.py b/tests/llm_web_kit/test_simple.py index b435c8a5..dc8ac8ab 100644 --- a/tests/llm_web_kit/test_simple.py +++ b/tests/llm_web_kit/test_simple.py @@ -1,86 +1,19 @@ import unittest -from unittest.mock import MagicMock, patch -from llm_web_kit.input.datajson import DataJson -from llm_web_kit.simple import (ExtractorFactory, ExtractorType, - extract_html_to_md, extract_html_to_mm_md) +from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md class TestSimple(unittest.TestCase): def setUp(self): self.url = 'https://example.com' - self.html_content = '

Test Content

This is a test paragraph.

' + self.html_content = '

Test Content

This is a test paragraph.

Test Image' - @patch('llm_web_kit.simple.ExtractSimpleFactory.create') - @patch('llm_web_kit.simple.load_pipe_tpl') - def test_extractor_factory(self, mock_load_pipe_tpl, mock_create): + def test_extractor_factory(self): # Setup mocks - mock_chain = MagicMock() - mock_create.return_value = mock_chain - mock_load_pipe_tpl.return_value = {'config': 'test'} + md = extract_html_to_md(self.url, self.html_content) + self.assertEqual(md, '# Test Content\n\nThis is a test paragraph.\n') - # Test HTML extractor creation - extractor = ExtractorFactory.get_extractor(ExtractorType.HTML) - self.assertEqual(extractor, mock_chain) - mock_load_pipe_tpl.assert_called_once_with('html') - mock_create.assert_called_once_with({'config': 'test'}) - - # Test caching - should reuse the same extractor - ExtractorFactory.get_extractor(ExtractorType.HTML) - # Verify the mocks were not called again - mock_load_pipe_tpl.assert_called_once() - mock_create.assert_called_once() - - # Test invalid extractor type - with self.assertRaises(ValueError): - ExtractorFactory.get_extractor('invalid_type') - - @patch('llm_web_kit.simple.ExtractorFactory.get_extractor') - def test_extract_html_to_md(self, mock_get_extractor): + def test_extract_html_to_mm_md(self): # Setup mock - mock_extractor = MagicMock() - mock_result = MagicMock() - mock_content_list = MagicMock() - mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.' - mock_result.get_content_list.return_value = mock_content_list - mock_extractor.extract.return_value = mock_result - mock_get_extractor.return_value = mock_extractor - - # Test extract_html_to_md - result = extract_html_to_md(self.url, self.html_content) - self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.') - - # Verify the mock was called with correct parameters - mock_get_extractor.assert_called_once_with(ExtractorType.HTML) - mock_extractor.extract.assert_called_once() - # Verify DataJson was created with correct data - call_args = mock_extractor.extract.call_args[0][0] - self.assertIsInstance(call_args, DataJson) - self.assertEqual(call_args.get('url'), self.url) - self.assertEqual(call_args.get('html_content'), self.html_content) - self.assertEqual(call_args.get('dataset_name'), 'llm-web-kit-quickstart') - self.assertEqual(call_args.get('data_source_category'), 'HTML') - - @patch('llm_web_kit.simple.ExtractorFactory.get_extractor') - def test_extract_html_to_mm_md(self, mock_get_extractor): - # Setup mock - mock_extractor = MagicMock() - mock_result = MagicMock() - mock_content_list = MagicMock() - mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.' - mock_result.get_content_list.return_value = mock_content_list - mock_extractor.extract.return_value = mock_result - mock_get_extractor.return_value = mock_extractor - - # Test extract_html_to_mm_md - result = extract_html_to_mm_md(self.url, self.html_content) - self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.') - - # Verify the mock was called with correct parameters - mock_get_extractor.assert_called_once_with(ExtractorType.HTML) - mock_extractor.extract.assert_called_once() - # Verify DataJson was created with correct data - call_args = mock_extractor.extract.call_args[0][0] - self.assertIsInstance(call_args, DataJson) - self.assertEqual(call_args.get('url'), self.url) - self.assertEqual(call_args.get('html_content'), self.html_content) + mm_md = extract_html_to_mm_md(self.url, self.html_content) + self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image]( "")\n')