From a963d725f3699807116f8a2d22477b3ad5c4614b Mon Sep 17 00:00:00 2001
From: drunkpig <xuchao@pjlab.org.cn>
Date: Wed, 19 Mar 2025 11:55:55 +0800
Subject: [PATCH 1/2] feat: simple user api to extract html to markdown

---
 README.md                                   | 19 +++++
 llm_web_kit/config/__init__.py              |  0
 llm_web_kit/config/cfg_reader.py            | 15 +++-
 llm_web_kit/extractor/html/extractor.py     |  2 +-
 llm_web_kit/libs/logger.py                  |  2 +-
 llm_web_kit/model/resource_utils/utils.py   |  3 +-
 llm_web_kit/simple.py                       | 60 ++++++++++++++
 tests/llm_web_kit/config/test_cfg_reader.py | 24 ++++++
 tests/llm_web_kit/test_simple.py            | 86 +++++++++++++++++++++
 9 files changed, 206 insertions(+), 5 deletions(-)
 create mode 100644 llm_web_kit/config/__init__.py
 create mode 100644 llm_web_kit/simple.py
 create mode 100644 tests/llm_web_kit/config/test_cfg_reader.py
 create mode 100644 tests/llm_web_kit/test_simple.py

diff --git a/README.md b/README.md
index 53b1f449..c4e0f717 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,25 @@ llm-web-kit is a python library that ..
 
 ## Quick Start
 
+```python
+from llm_web_kit.simple import extract_html_to_md
+import traceback
+from loguru import logger
+
+def extract(url:str, html:str) -> str:
+    try:
+        nlp_md = extract_html_to_md(url, html)
+        return nlp_md
+    except Exception as e:
+        logger.exception(e)
+    return None
+
+if __name__=="__main__":
+    url = ""
+    html = ""
+    markdown = extract(url, html)
+```
+
 ## Usage
 
 # TODO
diff --git a/llm_web_kit/config/__init__.py b/llm_web_kit/config/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/llm_web_kit/config/cfg_reader.py b/llm_web_kit/config/cfg_reader.py
index 24aa9948..bd295d27 100644
--- a/llm_web_kit/config/cfg_reader.py
+++ b/llm_web_kit/config/cfg_reader.py
@@ -1,12 +1,13 @@
 import os
 
 import commentjson as json
+from loguru import logger
 
 from llm_web_kit.exception.exception import ModelResourceException
 from llm_web_kit.libs.path_lib import get_py_pkg_root_dir
 
 
-def load_config() -> dict:
+def load_config(suppress_error: bool = False) -> dict:
     """Load the configuration file for the web kit. First try to read the
     configuration file from the environment variable LLM_WEB_KIT_CFG_PATH. If
     the environment variable is not set, use the default configuration file
@@ -27,12 +28,24 @@ def load_config() -> dict:
     if env_cfg_path:
         cfg_path = env_cfg_path
         if not os.path.exists(cfg_path):
+            if suppress_error:
+                return {}
+
+            logger.warning(
+                f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}'
+            )
             raise ModelResourceException(
                 f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}'
             )
     else:
         cfg_path = os.path.expanduser('~/.llm-web-kit.jsonc')
         if not os.path.exists(cfg_path):
+            if suppress_error:
+                return {}
+
+            logger.warning(
+                f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path'
+            )
             raise ModelResourceException(
                 f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path'
             )
diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
index 1194b71c..da5f4d92 100644
--- a/llm_web_kit/extractor/html/extractor.py
+++ b/llm_web_kit/extractor/html/extractor.py
@@ -379,5 +379,5 @@ def __get_custom_rule(self) -> dict:
         Returns:
 
         """
-        config = load_config()
+        config = load_config(suppress_error=True)
         return config.get('magic-html-custom-rule', {})
diff --git a/llm_web_kit/libs/logger.py b/llm_web_kit/libs/logger.py
index 610d1182..4b72c45b 100644
--- a/llm_web_kit/libs/logger.py
+++ b/llm_web_kit/libs/logger.py
@@ -16,7 +16,7 @@ def init_logger(config: dict = None):
         logger_cfg = config.get('logger', [])
 
     if not logger_cfg:
-        logger_cfg = load_config().get('logger', [])
+        logger_cfg = load_config(suppress_error=True).get('logger', [])
 
     if not logger_cfg:
         return logger
diff --git a/llm_web_kit/model/resource_utils/utils.py b/llm_web_kit/model/resource_utils/utils.py
index 42dacbf5..58947ac0 100644
--- a/llm_web_kit/model/resource_utils/utils.py
+++ b/llm_web_kit/model/resource_utils/utils.py
@@ -14,12 +14,11 @@ def decide_cache_dir():
 
     if 'WEB_KIT_CACHE_DIR' in os.environ:
         cache_dir = os.environ['WEB_KIT_CACHE_DIR']
-
     try:
         config = load_config()
         cache_dir = config['resources']['common']['cache_path']
     except Exception:
-        pass
+        pass  # ignore this exception
 
     if cache_dir.startswith('~/'):
         cache_dir = os.path.expanduser(cache_dir)
diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
new file mode 100644
index 00000000..4ccc91cd
--- /dev/null
+++ b/llm_web_kit/simple.py
@@ -0,0 +1,60 @@
+"""predefined simple user functions."""
+
+import uuid
+from datetime import datetime
+
+from llm_web_kit.config.cfg_reader import load_pipe_tpl
+from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
+from llm_web_kit.input.datajson import DataJson
+
+
+class ExtractorType:
+    HTML = 'html'
+    PDF = 'pdf'
+    EBOOK = 'ebook'
+
+
+class ExtractorFactory:
+    """factory class for extractor."""
+    html_extractor = None
+    pdf_extractor = None
+    ebook_extractor = None
+
+    @staticmethod
+    def get_extractor(extractor_type: ExtractorType):
+        if extractor_type == ExtractorType.HTML:
+            if ExtractorFactory.html_extractor is None:
+                extractor_cfg = load_pipe_tpl('html')
+                chain = ExtractSimpleFactory.create(extractor_cfg)
+                ExtractorFactory.html_extractor = chain
+            return ExtractorFactory.html_extractor
+        else:
+            raise ValueError(f'Invalid extractor type: {extractor_type}')
+
+
+def __extract_html(url:str, html_content: str) -> DataJson:
+    extractor = ExtractorFactory.get_extractor(ExtractorType.HTML)
+    input_data_dict = {
+        'track_id': str(uuid.uuid4()),
+        'url': url,
+        'html': html_content,
+        'dataset_name': 'llm-web-kit-quickstart',
+        'data_source_category': 'HTML',
+        'file_bytes': len(html_content),
+        'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+    }
+    d = DataJson(input_data_dict)
+    result = extractor.extract(d)
+    return result
+
+
+def extract_html_to_md(url:str, html_content: str) -> str:
+    """extract html to markdown without images."""
+    result = __extract_html(url, html_content)
+    return result.get_content_list().to_mm_md()
+
+
+def extract_html_to_mm_md(url:str, html_content: str) -> str:
+    """extract html to markdown with images."""
+    result = __extract_html(url, html_content)
+    return result.get_content_list().to_mm_md()
diff --git a/tests/llm_web_kit/config/test_cfg_reader.py b/tests/llm_web_kit/config/test_cfg_reader.py
new file mode 100644
index 00000000..3dba7551
--- /dev/null
+++ b/tests/llm_web_kit/config/test_cfg_reader.py
@@ -0,0 +1,24 @@
+
+import os
+import unittest
+
+from llm_web_kit.config.cfg_reader import load_config
+from llm_web_kit.exception.exception import ModelResourceException
+
+
+class TestCfgReader(unittest.TestCase):
+    """Test cases for the config reader module."""
+    def test_get_config_path(self):
+        """Test the get_config_path function with different scenarios."""
+        # Test when environment variable is set
+        # Test with non-existent file path in environment variable
+        os.environ['LLM_WEB_KIT_CFG_PATH'] = '/path/to/nonexistent/config.jsonc'
+        with self.assertRaises(ModelResourceException):
+            load_config()
+
+        # Test with suppress_error=True
+        config = load_config(suppress_error=True)
+        assert config == {}
+
+        # Clean up environment variable
+        del os.environ['LLM_WEB_KIT_CFG_PATH']
diff --git a/tests/llm_web_kit/test_simple.py b/tests/llm_web_kit/test_simple.py
new file mode 100644
index 00000000..b435c8a5
--- /dev/null
+++ b/tests/llm_web_kit/test_simple.py
@@ -0,0 +1,86 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from llm_web_kit.input.datajson import DataJson
+from llm_web_kit.simple import (ExtractorFactory, ExtractorType,
+                                extract_html_to_md, extract_html_to_mm_md)
+
+
+class TestSimple(unittest.TestCase):
+    def setUp(self):
+        self.url = 'https://example.com'
+        self.html_content = '<html><body><h1>Test Content</h1><p>This is a test paragraph.</p></body></html>'
+
+    @patch('llm_web_kit.simple.ExtractSimpleFactory.create')
+    @patch('llm_web_kit.simple.load_pipe_tpl')
+    def test_extractor_factory(self, mock_load_pipe_tpl, mock_create):
+        # Setup mocks
+        mock_chain = MagicMock()
+        mock_create.return_value = mock_chain
+        mock_load_pipe_tpl.return_value = {'config': 'test'}
+
+        # Test HTML extractor creation
+        extractor = ExtractorFactory.get_extractor(ExtractorType.HTML)
+        self.assertEqual(extractor, mock_chain)
+        mock_load_pipe_tpl.assert_called_once_with('html')
+        mock_create.assert_called_once_with({'config': 'test'})
+
+        # Test caching - should reuse the same extractor
+        ExtractorFactory.get_extractor(ExtractorType.HTML)
+        # Verify the mocks were not called again
+        mock_load_pipe_tpl.assert_called_once()
+        mock_create.assert_called_once()
+
+        # Test invalid extractor type
+        with self.assertRaises(ValueError):
+            ExtractorFactory.get_extractor('invalid_type')
+
+    @patch('llm_web_kit.simple.ExtractorFactory.get_extractor')
+    def test_extract_html_to_md(self, mock_get_extractor):
+        # Setup mock
+        mock_extractor = MagicMock()
+        mock_result = MagicMock()
+        mock_content_list = MagicMock()
+        mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.'
+        mock_result.get_content_list.return_value = mock_content_list
+        mock_extractor.extract.return_value = mock_result
+        mock_get_extractor.return_value = mock_extractor
+
+        # Test extract_html_to_md
+        result = extract_html_to_md(self.url, self.html_content)
+        self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.')
+
+        # Verify the mock was called with correct parameters
+        mock_get_extractor.assert_called_once_with(ExtractorType.HTML)
+        mock_extractor.extract.assert_called_once()
+        # Verify DataJson was created with correct data
+        call_args = mock_extractor.extract.call_args[0][0]
+        self.assertIsInstance(call_args, DataJson)
+        self.assertEqual(call_args.get('url'), self.url)
+        self.assertEqual(call_args.get('html_content'), self.html_content)
+        self.assertEqual(call_args.get('dataset_name'), 'llm-web-kit-quickstart')
+        self.assertEqual(call_args.get('data_source_category'), 'HTML')
+
+    @patch('llm_web_kit.simple.ExtractorFactory.get_extractor')
+    def test_extract_html_to_mm_md(self, mock_get_extractor):
+        # Setup mock
+        mock_extractor = MagicMock()
+        mock_result = MagicMock()
+        mock_content_list = MagicMock()
+        mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.'
+        mock_result.get_content_list.return_value = mock_content_list
+        mock_extractor.extract.return_value = mock_result
+        mock_get_extractor.return_value = mock_extractor
+
+        # Test extract_html_to_mm_md
+        result = extract_html_to_mm_md(self.url, self.html_content)
+        self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.')
+
+        # Verify the mock was called with correct parameters
+        mock_get_extractor.assert_called_once_with(ExtractorType.HTML)
+        mock_extractor.extract.assert_called_once()
+        # Verify DataJson was created with correct data
+        call_args = mock_extractor.extract.call_args[0][0]
+        self.assertIsInstance(call_args, DataJson)
+        self.assertEqual(call_args.get('url'), self.url)
+        self.assertEqual(call_args.get('html_content'), self.html_content)

From ebc6dda37fd526f1cd4ed608516e0ed6f9ee3f9e Mon Sep 17 00:00:00 2001
From: drunkpig <xuchao@pjlab.org.cn>
Date: Wed, 19 Mar 2025 13:39:49 +0800
Subject: [PATCH 2/2] feat: user simple api for extract html

---
 README.md                        |  1 +
 llm_web_kit/simple.py            |  3 +-
 tests/llm_web_kit/test_simple.py | 83 +++-----------------------------
 3 files changed, 11 insertions(+), 76 deletions(-)

diff --git a/README.md b/README.md
index c4e0f717..cb24d1d2 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,7 @@ from loguru import logger
 def extract(url:str, html:str) -> str:
     try:
         nlp_md = extract_html_to_md(url, html)
+        # or mm_nlp_md = extract_html_to_mm_md(url, html)
         return nlp_md
     except Exception as e:
         logger.exception(e)
diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
index 4ccc91cd..b483b130 100644
--- a/llm_web_kit/simple.py
+++ b/llm_web_kit/simple.py
@@ -51,10 +51,11 @@ def __extract_html(url:str, html_content: str) -> DataJson:
 def extract_html_to_md(url:str, html_content: str) -> str:
     """extract html to markdown without images."""
     result = __extract_html(url, html_content)
-    return result.get_content_list().to_mm_md()
+    return result.get_content_list().to_nlp_md()
 
 
 def extract_html_to_mm_md(url:str, html_content: str) -> str:
     """extract html to markdown with images."""
+
     result = __extract_html(url, html_content)
     return result.get_content_list().to_mm_md()
diff --git a/tests/llm_web_kit/test_simple.py b/tests/llm_web_kit/test_simple.py
index b435c8a5..dc8ac8ab 100644
--- a/tests/llm_web_kit/test_simple.py
+++ b/tests/llm_web_kit/test_simple.py
@@ -1,86 +1,19 @@
 import unittest
-from unittest.mock import MagicMock, patch
 
-from llm_web_kit.input.datajson import DataJson
-from llm_web_kit.simple import (ExtractorFactory, ExtractorType,
-                                extract_html_to_md, extract_html_to_mm_md)
+from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
 
 
 class TestSimple(unittest.TestCase):
     def setUp(self):
         self.url = 'https://example.com'
-        self.html_content = '<html><body><h1>Test Content</h1><p>This is a test paragraph.</p></body></html>'
+        self.html_content = '<html><body><h1>Test Content</h1><p>This is a test paragraph.</p><img src="https://example.com/image.jpg" alt="Test Image" /></body></html>'
 
-    @patch('llm_web_kit.simple.ExtractSimpleFactory.create')
-    @patch('llm_web_kit.simple.load_pipe_tpl')
-    def test_extractor_factory(self, mock_load_pipe_tpl, mock_create):
+    def test_extractor_factory(self):
         # Setup mocks
-        mock_chain = MagicMock()
-        mock_create.return_value = mock_chain
-        mock_load_pipe_tpl.return_value = {'config': 'test'}
+        md = extract_html_to_md(self.url, self.html_content)
+        self.assertEqual(md, '# Test Content\n\nThis is a test paragraph.\n')
 
-        # Test HTML extractor creation
-        extractor = ExtractorFactory.get_extractor(ExtractorType.HTML)
-        self.assertEqual(extractor, mock_chain)
-        mock_load_pipe_tpl.assert_called_once_with('html')
-        mock_create.assert_called_once_with({'config': 'test'})
-
-        # Test caching - should reuse the same extractor
-        ExtractorFactory.get_extractor(ExtractorType.HTML)
-        # Verify the mocks were not called again
-        mock_load_pipe_tpl.assert_called_once()
-        mock_create.assert_called_once()
-
-        # Test invalid extractor type
-        with self.assertRaises(ValueError):
-            ExtractorFactory.get_extractor('invalid_type')
-
-    @patch('llm_web_kit.simple.ExtractorFactory.get_extractor')
-    def test_extract_html_to_md(self, mock_get_extractor):
+    def test_extract_html_to_mm_md(self):
         # Setup mock
-        mock_extractor = MagicMock()
-        mock_result = MagicMock()
-        mock_content_list = MagicMock()
-        mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.'
-        mock_result.get_content_list.return_value = mock_content_list
-        mock_extractor.extract.return_value = mock_result
-        mock_get_extractor.return_value = mock_extractor
-
-        # Test extract_html_to_md
-        result = extract_html_to_md(self.url, self.html_content)
-        self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.')
-
-        # Verify the mock was called with correct parameters
-        mock_get_extractor.assert_called_once_with(ExtractorType.HTML)
-        mock_extractor.extract.assert_called_once()
-        # Verify DataJson was created with correct data
-        call_args = mock_extractor.extract.call_args[0][0]
-        self.assertIsInstance(call_args, DataJson)
-        self.assertEqual(call_args.get('url'), self.url)
-        self.assertEqual(call_args.get('html_content'), self.html_content)
-        self.assertEqual(call_args.get('dataset_name'), 'llm-web-kit-quickstart')
-        self.assertEqual(call_args.get('data_source_category'), 'HTML')
-
-    @patch('llm_web_kit.simple.ExtractorFactory.get_extractor')
-    def test_extract_html_to_mm_md(self, mock_get_extractor):
-        # Setup mock
-        mock_extractor = MagicMock()
-        mock_result = MagicMock()
-        mock_content_list = MagicMock()
-        mock_content_list.to_mm_md.return_value = '# Test Content\n\nThis is a test paragraph.'
-        mock_result.get_content_list.return_value = mock_content_list
-        mock_extractor.extract.return_value = mock_result
-        mock_get_extractor.return_value = mock_extractor
-
-        # Test extract_html_to_mm_md
-        result = extract_html_to_mm_md(self.url, self.html_content)
-        self.assertEqual(result, '# Test Content\n\nThis is a test paragraph.')
-
-        # Verify the mock was called with correct parameters
-        mock_get_extractor.assert_called_once_with(ExtractorType.HTML)
-        mock_extractor.extract.assert_called_once()
-        # Verify DataJson was created with correct data
-        call_args = mock_extractor.extract.call_args[0][0]
-        self.assertIsInstance(call_args, DataJson)
-        self.assertEqual(call_args.get('url'), self.url)
-        self.assertEqual(call_args.get('html_content'), self.html_content)
+        mm_md = extract_html_to_mm_md(self.url, self.html_content)
+        self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image]( "")\n')