ccprocessor · drunkpig · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/README.md b/README.md
@@ -75,6 +75,26 @@ llm-web-kit is a python library that ..
 
 ## Quick Start
 
+```python
+from llm_web_kit.simple import extract_html_to_md
+import traceback
+from loguru import logger
+
+def extract(url:str, html:str) -> str:
+    try:
+        nlp_md = extract_html_to_md(url, html)
+        # or mm_nlp_md = extract_html_to_mm_md(url, html)
+        return nlp_md
+    except Exception as e:
+        logger.exception(e)
+    return None
+
+if __name__=="__main__":
+    url = ""
+    html = ""
+    markdown = extract(url, html)
+```
+
 ## Usage
 
 # TODO

diff --git a/llm_web_kit/config/__init__.py b/llm_web_kit/config/__init__.py
diff --git a/llm_web_kit/config/cfg_reader.py b/llm_web_kit/config/cfg_reader.py
@@ -1,12 +1,13 @@
 import os
 
 import commentjson as json
+from loguru import logger
 
 from llm_web_kit.exception.exception import ModelResourceException
 from llm_web_kit.libs.path_lib import get_py_pkg_root_dir
 
 
-def load_config() -> dict:
+def load_config(suppress_error: bool = False) -> dict:
     """Load the configuration file for the web kit. First try to read the
     configuration file from the environment variable LLM_WEB_KIT_CFG_PATH. If
     the environment variable is not set, use the default configuration file
@@ -27,12 +28,24 @@
     if env_cfg_path:
         cfg_path = env_cfg_path
         if not os.path.exists(cfg_path):
+            if suppress_error:
+                return {}
+
+            logger.warning(
+                f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}'
+            )
             raise ModelResourceException(
                 f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}'
             )
     else:
         cfg_path = os.path.expanduser('~/.llm-web-kit.jsonc')
         if not os.path.exists(cfg_path):
+            if suppress_error:
+                return {}
+
+            logger.warning(
+                f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path'
+            )
             raise ModelResourceException(
                 f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path'
             )

diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
@@ -379,5 +379,5 @@ def __get_custom_rule(self) -> dict:
         Returns:
 
         """
-        config = load_config()
+        config = load_config(suppress_error=True)
         return config.get('magic-html-custom-rule', {})
diff --git a/llm_web_kit/libs/logger.py b/llm_web_kit/libs/logger.py
@@ -16,7 +16,7 @@ def init_logger(config: dict = None):
         logger_cfg = config.get('logger', [])
 
     if not logger_cfg:
-        logger_cfg = load_config().get('logger', [])
+        logger_cfg = load_config(suppress_error=True).get('logger', [])
 
     if not logger_cfg:
         return logger

diff --git a/llm_web_kit/model/resource_utils/utils.py b/llm_web_kit/model/resource_utils/utils.py
@@ -14,12 +14,11 @@ def decide_cache_dir():
 
     if 'WEB_KIT_CACHE_DIR' in os.environ:
         cache_dir = os.environ['WEB_KIT_CACHE_DIR']
-
     try:
         config = load_config()
         cache_dir = config['resources']['common']['cache_path']
     except Exception:
-        pass
+        pass  # ignore this exception
 
     if cache_dir.startswith('~/'):
         cache_dir = os.path.expanduser(cache_dir)

diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
@@ -0,0 +1,61 @@
+"""predefined simple user functions."""
+
+import uuid
+from datetime import datetime
+
+from llm_web_kit.config.cfg_reader import load_pipe_tpl
+from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
+from llm_web_kit.input.datajson import DataJson
+
+
+class ExtractorType:
+    HTML = 'html'
+    PDF = 'pdf'
+    EBOOK = 'ebook'
+
+
+class ExtractorFactory:
+    """factory class for extractor."""
+    html_extractor = None
+    pdf_extractor = None
+    ebook_extractor = None
+
+    @staticmethod
+    def get_extractor(extractor_type: ExtractorType):
+        if extractor_type == ExtractorType.HTML:
+            if ExtractorFactory.html_extractor is None:
+                extractor_cfg = load_pipe_tpl('html')
+                chain = ExtractSimpleFactory.create(extractor_cfg)
+                ExtractorFactory.html_extractor = chain
+            return ExtractorFactory.html_extractor
+        else:
+            raise ValueError(f'Invalid extractor type: {extractor_type}')
+
+
+def __extract_html(url:str, html_content: str) -> DataJson:
+    extractor = ExtractorFactory.get_extractor(ExtractorType.HTML)
+    input_data_dict = {
+        'track_id': str(uuid.uuid4()),
+        'url': url,
+        'html': html_content,
+        'dataset_name': 'llm-web-kit-quickstart',
+        'data_source_category': 'HTML',
+        'file_bytes': len(html_content),
+        'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+    }
+    d = DataJson(input_data_dict)
+    result = extractor.extract(d)
+    return result
+
+
+def extract_html_to_md(url:str, html_content: str) -> str:
+    """extract html to markdown without images."""
+    result = __extract_html(url, html_content)
+    return result.get_content_list().to_nlp_md()
+
+
+def extract_html_to_mm_md(url:str, html_content: str) -> str:
+    """extract html to markdown with images."""
+
+    result = __extract_html(url, html_content)
+    return result.get_content_list().to_mm_md()
diff --git a/tests/llm_web_kit/config/test_cfg_reader.py b/tests/llm_web_kit/config/test_cfg_reader.py
@@ -0,0 +1,24 @@
+
+import os
+import unittest
+
+from llm_web_kit.config.cfg_reader import load_config
+from llm_web_kit.exception.exception import ModelResourceException
+
+
+class TestCfgReader(unittest.TestCase):
+    """Test cases for the config reader module."""
+    def test_get_config_path(self):
+        """Test the get_config_path function with different scenarios."""
+        # Test when environment variable is set
+        # Test with non-existent file path in environment variable
+        os.environ['LLM_WEB_KIT_CFG_PATH'] = '/path/to/nonexistent/config.jsonc'
+        with self.assertRaises(ModelResourceException):
+            load_config()
+
+        # Test with suppress_error=True
+        config = load_config(suppress_error=True)
+        assert config == {}
+
+        # Clean up environment variable
+        del os.environ['LLM_WEB_KIT_CFG_PATH']
diff --git a/tests/llm_web_kit/test_simple.py b/tests/llm_web_kit/test_simple.py
@@ -0,0 +1,19 @@
+import unittest
+
+from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
+
+
+class TestSimple(unittest.TestCase):
+    def setUp(self):
+        self.url = 'https://example.com'
+        self.html_content = '<html><body><h1>Test Content</h1><p>This is a test paragraph.</p><img src="https://example.com/image.jpg" alt="Test Image" /></body></html>'
+
+    def test_extractor_factory(self):
+        # Setup mocks
+        md = extract_html_to_md(self.url, self.html_content)
+        self.assertEqual(md, '# Test Content\n\nThis is a test paragraph.\n')
+
+    def test_extract_html_to_mm_md(self):
+        # Setup mock
+        mm_md = extract_html_to_mm_md(self.url, self.html_content)
+        self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image]( "")\n')