Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,26 @@ llm-web-kit is a python library that ..

## Quick Start

```python
from llm_web_kit.simple import extract_html_to_md
import traceback
from loguru import logger

def extract(url:str, html:str) -> str:
try:
nlp_md = extract_html_to_md(url, html)
# or mm_nlp_md = extract_html_to_mm_md(url, html)
return nlp_md
except Exception as e:
logger.exception(e)
return None

if __name__=="__main__":
url = ""
html = ""
markdown = extract(url, html)
```

## Usage

# TODO
Expand Down
Empty file added llm_web_kit/config/__init__.py
Empty file.
15 changes: 14 additions & 1 deletion llm_web_kit/config/cfg_reader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import os

import commentjson as json
from loguru import logger

from llm_web_kit.exception.exception import ModelResourceException
from llm_web_kit.libs.path_lib import get_py_pkg_root_dir


def load_config() -> dict:
def load_config(suppress_error: bool = False) -> dict:
"""Load the configuration file for the web kit. First try to read the
configuration file from the environment variable LLM_WEB_KIT_CFG_PATH. If
the environment variable is not set, use the default configuration file
Expand All @@ -27,12 +28,24 @@
if env_cfg_path:
cfg_path = env_cfg_path
if not os.path.exists(cfg_path):
if suppress_error:
return {}

logger.warning(
f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}'
)
raise ModelResourceException(
f'environment variable LLM_WEB_KIT_CFG_PATH points to a non-exist file: {cfg_path}'
)
else:
cfg_path = os.path.expanduser('~/.llm-web-kit.jsonc')
if not os.path.exists(cfg_path):
if suppress_error:
return {}

logger.warning(

Check warning on line 46 in llm_web_kit/config/cfg_reader.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/config/cfg_reader.py#L46

Added line #L46 was not covered by tests
f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path'
)
raise ModelResourceException(
f'{cfg_path} does not exist, please create one or set environment variable LLM_WEB_KIT_CFG_PATH to a valid file path'
)
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,5 +379,5 @@ def __get_custom_rule(self) -> dict:
Returns:

"""
config = load_config()
config = load_config(suppress_error=True)
return config.get('magic-html-custom-rule', {})
2 changes: 1 addition & 1 deletion llm_web_kit/libs/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def init_logger(config: dict = None):
logger_cfg = config.get('logger', [])

if not logger_cfg:
logger_cfg = load_config().get('logger', [])
logger_cfg = load_config(suppress_error=True).get('logger', [])

if not logger_cfg:
return logger
Expand Down
3 changes: 1 addition & 2 deletions llm_web_kit/model/resource_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@ def decide_cache_dir():

if 'WEB_KIT_CACHE_DIR' in os.environ:
cache_dir = os.environ['WEB_KIT_CACHE_DIR']

try:
config = load_config()
cache_dir = config['resources']['common']['cache_path']
except Exception:
pass
pass # ignore this exception

if cache_dir.startswith('~/'):
cache_dir = os.path.expanduser(cache_dir)
Expand Down
61 changes: 61 additions & 0 deletions llm_web_kit/simple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""predefined simple user functions."""

import uuid
from datetime import datetime

from llm_web_kit.config.cfg_reader import load_pipe_tpl
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.input.datajson import DataJson


class ExtractorType:
HTML = 'html'
PDF = 'pdf'
EBOOK = 'ebook'


class ExtractorFactory:
"""factory class for extractor."""
html_extractor = None
pdf_extractor = None
ebook_extractor = None

@staticmethod
def get_extractor(extractor_type: ExtractorType):
if extractor_type == ExtractorType.HTML:
if ExtractorFactory.html_extractor is None:
extractor_cfg = load_pipe_tpl('html')
chain = ExtractSimpleFactory.create(extractor_cfg)
ExtractorFactory.html_extractor = chain
return ExtractorFactory.html_extractor
else:
raise ValueError(f'Invalid extractor type: {extractor_type}')

Check warning on line 32 in llm_web_kit/simple.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/simple.py#L32

Added line #L32 was not covered by tests


def __extract_html(url:str, html_content: str) -> DataJson:
extractor = ExtractorFactory.get_extractor(ExtractorType.HTML)
input_data_dict = {
'track_id': str(uuid.uuid4()),
'url': url,
'html': html_content,
'dataset_name': 'llm-web-kit-quickstart',
'data_source_category': 'HTML',
'file_bytes': len(html_content),
'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
}
d = DataJson(input_data_dict)
result = extractor.extract(d)
return result


def extract_html_to_md(url:str, html_content: str) -> str:
"""extract html to markdown without images."""
result = __extract_html(url, html_content)
return result.get_content_list().to_nlp_md()


def extract_html_to_mm_md(url:str, html_content: str) -> str:
"""extract html to markdown with images."""

result = __extract_html(url, html_content)
return result.get_content_list().to_mm_md()
24 changes: 24 additions & 0 deletions tests/llm_web_kit/config/test_cfg_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

import os
import unittest

from llm_web_kit.config.cfg_reader import load_config
from llm_web_kit.exception.exception import ModelResourceException


class TestCfgReader(unittest.TestCase):
"""Test cases for the config reader module."""
def test_get_config_path(self):
"""Test the get_config_path function with different scenarios."""
# Test when environment variable is set
# Test with non-existent file path in environment variable
os.environ['LLM_WEB_KIT_CFG_PATH'] = '/path/to/nonexistent/config.jsonc'
with self.assertRaises(ModelResourceException):
load_config()

# Test with suppress_error=True
config = load_config(suppress_error=True)
assert config == {}

# Clean up environment variable
del os.environ['LLM_WEB_KIT_CFG_PATH']
19 changes: 19 additions & 0 deletions tests/llm_web_kit/test_simple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import unittest

from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md


class TestSimple(unittest.TestCase):
def setUp(self):
self.url = 'https://example.com'
self.html_content = '<html><body><h1>Test Content</h1><p>This is a test paragraph.</p><img src="https://example.com/image.jpg" alt="Test Image" /></body></html>'

def test_extractor_factory(self):
# Setup mocks
md = extract_html_to_md(self.url, self.html_content)
self.assertEqual(md, '# Test Content\n\nThis is a test paragraph.\n')

def test_extract_html_to_mm_md(self):
# Setup mock
mm_md = extract_html_to_mm_md(self.url, self.html_content)
self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image]( "")\n')