From 45fa4c304871843813a3a66e04e350b6dacdbdcb Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Tue, 26 Aug 2025 17:40:52 +0800
Subject: [PATCH 01/12] feat: add html parse api

---
 llm_web_kit/api/README.md                |  79 ++++++++++++++++
 llm_web_kit/api/__init__.py              |   7 ++
 llm_web_kit/api/dependencies.py          |  66 +++++++++++++
 llm_web_kit/api/main.py                  |  74 +++++++++++++++
 llm_web_kit/api/models/__init__.py       |  13 +++
 llm_web_kit/api/models/request.py        |  34 +++++++
 llm_web_kit/api/models/response.py       |  99 ++++++++++++++++++++
 llm_web_kit/api/requirements.txt         |  21 +++++
 llm_web_kit/api/routers/__init__.py      |   8 ++
 llm_web_kit/api/routers/htmls.py         |  88 ++++++++++++++++++
 llm_web_kit/api/run_server.py            |  29 ++++++
 llm_web_kit/api/services/__init__.py     |   8 ++
 llm_web_kit/api/services/html_service.py | 112 +++++++++++++++++++++++
 13 files changed, 638 insertions(+)
 create mode 100644 llm_web_kit/api/README.md
 create mode 100644 llm_web_kit/api/__init__.py
 create mode 100644 llm_web_kit/api/dependencies.py
 create mode 100644 llm_web_kit/api/main.py
 create mode 100644 llm_web_kit/api/models/__init__.py
 create mode 100644 llm_web_kit/api/models/request.py
 create mode 100644 llm_web_kit/api/models/response.py
 create mode 100644 llm_web_kit/api/requirements.txt
 create mode 100644 llm_web_kit/api/routers/__init__.py
 create mode 100644 llm_web_kit/api/routers/htmls.py
 create mode 100644 llm_web_kit/api/run_server.py
 create mode 100644 llm_web_kit/api/services/__init__.py
 create mode 100644 llm_web_kit/api/services/html_service.py
diff --git a/llm_web_kit/api/README.md b/llm_web_kit/api/README.md
new file mode 100644
index 00000000..69cd4de5
--- /dev/null
+++ b/llm_web_kit/api/README.md
@@ -0,0 +1,79 @@
+# LLM Web Kit API
+
+基于 FastAPI 的 LLM Web Kit API 服务，提供 HTML 解析功能。
+
+## 功能特性
+
+- 🚀 基于 FastAPI 的高性能 Web API
+- 📄 HTML 内容解析与结构化输出
+- 🔗 支持 URL 和 HTML 字符串输入
+- 📁 支持 HTML 文件上传
+- 📚 自动生成的 API 文档
+- 🔧 可配置的解析选项
+
+## 快速开始
+
+```bash
+pip install -r requirements.txt
+python llm_web_kit/api/run_server.py
+```
+
+- Swagger UI: http://127.0.0.1:8000/docs
+- ReDoc: http://127.0.0.1:8000/redoc
+
+## API 端点
+
+### HTML 解析
+
+POST /api/v1/html/parse
+
+请求示例：
+
+```json
+{
+  "html_content": "<html><body><h1>Hello World</h1></body></html>",
+  "options": {
+    "clean_html": true
+  }
+}
+```
+
+### 文件上传解析
+
+POST /api/v1/html/upload
+
+```bash
+curl -s -X POST "http://127.0.0.1:8000/api/v1/html/upload" \
+  -F "file=@/path/to/file.html"
+```
+
+### 服务状态
+
+GET /api/v1/html/status
+
+## 返回结构示例
+
+```json
+{
+  "success": true,
+  "message": "HTML 解析成功",
+  "timestamp": "2025-08-26T16:45:43.140638",
+  "data": {
+    "layout_file_list": [],
+    "typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
+    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>\n",
+    "llm_response": {
+      "item_id 1": 0,
+      "item_id 2": 1
+    },
+    "typical_main_html": "<html></html>",
+    "html_target_list": []
+  },
+  "metadata": null
+}
+```
+
+## 常见问题
+
+- 422 错误：确认请求头 `Content-Type: application/json`，并确保请求体 JSON 合法。
+- 依赖缺失：`pip install -r llm_web_kit/api/requirements.txt`。
diff --git a/llm_web_kit/api/__init__.py b/llm_web_kit/api/__init__.py
new file mode 100644
index 00000000..c2601bff
--- /dev/null
+++ b/llm_web_kit/api/__init__.py
@@ -0,0 +1,7 @@
+"""LLM Web Kit API 模块.
+
+提供基于 FastAPI 的 Web API 接口，用于处理 HTML 解析和内容提取功能。
+"""
+
+__version__ = "1.0.0"
+__author__ = "LLM Web Kit Team"
diff --git a/llm_web_kit/api/dependencies.py b/llm_web_kit/api/dependencies.py
new file mode 100644
index 00000000..fe6f5136
--- /dev/null
+++ b/llm_web_kit/api/dependencies.py
@@ -0,0 +1,66 @@
+"""API 依赖项管理.
+
+包含 FastAPI 应用的依赖项、配置管理和共享服务。
+"""
+
+import logging
+from functools import lru_cache
+from typing import Optional
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+logger = logging.getLogger(__name__)
+
+
+class Settings(BaseSettings):
+    """应用配置设置."""
+
+    # API 配置
+    api_title: str = "LLM Web Kit API"
+    api_version: str = "1.0.0"
+    api_description: str = "基于 LLM 的 Web 内容解析和提取 API 服务"
+
+    # 服务器配置
+    host: str = "0.0.0.0"
+    port: int = 8000
+    debug: bool = False
+
+    # 日志配置
+    log_level: str = "INFO"
+
+    # 模型配置
+    model_path: Optional[str] = None
+    max_content_length: int = 10 * 1024 * 1024  # 10MB
+
+    # 缓存配置
+    cache_ttl: int = 3600  # 1小时
+
+    # pydantic v2 配置写法
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        case_sensitive=False
+    )
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    """获取应用配置单例."""
+    return Settings()
+
+
+def get_logger(name: str = __name__) -> logging.Logger:
+    """获取配置好的日志记录器."""
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(get_settings().log_level)
+    return logger
+
+
+# 全局依赖项
+settings = get_settings()
diff --git a/llm_web_kit/api/main.py b/llm_web_kit/api/main.py
new file mode 100644
index 00000000..77d2716c
--- /dev/null
+++ b/llm_web_kit/api/main.py
@@ -0,0 +1,74 @@
+"""FastAPI 应用主入口.
+
+提供 LLM Web Kit 的 Web API 服务，包括 HTML 解析、内容提取等功能。
+"""
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+
+from .dependencies import get_logger, get_settings
+from .routers import htmls
+
+settings = get_settings()
+logger = get_logger(__name__)
+
+
+# 创建 FastAPI 应用实例（元数据读取自 Settings）
+app = FastAPI(
+    title=settings.api_title,
+    description=settings.api_description,
+    version=settings.api_version,
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+
+# 添加 CORS 中间件
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 在生产环境中应该限制具体域名
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# 注册路由
+app.include_router(htmls.router, prefix="/api/v1", tags=["HTML 处理"])
+
+
+@app.get("/")
+async def root():
+    """根路径，返回服务状态信息."""
+    return {
+        "message": "LLM Web Kit API 服务运行中",
+        "version": settings.api_version,
+        "status": "healthy"
+    }
+
+
+@app.get("/health")
+async def health_check():
+    """健康检查端点."""
+    return {"status": "healthy", "service": "llm-web-kit-api"}
+
+
+@app.exception_handler(Exception)
+async def global_exception_handler(request, exc):
+    """全局异常处理器."""
+    logger.error(f"未处理的异常: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={"detail": "服务器内部错误", "error": str(exc)}
+    )
+
+
+if __name__ == "__main__":
+    # 开发环境运行
+    uvicorn.run(
+        "llm_web_kit.api.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        log_level=(settings.log_level or "INFO").lower()
+    )
diff --git a/llm_web_kit/api/models/__init__.py b/llm_web_kit/api/models/__init__.py
new file mode 100644
index 00000000..8f1a1ad6
--- /dev/null
+++ b/llm_web_kit/api/models/__init__.py
@@ -0,0 +1,13 @@
+"""Pydantic 模型模块.
+
+包含所有 API 请求和响应的数据模型定义。
+"""
+
+from .request import HTMLParseRequest
+from .response import ErrorResponse, HTMLParseResponse
+
+__all__ = [
+    "HTMLParseRequest",
+    "HTMLParseResponse",
+    "ErrorResponse"
+]
diff --git a/llm_web_kit/api/models/request.py b/llm_web_kit/api/models/request.py
new file mode 100644
index 00000000..6b2d4362
--- /dev/null
+++ b/llm_web_kit/api/models/request.py
@@ -0,0 +1,34 @@
+"""请求数据模型.
+
+定义 API 请求的数据结构和验证规则。
+"""
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class HTMLParseRequest(BaseModel):
+    """HTML 解析请求模型."""
+
+    html_content: Optional[str] = Field(
+        None,
+        description="HTML 内容字符串",
+        max_length=10485760  # 10MB
+    )
+
+    options: Optional[Dict[str, Any]] = Field(
+        default_factory=dict,
+        description="解析选项配置"
+    )
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "html_content": "<html><body><h1>Hello World</h1></body></html>",
+                "options": {
+                    "clean_html": True
+                }
+            }
+        }
+    )
diff --git a/llm_web_kit/api/models/response.py b/llm_web_kit/api/models/response.py
new file mode 100644
index 00000000..99fbf98f
--- /dev/null
+++ b/llm_web_kit/api/models/response.py
@@ -0,0 +1,99 @@
+"""响应数据模型.
+
+定义 API 响应的数据结构和格式。
+"""
+
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ErrorResponse(BaseModel):
+    """错误响应模型."""
+
+    success: bool = Field(False, description="请求是否成功")
+    error: str = Field(..., description="错误信息")
+    detail: Optional[str] = Field(None, description="详细错误信息")
+    timestamp: datetime = Field(default_factory=datetime.now, description="错误发生时间")
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "success": False,
+                "error": "HTML 解析失败",
+                "detail": "无效的 HTML 格式",
+                "timestamp": "2024-01-01T12:00:00"
+            }
+        }
+    )
+
+
+class BaseResponse(BaseModel):
+    """基础响应模型."""
+
+    success: bool = Field(..., description="请求是否成功")
+    message: str = Field(..., description="响应消息")
+    timestamp: datetime = Field(default_factory=datetime.now, description="响应时间")
+
+
+class HTMLParseData(BaseModel):
+    """HTML 解析结果的结构化数据."""
+    layout_file_list: List[str] = Field(default_factory=list, description="布局文件列表")
+    typical_raw_html: Optional[str] = Field(None, description="原始 HTML")
+    typical_raw_tag_html: Optional[str] = Field(None, description="带标签标注的原始 HTML")
+    llm_response: Dict[str, int] = Field(default_factory=dict, description="LLM 项目打标结果")
+    typical_main_html: Optional[str] = Field(None, description="解析得到的主体 HTML")
+    html_target_list: List[Any] = Field(default_factory=list, description="正文候选/目标列表")
+
+
+class HTMLParseResponse(BaseResponse):
+    """HTML 解析响应模型."""
+
+    data: Optional[HTMLParseData] = Field(None, description="解析结果数据")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="元数据信息")
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "success": True,
+                "message": "HTML 解析成功",
+                "timestamp": "2025-08-26T16:45:43.140638",
+                "data": {
+                    "layout_file_list": [],
+                    "typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
+                    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>\n",
+                    "llm_response": {
+                        "item_id 1": 0,
+                        "item_id 9": 1
+                    },
+                    "typical_main_html": "<html></html>",
+                    "html_target_list": []
+                },
+                "metadata": None
+            }
+        }
+    )
+
+
+class ServiceStatusResponse(BaseResponse):
+    """服务状态响应模型."""
+
+    service: str = Field(..., description="服务名称")
+    version: str = Field(..., description="服务版本")
+    status: str = Field(..., description="服务状态")
+    uptime: Optional[float] = Field(None, description="运行时间（秒）")
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "success": True,
+                "message": "服务状态正常",
+                "timestamp": "2024-01-01T12:00:00",
+                "service": "HTML Processing Service",
+                "version": "1.0.0",
+                "status": "running",
+                "uptime": 3600.5
+            }
+        }
+    )
diff --git a/llm_web_kit/api/requirements.txt b/llm_web_kit/api/requirements.txt
new file mode 100644
index 00000000..1cbb5459
--- /dev/null
+++ b/llm_web_kit/api/requirements.txt
@@ -0,0 +1,21 @@
+
+# HTTP 客户端
+aiohttp>=3.9.0
+
+# HTML 解析
+beautifulsoup4>=4.12.0
+# FastAPI 相关依赖
+fastapi>=0.104.0
+lxml>=4.9.0
+pydantic>=2.0.0
+pydantic-settings>=2.0.0
+
+# 日志和配置
+python-dotenv>=1.0.0
+
+# 数据处理
+python-multipart>=0.0.6
+
+# 类型提示支持
+typing-extensions>=4.8.0
+uvicorn[standard]>=0.24.0
diff --git a/llm_web_kit/api/routers/__init__.py b/llm_web_kit/api/routers/__init__.py
new file mode 100644
index 00000000..4a3f1567
--- /dev/null
+++ b/llm_web_kit/api/routers/__init__.py
@@ -0,0 +1,8 @@
+"""路由模块.
+
+包含所有 API 路由定义，按功能模块组织。
+"""
+
+from . import htmls
+
+__all__ = ["htmls"]
diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py
new file mode 100644
index 00000000..4a58f8dd
--- /dev/null
+++ b/llm_web_kit/api/routers/htmls.py
@@ -0,0 +1,88 @@
+"""HTML 处理路由.
+
+提供 HTML 解析、内容提取等功能的 API 端点。
+"""
+
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+
+from ..dependencies import get_logger, get_settings
+from ..models.request import HTMLParseRequest
+from ..models.response import HTMLParseResponse
+from ..services.html_service import HTMLService
+
+logger = get_logger(__name__)
+settings = get_settings()
+
+router = APIRouter()
+
+
+@router.post("/html/parse", response_model=HTMLParseResponse)
+async def parse_html(
+    request: HTMLParseRequest,
+    html_service: HTMLService = Depends(HTMLService)
+):
+    """解析 HTML 内容.
+
+    接收 HTML 字符串并返回解析后的结构化内容。
+    """
+    try:
+        logger.info(f"开始解析 HTML，内容长度: {len(request.html_content) if request.html_content else 0}")
+
+        result = await html_service.parse_html(
+            html_content=request.html_content,
+            options=request.options
+        )
+
+        return HTMLParseResponse(
+            success=True,
+            data=result,
+            message="HTML 解析成功"
+        )
+    except Exception as e:
+        logger.error(f"HTML 解析失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"HTML 解析失败: {str(e)}")
+
+
+@router.post("/html/upload")
+async def upload_html_file(
+    file: UploadFile = File(...),
+    html_service: HTMLService = Depends(HTMLService)
+):
+    """上传 HTML 文件进行解析.
+
+    支持上传 HTML 文件，自动解析并返回结果。
+    """
+    try:
+        if not file.filename.endswith(('.html', '.htm')):
+            raise HTTPException(status_code=400, detail="只支持 HTML 文件")
+
+        content = await file.read()
+        html_content = content.decode('utf-8')
+
+        logger.info(f"上传 HTML 文件: {file.filename}, 大小: {len(content)} bytes")
+
+        result = await html_service.parse_html(html_content=html_content)
+
+        return JSONResponse(content={
+            "success": True,
+            "data": result,
+            "message": "HTML 文件解析成功",
+            "filename": file.filename
+        })
+    except Exception as e:
+        logger.error(f"HTML 文件解析失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"HTML 文件解析失败: {str(e)}")
+
+
+@router.get("/html/status")
+async def get_service_status():
+    """获取服务状态.
+
+    返回 HTML 处理服务的当前状态信息。
+    """
+    return {
+        "service": "HTML Processing Service",
+        "status": "running",
+        "version": "1.0.0"
+    }
diff --git a/llm_web_kit/api/run_server.py b/llm_web_kit/api/run_server.py
new file mode 100644
index 00000000..739b14fa
--- /dev/null
+++ b/llm_web_kit/api/run_server.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+"""API 服务器启动脚本.
+
+用于启动 LLM Web Kit API 服务。
+"""
+
+import os
+import sys
+
+import uvicorn
+
+# 添加项目根目录到 Python 路径
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from llm_web_kit.api.dependencies import get_settings
+
+if __name__ == "__main__":
+    settings = get_settings()
+    print("启动 LLM Web Kit API 服务器...")
+    print(f"API 文档地址: http://{settings.host}:{settings.port}/docs")
+    print(f"ReDoc 文档地址: http://{settings.host}:{settings.port}/redoc")
+
+    uvicorn.run(
+        "llm_web_kit.api.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        log_level=(settings.log_level or "INFO").lower()
+    )
diff --git a/llm_web_kit/api/services/__init__.py b/llm_web_kit/api/services/__init__.py
new file mode 100644
index 00000000..7e6717b1
--- /dev/null
+++ b/llm_web_kit/api/services/__init__.py
@@ -0,0 +1,8 @@
+"""服务层模块.
+
+包含业务逻辑服务，桥接原有项目功能。
+"""
+
+from .html_service import HTMLService
+
+__all__ = ["HTMLService"]
diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
new file mode 100644
index 00000000..ebedd60e
--- /dev/null
+++ b/llm_web_kit/api/services/html_service.py
@@ -0,0 +1,112 @@
+"""HTML 处理服务.
+
+桥接原有项目的 HTML 解析和内容提取功能，提供统一的 API 接口。
+"""
+
+from typing import Any, Dict, Optional
+
+from ..dependencies import get_logger, get_settings
+
+logger = get_logger(__name__)
+settings = get_settings()
+
+
+class HTMLService:
+    """HTML 处理服务类."""
+
+    def __init__(self):
+        """初始化 HTML 服务."""
+        # 目前使用简化管线
+        pass
+
+    def _init_components(self):
+        """兼容保留（当前未使用）"""
+        return None
+
+    async def parse_html(
+        self,
+        html_content: Optional[str] = None,
+        options: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """解析 HTML 内容."""
+        try:
+            if not html_content:
+                raise ValueError("必须提供 HTML 内容")
+
+            # 延迟导入，避免模块导入期异常导致服务类不可用
+            try:
+                from llm_web_kit.input.pre_data_json import (PreDataJson,
+                                                             PreDataJsonKey)
+                from llm_web_kit.main_html_parser.parser.tag_mapping import \
+                    MapItemToHtmlTagsParser
+                from llm_web_kit.main_html_parser.simplify_html.simplify_html import \
+                    simplify_html
+            except Exception as import_err:
+                logger.error(f"依赖导入失败: {import_err}")
+                raise
+
+            # 简化网页
+            try:
+                simplified_html, typical_raw_tag_html, _ = simplify_html(html_content)
+            except Exception as e:
+                logger.error(f"简化网页失败: {e}")
+                raise
+
+            # 模型推理
+            llm_response = await self._parse_with_model(simplified_html, options)
+
+            # 结果映射
+            pre_data = PreDataJson({})
+            pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html_content
+            pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
+            pre_data[PreDataJsonKey.LLM_RESPONSE] = llm_response
+            parser = MapItemToHtmlTagsParser({})
+            pre_data = parser.parse_single(pre_data)
+
+            # 将 PreDataJson 转为标准 dict，避免响应模型校验错误
+            return dict(pre_data.items())
+
+        except Exception as e:
+            logger.error(f"HTML 解析失败: {e}")
+            raise
+
+    async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        return {
+            "item_id 1": 0,
+            "item_id 2": 0,
+            "item_id 3": 0,
+            "item_id 4": 0,
+            "item_id 5": 0,
+            "item_id 6": 0,
+            "item_id 7": 0,
+            "item_id 8": 0,
+            "item_id 9": 1,
+            "item_id 10": 0,
+            "item_id 11": 0,
+            "item_id 12": 0,
+            "item_id 13": 0,
+            "item_id 14": 0,
+            "item_id 15": 0,
+            "item_id 16": 0,
+            "item_id 17": 0,
+            "item_id 18": 0,
+            "item_id 19": 0,
+            "item_id 20": 0,
+            "item_id 21": 0,
+            "item_id 22": 0,
+            "item_id 23": 0,
+            "item_id 24": 0,
+            "item_id 25": 0,
+            "item_id 26": 0,
+            "item_id 27": 0,
+            "item_id 28": 0,
+            "item_id 29": 0,
+            "item_id 30": 0,
+            "item_id 31": 0,
+            "item_id 32": 0,
+            "item_id 33": 0,
+            "item_id 34": 0,
+            "item_id 35": 0,
+            "item_id 36": 0,
+            "item_id 37": 0
+        }

From ea8133bb7fc37bcb50f557e5f6c31e91b9b9b298 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Tue, 26 Aug 2025 17:58:42 +0800
Subject: [PATCH 02/12] feat: add html parse api

---
 llm_web_kit/api/requirements.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llm_web_kit/api/requirements.txt b/llm_web_kit/api/requirements.txt
index 1cbb5459..68e22885 100644
--- a/llm_web_kit/api/requirements.txt
+++ b/llm_web_kit/api/requirements.txt
@@ -2,20 +2,12 @@
 # HTTP 客户端
 aiohttp>=3.9.0
 
-# HTML 解析
-beautifulsoup4>=4.12.0
 # FastAPI 相关依赖
 fastapi>=0.104.0
-lxml>=4.9.0
-pydantic>=2.0.0
 pydantic-settings>=2.0.0
 
-# 日志和配置
-python-dotenv>=1.0.0
-
 # 数据处理
 python-multipart>=0.0.6
 
 # 类型提示支持
-typing-extensions>=4.8.0
 uvicorn[standard]>=0.24.0

From 9717f0fc400d8ad13d4b9cbc2c7fb2c0ad3d42c5 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Tue, 26 Aug 2025 18:00:04 +0800
Subject: [PATCH 03/12] feat: add html parse api

---
 llm_web_kit/api/requirements.txt | 5 ++++-
 requirements/runtime.txt         | 7 +++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/llm_web_kit/api/requirements.txt b/llm_web_kit/api/requirements.txt
index 68e22885..3ede5656 100644
--- a/llm_web_kit/api/requirements.txt
+++ b/llm_web_kit/api/requirements.txt
@@ -1,11 +1,14 @@
-
 # HTTP 客户端
 aiohttp>=3.9.0
 
 # FastAPI 相关依赖
 fastapi>=0.104.0
+pydantic>=2.0.0
 pydantic-settings>=2.0.0
 
+# 日志和配置
+python-dotenv>=1.0.0
+
 # 数据处理
 python-multipart>=0.0.6
 
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index f6ddacfe..ba3671e1 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,8 +1,10 @@
+aiohttp==3.12.15
 beautifulsoup4>=4.12.2
 boto3==1.28.43
 cairosvg==2.7.1
 click==8.1.8
 commentjson==0.9.0
+fastapi==0.116.1
 fasttext-wheel==0.9.2
 filelock==3.16.1
 html-alg-lib==2.0.2
@@ -22,9 +24,14 @@ orjson==3.11.0
 overrides==7.7.0
 py-asciimath==0.3.0
 pyahocorasick==2.0.0
+pydantic==2.11.7
+pydantic-settings==2.10.1
+python-dotenv==1.1.1
+python-multipart==0.0.20
 scikit-learn>=1.6.1
 selectolax==0.3.33
 torch>=2.3.0
 tqdm==4.67.1
 transformers==4.40.2
+uvicorn[standard]==0.35.0
 xformers>=0.0.27

From 0f90ca0d46ff8a2f78fc0db0a442ceb5bf55e46f Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Wed, 27 Aug 2025 16:40:32 +0800
Subject: [PATCH 04/12] feat: api module

---
 llm_web_kit/api/services/html_service.py      |  50 +--
 llm_web_kit/api/services/inference_service.py | 315 ++++++++++++++++++
 2 files changed, 325 insertions(+), 40 deletions(-)
 create mode 100644 llm_web_kit/api/services/inference_service.py

diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
index ebedd60e..37ab5a8b 100644
--- a/llm_web_kit/api/services/html_service.py
+++ b/llm_web_kit/api/services/html_service.py
@@ -17,7 +17,12 @@ class HTMLService:
     def __init__(self):
         """初始化 HTML 服务."""
         # 目前使用简化管线
-        pass
+        try:
+            from .inference_service import InferenceService
+            self._inference_service = InferenceService()
+        except Exception as e:
+            logger.warning(f"InferenceService 初始化失败（将在首次调用时再尝试）：{e}")
+            self._inference_service = None
 
     def _init_components(self):
         """兼容保留（当前未使用）"""
@@ -71,42 +76,7 @@ async def parse_html(
             raise
 
     async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
-        return {
-            "item_id 1": 0,
-            "item_id 2": 0,
-            "item_id 3": 0,
-            "item_id 4": 0,
-            "item_id 5": 0,
-            "item_id 6": 0,
-            "item_id 7": 0,
-            "item_id 8": 0,
-            "item_id 9": 1,
-            "item_id 10": 0,
-            "item_id 11": 0,
-            "item_id 12": 0,
-            "item_id 13": 0,
-            "item_id 14": 0,
-            "item_id 15": 0,
-            "item_id 16": 0,
-            "item_id 17": 0,
-            "item_id 18": 0,
-            "item_id 19": 0,
-            "item_id 20": 0,
-            "item_id 21": 0,
-            "item_id 22": 0,
-            "item_id 23": 0,
-            "item_id 24": 0,
-            "item_id 25": 0,
-            "item_id 26": 0,
-            "item_id 27": 0,
-            "item_id 28": 0,
-            "item_id 29": 0,
-            "item_id 30": 0,
-            "item_id 31": 0,
-            "item_id 32": 0,
-            "item_id 33": 0,
-            "item_id 34": 0,
-            "item_id 35": 0,
-            "item_id 36": 0,
-            "item_id 37": 0
-        }
+        if self._inference_service is None:
+            from .inference_service import InferenceService
+            self._inference_service = InferenceService()
+        return await self._inference_service.inference(html_content, options or {})
diff --git a/llm_web_kit/api/services/inference_service.py b/llm_web_kit/api/services/inference_service.py
new file mode 100644
index 00000000..ed7ef03e
--- /dev/null
+++ b/llm_web_kit/api/services/inference_service.py
@@ -0,0 +1,315 @@
+from dataclasses import dataclass
+
+import torch
+from transformers import AutoTokenizer
+
+# vLLM 作为可选依赖：导入失败时保持模块可用，实际使用时再报错
+try:
+    from vllm import LLM, SamplingParams
+    _VLLM_IMPORT_ERROR = None
+except Exception as _e:  # noqa: N816
+    LLM = None  # type: ignore
+    SamplingParams = None  # type: ignore
+    _VLLM_IMPORT_ERROR = _e
+import json
+import re
+from enum import Enum
+from typing import List
+
+
+@dataclass
+class InferenceConfig:
+    model_path: str = "/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296"
+    data_path: str = "/fs-computility/llmit_d/shared/liumengjie/NeuScraper_cc/benchmark_process/benchmark_with_alg_800.jsonl"
+    output_path: str = "/share/liukaiwen/test_results"
+    use_logits_processor: bool = True
+    num_workers: int = 8
+    max_tokens: int = 32768
+    temperature: float = 0
+    top_p: float = 0.95
+    max_output_tokens: int = 8192
+    tensor_parallel_size: int = 1
+    dtype: str = "bfloat16"
+    template: bool = True
+
+
+config = InferenceConfig(
+    model_path="/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296",  # checkpoint-3296路径
+    output_path="/share/liukaiwen/test_results",
+    use_logits_processor=True,  # 启用逻辑处理器确保JSON格式输出
+    num_workers=8,              # 并行工作进程数
+    max_tokens=35000,           # 最大输入token数
+    temperature=0,              # 确定性输出
+    top_p=0.95,
+    max_output_tokens=8192,     # 最大输出token数
+    tensor_parallel_size=1,     # 张量并行大小
+    template=True               # 启用聊天模板
+)
+
+
+PROMPT = """As a front-end engineering expert in HTML, your task is to analyze the given HTML structure and accurately classify elements with the _item_id attribute as either "main" (primary content) or "other" (supplementary content). Your goal is to precisely extract the primary content of the page, ensuring that only the most relevant information is labeled as "main" while excluding navigation, metadata, and other non-essential elements.
+Guidelines for Classification:
+Primary Content ("main")
+Elements that constitute the core content of the page should be classified as "main". These typically include:
+✅ For Articles, News, and Blogs:
+The main text body of the article, blog post, or news content.
+Images embedded within the main content that contribute to the article.
+✅ For Forums & Discussion Threads:
+The original post in the thread.
+Replies and discussions that are part of the main conversation.
+✅ For Q&A Websites:
+The question itself posted by a user.
+Answers to the question and replies to answers that contribute to the discussion.
+✅ For Other Content-Based Pages:
+Any rich text, paragraphs, or media that serve as the primary focus of the page.
+Supplementary Content ("other")
+Elements that do not contribute to the primary content but serve as navigation, metadata, or supporting information should be classified as "other". These include:
+❌ Navigation & UI Elements:
+Menus, sidebars, footers, breadcrumbs, and pagination links.
+"Skip to content" links and accessibility-related text.
+❌ Metadata & User Information:
+Article titles, author names, timestamps, and view counts.
+Like counts, vote counts, and other engagement metrics.
+❌ Advertisements & Promotional Content:
+Any section labeled as "Advertisement" or "Sponsored".
+Social media sharing buttons, follow prompts, and external links.
+❌ Related & Suggested Content:
+"Read More", "Next Article", "Trending Topics", and similar sections.
+Lists of related articles, tags, and additional recommendations.
+Task Instructions:
+You will be provided with a simplified HTML structure containing elements with an _item_id attribute. Your job is to analyze each element's function and determine whether it should be classified as "main" or "other".
+Response Format:
+Return a JSON object where each key is the _item_id value, and the corresponding value is either "main" or "other", as in the following example:
+{{"1": "other","2": "main","3": "other"}}
+🚨 Important Notes:
+Do not include any explanations in the output—only return the JSON.
+Ensure high accuracy by carefully distinguishing between primary content and supplementary content.
+Err on the side of caution—if an element seems uncertain, classify it as "other" unless it clearly belongs to the main content.
+
+Input HTML:
+{alg_html}
+
+Output format should be a JSON-formatted string representing a dictionary where keys are item_id strings and values are either 'main' or 'other'. Make sure to include ALL item_ids from the input HTML.
+"""
+
+
+def create_prompt(alg_html: str) -> str:
+    return PROMPT.format(alg_html=alg_html)
+
+
+def add_template(prompt: str, tokenizer: AutoTokenizer) -> str:
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    chat_prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True  # Switches between thinking and non-thinking modes. Default is True.
+    )
+    return chat_prompt
+
+
+class State(Enum):
+    Left_bracket = 0
+    Right_bracket = 1
+    Space_quote = 2
+    Quote_colon_quote = 3
+    Quote_comma = 4
+    Main_other = 5
+    Number = 6
+
+
+class Token_state:
+    def __init__(self, model_path):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        token_id_map = {
+            State.Left_bracket: ["{"],
+            State.Right_bracket: ["}"],
+            State.Space_quote: [' "'],
+            State.Quote_colon_quote: ['":"'],
+            State.Quote_comma: ['",'],
+            State.Main_other: ["main", "other"],
+            State.Number: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
+        }
+        self.token_id_map = {k: [self.tokenizer.encode(v)[0] for v in token_id_map[k]] for k in token_id_map}
+
+    def mask_other_logits(self, logits: torch.Tensor, remained_ids: List[int]):
+        remained_logits = {ids: logits[ids].item() for ids in remained_ids}
+        new_logits = torch.ones_like(logits) * -float('inf')
+        for id in remained_ids:
+            new_logits[id] = remained_logits[id]
+        return new_logits
+
+    def calc_max_count(self, prompt_token_ids: List[int]):
+        pattern_list = [716, 1203, 842, 428]
+        for idx in range(len(prompt_token_ids) - len(pattern_list), -1, -1):
+            if all(prompt_token_ids[idx + i] == pattern_list[i] for i in range(len(pattern_list))):
+                num_idx = idx + len(pattern_list)
+                num_ids = []
+                while num_idx < len(prompt_token_ids) and prompt_token_ids[num_idx] in self.token_id_map[State.Number]:
+                    num_ids.append(prompt_token_ids[num_idx])
+                    num_idx += 1
+                # return int(self.tokenizer.decode(num_ids)) + 1
+                return int(self.tokenizer.decode(num_ids))
+        return 1
+
+    def find_last_complete_number(self, input_ids: List[int]):
+        if not input_ids:
+            return -1, "null", -1
+
+        tail_number_ids = []
+        last_idx = len(input_ids) - 1
+        while last_idx >= 0 and input_ids[last_idx] in self.token_id_map[State.Number]:
+            tail_number_ids.insert(0, input_ids[last_idx])
+            last_idx -= 1
+
+        tail_number = int(self.tokenizer.decode(tail_number_ids)) if tail_number_ids else -1
+
+        while last_idx >= 0 and input_ids[last_idx] not in self.token_id_map[State.Number]:
+            last_idx -= 1
+
+        if last_idx < 0:
+            return tail_number, "tail", tail_number
+
+        last_number_ids = []
+        while last_idx >= 0 and input_ids[last_idx] in self.token_id_map[State.Number]:
+            last_number_ids.insert(0, input_ids[last_idx])
+            last_idx -= 1
+
+        last_number = int(self.tokenizer.decode(last_number_ids))
+
+        if tail_number == last_number + 1:
+            return tail_number, "tail", tail_number
+        return last_number, "non_tail", tail_number
+
+    def process_logit(self, prompt_token_ids: List[int], input_ids: List[int], logits: torch.Tensor):
+        if not input_ids:
+            return self.mask_other_logits(logits, self.token_id_map[State.Left_bracket])
+
+        last_token = input_ids[-1]
+
+        if last_token == self.token_id_map[State.Right_bracket][0]:
+            return self.mask_other_logits(logits, [151645])
+        elif last_token == self.token_id_map[State.Left_bracket][0]:
+            return self.mask_other_logits(logits, self.token_id_map[State.Space_quote])
+        elif last_token == self.token_id_map[State.Space_quote][0]:
+            last_number, _, _ = self.find_last_complete_number(input_ids)
+            # next_char = str(last_number + 1)[0]
+            if last_number == -1:
+                next_char = '1'
+            else:
+                next_char = str(last_number + 1)[0]
+
+            return self.mask_other_logits(logits, self.tokenizer.encode(next_char))
+        elif last_token in self.token_id_map[State.Number]:
+            last_number, state, tail_number = self.find_last_complete_number(input_ids)
+            if state == "tail":
+                return self.mask_other_logits(logits, self.token_id_map[State.Quote_colon_quote])
+            else:
+                next_str = str(last_number + 1)
+                next_char = next_str[len(str(tail_number))]
+                return self.mask_other_logits(logits, self.tokenizer.encode(next_char))
+        elif last_token == self.token_id_map[State.Quote_colon_quote][0]:
+            return self.mask_other_logits(logits, self.token_id_map[State.Main_other])
+        elif last_token in self.token_id_map[State.Main_other]:
+            return self.mask_other_logits(logits, self.token_id_map[State.Quote_comma])
+        elif last_token == self.token_id_map[State.Quote_comma][0]:
+            last_number, _, _ = self.find_last_complete_number(input_ids)
+            max_count = self.calc_max_count(prompt_token_ids)
+            if last_number >= max_count:
+                return self.mask_other_logits(logits, self.token_id_map[State.Right_bracket])
+            else:
+                return self.mask_other_logits(logits, self.token_id_map[State.Space_quote])
+
+        return logits
+
+
+def reformat_map(text):
+    try:
+        data = json.loads(text)
+        return {"item_id " + k: 1 if v == "main" else 0 for k, v in data.items()}
+    except json.JSONDecodeError:
+        return {}
+
+
+def main(simplified_html: str, model: object, tokenizer: object):
+    # tokenizer = AutoTokenizer.from_pretrained("/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296", trust_remote_code=True)
+    # simplified_html = simplify_html(ori_html)
+    # print("sim_html length", len(simplified_html))
+    if SamplingParams is None:
+        raise RuntimeError(
+            "当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，"
+            "或在 API 中使用占位/替代推理实现。原始导入错误: {}".format(_VLLM_IMPORT_ERROR)
+        )
+    prompt = create_prompt(simplified_html)
+    chat_prompt = add_template(prompt, tokenizer)
+
+    if config.use_logits_processor:
+        token_state = Token_state(config.model_path)
+        sampling_params = SamplingParams(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            max_tokens=config.max_output_tokens,
+            logits_processors=[token_state.process_logit]
+        )
+    else:
+        sampling_params = SamplingParams(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            max_tokens=config.max_output_tokens
+        )
+
+    output = model.generate(chat_prompt, sampling_params)
+    output_json = clean_output(output)
+    return output_json
+
+
+def clean_output(output):
+    prediction = output[0].outputs[0].text
+
+    # Extract JSON from prediction
+    start_idx = prediction.rfind("{")
+    end_idx = prediction.rfind("}") + 1
+
+    if start_idx != -1 and end_idx != -1:
+        json_str = prediction[start_idx:end_idx]
+        json_str = re.sub(r',\s*}', '}', json_str)  # Clean JSON
+        try:
+            json.loads(json_str)  # Validate
+        except Exception:
+            json_str = "{}"
+    else:
+        json_str = "{}"
+
+    return json_str
+
+
+class InferenceService:
+    """对外暴露的推理服务封装，供 HTMLService 调用。"""
+    async def inference(self, simplified_html: str, options: dict | None = None) -> dict:
+        """占位实现：请在此处接入真实模型；保持返回 {"item_id N": 0/1} 的结构。"""
+        return {
+            "item_id 1": 0,
+            "item_id 2": 0,
+            "item_id 3": 0,
+            "item_id 4": 0,
+            "item_id 5": 0,
+            "item_id 6": 0,
+            "item_id 7": 0,
+            "item_id 8": 0,
+            "item_id 9": 1
+        }
+
+
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("D:/test_data/网页抽取/models/checkpoint-3296/checkpoint-3296/", trust_remote_code=True)
+    model = LLM(model=config.model_path,
+            trust_remote_code=True,
+            dtype=config.dtype,
+            tensor_parallel_size=config.tensor_parallel_size)
+
+    simplified_html = "<html><body><h1>Hello World</h1></body></html>"
+    response_json = main(simplified_html, model, tokenizer)
+    llm_response_dict = reformat_map(response_json)
+    print(llm_response_dict)

From f26d10f9c0e3c7325ed60d34d407c575dafece66 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 28 Aug 2025 15:04:43 +0800
Subject: [PATCH 05/12] feat: add html parse api with some model params changed
 for test

---
 llm_web_kit/api/dependencies.py               |  12 ++
 llm_web_kit/api/main.py                       |  13 +-
 llm_web_kit/api/services/html_service.py      |  12 +-
 llm_web_kit/api/services/inference_service.py | 182 +++++++++++++++---
 4 files changed, 183 insertions(+), 36 deletions(-)

diff --git a/llm_web_kit/api/dependencies.py b/llm_web_kit/api/dependencies.py
index fe6f5136..32eadeb6 100644
--- a/llm_web_kit/api/dependencies.py
+++ b/llm_web_kit/api/dependencies.py
@@ -64,3 +64,15 @@ def get_logger(name: str = __name__) -> logging.Logger:
 
 # 全局依赖项
 settings = get_settings()
+
+# InferenceService 单例
+_inference_service_singleton = None
+
+
+def get_inference_service():
+    """获取 InferenceService 单例."""
+    global _inference_service_singleton
+    if _inference_service_singleton is None:
+        from .services.inference_service import InferenceService
+        _inference_service_singleton = InferenceService()
+    return _inference_service_singleton
diff --git a/llm_web_kit/api/main.py b/llm_web_kit/api/main.py
index 77d2716c..18f71663 100644
--- a/llm_web_kit/api/main.py
+++ b/llm_web_kit/api/main.py
@@ -8,7 +8,7 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 
-from .dependencies import get_logger, get_settings
+from .dependencies import get_inference_service, get_logger, get_settings
 from .routers import htmls
 
 settings = get_settings()
@@ -53,6 +53,17 @@ async def health_check():
     return {"status": "healthy", "service": "llm-web-kit-api"}
 
 
+@app.on_event("startup")
+async def app_startup():
+    """应用启动时预热模型，避免首个请求冷启动延迟."""
+    try:
+        service = get_inference_service()
+        await service.warmup()
+        logger.info("InferenceService 模型预热完成")
+    except Exception as e:
+        logger.warning(f"InferenceService 预热失败（服务仍可运行，将在首次请求时再初始化）: {e}")
+
+
 @app.exception_handler(Exception)
 async def global_exception_handler(request, exc):
     """全局异常处理器."""
diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
index 37ab5a8b..5c3a3760 100644
--- a/llm_web_kit/api/services/html_service.py
+++ b/llm_web_kit/api/services/html_service.py
@@ -5,7 +5,7 @@
 
 from typing import Any, Dict, Optional
 
-from ..dependencies import get_logger, get_settings
+from ..dependencies import get_inference_service, get_logger, get_settings
 
 logger = get_logger(__name__)
 settings = get_settings()
@@ -16,12 +16,11 @@ class HTMLService:
 
     def __init__(self):
         """初始化 HTML 服务."""
-        # 目前使用简化管线
+        # 目前使用简化管线；使用全局单例的 InferenceService，避免重复初始化模型
         try:
-            from .inference_service import InferenceService
-            self._inference_service = InferenceService()
+            self._inference_service = get_inference_service()
         except Exception as e:
-            logger.warning(f"InferenceService 初始化失败（将在首次调用时再尝试）：{e}")
+            logger.warning(f"InferenceService 获取失败（将在首次调用时再尝试）：{e}")
             self._inference_service = None
 
     def _init_components(self):
@@ -77,6 +76,5 @@ async def parse_html(
 
     async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         if self._inference_service is None:
-            from .inference_service import InferenceService
-            self._inference_service = InferenceService()
+            self._inference_service = get_inference_service()
         return await self._inference_service.inference(html_content, options or {})
diff --git a/llm_web_kit/api/services/inference_service.py b/llm_web_kit/api/services/inference_service.py
index ed7ef03e..3a2d7bf2 100644
--- a/llm_web_kit/api/services/inference_service.py
+++ b/llm_web_kit/api/services/inference_service.py
@@ -1,25 +1,18 @@
-from dataclasses import dataclass
-
-import torch
-from transformers import AutoTokenizer
-
 # vLLM 作为可选依赖：导入失败时保持模块可用，实际使用时再报错
-try:
-    from vllm import LLM, SamplingParams
-    _VLLM_IMPORT_ERROR = None
-except Exception as _e:  # noqa: N816
-    LLM = None  # type: ignore
-    SamplingParams = None  # type: ignore
-    _VLLM_IMPORT_ERROR = _e
 import json
 import re
+from dataclasses import dataclass
 from enum import Enum
 from typing import List
 
+import torch
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
 
 @dataclass
 class InferenceConfig:
-    model_path: str = "/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296"
+    model_path: str = "/mnt/installers/checkpoint-3296"
     data_path: str = "/fs-computility/llmit_d/shared/liumengjie/NeuScraper_cc/benchmark_process/benchmark_with_alg_800.jsonl"
     output_path: str = "/share/liukaiwen/test_results"
     use_logits_processor: bool = True
@@ -29,16 +22,17 @@ class InferenceConfig:
     top_p: float = 0.95
     max_output_tokens: int = 8192
     tensor_parallel_size: int = 1
-    dtype: str = "bfloat16"
+    # 正式环境修改为bfloat16
+    dtype: str = "float16"
     template: bool = True
 
 
 config = InferenceConfig(
-    model_path="/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296",  # checkpoint-3296路径
+    model_path="/mnt/installers/checkpoint-3296",  # checkpoint-3296路径
     output_path="/share/liukaiwen/test_results",
     use_logits_processor=True,  # 启用逻辑处理器确保JSON格式输出
     num_workers=8,              # 并行工作进程数
-    max_tokens=35000,           # 最大输入token数
+    max_tokens=26000,           # 最大输入token数
     temperature=0,              # 确定性输出
     top_p=0.95,
     max_output_tokens=8192,     # 最大输出token数
@@ -240,7 +234,7 @@ def main(simplified_html: str, model: object, tokenizer: object):
     if SamplingParams is None:
         raise RuntimeError(
             "当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，"
-            "或在 API 中使用占位/替代推理实现。原始导入错误: {}".format(_VLLM_IMPORT_ERROR)
+            "或在 API 中使用占位/替代推理实现。原始导入错误: {}".format("_VLLM_IMPORT_ERROR")
         )
     prompt = create_prompt(simplified_html)
     chat_prompt = add_template(prompt, tokenizer)
@@ -287,8 +281,117 @@ def clean_output(output):
 
 class InferenceService:
     """对外暴露的推理服务封装，供 HTMLService 调用。"""
+
+    def __init__(self):
+        """初始化推理服务，延迟加载模型."""
+        self._llm = None
+        self._tokenizer = None
+        self._initialized = False
+        self._init_lock = None  # 用于异步初始化锁
+
+    async def warmup(self):
+        """在服务启动阶段主动预热模型（异步初始化）。"""
+        await self._ensure_initialized()
+
+    async def _ensure_initialized(self):
+        """确保模型已初始化（异步安全）"""
+        if not self._initialized:
+            if self._init_lock is None:
+                import asyncio
+                self._init_lock = asyncio.Lock()
+
+            async with self._init_lock:
+                if not self._initialized:  # 双重检查
+                    await self._init_model()
+                    self._initialized = True
+
+    async def _init_model(self):
+        """初始化模型和tokenizer."""
+        try:
+            if SamplingParams is None:
+                raise RuntimeError(
+                    "当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，"
+                    "或在 API 中使用占位/替代推理实现。原始导入错误: {}".format("_VLLM_IMPORT_ERROR")
+                )
+
+            # 初始化 tokenizer
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                config.model_path,
+                trust_remote_code=True
+            )
+
+            # 初始化 LLM 模型
+            self._llm = LLM(
+                model=config.model_path,
+                trust_remote_code=True,
+                dtype=config.dtype,
+                tensor_parallel_size=config.tensor_parallel_size,
+                # 正式环境删掉
+                max_model_len=config.max_tokens,  # 减少序列长度避免内存不足
+            )
+
+            print(f"模型初始化成功: {config.model_path}")
+
+        except Exception as e:
+            print(f"模型初始化失败: {e}")
+            # 如果模型初始化失败，保持为 None，后续调用会返回占位结果
+            self._llm = None
+            self._tokenizer = None
+
     async def inference(self, simplified_html: str, options: dict | None = None) -> dict:
-        """占位实现：请在此处接入真实模型；保持返回 {"item_id N": 0/1} 的结构。"""
+        """执行推理，如果模型未初始化则返回占位结果."""
+        try:
+            await self._ensure_initialized()
+
+            if self._llm is None or self._tokenizer is None:
+                print("模型未初始化，返回占位结果")
+                return self._get_placeholder_result()
+
+            # 执行真实推理
+            return await self._run_real_inference(simplified_html, options)
+
+        except Exception as e:
+            print(f"推理过程出错: {e}")
+            return self._get_placeholder_result()
+
+    async def _run_real_inference(self, simplified_html: str, options: dict | None = None) -> dict:
+        """执行真实的模型推理."""
+        try:
+            # 创建 prompt
+            prompt = create_prompt(simplified_html)
+            chat_prompt = add_template(prompt, self._tokenizer)
+
+            # 设置采样参数
+            if config.use_logits_processor:
+                token_state = Token_state(config.model_path)
+                sampling_params = SamplingParams(
+                    temperature=config.temperature,
+                    top_p=config.top_p,
+                    max_tokens=config.max_output_tokens,
+                    logits_processors=[token_state.process_logit]
+                )
+            else:
+                sampling_params = SamplingParams(
+                    temperature=config.temperature,
+                    top_p=config.top_p,
+                    max_tokens=config.max_output_tokens
+                )
+
+            # 执行推理
+            output = self._llm.generate(chat_prompt, sampling_params)
+            output_json = clean_output(output)
+
+            # 格式化结果
+            result = reformat_map(output_json)
+            print(f"推理完成，结果: {result}")
+            return result
+
+        except Exception as e:
+            print(f"真实推理失败: {e}")
+            return self._get_placeholder_result()
+
+    def _get_placeholder_result(self) -> dict:
+        """返回占位结果."""
         return {
             "item_id 1": 0,
             "item_id 2": 0,
@@ -303,13 +406,36 @@ async def inference(self, simplified_html: str, options: dict | None = None) ->
 
 
 if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained("D:/test_data/网页抽取/models/checkpoint-3296/checkpoint-3296/", trust_remote_code=True)
-    model = LLM(model=config.model_path,
-            trust_remote_code=True,
-            dtype=config.dtype,
-            tensor_parallel_size=config.tensor_parallel_size)
-
-    simplified_html = "<html><body><h1>Hello World</h1></body></html>"
-    response_json = main(simplified_html, model, tokenizer)
-    llm_response_dict = reformat_map(response_json)
-    print(llm_response_dict)
+    config = InferenceConfig(
+        model_path="/mnt/installers/checkpoint-3296",
+        output_path="/share/liukaiwen/test_results",
+        use_logits_processor=True,
+        num_workers=8,
+        max_tokens=26000,
+        temperature=0,
+        top_p=0.95,
+        max_output_tokens=8192,
+        tensor_parallel_size=1,
+        template=True,
+    )
+    try:
+        tokenizer = AutoTokenizer.from_pretrained("/mnt/installers/checkpoint-3296", trust_remote_code=True)
+        model = LLM(model=config.model_path,
+                trust_remote_code=True,
+                dtype=config.dtype,
+                # 设置最大模型长度
+                max_model_len=config.max_tokens,
+                tensor_parallel_size=config.tensor_parallel_size)
+
+        simplified_html = "<html><body><h1>Hello World</h1></body></html>"
+        response_json = main(simplified_html, model, tokenizer)
+        llm_response_dict = reformat_map(response_json)
+        print(llm_response_dict)
+    except Exception:
+        raise
+    finally:
+        import torch.distributed as dist
+
+        # 在程序结束前添加
+        if dist.is_initialized():
+            dist.destroy_process_group()

From c6e78e33c05435c8f3ad4ba40386340e146a8a93 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Wed, 3 Sep 2025 11:25:09 +0800
Subject: [PATCH 06/12] <feat>: add single parse api

---
 llm_web_kit/api/README.md        |  6 +++---
 llm_web_kit/api/routers/htmls.py | 13 ++++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/llm_web_kit/api/README.md b/llm_web_kit/api/README.md
index 69cd4de5..e3491da1 100644
--- a/llm_web_kit/api/README.md
+++ b/llm_web_kit/api/README.md
@@ -61,13 +61,13 @@ GET /api/v1/html/status
   "data": {
     "layout_file_list": [],
     "typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
-    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>\n",
+    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1><h2 _item_id=\"2\">not main content</h2></body></html>\n",
     "llm_response": {
       "item_id 1": 0,
       "item_id 2": 1
     },
-    "typical_main_html": "<html></html>",
-    "html_target_list": []
+    "typical_main_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>",
+    "html_target_list": ["Hello World"]
   },
   "metadata": null
 }
diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py
index 4a58f8dd..5ec17f62 100644
--- a/llm_web_kit/api/routers/htmls.py
+++ b/llm_web_kit/api/routers/htmls.py
@@ -4,7 +4,6 @@
 """
 
 from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
-from fastapi.responses import JSONResponse
 
 from ..dependencies import get_logger, get_settings
 from ..models.request import HTMLParseRequest
@@ -64,12 +63,12 @@ async def upload_html_file(
 
         result = await html_service.parse_html(html_content=html_content)
 
-        return JSONResponse(content={
-            "success": True,
-            "data": result,
-            "message": "HTML 文件解析成功",
-            "filename": file.filename
-        })
+        return HTMLParseResponse(
+            success=True,
+            data=result,
+            message="HTML 文件解析成功",
+            filename=file.filename
+        )
     except Exception as e:
         logger.error(f"HTML 文件解析失败: {str(e)}")
         raise HTTPException(status_code=500, detail=f"HTML 文件解析失败: {str(e)}")

From c7b0f5a46b237e30bc8fc66d40b35b96da00192c Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 11 Sep 2025 16:36:05 +0800
Subject: [PATCH 07/12] <fix>: fix match failure if there are too many same ids
 in one html, fix incomplete html tags that cause structure chaos and fix
 natural language detection method for chinese

---
 llm_web_kit/api/services/html_service.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
index 5c3a3760..72373049 100644
--- a/llm_web_kit/api/services/html_service.py
+++ b/llm_web_kit/api/services/html_service.py
@@ -30,6 +30,7 @@ def _init_components(self):
     async def parse_html(
         self,
         html_content: Optional[str] = None,
+        url: Optional[str] = None,
         options: Optional[Dict[str, Any]] = None
     ) -> Dict[str, Any]:
         """解析 HTML 内容."""

From 4362931843024d5d186cc12ede4c6e26c423a490 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 11 Sep 2025 16:37:30 +0800
Subject: [PATCH 08/12] feat: add html parse api

---
 llm_web_kit/api/routers/htmls.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py
index 5ec17f62..fd4c1c35 100644
--- a/llm_web_kit/api/routers/htmls.py
+++ b/llm_web_kit/api/routers/htmls.py
@@ -30,6 +30,7 @@ async def parse_html(
 
         result = await html_service.parse_html(
             html_content=request.html_content,
+            url=request.url,
             options=request.options
         )
 

From acb0bc9ae7842f1a03adf5b7c0c700522639d52c Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 11 Sep 2025 16:40:58 +0800
Subject: [PATCH 09/12] feat: add html parse api

---
 llm_web_kit/api/README.md         | 28 +++++++++++++++++++++++++++-
 llm_web_kit/api/models/request.py |  6 ++++++
 llm_web_kit/api/requirements.txt  |  5 +++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/llm_web_kit/api/README.md b/llm_web_kit/api/README.md
index e3491da1..700e2de3 100644
--- a/llm_web_kit/api/README.md
+++ b/llm_web_kit/api/README.md
@@ -13,6 +13,16 @@
 
 ## 快速开始
 
+配置环境变量
+
+```bash
+export MODEL_PATH=""
+```
+
+或者配置文件.llm-web-kit.jsonc添加“model_path”
+
+安装依赖
+
 ```bash
 pip install -r requirements.txt
 python llm_web_kit/api/run_server.py
@@ -29,6 +39,20 @@ POST /api/v1/html/parse
 
 请求示例：
 
+```bash
+curl -s -X POST "http://127.0.0.1:8000/api/v1/html/parse" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "html_content": "<html><body><h1>Hello World</h1></body></html>",
+    "url": "https://helloworld.com/hello",
+    "options": {
+      "clean_html": true
+    }
+  }'
+```
+
+或直接发送以下 JSON 作为请求体：
+
 ```json
 {
   "html_content": "<html><body><h1>Hello World</h1></body></html>",
@@ -51,7 +75,9 @@ curl -s -X POST "http://127.0.0.1:8000/api/v1/html/upload" \
 
 GET /api/v1/html/status
 
-## 返回结构示例
+## 返回结构示例（/api/v1/html/parse 与 /api/v1/html/upload 成功返回）
+
+以下示例为 HTML 解析成功时的统一响应结构：
 
 ```json
 {
diff --git a/llm_web_kit/api/models/request.py b/llm_web_kit/api/models/request.py
index 6b2d4362..712f7e07 100644
--- a/llm_web_kit/api/models/request.py
+++ b/llm_web_kit/api/models/request.py
@@ -17,6 +17,12 @@ class HTMLParseRequest(BaseModel):
         max_length=10485760  # 10MB
     )
 
+    url: Optional[str] = Field(
+        None,
+        description="url 地址",
+        max_length=10485760  # 10MB
+    )
+
     options: Optional[Dict[str, Any]] = Field(
         default_factory=dict,
         description="解析选项配置"
diff --git a/llm_web_kit/api/requirements.txt b/llm_web_kit/api/requirements.txt
index 3ede5656..c7cbd56f 100644
--- a/llm_web_kit/api/requirements.txt
+++ b/llm_web_kit/api/requirements.txt
@@ -11,6 +11,11 @@ python-dotenv>=1.0.0
 
 # 数据处理
 python-multipart>=0.0.6
+torch==2.6.0
+transformers==4.52.4
 
 # 类型提示支持
 uvicorn[standard]>=0.24.0
+
+# 模型推理
+vllm==0.8.5.post1

From 4a97137b1c9397362a0c615594e66862d90db2d8 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 11 Sep 2025 16:42:33 +0800
Subject: [PATCH 10/12] feat: add html parse api

---
 llm_web_kit/api/services/inference_service.py |  94 ++++++++++--------
 llm_web_kit/config/pipe_tpl/model.jsonc       | Bin 0 -> 112 bytes
 2 files changed, 50 insertions(+), 44 deletions(-)
 create mode 100644 llm_web_kit/config/pipe_tpl/model.jsonc

diff --git a/llm_web_kit/api/services/inference_service.py b/llm_web_kit/api/services/inference_service.py
index 3a2d7bf2..484a31cf 100644
--- a/llm_web_kit/api/services/inference_service.py
+++ b/llm_web_kit/api/services/inference_service.py
@@ -1,6 +1,8 @@
 # vLLM 作为可选依赖：导入失败时保持模块可用，实际使用时再报错
 import json
+import os
 import re
+import time
 from dataclasses import dataclass
 from enum import Enum
 from typing import List
@@ -9,12 +11,18 @@
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 
+from llm_web_kit.config.cfg_reader import load_config
+
+from ..dependencies import get_logger
+
+logger = get_logger(__name__)
+
 
 @dataclass
 class InferenceConfig:
-    model_path: str = "/mnt/installers/checkpoint-3296"
-    data_path: str = "/fs-computility/llmit_d/shared/liumengjie/NeuScraper_cc/benchmark_process/benchmark_with_alg_800.jsonl"
-    output_path: str = "/share/liukaiwen/test_results"
+    model_path: str = ""
+    data_path: str = ""
+    output_path: str = ""
     use_logits_processor: bool = True
     num_workers: int = 8
     max_tokens: int = 32768
@@ -28,19 +36,18 @@ class InferenceConfig:
 
 
 config = InferenceConfig(
-    model_path="/mnt/installers/checkpoint-3296",  # checkpoint-3296路径
-    output_path="/share/liukaiwen/test_results",
+    model_path="",  # checkpoint-3296路径
+    output_path="",
     use_logits_processor=True,  # 启用逻辑处理器确保JSON格式输出
-    num_workers=8,              # 并行工作进程数
-    max_tokens=26000,           # 最大输入token数
-    temperature=0,              # 确定性输出
+    num_workers=8,  # 并行工作进程数
+    max_tokens=26000,  # 最大输入token数
+    temperature=0,  # 确定性输出
     top_p=0.95,
-    max_output_tokens=8192,     # 最大输出token数
-    tensor_parallel_size=1,     # 张量并行大小
-    template=True               # 启用聊天模板
+    max_output_tokens=8192,  # 最大输出token数
+    tensor_parallel_size=1,  # 张量并行大小
+    template=True  # 启用聊天模板
 )
 
-
 PROMPT = """As a front-end engineering expert in HTML, your task is to analyze the given HTML structure and accurately classify elements with the _item_id attribute as either "main" (primary content) or "other" (supplementary content). Your goal is to precisely extract the primary content of the page, ensuring that only the most relevant information is labeled as "main" while excluding navigation, metadata, and other non-essential elements.
 Guidelines for Classification:
 Primary Content ("main")
@@ -227,7 +234,7 @@ def reformat_map(text):
         return {}
 
 
-def main(simplified_html: str, model: object, tokenizer: object):
+def main(simplified_html: str, model: object, tokenizer: object, model_path: str):
     # tokenizer = AutoTokenizer.from_pretrained("/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296", trust_remote_code=True)
     # simplified_html = simplify_html(ori_html)
     # print("sim_html length", len(simplified_html))
@@ -240,7 +247,7 @@ def main(simplified_html: str, model: object, tokenizer: object):
     chat_prompt = add_template(prompt, tokenizer)
 
     if config.use_logits_processor:
-        token_state = Token_state(config.model_path)
+        token_state = Token_state(model_path)
         sampling_params = SamplingParams(
             temperature=config.temperature,
             top_p=config.top_p,
@@ -288,6 +295,7 @@ def __init__(self):
         self._tokenizer = None
         self._initialized = False
         self._init_lock = None  # 用于异步初始化锁
+        self._model_path = None
 
     async def warmup(self):
         """在服务启动阶段主动预热模型（异步初始化）。"""
@@ -308,6 +316,11 @@ async def _ensure_initialized(self):
     async def _init_model(self):
         """初始化模型和tokenizer."""
         try:
+            llm_config = load_config(suppress_error=True)
+            self.model_path = os.environ['MODEL_PATH'] if 'MODEL_PATH' in os.environ else llm_config.get('model_path',
+                                                                                                         None)
+            if self.model_path is None:
+                raise RuntimeError("model_path为空，未配置模型路径")
             if SamplingParams is None:
                 raise RuntimeError(
                     "当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，"
@@ -316,13 +329,13 @@ async def _init_model(self):
 
             # 初始化 tokenizer
             self._tokenizer = AutoTokenizer.from_pretrained(
-                config.model_path,
+                self.model_path,
                 trust_remote_code=True
             )
 
             # 初始化 LLM 模型
             self._llm = LLM(
-                model=config.model_path,
+                model=self.model_path,
                 trust_remote_code=True,
                 dtype=config.dtype,
                 tensor_parallel_size=config.tensor_parallel_size,
@@ -330,10 +343,10 @@ async def _init_model(self):
                 max_model_len=config.max_tokens,  # 减少序列长度避免内存不足
             )
 
-            print(f"模型初始化成功: {config.model_path}")
+            logger.info(f"模型初始化成功: {self.model_path}")
 
         except Exception as e:
-            print(f"模型初始化失败: {e}")
+            logger.error(f"模型初始化失败: {e}")
             # 如果模型初始化失败，保持为 None，后续调用会返回占位结果
             self._llm = None
             self._tokenizer = None
@@ -344,14 +357,14 @@ async def inference(self, simplified_html: str, options: dict | None = None) ->
             await self._ensure_initialized()
 
             if self._llm is None or self._tokenizer is None:
-                print("模型未初始化，返回占位结果")
+                logger.error("模型未初始化，返回占位结果")
                 return self._get_placeholder_result()
 
             # 执行真实推理
             return await self._run_real_inference(simplified_html, options)
 
         except Exception as e:
-            print(f"推理过程出错: {e}")
+            logger.error(f"推理过程出错: {e}")
             return self._get_placeholder_result()
 
     async def _run_real_inference(self, simplified_html: str, options: dict | None = None) -> dict:
@@ -363,7 +376,7 @@ async def _run_real_inference(self, simplified_html: str, options: dict | None =
 
             # 设置采样参数
             if config.use_logits_processor:
-                token_state = Token_state(config.model_path)
+                token_state = Token_state(self.model_path)
                 sampling_params = SamplingParams(
                     temperature=config.temperature,
                     top_p=config.top_p,
@@ -378,37 +391,29 @@ async def _run_real_inference(self, simplified_html: str, options: dict | None =
                 )
 
             # 执行推理
+            start_time = time.time()
             output = self._llm.generate(chat_prompt, sampling_params)
+            end_time = time.time()
             output_json = clean_output(output)
 
             # 格式化结果
             result = reformat_map(output_json)
-            print(f"推理完成，结果: {result}")
+            logger.info(f"推理完成，结果: {result}, 耗时: {end_time - start_time}秒")
             return result
 
         except Exception as e:
-            print(f"真实推理失败: {e}")
+            logger.error(f"真实推理失败: {e}")
             return self._get_placeholder_result()
 
     def _get_placeholder_result(self) -> dict:
         """返回占位结果."""
-        return {
-            "item_id 1": 0,
-            "item_id 2": 0,
-            "item_id 3": 0,
-            "item_id 4": 0,
-            "item_id 5": 0,
-            "item_id 6": 0,
-            "item_id 7": 0,
-            "item_id 8": 0,
-            "item_id 9": 1
-        }
+        return {}
 
 
 if __name__ == "__main__":
     config = InferenceConfig(
-        model_path="/mnt/installers/checkpoint-3296",
-        output_path="/share/liukaiwen/test_results",
+        model_path="",
+        output_path="",
         use_logits_processor=True,
         num_workers=8,
         max_tokens=26000,
@@ -419,18 +424,19 @@ def _get_placeholder_result(self) -> dict:
         template=True,
     )
     try:
-        tokenizer = AutoTokenizer.from_pretrained("/mnt/installers/checkpoint-3296", trust_remote_code=True)
-        model = LLM(model=config.model_path,
-                trust_remote_code=True,
-                dtype=config.dtype,
-                # 设置最大模型长度
-                max_model_len=config.max_tokens,
-                tensor_parallel_size=config.tensor_parallel_size)
+        llm_config = load_config(suppress_error=True)
+        model_path = llm_config.get('model_path', None)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = LLM(model=model_path,
+                    trust_remote_code=True,
+                    dtype=config.dtype,
+                    # 设置最大模型长度
+                    max_model_len=config.max_tokens,
+                    tensor_parallel_size=config.tensor_parallel_size)
 
         simplified_html = "<html><body><h1>Hello World</h1></body></html>"
         response_json = main(simplified_html, model, tokenizer)
         llm_response_dict = reformat_map(response_json)
-        print(llm_response_dict)
     except Exception:
         raise
     finally:
diff --git a/llm_web_kit/config/pipe_tpl/model.jsonc b/llm_web_kit/config/pipe_tpl/model.jsonc
new file mode 100644
index 0000000000000000000000000000000000000000..ff77efe59c74b645692716043e597767dacfc319
GIT binary patch
literal 112
zcmW-Z(F#C75Jk`PmVc0UZBdG!(Z&{8R_vSnI&PY}Gjs2md7leIt}0lGjCcY)JC4L8
oxBD}zbdzJ5)lW@MTo8R|vkYRrbkSw}nM6rVMN6YEhMP?K0o1S)rT_o{

literal 0
HcmV?d00001


From d4382a4099a086ac4af84a734b7f8326210dd94d Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 11 Sep 2025 16:51:08 +0800
Subject: [PATCH 11/12] feat: add html parse api

---
 llm_web_kit/api/routers/htmls.py              | 30 ++++----
 llm_web_kit/api/services/html_service.py      | 10 +--
 llm_web_kit/api/services/inference_service.py | 72 +++++++++----------
 3 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py
index fd4c1c35..0f69074a 100644
--- a/llm_web_kit/api/routers/htmls.py
+++ b/llm_web_kit/api/routers/htmls.py
@@ -16,7 +16,7 @@
 router = APIRouter()
 
 
-@router.post("/html/parse", response_model=HTMLParseResponse)
+@router.post('/html/parse', response_model=HTMLParseResponse)
 async def parse_html(
     request: HTMLParseRequest,
     html_service: HTMLService = Depends(HTMLService)
@@ -26,7 +26,7 @@ async def parse_html(
     接收 HTML 字符串并返回解析后的结构化内容。
     """
     try:
-        logger.info(f"开始解析 HTML，内容长度: {len(request.html_content) if request.html_content else 0}")
+        logger.info(f'开始解析 HTML，内容长度: {len(request.html_content) if request.html_content else 0}')
 
         result = await html_service.parse_html(
             html_content=request.html_content,
@@ -37,14 +37,14 @@ async def parse_html(
         return HTMLParseResponse(
             success=True,
             data=result,
-            message="HTML 解析成功"
+            message='HTML 解析成功'
         )
     except Exception as e:
-        logger.error(f"HTML 解析失败: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"HTML 解析失败: {str(e)}")
+        logger.error(f'HTML 解析失败: {str(e)}')
+        raise HTTPException(status_code=500, detail=f'HTML 解析失败: {str(e)}')
 
 
-@router.post("/html/upload")
+@router.post('/html/upload')
 async def upload_html_file(
     file: UploadFile = File(...),
     html_service: HTMLService = Depends(HTMLService)
@@ -55,34 +55,34 @@ async def upload_html_file(
     """
     try:
         if not file.filename.endswith(('.html', '.htm')):
-            raise HTTPException(status_code=400, detail="只支持 HTML 文件")
+            raise HTTPException(status_code=400, detail='只支持 HTML 文件')
 
         content = await file.read()
         html_content = content.decode('utf-8')
 
-        logger.info(f"上传 HTML 文件: {file.filename}, 大小: {len(content)} bytes")
+        logger.info(f'上传 HTML 文件: {file.filename}, 大小: {len(content)} bytes')
 
         result = await html_service.parse_html(html_content=html_content)
 
         return HTMLParseResponse(
             success=True,
             data=result,
-            message="HTML 文件解析成功",
+            message='HTML 文件解析成功',
             filename=file.filename
         )
     except Exception as e:
-        logger.error(f"HTML 文件解析失败: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"HTML 文件解析失败: {str(e)}")
+        logger.error(f'HTML 文件解析失败: {str(e)}')
+        raise HTTPException(status_code=500, detail=f'HTML 文件解析失败: {str(e)}')
 
 
-@router.get("/html/status")
+@router.get('/html/status')
 async def get_service_status():
     """获取服务状态.
 
     返回 HTML 处理服务的当前状态信息。
     """
     return {
-        "service": "HTML Processing Service",
-        "status": "running",
-        "version": "1.0.0"
+        'service': 'HTML Processing Service',
+        'status': 'running',
+        'version': '1.0.0'
     }
diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
index 72373049..64b740ac 100644
--- a/llm_web_kit/api/services/html_service.py
+++ b/llm_web_kit/api/services/html_service.py
@@ -20,7 +20,7 @@ def __init__(self):
         try:
             self._inference_service = get_inference_service()
         except Exception as e:
-            logger.warning(f"InferenceService 获取失败（将在首次调用时再尝试）：{e}")
+            logger.warning(f'InferenceService 获取失败（将在首次调用时再尝试）：{e}')
             self._inference_service = None
 
     def _init_components(self):
@@ -36,7 +36,7 @@ async def parse_html(
         """解析 HTML 内容."""
         try:
             if not html_content:
-                raise ValueError("必须提供 HTML 内容")
+                raise ValueError('必须提供 HTML 内容')
 
             # 延迟导入，避免模块导入期异常导致服务类不可用
             try:
@@ -47,14 +47,14 @@ async def parse_html(
                 from llm_web_kit.main_html_parser.simplify_html.simplify_html import \
                     simplify_html
             except Exception as import_err:
-                logger.error(f"依赖导入失败: {import_err}")
+                logger.error(f'依赖导入失败: {import_err}')
                 raise
 
             # 简化网页
             try:
                 simplified_html, typical_raw_tag_html, _ = simplify_html(html_content)
             except Exception as e:
-                logger.error(f"简化网页失败: {e}")
+                logger.error(f'简化网页失败: {e}')
                 raise
 
             # 模型推理
@@ -72,7 +72,7 @@ async def parse_html(
             return dict(pre_data.items())
 
         except Exception as e:
-            logger.error(f"HTML 解析失败: {e}")
+            logger.error(f'HTML 解析失败: {e}')
             raise
 
     async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
diff --git a/llm_web_kit/api/services/inference_service.py b/llm_web_kit/api/services/inference_service.py
index 484a31cf..095404d9 100644
--- a/llm_web_kit/api/services/inference_service.py
+++ b/llm_web_kit/api/services/inference_service.py
@@ -20,9 +20,9 @@
 
 @dataclass
 class InferenceConfig:
-    model_path: str = ""
-    data_path: str = ""
-    output_path: str = ""
+    model_path: str = ''
+    data_path: str = ''
+    output_path: str = ''
     use_logits_processor: bool = True
     num_workers: int = 8
     max_tokens: int = 32768
@@ -31,13 +31,13 @@ class InferenceConfig:
     max_output_tokens: int = 8192
     tensor_parallel_size: int = 1
     # 正式环境修改为bfloat16
-    dtype: str = "float16"
+    dtype: str = 'float16'
     template: bool = True
 
 
 config = InferenceConfig(
-    model_path="",  # checkpoint-3296路径
-    output_path="",
+    model_path='',  # checkpoint-3296路径
+    output_path='',
     use_logits_processor=True,  # 启用逻辑处理器确保JSON格式输出
     num_workers=8,  # 并行工作进程数
     max_tokens=26000,  # 最大输入token数
@@ -100,7 +100,7 @@ def create_prompt(alg_html: str) -> str:
 
 def add_template(prompt: str, tokenizer: AutoTokenizer) -> str:
     messages = [
-        {"role": "user", "content": prompt}
+        {'role': 'user', 'content': prompt}
     ]
     chat_prompt = tokenizer.apply_chat_template(
         messages,
@@ -125,13 +125,13 @@ class Token_state:
     def __init__(self, model_path):
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         token_id_map = {
-            State.Left_bracket: ["{"],
-            State.Right_bracket: ["}"],
+            State.Left_bracket: ['{'],
+            State.Right_bracket: ['}'],
             State.Space_quote: [' "'],
             State.Quote_colon_quote: ['":"'],
             State.Quote_comma: ['",'],
-            State.Main_other: ["main", "other"],
-            State.Number: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
+            State.Main_other: ['main', 'other'],
+            State.Number: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
         }
         self.token_id_map = {k: [self.tokenizer.encode(v)[0] for v in token_id_map[k]] for k in token_id_map}
 
@@ -157,7 +157,7 @@ def calc_max_count(self, prompt_token_ids: List[int]):
 
     def find_last_complete_number(self, input_ids: List[int]):
         if not input_ids:
-            return -1, "null", -1
+            return -1, 'null', -1
 
         tail_number_ids = []
         last_idx = len(input_ids) - 1
@@ -171,7 +171,7 @@ def find_last_complete_number(self, input_ids: List[int]):
             last_idx -= 1
 
         if last_idx < 0:
-            return tail_number, "tail", tail_number
+            return tail_number, 'tail', tail_number
 
         last_number_ids = []
         while last_idx >= 0 and input_ids[last_idx] in self.token_id_map[State.Number]:
@@ -181,8 +181,8 @@ def find_last_complete_number(self, input_ids: List[int]):
         last_number = int(self.tokenizer.decode(last_number_ids))
 
         if tail_number == last_number + 1:
-            return tail_number, "tail", tail_number
-        return last_number, "non_tail", tail_number
+            return tail_number, 'tail', tail_number
+        return last_number, 'non_tail', tail_number
 
     def process_logit(self, prompt_token_ids: List[int], input_ids: List[int], logits: torch.Tensor):
         if not input_ids:
@@ -205,7 +205,7 @@ def process_logit(self, prompt_token_ids: List[int], input_ids: List[int], logit
             return self.mask_other_logits(logits, self.tokenizer.encode(next_char))
         elif last_token in self.token_id_map[State.Number]:
             last_number, state, tail_number = self.find_last_complete_number(input_ids)
-            if state == "tail":
+            if state == 'tail':
                 return self.mask_other_logits(logits, self.token_id_map[State.Quote_colon_quote])
             else:
                 next_str = str(last_number + 1)
@@ -229,7 +229,7 @@ def process_logit(self, prompt_token_ids: List[int], input_ids: List[int], logit
 def reformat_map(text):
     try:
         data = json.loads(text)
-        return {"item_id " + k: 1 if v == "main" else 0 for k, v in data.items()}
+        return {'item_id ' + k: 1 if v == 'main' else 0 for k, v in data.items()}
     except json.JSONDecodeError:
         return {}
 
@@ -240,8 +240,8 @@ def main(simplified_html: str, model: object, tokenizer: object, model_path: str
     # print("sim_html length", len(simplified_html))
     if SamplingParams is None:
         raise RuntimeError(
-            "当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，"
-            "或在 API 中使用占位/替代推理实现。原始导入错误: {}".format("_VLLM_IMPORT_ERROR")
+            '当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，' +
+            '或在 API 中使用占位/替代推理实现。原始导入错误: {}'.format('_VLLM_IMPORT_ERROR')
         )
     prompt = create_prompt(simplified_html)
     chat_prompt = add_template(prompt, tokenizer)
@@ -270,8 +270,8 @@ def clean_output(output):
     prediction = output[0].outputs[0].text
 
     # Extract JSON from prediction
-    start_idx = prediction.rfind("{")
-    end_idx = prediction.rfind("}") + 1
+    start_idx = prediction.rfind('{')
+    end_idx = prediction.rfind('}') + 1
 
     if start_idx != -1 and end_idx != -1:
         json_str = prediction[start_idx:end_idx]
@@ -279,9 +279,9 @@ def clean_output(output):
         try:
             json.loads(json_str)  # Validate
         except Exception:
-            json_str = "{}"
+            json_str = '{}'
     else:
-        json_str = "{}"
+        json_str = '{}'
 
     return json_str
 
@@ -320,11 +320,11 @@ async def _init_model(self):
             self.model_path = os.environ['MODEL_PATH'] if 'MODEL_PATH' in os.environ else llm_config.get('model_path',
                                                                                                          None)
             if self.model_path is None:
-                raise RuntimeError("model_path为空，未配置模型路径")
+                raise RuntimeError('model_path为空，未配置模型路径')
             if SamplingParams is None:
                 raise RuntimeError(
-                    "当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，"
-                    "或在 API 中使用占位/替代推理实现。原始导入错误: {}".format("_VLLM_IMPORT_ERROR")
+                    '当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，' +
+                    '或在 API 中使用占位/替代推理实现。原始导入错误: {}'.format('_VLLM_IMPORT_ERROR')
                 )
 
             # 初始化 tokenizer
@@ -343,10 +343,10 @@ async def _init_model(self):
                 max_model_len=config.max_tokens,  # 减少序列长度避免内存不足
             )
 
-            logger.info(f"模型初始化成功: {self.model_path}")
+            logger.info(f'模型初始化成功: {self.model_path}')
 
         except Exception as e:
-            logger.error(f"模型初始化失败: {e}")
+            logger.error(f'模型初始化失败: {e}')
             # 如果模型初始化失败，保持为 None，后续调用会返回占位结果
             self._llm = None
             self._tokenizer = None
@@ -357,14 +357,14 @@ async def inference(self, simplified_html: str, options: dict | None = None) ->
             await self._ensure_initialized()
 
             if self._llm is None or self._tokenizer is None:
-                logger.error("模型未初始化，返回占位结果")
+                logger.error('模型未初始化，返回占位结果')
                 return self._get_placeholder_result()
 
             # 执行真实推理
             return await self._run_real_inference(simplified_html, options)
 
         except Exception as e:
-            logger.error(f"推理过程出错: {e}")
+            logger.error(f'推理过程出错: {e}')
             return self._get_placeholder_result()
 
     async def _run_real_inference(self, simplified_html: str, options: dict | None = None) -> dict:
@@ -398,11 +398,11 @@ async def _run_real_inference(self, simplified_html: str, options: dict | None =
 
             # 格式化结果
             result = reformat_map(output_json)
-            logger.info(f"推理完成，结果: {result}, 耗时: {end_time - start_time}秒")
+            logger.info(f'推理完成，结果: {result}, 耗时: {end_time - start_time}秒')
             return result
 
         except Exception as e:
-            logger.error(f"真实推理失败: {e}")
+            logger.error(f'真实推理失败: {e}')
             return self._get_placeholder_result()
 
     def _get_placeholder_result(self) -> dict:
@@ -410,10 +410,10 @@ def _get_placeholder_result(self) -> dict:
         return {}
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     config = InferenceConfig(
-        model_path="",
-        output_path="",
+        model_path='',
+        output_path='',
         use_logits_processor=True,
         num_workers=8,
         max_tokens=26000,
@@ -434,7 +434,7 @@ def _get_placeholder_result(self) -> dict:
                     max_model_len=config.max_tokens,
                     tensor_parallel_size=config.tensor_parallel_size)
 
-        simplified_html = "<html><body><h1>Hello World</h1></body></html>"
+        simplified_html = '<html><body><h1>Hello World</h1></body></html>'
         response_json = main(simplified_html, model, tokenizer)
         llm_response_dict = reformat_map(response_json)
     except Exception:

From 82886cc15189395cd3bbc8c1d1ffb47644526bf2 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 11 Sep 2025 17:25:44 +0800
Subject: [PATCH 12/12] feat: add html parse api

---
 llm_web_kit/api/models/request.py        | 1 +
 llm_web_kit/api/services/html_service.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llm_web_kit/api/models/request.py b/llm_web_kit/api/models/request.py
index 712f7e07..6fdb7269 100644
--- a/llm_web_kit/api/models/request.py
+++ b/llm_web_kit/api/models/request.py
@@ -32,6 +32,7 @@ class HTMLParseRequest(BaseModel):
         json_schema_extra={
             "example": {
                 "html_content": "<html><body><h1>Hello World</h1></body></html>",
+                "url": "https://helloworld.com/hello",
                 "options": {
                     "clean_html": True
                 }
diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
index 64b740ac..46c6d4a3 100644
--- a/llm_web_kit/api/services/html_service.py
+++ b/llm_web_kit/api/services/html_service.py
@@ -52,7 +52,7 @@ async def parse_html(
 
             # 简化网页
             try:
-                simplified_html, typical_raw_tag_html, _ = simplify_html(html_content)
+                simplified_html, typical_raw_tag_html = simplify_html(html_content)
             except Exception as e:
                 logger.error(f'简化网页失败: {e}')
                 raise
@@ -72,7 +72,7 @@ async def parse_html(
             return dict(pre_data.items())
 
         except Exception as e:
-            logger.error(f'HTML 解析失败: {e}')
+            logger.error(f'HTML解析失败: {e}')
             raise
 
     async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: