diff --git a/llm_web_kit/api/README.md b/llm_web_kit/api/README.md
new file mode 100644
index 00000000..700e2de3
--- /dev/null
+++ b/llm_web_kit/api/README.md
@@ -0,0 +1,105 @@
+# LLM Web Kit API
+
+基于 FastAPI 的 LLM Web Kit API 服务，提供 HTML 解析功能。
+
+## 功能特性
+
+- 🚀 基于 FastAPI 的高性能 Web API
+- 📄 HTML 内容解析与结构化输出
+- 🔗 支持 URL 和 HTML 字符串输入
+- 📁 支持 HTML 文件上传
+- 📚 自动生成的 API 文档
+- 🔧 可配置的解析选项
+
+## 快速开始
+
+配置环境变量
+
+```bash
+export MODEL_PATH=""
+```
+
+或者配置文件.llm-web-kit.jsonc添加“model_path”
+
+安装依赖
+
+```bash
+pip install -r requirements.txt
+python llm_web_kit/api/run_server.py
+```
+
+- Swagger UI: http://127.0.0.1:8000/docs
+- ReDoc: http://127.0.0.1:8000/redoc
+
+## API 端点
+
+### HTML 解析
+
+POST /api/v1/html/parse
+
+请求示例：
+
+```bash
+curl -s -X POST "http://127.0.0.1:8000/api/v1/html/parse" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "html_content": "<html><body><h1>Hello World</h1></body></html>",
+    "url": "https://helloworld.com/hello",
+    "options": {
+      "clean_html": true
+    }
+  }'
+```
+
+或直接发送以下 JSON 作为请求体：
+
+```json
+{
+  "html_content": "<html><body><h1>Hello World</h1></body></html>",
+  "options": {
+    "clean_html": true
+  }
+}
+```
+
+### 文件上传解析
+
+POST /api/v1/html/upload
+
+```bash
+curl -s -X POST "http://127.0.0.1:8000/api/v1/html/upload" \
+  -F "file=@/path/to/file.html"
+```
+
+### 服务状态
+
+GET /api/v1/html/status
+
+## 返回结构示例（/api/v1/html/parse 与 /api/v1/html/upload 成功返回）
+
+以下示例为 HTML 解析成功时的统一响应结构：
+
+```json
+{
+  "success": true,
+  "message": "HTML 解析成功",
+  "timestamp": "2025-08-26T16:45:43.140638",
+  "data": {
+    "layout_file_list": [],
+    "typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
+    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1><h2 _item_id=\"2\">not main content</h2></body></html>\n",
+    "llm_response": {
+      "item_id 1": 0,
+      "item_id 2": 1
+    },
+    "typical_main_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>",
+    "html_target_list": ["Hello World"]
+  },
+  "metadata": null
+}
+```
+
+## 常见问题
+
+- 422 错误：确认请求头 `Content-Type: application/json`，并确保请求体 JSON 合法。
+- 依赖缺失：`pip install -r llm_web_kit/api/requirements.txt`。
diff --git a/llm_web_kit/api/__init__.py b/llm_web_kit/api/__init__.py
new file mode 100644
index 00000000..c2601bff
--- /dev/null
+++ b/llm_web_kit/api/__init__.py
@@ -0,0 +1,7 @@
+"""LLM Web Kit API 模块.
+
+提供基于 FastAPI 的 Web API 接口，用于处理 HTML 解析和内容提取功能。
+"""
+
+__version__ = "1.0.0"
+__author__ = "LLM Web Kit Team"
diff --git a/llm_web_kit/api/dependencies.py b/llm_web_kit/api/dependencies.py
new file mode 100644
index 00000000..32eadeb6
--- /dev/null
+++ b/llm_web_kit/api/dependencies.py
@@ -0,0 +1,78 @@
+"""API 依赖项管理.
+
+包含 FastAPI 应用的依赖项、配置管理和共享服务。
+"""
+
+import logging
+from functools import lru_cache
+from typing import Optional
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+logger = logging.getLogger(__name__)
+
+
+class Settings(BaseSettings):
+    """应用配置设置."""
+
+    # API 配置
+    api_title: str = "LLM Web Kit API"
+    api_version: str = "1.0.0"
+    api_description: str = "基于 LLM 的 Web 内容解析和提取 API 服务"
+
+    # 服务器配置
+    host: str = "0.0.0.0"
+    port: int = 8000
+    debug: bool = False
+
+    # 日志配置
+    log_level: str = "INFO"
+
+    # 模型配置
+    model_path: Optional[str] = None
+    max_content_length: int = 10 * 1024 * 1024  # 10MB
+
+    # 缓存配置
+    cache_ttl: int = 3600  # 1小时
+
+    # pydantic v2 配置写法
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        case_sensitive=False
+    )
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    """获取应用配置单例."""
+    return Settings()
+
+
+def get_logger(name: str = __name__) -> logging.Logger:
+    """获取配置好的日志记录器."""
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(get_settings().log_level)
+    return logger
+
+
+# 全局依赖项
+settings = get_settings()
+
+# InferenceService 单例
+_inference_service_singleton = None
+
+
+def get_inference_service():
+    """获取 InferenceService 单例."""
+    global _inference_service_singleton
+    if _inference_service_singleton is None:
+        from .services.inference_service import InferenceService
+        _inference_service_singleton = InferenceService()
+    return _inference_service_singleton
diff --git a/llm_web_kit/api/main.py b/llm_web_kit/api/main.py
new file mode 100644
index 00000000..18f71663
--- /dev/null
+++ b/llm_web_kit/api/main.py
@@ -0,0 +1,85 @@
+"""FastAPI 应用主入口.
+
+提供 LLM Web Kit 的 Web API 服务，包括 HTML 解析、内容提取等功能。
+"""
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+
+from .dependencies import get_inference_service, get_logger, get_settings
+from .routers import htmls
+
+settings = get_settings()
+logger = get_logger(__name__)
+
+
+# 创建 FastAPI 应用实例（元数据读取自 Settings）
+app = FastAPI(
+    title=settings.api_title,
+    description=settings.api_description,
+    version=settings.api_version,
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+
+# 添加 CORS 中间件
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 在生产环境中应该限制具体域名
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# 注册路由
+app.include_router(htmls.router, prefix="/api/v1", tags=["HTML 处理"])
+
+
+@app.get("/")
+async def root():
+    """根路径，返回服务状态信息."""
+    return {
+        "message": "LLM Web Kit API 服务运行中",
+        "version": settings.api_version,
+        "status": "healthy"
+    }
+
+
+@app.get("/health")
+async def health_check():
+    """健康检查端点."""
+    return {"status": "healthy", "service": "llm-web-kit-api"}
+
+
+@app.on_event("startup")
+async def app_startup():
+    """应用启动时预热模型，避免首个请求冷启动延迟."""
+    try:
+        service = get_inference_service()
+        await service.warmup()
+        logger.info("InferenceService 模型预热完成")
+    except Exception as e:
+        logger.warning(f"InferenceService 预热失败（服务仍可运行，将在首次请求时再初始化）: {e}")
+
+
+@app.exception_handler(Exception)
+async def global_exception_handler(request, exc):
+    """全局异常处理器."""
+    logger.error(f"未处理的异常: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={"detail": "服务器内部错误", "error": str(exc)}
+    )
+
+
+if __name__ == "__main__":
+    # 开发环境运行
+    uvicorn.run(
+        "llm_web_kit.api.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        log_level=(settings.log_level or "INFO").lower()
+    )
diff --git a/llm_web_kit/api/models/__init__.py b/llm_web_kit/api/models/__init__.py
new file mode 100644
index 00000000..8f1a1ad6
--- /dev/null
+++ b/llm_web_kit/api/models/__init__.py
@@ -0,0 +1,13 @@
+"""Pydantic 模型模块.
+
+包含所有 API 请求和响应的数据模型定义。
+"""
+
+from .request import HTMLParseRequest
+from .response import ErrorResponse, HTMLParseResponse
+
+__all__ = [
+    "HTMLParseRequest",
+    "HTMLParseResponse",
+    "ErrorResponse"
+]
diff --git a/llm_web_kit/api/models/request.py b/llm_web_kit/api/models/request.py
new file mode 100644
index 00000000..6fdb7269
--- /dev/null
+++ b/llm_web_kit/api/models/request.py
@@ -0,0 +1,41 @@
+"""请求数据模型.
+
+定义 API 请求的数据结构和验证规则。
+"""
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class HTMLParseRequest(BaseModel):
+    """HTML 解析请求模型."""
+
+    html_content: Optional[str] = Field(
+        None,
+        description="HTML 内容字符串",
+        max_length=10485760  # 10MB
+    )
+
+    url: Optional[str] = Field(
+        None,
+        description="url 地址",
+        max_length=10485760  # 10MB
+    )
+
+    options: Optional[Dict[str, Any]] = Field(
+        default_factory=dict,
+        description="解析选项配置"
+    )
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "html_content": "<html><body><h1>Hello World</h1></body></html>",
+                "url": "https://helloworld.com/hello",
+                "options": {
+                    "clean_html": True
+                }
+            }
+        }
+    )
diff --git a/llm_web_kit/api/models/response.py b/llm_web_kit/api/models/response.py
new file mode 100644
index 00000000..99fbf98f
--- /dev/null
+++ b/llm_web_kit/api/models/response.py
@@ -0,0 +1,99 @@
+"""响应数据模型.
+
+定义 API 响应的数据结构和格式。
+"""
+
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ErrorResponse(BaseModel):
+    """错误响应模型."""
+
+    success: bool = Field(False, description="请求是否成功")
+    error: str = Field(..., description="错误信息")
+    detail: Optional[str] = Field(None, description="详细错误信息")
+    timestamp: datetime = Field(default_factory=datetime.now, description="错误发生时间")
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "success": False,
+                "error": "HTML 解析失败",
+                "detail": "无效的 HTML 格式",
+                "timestamp": "2024-01-01T12:00:00"
+            }
+        }
+    )
+
+
+class BaseResponse(BaseModel):
+    """基础响应模型."""
+
+    success: bool = Field(..., description="请求是否成功")
+    message: str = Field(..., description="响应消息")
+    timestamp: datetime = Field(default_factory=datetime.now, description="响应时间")
+
+
+class HTMLParseData(BaseModel):
+    """HTML 解析结果的结构化数据."""
+    layout_file_list: List[str] = Field(default_factory=list, description="布局文件列表")
+    typical_raw_html: Optional[str] = Field(None, description="原始 HTML")
+    typical_raw_tag_html: Optional[str] = Field(None, description="带标签标注的原始 HTML")
+    llm_response: Dict[str, int] = Field(default_factory=dict, description="LLM 项目打标结果")
+    typical_main_html: Optional[str] = Field(None, description="解析得到的主体 HTML")
+    html_target_list: List[Any] = Field(default_factory=list, description="正文候选/目标列表")
+
+
+class HTMLParseResponse(BaseResponse):
+    """HTML 解析响应模型."""
+
+    data: Optional[HTMLParseData] = Field(None, description="解析结果数据")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="元数据信息")
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "success": True,
+                "message": "HTML 解析成功",
+                "timestamp": "2025-08-26T16:45:43.140638",
+                "data": {
+                    "layout_file_list": [],
+                    "typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
+                    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>\n",
+                    "llm_response": {
+                        "item_id 1": 0,
+                        "item_id 9": 1
+                    },
+                    "typical_main_html": "<html></html>",
+                    "html_target_list": []
+                },
+                "metadata": None
+            }
+        }
+    )
+
+
+class ServiceStatusResponse(BaseResponse):
+    """服务状态响应模型."""
+
+    service: str = Field(..., description="服务名称")
+    version: str = Field(..., description="服务版本")
+    status: str = Field(..., description="服务状态")
+    uptime: Optional[float] = Field(None, description="运行时间（秒）")
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "success": True,
+                "message": "服务状态正常",
+                "timestamp": "2024-01-01T12:00:00",
+                "service": "HTML Processing Service",
+                "version": "1.0.0",
+                "status": "running",
+                "uptime": 3600.5
+            }
+        }
+    )
diff --git a/llm_web_kit/api/requirements.txt b/llm_web_kit/api/requirements.txt
new file mode 100644
index 00000000..c7cbd56f
--- /dev/null
+++ b/llm_web_kit/api/requirements.txt
@@ -0,0 +1,21 @@
+# HTTP 客户端
+aiohttp>=3.9.0
+
+# FastAPI 相关依赖
+fastapi>=0.104.0
+pydantic>=2.0.0
+pydantic-settings>=2.0.0
+
+# 日志和配置
+python-dotenv>=1.0.0
+
+# 数据处理
+python-multipart>=0.0.6
+torch==2.6.0
+transformers==4.52.4
+
+# 类型提示支持
+uvicorn[standard]>=0.24.0
+
+# 模型推理
+vllm==0.8.5.post1
diff --git a/llm_web_kit/api/routers/__init__.py b/llm_web_kit/api/routers/__init__.py
new file mode 100644
index 00000000..4a3f1567
--- /dev/null
+++ b/llm_web_kit/api/routers/__init__.py
@@ -0,0 +1,8 @@
+"""路由模块.
+
+包含所有 API 路由定义，按功能模块组织。
+"""
+
+from . import htmls
+
+__all__ = ["htmls"]
diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py
new file mode 100644
index 00000000..0f69074a
--- /dev/null
+++ b/llm_web_kit/api/routers/htmls.py
@@ -0,0 +1,88 @@
+"""HTML 处理路由.
+
+提供 HTML 解析、内容提取等功能的 API 端点。
+"""
+
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
+
+from ..dependencies import get_logger, get_settings
+from ..models.request import HTMLParseRequest
+from ..models.response import HTMLParseResponse
+from ..services.html_service import HTMLService
+
+logger = get_logger(__name__)
+settings = get_settings()
+
+router = APIRouter()
+
+
+@router.post('/html/parse', response_model=HTMLParseResponse)
+async def parse_html(
+    request: HTMLParseRequest,
+    html_service: HTMLService = Depends(HTMLService)
+):
+    """解析 HTML 内容.
+
+    接收 HTML 字符串并返回解析后的结构化内容。
+    """
+    try:
+        logger.info(f'开始解析 HTML，内容长度: {len(request.html_content) if request.html_content else 0}')
+
+        result = await html_service.parse_html(
+            html_content=request.html_content,
+            url=request.url,
+            options=request.options
+        )
+
+        return HTMLParseResponse(
+            success=True,
+            data=result,
+            message='HTML 解析成功'
+        )
+    except Exception as e:
+        logger.error(f'HTML 解析失败: {str(e)}')
+        raise HTTPException(status_code=500, detail=f'HTML 解析失败: {str(e)}')
+
+
+@router.post('/html/upload')
+async def upload_html_file(
+    file: UploadFile = File(...),
+    html_service: HTMLService = Depends(HTMLService)
+):
+    """上传 HTML 文件进行解析.
+
+    支持上传 HTML 文件，自动解析并返回结果。
+    """
+    try:
+        if not file.filename.endswith(('.html', '.htm')):
+            raise HTTPException(status_code=400, detail='只支持 HTML 文件')
+
+        content = await file.read()
+        html_content = content.decode('utf-8')
+
+        logger.info(f'上传 HTML 文件: {file.filename}, 大小: {len(content)} bytes')
+
+        result = await html_service.parse_html(html_content=html_content)
+
+        return HTMLParseResponse(
+            success=True,
+            data=result,
+            message='HTML 文件解析成功',
+            filename=file.filename
+        )
+    except Exception as e:
+        logger.error(f'HTML 文件解析失败: {str(e)}')
+        raise HTTPException(status_code=500, detail=f'HTML 文件解析失败: {str(e)}')
+
+
+@router.get('/html/status')
+async def get_service_status():
+    """获取服务状态.
+
+    返回 HTML 处理服务的当前状态信息。
+    """
+    return {
+        'service': 'HTML Processing Service',
+        'status': 'running',
+        'version': '1.0.0'
+    }
diff --git a/llm_web_kit/api/run_server.py b/llm_web_kit/api/run_server.py
new file mode 100644
index 00000000..739b14fa
--- /dev/null
+++ b/llm_web_kit/api/run_server.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+"""API 服务器启动脚本.
+
+用于启动 LLM Web Kit API 服务。
+"""
+
+import os
+import sys
+
+import uvicorn
+
+# 添加项目根目录到 Python 路径
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from llm_web_kit.api.dependencies import get_settings
+
+if __name__ == "__main__":
+    settings = get_settings()
+    print("启动 LLM Web Kit API 服务器...")
+    print(f"API 文档地址: http://{settings.host}:{settings.port}/docs")
+    print(f"ReDoc 文档地址: http://{settings.host}:{settings.port}/redoc")
+
+    uvicorn.run(
+        "llm_web_kit.api.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        log_level=(settings.log_level or "INFO").lower()
+    )
diff --git a/llm_web_kit/api/services/__init__.py b/llm_web_kit/api/services/__init__.py
new file mode 100644
index 00000000..7e6717b1
--- /dev/null
+++ b/llm_web_kit/api/services/__init__.py
@@ -0,0 +1,8 @@
+"""服务层模块.
+
+包含业务逻辑服务，桥接原有项目功能。
+"""
+
+from .html_service import HTMLService
+
+__all__ = ["HTMLService"]
diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py
new file mode 100644
index 00000000..46c6d4a3
--- /dev/null
+++ b/llm_web_kit/api/services/html_service.py
@@ -0,0 +1,81 @@
+"""HTML 处理服务.
+
+桥接原有项目的 HTML 解析和内容提取功能，提供统一的 API 接口。
+"""
+
+from typing import Any, Dict, Optional
+
+from ..dependencies import get_inference_service, get_logger, get_settings
+
+logger = get_logger(__name__)
+settings = get_settings()
+
+
+class HTMLService:
+    """HTML 处理服务类."""
+
+    def __init__(self):
+        """初始化 HTML 服务."""
+        # 目前使用简化管线；使用全局单例的 InferenceService，避免重复初始化模型
+        try:
+            self._inference_service = get_inference_service()
+        except Exception as e:
+            logger.warning(f'InferenceService 获取失败（将在首次调用时再尝试）：{e}')
+            self._inference_service = None
+
+    def _init_components(self):
+        """兼容保留（当前未使用）"""
+        return None
+
+    async def parse_html(
+        self,
+        html_content: Optional[str] = None,
+        url: Optional[str] = None,
+        options: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """解析 HTML 内容."""
+        try:
+            if not html_content:
+                raise ValueError('必须提供 HTML 内容')
+
+            # 延迟导入，避免模块导入期异常导致服务类不可用
+            try:
+                from llm_web_kit.input.pre_data_json import (PreDataJson,
+                                                             PreDataJsonKey)
+                from llm_web_kit.main_html_parser.parser.tag_mapping import \
+                    MapItemToHtmlTagsParser
+                from llm_web_kit.main_html_parser.simplify_html.simplify_html import \
+                    simplify_html
+            except Exception as import_err:
+                logger.error(f'依赖导入失败: {import_err}')
+                raise
+
+            # 简化网页
+            try:
+                simplified_html, typical_raw_tag_html = simplify_html(html_content)
+            except Exception as e:
+                logger.error(f'简化网页失败: {e}')
+                raise
+
+            # 模型推理
+            llm_response = await self._parse_with_model(simplified_html, options)
+
+            # 结果映射
+            pre_data = PreDataJson({})
+            pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html_content
+            pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
+            pre_data[PreDataJsonKey.LLM_RESPONSE] = llm_response
+            parser = MapItemToHtmlTagsParser({})
+            pre_data = parser.parse_single(pre_data)
+
+            # 将 PreDataJson 转为标准 dict，避免响应模型校验错误
+            return dict(pre_data.items())
+
+        except Exception as e:
+            logger.error(f'HTML解析失败: {e}')
+            raise
+
+    async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        if self._inference_service is None:
+            self._inference_service = get_inference_service()
+        return await self._inference_service.inference(html_content, options or {})
diff --git a/llm_web_kit/api/services/inference_service.py b/llm_web_kit/api/services/inference_service.py
new file mode 100644
index 00000000..095404d9
--- /dev/null
+++ b/llm_web_kit/api/services/inference_service.py
@@ -0,0 +1,447 @@
+# vLLM 作为可选依赖：导入失败时保持模块可用，实际使用时再报错
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from enum import Enum
+from typing import List
+
+import torch
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+from llm_web_kit.config.cfg_reader import load_config
+
+from ..dependencies import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class InferenceConfig:
+    model_path: str = ''
+    data_path: str = ''
+    output_path: str = ''
+    use_logits_processor: bool = True
+    num_workers: int = 8
+    max_tokens: int = 32768
+    temperature: float = 0
+    top_p: float = 0.95
+    max_output_tokens: int = 8192
+    tensor_parallel_size: int = 1
+    # 正式环境修改为bfloat16
+    dtype: str = 'float16'
+    template: bool = True
+
+
+config = InferenceConfig(
+    model_path='',  # checkpoint-3296路径
+    output_path='',
+    use_logits_processor=True,  # 启用逻辑处理器确保JSON格式输出
+    num_workers=8,  # 并行工作进程数
+    max_tokens=26000,  # 最大输入token数
+    temperature=0,  # 确定性输出
+    top_p=0.95,
+    max_output_tokens=8192,  # 最大输出token数
+    tensor_parallel_size=1,  # 张量并行大小
+    template=True  # 启用聊天模板
+)
+
+PROMPT = """As a front-end engineering expert in HTML, your task is to analyze the given HTML structure and accurately classify elements with the _item_id attribute as either "main" (primary content) or "other" (supplementary content). Your goal is to precisely extract the primary content of the page, ensuring that only the most relevant information is labeled as "main" while excluding navigation, metadata, and other non-essential elements.
+Guidelines for Classification:
+Primary Content ("main")
+Elements that constitute the core content of the page should be classified as "main". These typically include:
+✅ For Articles, News, and Blogs:
+The main text body of the article, blog post, or news content.
+Images embedded within the main content that contribute to the article.
+✅ For Forums & Discussion Threads:
+The original post in the thread.
+Replies and discussions that are part of the main conversation.
+✅ For Q&A Websites:
+The question itself posted by a user.
+Answers to the question and replies to answers that contribute to the discussion.
+✅ For Other Content-Based Pages:
+Any rich text, paragraphs, or media that serve as the primary focus of the page.
+Supplementary Content ("other")
+Elements that do not contribute to the primary content but serve as navigation, metadata, or supporting information should be classified as "other". These include:
+❌ Navigation & UI Elements:
+Menus, sidebars, footers, breadcrumbs, and pagination links.
+"Skip to content" links and accessibility-related text.
+❌ Metadata & User Information:
+Article titles, author names, timestamps, and view counts.
+Like counts, vote counts, and other engagement metrics.
+❌ Advertisements & Promotional Content:
+Any section labeled as "Advertisement" or "Sponsored".
+Social media sharing buttons, follow prompts, and external links.
+❌ Related & Suggested Content:
+"Read More", "Next Article", "Trending Topics", and similar sections.
+Lists of related articles, tags, and additional recommendations.
+Task Instructions:
+You will be provided with a simplified HTML structure containing elements with an _item_id attribute. Your job is to analyze each element's function and determine whether it should be classified as "main" or "other".
+Response Format:
+Return a JSON object where each key is the _item_id value, and the corresponding value is either "main" or "other", as in the following example:
+{{"1": "other","2": "main","3": "other"}}
+🚨 Important Notes:
+Do not include any explanations in the output—only return the JSON.
+Ensure high accuracy by carefully distinguishing between primary content and supplementary content.
+Err on the side of caution—if an element seems uncertain, classify it as "other" unless it clearly belongs to the main content.
+
+Input HTML:
+{alg_html}
+
+Output format should be a JSON-formatted string representing a dictionary where keys are item_id strings and values are either 'main' or 'other'. Make sure to include ALL item_ids from the input HTML.
+"""
+
+
+def create_prompt(alg_html: str) -> str:
+    return PROMPT.format(alg_html=alg_html)
+
+
+def add_template(prompt: str, tokenizer: AutoTokenizer) -> str:
+    messages = [
+        {'role': 'user', 'content': prompt}
+    ]
+    chat_prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True  # Switches between thinking and non-thinking modes. Default is True.
+    )
+    return chat_prompt
+
+
+class State(Enum):
+    Left_bracket = 0
+    Right_bracket = 1
+    Space_quote = 2
+    Quote_colon_quote = 3
+    Quote_comma = 4
+    Main_other = 5
+    Number = 6
+
+
+class Token_state:
+    def __init__(self, model_path):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        token_id_map = {
+            State.Left_bracket: ['{'],
+            State.Right_bracket: ['}'],
+            State.Space_quote: [' "'],
+            State.Quote_colon_quote: ['":"'],
+            State.Quote_comma: ['",'],
+            State.Main_other: ['main', 'other'],
+            State.Number: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
+        }
+        self.token_id_map = {k: [self.tokenizer.encode(v)[0] for v in token_id_map[k]] for k in token_id_map}
+
+    def mask_other_logits(self, logits: torch.Tensor, remained_ids: List[int]):
+        remained_logits = {ids: logits[ids].item() for ids in remained_ids}
+        new_logits = torch.ones_like(logits) * -float('inf')
+        for id in remained_ids:
+            new_logits[id] = remained_logits[id]
+        return new_logits
+
+    def calc_max_count(self, prompt_token_ids: List[int]):
+        pattern_list = [716, 1203, 842, 428]
+        for idx in range(len(prompt_token_ids) - len(pattern_list), -1, -1):
+            if all(prompt_token_ids[idx + i] == pattern_list[i] for i in range(len(pattern_list))):
+                num_idx = idx + len(pattern_list)
+                num_ids = []
+                while num_idx < len(prompt_token_ids) and prompt_token_ids[num_idx] in self.token_id_map[State.Number]:
+                    num_ids.append(prompt_token_ids[num_idx])
+                    num_idx += 1
+                # return int(self.tokenizer.decode(num_ids)) + 1
+                return int(self.tokenizer.decode(num_ids))
+        return 1
+
+    def find_last_complete_number(self, input_ids: List[int]):
+        if not input_ids:
+            return -1, 'null', -1
+
+        tail_number_ids = []
+        last_idx = len(input_ids) - 1
+        while last_idx >= 0 and input_ids[last_idx] in self.token_id_map[State.Number]:
+            tail_number_ids.insert(0, input_ids[last_idx])
+            last_idx -= 1
+
+        tail_number = int(self.tokenizer.decode(tail_number_ids)) if tail_number_ids else -1
+
+        while last_idx >= 0 and input_ids[last_idx] not in self.token_id_map[State.Number]:
+            last_idx -= 1
+
+        if last_idx < 0:
+            return tail_number, 'tail', tail_number
+
+        last_number_ids = []
+        while last_idx >= 0 and input_ids[last_idx] in self.token_id_map[State.Number]:
+            last_number_ids.insert(0, input_ids[last_idx])
+            last_idx -= 1
+
+        last_number = int(self.tokenizer.decode(last_number_ids))
+
+        if tail_number == last_number + 1:
+            return tail_number, 'tail', tail_number
+        return last_number, 'non_tail', tail_number
+
+    def process_logit(self, prompt_token_ids: List[int], input_ids: List[int], logits: torch.Tensor):
+        if not input_ids:
+            return self.mask_other_logits(logits, self.token_id_map[State.Left_bracket])
+
+        last_token = input_ids[-1]
+
+        if last_token == self.token_id_map[State.Right_bracket][0]:
+            return self.mask_other_logits(logits, [151645])
+        elif last_token == self.token_id_map[State.Left_bracket][0]:
+            return self.mask_other_logits(logits, self.token_id_map[State.Space_quote])
+        elif last_token == self.token_id_map[State.Space_quote][0]:
+            last_number, _, _ = self.find_last_complete_number(input_ids)
+            # next_char = str(last_number + 1)[0]
+            if last_number == -1:
+                next_char = '1'
+            else:
+                next_char = str(last_number + 1)[0]
+
+            return self.mask_other_logits(logits, self.tokenizer.encode(next_char))
+        elif last_token in self.token_id_map[State.Number]:
+            last_number, state, tail_number = self.find_last_complete_number(input_ids)
+            if state == 'tail':
+                return self.mask_other_logits(logits, self.token_id_map[State.Quote_colon_quote])
+            else:
+                next_str = str(last_number + 1)
+                next_char = next_str[len(str(tail_number))]
+                return self.mask_other_logits(logits, self.tokenizer.encode(next_char))
+        elif last_token == self.token_id_map[State.Quote_colon_quote][0]:
+            return self.mask_other_logits(logits, self.token_id_map[State.Main_other])
+        elif last_token in self.token_id_map[State.Main_other]:
+            return self.mask_other_logits(logits, self.token_id_map[State.Quote_comma])
+        elif last_token == self.token_id_map[State.Quote_comma][0]:
+            last_number, _, _ = self.find_last_complete_number(input_ids)
+            max_count = self.calc_max_count(prompt_token_ids)
+            if last_number >= max_count:
+                return self.mask_other_logits(logits, self.token_id_map[State.Right_bracket])
+            else:
+                return self.mask_other_logits(logits, self.token_id_map[State.Space_quote])
+
+        return logits
+
+
+def reformat_map(text):
+    try:
+        data = json.loads(text)
+        return {'item_id ' + k: 1 if v == 'main' else 0 for k, v in data.items()}
+    except json.JSONDecodeError:
+        return {}
+
+
+def main(simplified_html: str, model: object, tokenizer: object, model_path: str):
+    # tokenizer = AutoTokenizer.from_pretrained("/share/liukaiwen/models/qwen3-0.6b/checkpoint-3296", trust_remote_code=True)
+    # simplified_html = simplify_html(ori_html)
+    # print("sim_html length", len(simplified_html))
+    if SamplingParams is None:
+        raise RuntimeError(
+            '当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，' +
+            '或在 API 中使用占位/替代推理实现。原始导入错误: {}'.format('_VLLM_IMPORT_ERROR')
+        )
+    prompt = create_prompt(simplified_html)
+    chat_prompt = add_template(prompt, tokenizer)
+
+    if config.use_logits_processor:
+        token_state = Token_state(model_path)
+        sampling_params = SamplingParams(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            max_tokens=config.max_output_tokens,
+            logits_processors=[token_state.process_logit]
+        )
+    else:
+        sampling_params = SamplingParams(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            max_tokens=config.max_output_tokens
+        )
+
+    output = model.generate(chat_prompt, sampling_params)
+    output_json = clean_output(output)
+    return output_json
+
+
+def clean_output(output):
+    prediction = output[0].outputs[0].text
+
+    # Extract JSON from prediction
+    start_idx = prediction.rfind('{')
+    end_idx = prediction.rfind('}') + 1
+
+    if start_idx != -1 and end_idx != -1:
+        json_str = prediction[start_idx:end_idx]
+        json_str = re.sub(r',\s*}', '}', json_str)  # Clean JSON
+        try:
+            json.loads(json_str)  # Validate
+        except Exception:
+            json_str = '{}'
+    else:
+        json_str = '{}'
+
+    return json_str
+
+
+class InferenceService:
+    """对外暴露的推理服务封装，供 HTMLService 调用。"""
+
+    def __init__(self):
+        """初始化推理服务，延迟加载模型."""
+        self._llm = None
+        self._tokenizer = None
+        self._initialized = False
+        self._init_lock = None  # 用于异步初始化锁
+        self._model_path = None
+
+    async def warmup(self):
+        """在服务启动阶段主动预热模型（异步初始化）。"""
+        await self._ensure_initialized()
+
+    async def _ensure_initialized(self):
+        """确保模型已初始化（异步安全）"""
+        if not self._initialized:
+            if self._init_lock is None:
+                import asyncio
+                self._init_lock = asyncio.Lock()
+
+            async with self._init_lock:
+                if not self._initialized:  # 双重检查
+                    await self._init_model()
+                    self._initialized = True
+
+    async def _init_model(self):
+        """初始化模型和tokenizer."""
+        try:
+            llm_config = load_config(suppress_error=True)
+            self.model_path = os.environ['MODEL_PATH'] if 'MODEL_PATH' in os.environ else llm_config.get('model_path',
+                                                                                                         None)
+            if self.model_path is None:
+                raise RuntimeError('model_path为空，未配置模型路径')
+            if SamplingParams is None:
+                raise RuntimeError(
+                    '当前环境未安装 vLLM 或安装失败，无法执行模型推理。建议在 Linux+NVIDIA GPU 环境安装 vLLM，' +
+                    '或在 API 中使用占位/替代推理实现。原始导入错误: {}'.format('_VLLM_IMPORT_ERROR')
+                )
+
+            # 初始化 tokenizer
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.model_path,
+                trust_remote_code=True
+            )
+
+            # 初始化 LLM 模型
+            self._llm = LLM(
+                model=self.model_path,
+                trust_remote_code=True,
+                dtype=config.dtype,
+                tensor_parallel_size=config.tensor_parallel_size,
+                # 正式环境删掉
+                max_model_len=config.max_tokens,  # 减少序列长度避免内存不足
+            )
+
+            logger.info(f'模型初始化成功: {self.model_path}')
+
+        except Exception as e:
+            logger.error(f'模型初始化失败: {e}')
+            # 如果模型初始化失败，保持为 None，后续调用会返回占位结果
+            self._llm = None
+            self._tokenizer = None
+
+    async def inference(self, simplified_html: str, options: dict | None = None) -> dict:
+        """执行推理，如果模型未初始化则返回占位结果."""
+        try:
+            await self._ensure_initialized()
+
+            if self._llm is None or self._tokenizer is None:
+                logger.error('模型未初始化，返回占位结果')
+                return self._get_placeholder_result()
+
+            # 执行真实推理
+            return await self._run_real_inference(simplified_html, options)
+
+        except Exception as e:
+            logger.error(f'推理过程出错: {e}')
+            return self._get_placeholder_result()
+
+    async def _run_real_inference(self, simplified_html: str, options: dict | None = None) -> dict:
+        """执行真实的模型推理."""
+        try:
+            # 创建 prompt
+            prompt = create_prompt(simplified_html)
+            chat_prompt = add_template(prompt, self._tokenizer)
+
+            # 设置采样参数
+            if config.use_logits_processor:
+                token_state = Token_state(self.model_path)
+                sampling_params = SamplingParams(
+                    temperature=config.temperature,
+                    top_p=config.top_p,
+                    max_tokens=config.max_output_tokens,
+                    logits_processors=[token_state.process_logit]
+                )
+            else:
+                sampling_params = SamplingParams(
+                    temperature=config.temperature,
+                    top_p=config.top_p,
+                    max_tokens=config.max_output_tokens
+                )
+
+            # 执行推理
+            start_time = time.time()
+            output = self._llm.generate(chat_prompt, sampling_params)
+            end_time = time.time()
+            output_json = clean_output(output)
+
+            # 格式化结果
+            result = reformat_map(output_json)
+            logger.info(f'推理完成，结果: {result}, 耗时: {end_time - start_time}秒')
+            return result
+
+        except Exception as e:
+            logger.error(f'真实推理失败: {e}')
+            return self._get_placeholder_result()
+
+    def _get_placeholder_result(self) -> dict:
+        """返回占位结果."""
+        return {}
+
+
+if __name__ == '__main__':
+    config = InferenceConfig(
+        model_path='',
+        output_path='',
+        use_logits_processor=True,
+        num_workers=8,
+        max_tokens=26000,
+        temperature=0,
+        top_p=0.95,
+        max_output_tokens=8192,
+        tensor_parallel_size=1,
+        template=True,
+    )
+    try:
+        llm_config = load_config(suppress_error=True)
+        model_path = llm_config.get('model_path', None)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = LLM(model=model_path,
+                    trust_remote_code=True,
+                    dtype=config.dtype,
+                    # 设置最大模型长度
+                    max_model_len=config.max_tokens,
+                    tensor_parallel_size=config.tensor_parallel_size)
+
+        simplified_html = '<html><body><h1>Hello World</h1></body></html>'
+        response_json = main(simplified_html, model, tokenizer)
+        llm_response_dict = reformat_map(response_json)
+    except Exception:
+        raise
+    finally:
+        import torch.distributed as dist
+
+        # 在程序结束前添加
+        if dist.is_initialized():
+            dist.destroy_process_group()
diff --git a/llm_web_kit/config/pipe_tpl/model.jsonc b/llm_web_kit/config/pipe_tpl/model.jsonc
new file mode 100644
index 00000000..ff77efe5
Binary files /dev/null and b/llm_web_kit/config/pipe_tpl/model.jsonc differ
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index f6ddacfe..ba3671e1 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,8 +1,10 @@
+aiohttp==3.12.15
 beautifulsoup4>=4.12.2
 boto3==1.28.43
 cairosvg==2.7.1
 click==8.1.8
 commentjson==0.9.0
+fastapi==0.116.1
 fasttext-wheel==0.9.2
 filelock==3.16.1
 html-alg-lib==2.0.2
@@ -22,9 +24,14 @@ orjson==3.11.0
 overrides==7.7.0
 py-asciimath==0.3.0
 pyahocorasick==2.0.0
+pydantic==2.11.7
+pydantic-settings==2.10.1
+python-dotenv==1.1.1
+python-multipart==0.0.20
 scikit-learn>=1.6.1
 selectolax==0.3.33
 torch>=2.3.0
 tqdm==4.67.1
 transformers==4.40.2
+uvicorn[standard]==0.35.0
 xformers>=0.0.27