diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
index 4c51bda0..e28f088b 100644
--- a/llm_web_kit/extractor/html/extractor.py
+++ b/llm_web_kit/extractor/html/extractor.py
@@ -2,6 +2,7 @@
 from typing import List, Tuple
 
 import commentjson as json
+from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.config.cfg_reader import load_config
@@ -20,6 +21,7 @@
 from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer
 from llm_web_kit.extractor.html.recognizer.video import VideoRecognizer
 from llm_web_kit.input.datajson import ContentList, DataJson
+from llm_web_kit.libs.doc_element_type import DocElementType
 from llm_web_kit.libs.html_utils import element_to_html, html_to_element
 from llm_web_kit.libs.path_lib import get_py_pkg_root_dir
 
@@ -92,12 +94,12 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
         page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE)  # 默认是文章类型
 
         main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
-        parsed_html = [(main_html,raw_html)]
+        main_html_element = html_to_element(main_html)
+        parsed_html = [(main_html_element, raw_html)]
         for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list,
                              self._extract_image,
                              self._extract_title, self._extract_paragraph]:
             parsed_html = extract_func(base_url, parsed_html, raw_html)
-
         content_list:ContentList = self._export_to_content_list(base_url, parsed_html, raw_html)
         data_json['content_list'] = content_list
         data_json['title'] = title
@@ -119,7 +121,7 @@ def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -
         dict_result = self.__magic_html_extractor.extract(raw_html, base_url=base_url, precision=False, html_type=page_layout_type)
         return dict_result['html'], dict_result['xp_num'], dict_result.get('title', '')
 
-    def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
+    def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
         """从html文本中提取代码.
 
         Args:
@@ -256,43 +258,43 @@ def __is_valid_node(self, node: dict) -> bool:
         if not node:
             raise HtmlFileExtractorException('node is empty')
         node_type = node.get('type')
-        valid_types = {'list', 'code', 'equation-interline', 'image', 'table', 'title', 'paragraph'}
+        valid_types = {DocElementType.TITLE, DocElementType.LIST, DocElementType.CODE, DocElementType.EQUATION_INTERLINE, DocElementType.IMAGE, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.IMAGE, DocElementType.PARAGRAPH}
         if node_type not in valid_types:
             raise HtmlFileExtractorException(f'Invalid node type: {node_type}')
         # 检查列表类型的节点
-        if node.get('type') == 'list':
+        if node.get('type') == DocElementType.LIST:
             items = node.get('content', {}).get('items', [])
             # 过滤掉None、空列表，以及只包含None或空值的列表
             return bool(items) and any(
                 isinstance(item, (dict, list)) and bool(item)
                 for item in items)
         # 检测code类型的节点
-        if node.get('type') == 'code':
+        if node.get('type') == DocElementType.CODE:
             code_content = node.get('content', {}).get('code_content')
             # 如果代码内容为None或空字符串，则视为无效节点
             return bool(code_content and code_content.strip())
         # 检测行间公式类型的节点
-        if node.get('type') == 'equation-interline':
+        if node.get('type') == DocElementType.EQUATION_INTERLINE:
             math_content = node.get('content', {}).get('math_content')
             # 如果公式内容为None或空字符串，则视为无效节点
             return bool(math_content and math_content.strip())
         # 检测image类型的节点
-        if node.get('type') == 'image':
+        if node.get('type') == DocElementType.IMAGE:
             content = node.get('content', {})
             # 检查url、path或data字段是否至少有一个不为空
             return bool(content.get('url') or content.get('path') or content.get('data'))
         # 检测table类型的节点
-        if node.get('type') == 'table':
+        if node.get('type') == DocElementType.SIMPLE_TABLE or node.get('type') == DocElementType.COMPLEX_TABLE:
             html = node.get('content', {}).get('html')
             # 如果表格的html内容为None或空字符串，则视为无效节点
             return bool(html and html.strip())
         # 检测title类型的节点
-        if node.get('type') == 'title':
+        if node.get('type') == DocElementType.TITLE:
             title_content = node.get('content', {}).get('title_content')
             # 如果标题内容为None或空字符串，则视为无效节点
             return bool(title_content and title_content.strip())
         # 检测段落类型的节点
-        if node.get('type') == 'paragraph':
+        if node.get('type') == DocElementType.PARAGRAPH:
             content = node.get('content', [])
             # 检查content列表是否存在且不为空，并且至少有一个非空的内容项
             return bool(content) and any(
@@ -301,7 +303,7 @@ def __is_valid_node(self, node: dict) -> bool:
             )
         return True
 
-    def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> ContentList:
+    def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> ContentList:
         """将解析结果存入content_list格式中.
 
         Args:
@@ -318,7 +320,9 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r
             ccnode_html, cc_tag = self.__get_cc_node(parsed_html)
             parser:BaseHTMLElementRecognizer = self.__to_content_list_mapper.get(cc_tag)
             if parser:
-                node = parser.to_content_list_node(base_url, ccnode_html, raw_html)
+                raw_html_str = element_to_html(raw_html)
+                # raw_html_str = raw_html
+                node = parser.to_content_list_node(base_url, ccnode_html, raw_html_str)
                 if node and self.__is_valid_node(node):
                     one_page.append(node)
             else:
@@ -326,7 +330,7 @@ def _export_to_content_list(self, base_url:str, html_lst:List[Tuple[str,str]], r
         content_list = ContentList([one_page])  # 对于网页来说仅有一页，如果多页，则剩下的每个都是一个论坛的回复
         return content_list
 
-    def __get_cc_node(self, html:str) -> (str, str):
+    def __get_cc_node(self, html:HtmlElement) -> (HtmlElement, str):
         """获取html文本的根标签名。只获取一个，如果html文本中包含多个cc标签，则抛异常。
 
         Args:
@@ -335,7 +339,8 @@ def __get_cc_node(self, html:str) -> (str, str):
         Returns:
             str: 根标签名
         """
-        el = html_to_element(html)
+        # el = html_to_element(html)
+        el = html
         if el.tag in self.__to_content_list_mapper.keys():
             return html, el.tag
         else:
@@ -346,7 +351,8 @@ def __get_cc_node(self, html:str) -> (str, str):
                 raise HtmlFileExtractorException(f'html文本中没有cc标签: {html}')
             if len(nodes) > 3:
                 raise HtmlFileExtractorException(f'html文本中包含多个cc标签: {html}')
-            return element_to_html(nodes[0]), nodes[0].tag
+            # return element_to_html(nodes[0]), nodes[0].tag
+            return nodes[0], nodes[0].tag
 
     def __build_extractor(self):
         """
diff --git a/llm_web_kit/extractor/html/recognizer/audio.py b/llm_web_kit/extractor/html/recognizer/audio.py
index 24acc343..f9e74a7b 100644
--- a/llm_web_kit/extractor/html/recognizer/audio.py
+++ b/llm_web_kit/extractor/html/recognizer/audio.py
@@ -1,5 +1,6 @@
 from typing import List, Tuple
 
+from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.extractor.html.recognizer.recognizer import \
@@ -9,7 +10,7 @@
 class AudioRecognizer(BaseHTMLElementRecognizer):
     """解析音频元素."""
     @override
-    def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
+    def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
         """父类，解析音频元素.
 
         Args:
@@ -22,5 +23,15 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
         raise NotImplementedError
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
+        """
+        把音频元素转换为content list node.
+        Args:
+            base_url:
+            parsed_content:
+            raw_html_segment:
+
+        Returns:
+
+        """
         raise NotImplementedError
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
index 2cd91a19..08d3f9e2 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -86,10 +86,10 @@ class MATH_TYPE_PATTERN:
         ['\\[', '\\]'],
         ['$$', '$$'],
         ['[tex]', '[/tex]'],  # 这个网站自定义的分割，https://www.physicsforums.com/threads/turning-to-a-single-logarithm-then-simply.269419/
-        ['\\begin{equation}', '\\end{equation}'],
-        ['\\begin{align}', '\\end{align}'],
-        ['\\begin{alignat}', '\\end{alignat}'],
-        ['\\begin{array}', '\\end{array}'],
+        # ['\\begin{equation}', '\\end{equation}'],
+        # ['\\begin{align}', '\\end{align}'],
+        # ['\\begin{alignat}', '\\end{alignat}'],
+        # ['\\begin{array}', '\\end{array}'],
         # 添加通用的begin/end匹配
         ['\\begin{.*?}', '\\end{.*?}'],
     ],
diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py
index 4a638fee..fe5744a7 100644
--- a/llm_web_kit/extractor/html/recognizer/cccode.py
+++ b/llm_web_kit/extractor/html/recognizer/cccode.py
@@ -7,19 +7,17 @@
                                                         tag_pre_code)
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
-from llm_web_kit.libs.html_utils import element_to_html, html_to_element
 
 
 class CodeRecognizer(BaseHTMLElementRecognizer):
     """解析代码元素."""
-
     @override
     def recognize(
         self,
         base_url: str,
-        main_html_lst: List[Tuple[str, str]],
-        raw_html: str,
-    ) -> List[Tuple[str, str]]:
+        main_html_lst: List[Tuple[HtmlElement, HtmlElement]],
+        raw_html: str
+    ) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析代码元素.
 
         Args:
@@ -38,7 +36,8 @@ def recognize(
             if self.is_cc_html(html):
                 rtn.append((html, raw_html))
                 continue
-            root: HtmlElement = html_to_element(html)
+            # root: HtmlElement = html_to_element(html)
+            root = html
             while True:
                 # 最常见:
                 # <pre><code></code></pre>
@@ -77,31 +76,36 @@ def remove_empty_code(r: HtmlElement):
                         remove_empty_code(x)
 
             remove_empty_code(root)
-
-            html_str: str = element_to_html(root)
-
-            rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(html_str, CCTag.CC_CODE))
-
+            # html_str: str = element_to_html(root)
+            rtn.extend(BaseHTMLElementRecognizer.html_split_by_tags(root, CCTag.CC_CODE))
         return rtn
 
     @override
-    def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict:
-        code_node: HtmlElement = html_to_element(parsed_content)
+    def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
+        """
+        把代码元素转换为content list node.
+        Args:
+            base_url:
+            parsed_content: HtmlElement对象
+            raw_html_segment:
+
+        Returns:
 
+        """
         d = {
             'type': 'code',
             # "bbox": [],
             'raw_content': raw_html_segment,
-            'inline': code_node.get('inline', 'false') == 'true',
+            'inline': parsed_content.get('inline', 'false') == 'true',
             'content': {
-                'code_content': code_node.text,
+                'code_content': parsed_content.text,
             },
         }
 
-        if lang := code_node.get('language', None):
+        if lang := parsed_content.get('language', None):
             d['content']['language'] = lang
 
-        if by := code_node.get('by', None):
+        if by := parsed_content.get('by', None):
             d['content']['by'] = by
 
         return d
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index 91ba602d..62701417 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -26,7 +26,7 @@ def __init__(self):
         self.cm = CCMATH()
 
     @override
-    def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]:
+    def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析数学公式元素.
 
         Args:
@@ -56,7 +56,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm
         return result
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
         """将content转换成content_list_node.
         每种类型的html元素都有自己的content-list格式：参考 docs/specification/output_format/content_list_spec.md
         例如代码的返回格式：
@@ -78,7 +78,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
         Returns:
             dict: content_list_node
         """
-        tree = self._build_html_tree(parsed_content)
+        tree = parsed_content
         if tree is None:
             raise HtmlMathRecognizerException(f'Failed to load html: {parsed_content}')
 
@@ -125,7 +125,7 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
         """
         # node是从cc_html中解析出来的lxml节点
         self.cm.url = base_url
-        tree = self._build_html_tree(cc_html)
+        tree = cc_html
         math_render_type = math_render.get_render_type()
         if tree is None:
             raise HtmlMathRecognizerException(f'Failed to load html: {cc_html}')
@@ -171,20 +171,20 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
         # 保存处理后的html
         # with open('math_physicsforums_1_processed.html', 'w') as f:
         #     f.write(self._element_to_html(tree))
-        return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE])
+        return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE])
 
-    def process_mathjax_html(self, cc_html: str, o_html: str, math_render: BaseMathRender, base_url: str) -> List[Tuple[str, str]]:
+    def process_mathjax_html(self, cc_html: HtmlElement, o_html: HtmlElement, math_render: BaseMathRender, base_url: str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """处理mathjax有自定义标识符的数学公式."""
         self.cm.url = base_url
         try:
-            tree = self._build_html_tree(cc_html)
+            tree = cc_html
             math_render.find_math(tree)
 
             # with open('math_physicsforums_1_processed.html', 'w') as f:
             #     f.write(self._element_to_html(tree))
         except Exception as e:
             raise HtmlMathMathjaxRenderRecognizerException(f'处理mathjax有自定义标识符的数学公式失败: {e}')
-        return self.html_split_by_tags(self._element_to_html(tree), [CCTag.CC_MATH_INTERLINE])
+        return self.html_split_by_tags(tree, [CCTag.CC_MATH_INTERLINE])
 
 
 if __name__ == '__main__':
diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py
index 98d9aa3f..760d312b 100644
--- a/llm_web_kit/extractor/html/recognizer/code/tag_code.py
+++ b/llm_web_kit/extractor/html/recognizer/code/tag_code.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from lxml.html import HtmlElement
 
 from llm_web_kit.extractor.html.recognizer.code.common import (
@@ -29,111 +31,6 @@ def __is_all_chars_in_code_element(node: HtmlElement) -> bool:
     return full_text == code_text
 
 
-def __group_code_by_distance(
-    root: HtmlElement,
-    node_paths: list[list[str]],
-    dist: list[list[int]],
-) -> list[str]:
-    father = list(range(len(node_paths)))
-
-    def get_father(x: int) -> int:
-        if father[x] == x:
-            return x
-        father[x] = get_father(father[x])
-        return father[x]
-
-    edges: list[tuple[int, int, int]] = []
-    root_paths: list[list[str]] = []
-    for i in range(len(node_paths)):
-        root_paths.append(node_paths[i])
-        for j in range(i + 1, len(node_paths)):
-            edges.append((dist[i][j], i, j))
-    edges = sorted(edges)
-
-    used_edge = 0
-    meet = set()
-    for edge in edges:
-        _, i, j = edge
-        i = get_father(i)
-        j = get_father(j)
-        if i != j and (i, j) not in meet:
-            common_node_idx = min(len(root_paths[i]), len(root_paths[j]))
-            for idx, (x, y) in enumerate(zip(root_paths[i], root_paths[j])):
-                if idx == 0:
-                    continue
-                if x != y:
-                    common_node_idx = idx
-                    break
-            maybe_tree_root = __get_html_element(root, root_paths[i][:common_node_idx])
-
-            if len(maybe_tree_root.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0:
-                meet.add((i, j))
-                continue
-
-            if not __is_all_chars_in_code_element(maybe_tree_root):
-                meet.add((i, j))
-                continue
-
-            root_paths[i] = root_paths[i][:common_node_idx]
-            used_edge += 1
-            father[j] = i
-
-    root_paths = [
-        root_path for i, root_path in enumerate(root_paths) if i == get_father(i)
-    ]
-
-    removed = set()
-    root_paths_joined = sorted(
-        list(set(['/'.join(root_path) for root_path in root_paths]))
-    )
-    for x in root_paths_joined:
-        for y in root_paths_joined:
-            if len(x) < len(y) and y.startswith(x):
-                removed.add(y)
-    return [x for x in root_paths_joined if x not in removed]
-
-
-def __compute_distance_matrix(node_paths: list[list[str]]) -> list[list[int]]:
-    """
-    计算节点路径的距离矩阵，具体步骤：
-    1. 创建距离矩阵，计算每两个节点之间的距离
-    2. 距离计算方法：从共同祖先节点到两个节点的路径长度之和
-    例如：
-    节点1路径：/html/body/div/code
-    节点2路径：/html/body/pre/code
-    共同祖先到 body，距离为 2（div->code) + 2(pre->code) = 4
-    节点1和节点2的距离为 4
-
-    距离矩阵（对称矩阵）：
-    [0, 1, 2, 3],
-    [1, 0, 1, 2],
-    [2, 1, 0, 1],
-    [3, 2, 1, 0]
-
-    Args:
-        node_paths: 节点路径
-
-    Returns:
-        list[list[int]]: 距离矩阵
-    """
-    def get_lca_depth(path1: list[str], path2: list[str]) -> int:
-        for i, (x, y) in enumerate(zip(path1, path2)):
-            if x != y:
-                return i
-        return min(len(path1), len(path2))
-
-    n = len(node_paths)
-    dist = [[0] * n for _ in range(n)]
-
-    for i in range(n):
-        for j in range(i + 1, n):
-            lca_depth = get_lca_depth(node_paths[i], node_paths[j])
-            d = len(node_paths[i]) + len(node_paths[j]) - 2 * lca_depth
-            dist[i][j] = dist[j][i] = d
-
-    return dist
-
-
 def __get_code_node_paths(html_el: HtmlElement) -> list[list[str]]:
     """获取 html_el 中所有 code 标签的路径 只获取最外层的code标签， 如果code标签内还有code标签，则不获取。
 
@@ -223,6 +120,49 @@ def __detect_inline_code(root: HtmlElement, node_paths: list[list[str]]) -> tupl
     return new_node_paths, inline_code
 
 
+def __group_code(root: HtmlElement, node_paths: list[list[str]]) -> list[str]:
+    root_paths = []
+
+    def next_parent(code_node: HtmlElement, code_tags: int) -> tuple[Optional[HtmlElement], int]:
+        parent: Optional[HtmlElement] = code_node.getparent()
+        while parent is not None:
+            new_code_tags = len(parent.xpath('.//code'))
+            if new_code_tags == code_tags:
+                parent = parent.getparent()
+            else:
+                return parent, new_code_tags
+        return None, 0
+
+    while len(node_paths):
+        code_node = __get_html_element(root, node_paths[0])
+        code_tags = len(code_node.xpath('.//code'))
+
+        parent, new_code_tags = next_parent(code_node, code_tags)
+        while parent is not None:
+            if not __is_all_chars_in_code_element(parent):
+                break
+
+            if len(parent.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0:
+                break
+
+            code_node = parent
+            code_tags = new_code_tags
+
+            parent, new_code_tags = next_parent(code_node, code_tags)
+
+        root_path = code_node.getroottree().getpath(code_node)
+        root_paths.append(root_path)
+
+        new_node_path = []
+        for node_path in node_paths:
+            if '/'.join(node_path).startswith(root_path):
+                continue
+            new_node_path.append(node_path)
+        node_paths = new_node_path
+
+    return root_paths
+
+
 def modify_tree(root: HtmlElement) -> None:
     """将 html 树中所有 code 标签转换为代码块.
 
@@ -239,8 +179,8 @@ def modify_tree(root: HtmlElement) -> None:
     elif len(node_paths) == 1:
         tree_roots = ['/'.join(node_paths[0])]
     else:
-        dist_matrix = __compute_distance_matrix(node_paths)  # 计算距离矩阵
-        tree_roots = __group_code_by_distance(root, node_paths, dist_matrix)  # 根据距离矩阵，对code标签进行分组
+        tree_roots = __group_code(root, node_paths)  # 根据距离矩阵，对code标签进行分组
+        tree_roots = sorted(tree_roots)
 
     nodes = __get_code_blocks_nodes(root, tree_roots)  # 获取所有需要被转换为代码块的节点，并进行标签替换
     for node in nodes:
diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py
index 7be5b862..af362110 100644
--- a/llm_web_kit/extractor/html/recognizer/image.py
+++ b/llm_web_kit/extractor/html/recognizer/image.py
@@ -18,7 +18,7 @@ class ImageRecognizer(BaseHTMLElementRecognizer):
     IMG_LABEL = ['.jpg', '.jpeg', '.png', '.gft', '.webp', '.bmp', '.svg', 'data:image', '.gif']  # '.pdf'
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
         """将content转换成content_list_node.
         每种类型的html元素都有自己的content-list格式：参考 docs/specification/output_format/content_list_spec.md
         例如代码的返回格式：
@@ -43,7 +43,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
         Returns:
             dict: content_list_node
         """
-        html_obj = self._build_html_tree(parsed_content)
+        # html_obj = self._build_html_tree(parsed_content)
+        html_obj = parsed_content
 
         if html_obj.tag == CCTag.CC_IMAGE:
             return self.__ccimg_to_content_list(raw_html_segment, html_obj)
@@ -66,7 +67,7 @@ def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement)
         return result
 
     @override
-    def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]:
+    def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析图片元素.
 
         Args:
@@ -88,9 +89,10 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm
                     ccimg_html.append(html_li)
         return ccimg_html
 
-    def __parse_html_img(self, base_url: str, html_str: Tuple[str, str]) -> List[Tuple[str, str]]:
+    def __parse_html_img(self, base_url: str, html_str: Tuple[HtmlElement, HtmlElement]) -> List[Tuple[HtmlElement, HtmlElement]]:
         """解析html，获取img标签."""
-        html_obj = self._build_html_tree(html_str[0])
+        # html_obj = self._build_html_tree(html_str[0])
+        html_obj = html_str[0]
         image_related_selectors = [
             '//*[contains(@class, "image-embed") or contains(@id, "image-embed")]',  # 可能包含嵌入图片的自定义标签
             '//*[starts-with(@src, "data:image/") and not(self::img)]',
@@ -168,7 +170,8 @@ def __parse_img_elements(self, base_url: str, img_elements: HtmlElement, html_ob
             self._replace_element(elem, new_ccimage)
 
         if is_valid_img:
-            updated_html = self._element_to_html(html_obj)
+            # updated_html = self._element_to_html(html_obj)
+            updated_html = html_obj
             return (updated_html, img_tag)
         else:
             return (None, None)
diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
index 7694ba1a..2615f60e 100644
--- a/llm_web_kit/extractor/html/recognizer/list.py
+++ b/llm_web_kit/extractor/html/recognizer/list.py
@@ -1,7 +1,7 @@
 import json
 from typing import Any, List, Tuple
 
-from lxml.etree import _Element as HtmlElement
+from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.exception.exception import HtmlListRecognizerException
@@ -13,7 +13,7 @@
 class ListRecognizer(BaseHTMLElementRecognizer):
     """解析列表元素."""
 
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
         """专化为列表元素的解析.
 
         Args:
@@ -23,6 +23,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
 
         Returns:
         """
+        if not isinstance(parsed_content, HtmlElement):
+            raise HtmlListRecognizerException(f'parsed_content 必须是 HtmlElement 类型，而不是 {type(parsed_content)}')
         ordered, content_list, _, list_nest_level = self.__get_attribute(parsed_content)
         ele_node = {
             'type': DocElementType.LIST,
@@ -37,7 +39,7 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
         return ele_node
 
     @override
-    def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_html: str) -> List[Tuple[str, str]]:
+    def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析列表元素.
 
         Args:
@@ -57,7 +59,7 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[str, str]], raw_htm
                 new_html_lst.extend(lst)
         return new_html_lst
 
-    def _extract_list(self, raw_html: str) -> List[Tuple[str, str]]:
+    def _extract_list(self, raw_html: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]:
         """提取列表元素. 不支持嵌套列表，如果有嵌套的情况，则内部列表将作为一个单独的段落，内部列表的每个列表项作为一个单独的句子，使用句号结尾。
         列表在html中有以下几个标签：
 
@@ -70,12 +72,13 @@ def _extract_list(self, raw_html: str) -> List[Tuple[str, str]]:
         Returns:
             List[Tuple[str, str]]: 列表元素, 第一个str是<cc-list>xxx</cc-list>, 第二个str是原始的html内容
         """
-        tree = self._build_html_tree(raw_html)
+        # tree = self._build_html_tree(raw_html)
+        tree = raw_html
         self.__do_extract_list(tree)
         # 最后切割html
-        new_html = self._element_to_html(tree)
+        # new_html = self._element_to_html(tree)
+        new_html = tree
         lst = self.html_split_by_tags(new_html, CCTag.CC_LIST)
-
         return lst
 
     def __do_extract_list(self, root:HtmlElement) -> None:
@@ -219,7 +222,7 @@ def __extract_list_item_text_recusive(el: HtmlElement) -> list[list]:
 
         return text_paragraph
 
-    def __get_attribute(self, html:str) -> Tuple[bool, dict, str]:
+    def __get_attribute(self, html:HtmlElement) -> Tuple[bool, dict, str]:
         """获取element的属性.
 
         Args:
@@ -228,7 +231,8 @@ def __get_attribute(self, html:str) -> Tuple[bool, dict, str]:
         Returns:
             Tuple[str]: 第一个元素是是否有序; 第二个元素是个python list，内部是文本和行内公式，具体格式参考list的content_list定义。第三个元素是列表原始的html内容
         """
-        ele = self._build_html_tree(html)
+        # ele = self._build_html_tree(html)
+        ele = html
         if ele is not None and ele.tag == CCTag.CC_LIST:
             ordered = ele.attrib.get('ordered', 'False') in ['True', 'true']
             content_list = json.loads(ele.text)
diff --git a/llm_web_kit/extractor/html/recognizer/recognizer.py b/llm_web_kit/extractor/html/recognizer/recognizer.py
index 736b3637..6ab1a5ef 100644
--- a/llm_web_kit/extractor/html/recognizer/recognizer.py
+++ b/llm_web_kit/extractor/html/recognizer/recognizer.py
@@ -29,7 +29,7 @@ class BaseHTMLElementRecognizer(ABC):
 
     """基本的元素解析类."""
     @abstractmethod
-    def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
+    def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析html中的元素.
 
         Args:
@@ -38,11 +38,12 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
             raw_html: 原始完整的html
 
         Returns:
+            List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表
         """
         raise NotImplementedError
 
     @abstractmethod
-    def to_content_list_node(self, base_url:str, parsed_content: str, raw_html_segment:str) -> dict:
+    def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
         """将content转换成content_list_node.
         每种类型的html元素都有自己的content-list格式：参考 docs/specification/output_format/content_list_spec.md
         例如代码的返回格式：
@@ -119,7 +120,7 @@ def _replace_element(self, element:HtmlElement, cc_element:HtmlElement) -> None:
         replace_element(element, cc_element)
 
     @staticmethod
-    def html_split_by_tags(html_segment: str, split_tag_names:str | list) -> List[Tuple[str,str]]:
+    def html_split_by_tags(root: HtmlElement, split_tag_names:str | list) -> List[Tuple[HtmlElement,HtmlElement]]:
         """根据split_tag_name将html分割成不同的部分.
 
         Args:
@@ -127,7 +128,7 @@ def html_split_by_tags(html_segment: str, split_tag_names:str | list) -> List[Tu
             split_tag_names: str|list: 分割标签名, 例如 'p' 或者 'div' 或者 ['p', 'div']
         """
         copy_attri = True  # 是否copy 父节点的属性
-        root = html_to_element(html_segment)
+        # root = html_to_element(html_segment)
         if isinstance(split_tag_names, str):  # 如果参数是str，转换成list
             split_tag_names = [split_tag_names]
 
@@ -184,7 +185,8 @@ def __split_node(elem: HtmlElement):
             for sub_elem in elem:
                 if sub_elem.tag in split_tag_names:
                     # previous elements
-                    nodes = raw_nodes = element_to_html(path[0])
+                    # nodes = raw_nodes = element_to_html(path[0])
+                    nodes = raw_nodes = path[0]
                     if not __is_element_text_empty(path[0]):
                         yield nodes, raw_nodes
 
@@ -196,7 +198,11 @@ def __split_node(elem: HtmlElement):
                     if not html_source_segment:
                         mylogger.error(f'{sub_elem.tag} has no html attribute')
                         # TODO raise exception
-                    nodes, raw_nodes = element_to_html(path[0]), html_source_segment
+                    # nodes, raw_nodes = element_to_html(path[0]), html_source_segment
+                    if html_source_segment:
+                        nodes, raw_nodes = path[0], html_to_element(html_source_segment)
+                    else:
+                        nodes, raw_nodes = path[0], None
                     # if not __is_element_text_empty(path[0]):
                     yield nodes, raw_nodes  # 这个地方无需检查是否为空，因为这个是分割元素，必须返还
 
@@ -213,7 +219,8 @@ def __split_node(elem: HtmlElement):
                 copied.tail = elem.tail
 
             if not path:
-                nodes = raw_nodes = element_to_html(copied)
+                nodes = raw_nodes = copied
+                # raw_nodes = element_to_html(copied)
                 if not __is_element_text_empty(copied):
                     yield nodes, raw_nodes
 
@@ -221,30 +228,38 @@ def __split_node(elem: HtmlElement):
         return rtn
 
     @staticmethod
-    def is_cc_html(html: str, tag_name: str | list = None) -> bool:
+    def is_cc_html(el: HtmlElement, tag_name: str | list = None) -> bool:
         """判断html片段是否是cc标签.
 
         判断的时候由于自定义ccmath等标签可能会含有父标签，因此要逐层判断tagname. 含有父html
         完整路径的如：<html><body><ccmath>...</ccmath></body></html>，这种情况也会被识别为cc标签.
-        TODO 保证进来的cc标签没有父标签，只有一个根标签。
+
         Args:
-            html: str: html片段
+            el: str|HtmlElement: html片段或HtmlElement对象
             tag_name: str|list: cc标签，如ccmath, cccode, 如果指定了那么就只检查这几个标签是否在html里，否则检查所有cc标签
         """
-        # cc标签是指自定义标签，例如<ccmath>，<ccimage>，<ccvideo>等，输入html片段，判断是否是cc标签
-        el = html_to_element(html)
         if el is None:
             return False
 
+        # 默认cc标签列表
+        default_tag_names = [
+            CCTag.CC_CODE, CCTag.CC_MATH_INTERLINE, CCTag.CC_IMAGE, CCTag.CC_VIDEO,
+            CCTag.CC_AUDIO, CCTag.CC_TABLE, CCTag.CC_LIST, CCTag.CC_TEXT, CCTag.CC_TITLE
+        ]
+
+        # 确定需要检查的标签集合
         if tag_name:
             if isinstance(tag_name, str):
-                tag_to_check = [tag_name]
+                tags = {tag_name}
             else:
-                tag_to_check = tag_name
+                tags = set(tag_name)
         else:
-            tag_to_check = [CCTag.CC_CODE, CCTag.CC_MATH_INTERLINE, CCTag.CC_IMAGE, CCTag.CC_VIDEO, CCTag.CC_AUDIO, CCTag.CC_TABLE, CCTag.CC_LIST, CCTag.CC_TEXT, CCTag.CC_TITLE]
+            tags = set(default_tag_names)
+
+        # 如果当前元素的标签匹配，直接返回True
+        if el.tag in tags:
+            return True
 
-        for tag in tag_to_check:
-            if el.tag == tag or el.xpath(f'.//{tag}') :
-                return True
-        return False
+        # 构建XPath表达式，检查子元素是否包含目标标签
+        xpath_expr = ' or '.join([f'self::{tag}' for tag in tags])
+        return bool(el.xpath(f'.//*[{xpath_expr}]'))
diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
index db9351a0..3effe18a 100644
--- a/llm_web_kit/extractor/html/recognizer/table.py
+++ b/llm_web_kit/extractor/html/recognizer/table.py
@@ -9,6 +9,7 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType
+from llm_web_kit.libs.html_utils import remove_element
 
 
 class TableRecognizer(BaseHTMLElementRecognizer):
@@ -16,12 +17,13 @@ class TableRecognizer(BaseHTMLElementRecognizer):
 
     def __init__(self):
         super().__init__()
+        self.math_recognizer = MathRecognizer()
 
     @override
     def recognize(self,
                   base_url: str,
-                  main_html_lst: List[Tuple[str, str]],
-                  raw_html: str) -> List[Tuple[str, str]]:
+                  main_html_lst: List[Tuple[HtmlElement, HtmlElement]],
+                  raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析表格元素.
 
         Args:
@@ -30,6 +32,7 @@ def recognize(self,
             raw_html: 原始完整的html
 
         Returns:
+            List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表
         """
         final_result = list()
         for cc_html, o_html in main_html_lst:
@@ -41,66 +44,59 @@ def recognize(self,
         return final_result
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
-        if not parsed_content:
-            raise HtmlTableRecognizerException(f'table parsed_content{parsed_content}为空')
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
+        if not isinstance(parsed_content, HtmlElement):
+            raise HtmlTableRecognizerException(f'parsed_content 必须是 HtmlElement 类型，而不是 {type(parsed_content)}')
+
         table_type, table_nest_level, table_body = self.__get_attribute(parsed_content)
+
+        # 确保 table_body 不为 None 且是字符串类型
+        html_content = table_body if table_body is not None else ''
+        # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串
+        if table_type:
+            cc_table_type = DocElementType.COMPLEX_TABLE
+        else:
+            cc_table_type = DocElementType.SIMPLE_TABLE
         d = {
-            'type': DocElementType.TABLE,
-            # "bbox": [],
+            'type': cc_table_type,
             'raw_content': raw_html_segment,
             'content': {
-                'html': table_body,
-            },
+                'html': html_content,
+                'is_complex': table_type,
+                'table_nest_level': table_nest_level
+            }
         }
-        d['content']['is_complex'] = table_type
-        d['content']['table_nest_level'] = table_nest_level
         return d
 
-    def __is_contain_cc_html(self, cc_html: str) -> bool:
+    def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool:
         """判断html片段是否是cc标签."""
         return BaseHTMLElementRecognizer.is_cc_html(cc_html)
 
-    def __is_table_empty(self, table) -> bool:
-        """检查表格是否为空（递归检查嵌套元素）
-
-        :param table: lxml.html.HtmlElement 对象，表示一个 <table> 元素
-        :return: 如果表格为空，返回 True；否则返回 False
-        """
-        def is_element_empty(elem):
-            # 检查元素本身的文本内容
-            if elem.text and elem.text.strip():
-                return False
-            # 检查所有子元素
-            for child in elem.iterchildren():
-                # 如果是嵌套表格，递归检查表格是否为空
-                if child.tag == 'table':
-                    if not self.__is_table_empty(child):
-                        return False
-                # 其他元素需要递归检查
-                elif not is_element_empty(child):
-                    return False
-            # 检查尾部文本（如 </div> 后的文本）
-            if elem.tail and elem.tail.strip():
-                return False
-            return True
-
-        # 检查所有单元格
-        for cell in table.xpath('.//td | .//th'):
-            # 检查单元格内容
+    def __is_table_empty(self, table: HtmlElement) -> bool:
+        """table是否为空."""
+        # 合并单元格查询
+        cells = table.xpath('.//td | .//th')
+        for cell in cells:
             if cell.text and cell.text.strip():
                 return False
-            # 递归检查子元素
-            if not is_element_empty(cell):
-                return False
+            stack = [cell]
+            while stack:
+                elem = stack.pop()
+                if elem.text and elem.text.strip():
+                    return False
+                if elem.tail and elem.tail.strip():
+                    return False
+                # 添加子元素到栈中(倒序保证处理顺序)
+                stack.extend(reversed(elem.getchildren()))
         return True
 
-    def __is_simple_table(self, tree) -> bool:
+    def __is_simple_table(self, tree: HtmlElement) -> bool:
         """处理table元素，判断是是否复杂：是否包含合并单元格."""
-        cells = tree.xpath('.//td') + tree.xpath('.//th')
+        print('tree', self._element_to_html(tree))
+        cells = tree.xpath('.//td | .//th')
         for cell in cells:
-            colspan_str = cell.get('colspan', '1')
-            rowspan_str = cell.get('rowspan', '1')
+            colspan_str = cell.get('colspan', '1').strip('"\'\\')
+            rowspan_str = cell.get('rowspan', '1').strip('"\'\\')
             try:
                 colspan = int(colspan_str)
                 rowspan = int(rowspan_str)
@@ -111,34 +107,44 @@ def __is_simple_table(self, tree) -> bool:
                 return False
         return True
 
-    def __is_table_nested(self, element) -> int:
-        """计算表格的嵌套层级（非表格返回0，根据原始table判断的."""
+    def __is_table_nested(self, element: HtmlElement) -> int:
+        """计算表格的嵌套层级."""
         if element.tag != 'table':
             return 0
-        # 获取当前表格下所有的表格（包括自身）
-        all_tables = [element] + element.xpath('.//table')
-        max_level = 1  # 初始层级为1（当前表格）
-        # 计算每个表格的层级，取最大值
-        for table in all_tables:
-            ancestor_count = len(table.xpath('ancestor::table'))
-            level = ancestor_count + 1
-            max_level = max(max_level, level)
+
+        # 初始化栈结构：存储(当前元素, 当前层级)
+        stack = [(element, 1)]
+        max_level = 1
+
+        # 深度优先遍历
+        while stack:
+            current, current_level = stack.pop()
+            # 更新最大层级
+            max_level = max(max_level, current_level)
+            # 遍历子元素（倒序保证处理顺序）
+            for child in reversed(current.getchildren()):
+                if child.tag == 'table':
+                    # 遇到子表格时层级+1
+                    stack.append((child, current_level + 1))
+                else:
+                    # 非表格元素保持当前层级
+                    stack.append((child, current_level))
         return max_level
 
-    def __extract_tables(self, ele: str) -> List[Tuple[str, str]]:
+    def __extract_tables(self, tree: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]:
         """提取html中的table元素."""
-        tree = self._build_html_tree(ele)
         self.__do_extract_tables(tree)
-        new_html = self._element_to_html(tree)
+        new_html = tree
         lst = self.html_split_by_tags(new_html, CCTag.CC_TABLE)
         return lst
 
     def __get_table_type(self, child: HtmlElement) -> str:
         """获取table的类型."""
+        assert isinstance(child, HtmlElement)
         empty_flag = self.__is_table_empty(child)
-        level = self.__is_table_nested(child)
         if empty_flag:
             return 'empty'
+        level = self.__is_table_nested(child)
         # 是否跨行跨列
         flag = (self.__is_simple_table(child) and level < 2)
         if flag:
@@ -149,16 +155,16 @@ def __get_table_type(self, child: HtmlElement) -> str:
 
     def __check_table_include_math_code(self, raw_html: HtmlElement):
         """检查table中的内容，包括普通文本、数学公式和代码."""
-        math_html = self._element_to_html(raw_html)
-        math_recognizer = MathRecognizer()
-        math_res_parts = math_recognizer.recognize(
+        math_raw_html = self._element_to_html(raw_html)
+        math_html = raw_html
+        math_res_parts = self.math_recognizer.recognize(
             base_url='',
             main_html_lst=[(math_html, math_html)],
-            raw_html=math_html
+            raw_html=math_raw_html
         )
         result = []
         for math_item in math_res_parts:
-            ele_item = self._build_html_tree(math_item[0])
+            ele_item = math_item[0]
 
             def process_node(node):
                 """处理行内公式、行间公式、行间代码、行内代码."""
@@ -216,16 +222,16 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
                         # 处理非表格元素
                         math_res = self.__check_table_include_math_code(child)
                         parse_res.extend(math_res)
-                        elem.remove(child)
+                        remove_element(child)
                 # 将非表格内容拼接后放在表格前面
                 if parse_res:
-                    elem.text = ' '.join(parse_res) + (elem.text or '')
+                    elem.text = ' '.join(parse_res)
             else:
                 # 没有嵌套表格，直接简化
                 math_res = self.__check_table_include_math_code(elem)
                 parse_res.extend(math_res)
                 for item in list(elem.iterchildren()):
-                    elem.remove(item)
+                    remove_element(item)
                 if parse_res:
                     elem.text = ' '.join(parse_res)
             return
@@ -244,7 +250,7 @@ def __get_table_body(self, table_type, table_nest_level, table_root):
             table_root.attrib.clear()
             table_root.attrib.update(cleaned_attrs)
         # text进行strip操作,tail保留（部分内容留在tail中）
-        for elem in chain([table_root], table_root.iterdescendants()):
+        for elem in chain([table_root], table_root.iterchildren()):
             if elem.text is not None:
                 elem.text = elem.text.strip().replace('\\n', '')
             if elem.tail is not None:
@@ -273,9 +279,9 @@ def __do_extract_tables(self, root: HtmlElement) -> None:
         for child in root.iterchildren():
             self.__do_extract_tables(child)
 
-    def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]:
+    def __get_attribute(self, ele: HtmlElement) -> Tuple[bool, Any, Any]:
         """获取element的属性."""
-        ele = self._build_html_tree(html)
+        # ele = self._build_html_tree(html)
         if ele is not None and ele.tag == CCTag.CC_TABLE:
             table_type = ele.attrib.get('table_type')
             table_nest_level = ele.attrib.get('table_nest_level')
@@ -283,7 +289,7 @@ def __get_attribute(self, html: str) -> Tuple[bool, Any, Any]:
             table_body = ele.text
             return table_flag, table_nest_level, table_body
         else:
-            raise HtmlTableRecognizerException(f'{html}中没有cctable标签')
+            raise HtmlTableRecognizerException(f'{ele}中没有cctable标签')
 
     def __get_content_list_table_type(self, table_type):
         """complex|simple 转为True|False."""
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
index f39ffb19..a47a393d 100644
--- a/llm_web_kit/extractor/html/recognizer/text.py
+++ b/llm_web_kit/extractor/html/recognizer/text.py
@@ -2,14 +2,14 @@
 import string
 from typing import List, Tuple
 
-from lxml import etree
+from lxml import html
 from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
-from llm_web_kit.libs.html_utils import element_to_html
+from llm_web_kit.libs.html_utils import element_to_html, html_to_element
 
 special_symbols = [  # TODO 从文件读取
     '®',  # 注册商标符号
@@ -42,27 +42,28 @@ class TextParagraphRecognizer(BaseHTMLElementRecognizer):
     """解析文本段落元素."""
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
         """
         把文本段落元素转换为content list node.
         Args:
             base_url:
-            parsed_content:
+            parsed_content: 可能是字符串或HtmlElement对象
             raw_html_segment:
 
         Returns:
 
         """
-        el = self._build_html_tree(parsed_content)
+        # 如果是字符串则转换为HtmlElement，否则直接使用
+        el = parsed_content
         node = {
             'type': DocElementType.PARAGRAPH,
-            'raw_content': el.attrib.get('html', ''),
+            'raw_content': raw_html_segment,
             'content': json.loads(el.text),
         }
         return node
 
     @override
-    def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
+    def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]:
         """父类，解析文本段落元素.
 
         Args:
@@ -73,31 +74,32 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
         Returns:
         """
         new_html_lst = []
-        for html, raw_html in main_html_lst:
-            if self.is_cc_html(html):
-                new_html_lst.append((html, raw_html))
+        for html_element, raw_html_element in main_html_lst:
+            # 如果是字符串则转换为 HtmlElement
+            if self.is_cc_html(html_element):
+                new_html_lst.append((html_element, raw_html_element))
             else:
-                root_el = self._build_html_tree(html)
-                lst = list(self.__extract_paragraphs(root_el))
-                # 然后对lst[Element, raw_html] 进行处理. 提出Element里的文字，做成<<cctext>>标签
+                lst = list(self.__extract_paragraphs(html_element))
                 new_lst = self.__to_cctext_lst(lst)
                 new_html_lst.extend(new_lst)
         return new_html_lst
 
-    def __to_cctext_lst(self, lst: List[Tuple[HtmlElement, str]]) -> List[Tuple[str, str]]:
+    def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]) -> List[Tuple[HtmlElement, HtmlElement]]:
         """将lst[Element, raw_html] 进行处理. 提出Element里的文字，做成<<cctext>>标签.
 
         Args:
-            lst: List[Tuple[HtmlElement, str]]: Element和raw_html组成的列表
+            lst: List[Tuple[HtmlElement | str, HtmlElement | str]]: Element和raw_html组成的列表
         """
         new_lst = []
         for el, raw_html in lst:
-            para_text = self.__get_paragraph_text(el)
-            if para_text:
-                cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=raw_html)
-                cc_node_html = self._element_to_html(cctext_el)
-                new_lst.append((cc_node_html, raw_html))
+            # 如果是字符串则转换为 HtmlElement
+            el_element = html_to_element(el) if isinstance(el, str) else el
+            raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html
 
+            para_text = self.__get_paragraph_text(el_element)
+            if para_text:
+                cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=element_to_html(raw_html_element))
+                new_lst.append((cctext_el, raw_html_element))
         return new_lst
 
     def __combine_text(self, text1:str, text2:str, lang='en') -> str:
@@ -172,7 +174,7 @@ def __extract_paragraphs(self, root: HtmlElement):
             解析后的文本段落元素
         """
         path: List[HtmlElement] = []
-        parser = etree.HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True)
+        parser = html.HTMLParser(collect_ids=False, encoding='utf-8', remove_comments=True, remove_pis=True)
 
         def is_contain_readable_text(text):
             return text.strip() if text else text
@@ -223,12 +225,18 @@ def helper(elem: HtmlElement):
                 path[-1].append(copied)
 
             path.append(copied)
+            # elem直接有text，则直接添加返回
+            if has_direct_text(elem):
+                rebuild_path()
+                path[-1].append(copy_helper(elem))
+                yield path[0], path[0]
+                rebuild_path()
             for sub_elem in elem:
                 if has_direct_text(sub_elem) or (sub_elem.tag == 'p' and has_text(sub_elem)):
                     rebuild_path()
                     path[-1].append(copy_helper(sub_elem))
-                    yield path[0], element_to_html(path[0])
-
+                    # yield path[0], element_to_html(path[0])
+                    yield path[0], path[0]
                     # detach the yielded tree
                     rebuild_path()
                     continue
diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py
index 8f2043db..9b18bdb4 100644
--- a/llm_web_kit/extractor/html/recognizer/title.py
+++ b/llm_web_kit/extractor/html/recognizer/title.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
-from lxml.etree import _Element as HtmlElement
+# from lxml.etree import _Element as HtmlElement
+from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.extractor.html.recognizer.recognizer import (
@@ -12,7 +13,7 @@ class TitleRecognizer(BaseHTMLElementRecognizer):
     """解析多级标题元素."""
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
         """将html转换成content_list_node.
 
         Args:
@@ -37,8 +38,8 @@ def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segm
         return cctitle_content_node
 
     @override
-    def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
-        """父类，解析多级标题元素.
+    def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
+        """父类，解析标题元素.
 
         Args:
             base_url: str: 基础url
@@ -46,9 +47,12 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
             raw_html: 原始完整的html
 
         Returns:
+            List[Tuple[HtmlElement, HtmlElement]]: 处理后的HTML元素列表
         """
         new_html_lst = []
         for html, raw_html in main_html_lst:
+            if isinstance(html, str):
+                html = self._build_html_tree(html)
             if self.is_cc_html(html):
                 new_html_lst.append((html, raw_html))
             else:
@@ -56,22 +60,19 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
                 new_html_lst.extend(lst)
         return new_html_lst
 
-    def _extract_title(self, raw_html:str) -> List[Tuple[str,str]]:
-        """
-        提取多级标题元素
+    def _extract_title(self, raw_html: HtmlElement) -> List[Tuple[HtmlElement, HtmlElement]]:
+        """提取多级标题元素
         Args:
-            raw_html:
+            raw_html: HtmlElement对象
 
         Returns:
-            List[Tuple[str,str]]: 多级标题元素, 第一个str是<cctitle>xxx</cctitle>, 第二个str是原始的html内容
-
+            List[Tuple[HtmlElement, HtmlElement]]: 多级标题元素列表
         """
-        tree = self._build_html_tree(raw_html)
-        self.__do_extract_title(tree)  # 遍历这个tree, 找到所有h1, h2, h3, h4, h5, h6标签, 并得到其对应的原始的html片段
+        tree = raw_html
+        self.__do_extract_title(tree)  # 遍历这个tree, 找到所有h1, h2, h3, h4, h5, h6标签
         # 最后切割html
-        new_html = self._element_to_html(tree)
+        new_html = tree
         lst = self.html_split_by_tags(new_html, CCTag.CC_TITLE)
-
         return lst
 
     def __do_extract_title(self, root:HtmlElement) -> None:
@@ -137,9 +138,10 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
 
         return ' '.join(blk for blk in blks if blk)
 
-    def __get_attribute(self, html:str) -> Tuple[int, str]:
+    def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]:
         """获取element的属性."""
-        ele = self._build_html_tree(html)
+        # ele = self._build_html_tree(html)
+        ele = html
         # 找到cctitle标签
         if ele is not None:
             level = ele.attrib.get('level')
diff --git a/llm_web_kit/extractor/html/recognizer/video.py b/llm_web_kit/extractor/html/recognizer/video.py
index 227736a1..bed7df5a 100644
--- a/llm_web_kit/extractor/html/recognizer/video.py
+++ b/llm_web_kit/extractor/html/recognizer/video.py
@@ -1,5 +1,6 @@
 from typing import List, Tuple
 
+from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.extractor.html.recognizer.recognizer import \
@@ -9,7 +10,7 @@
 class VideoRecognizer(BaseHTMLElementRecognizer):
     """解析视元素."""
     @override
-    def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
+    def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
         """父类，解析视频元素.
 
         Args:
@@ -22,5 +23,5 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[str,str]], raw_html:
         raise NotImplementedError
 
     @override
-    def to_content_list_node(self, base_url: str, parsed_content: str, raw_html_segment: str) -> dict:
+    def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
         raise NotImplementedError
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 49b36d0a..26246b58 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -5,8 +5,10 @@
 
 from overrides import override
 
+from llm_web_kit.exception.exception import ExtractorChainInputException
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
-from llm_web_kit.libs.html_utils import (get_element_text, html_to_element,
+from llm_web_kit.libs.html_utils import (element_to_html, get_element_text,
+                                         html_to_element,
                                          html_to_markdown_table,
                                          table_cells_count)
 
@@ -51,11 +53,13 @@ def __init__(self):
         self.__list_item_start = '-'  # md里的列表项前缀
         self.__list_para_prefix = '  '  # 两个空格，md里的列表项非第一个段落的前缀：如果多个段落的情况，第二个以及之后的段落前缀
         self.__md_special_chars = ['#', '`', ]  # TODO: 先去掉$，会影响行内公式，后面再处理
+        self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, DocElementType.CODE, DocElementType.EQUATION_INTERLINE]
+        self.__inline_types_document_type = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE]
 
     def to_html(self):
         raise NotImplementedError('This method must be implemented by the subclass.')
 
-    def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST):
+    def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST, exclude_inline_types=[]):
         """把content_list转化为txt格式.
 
         Args:
@@ -68,7 +72,7 @@ def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST):
         for page in content_lst:
             for content_lst_node in page:
                 if content_lst_node['type'] not in exclude_nodes:
-                    txt_content = self.__content_lst_node_2_txt(content_lst_node)
+                    txt_content = self.__content_lst_node_2_txt(content_lst_node, exclude_inline_types)
                     if txt_content and len(txt_content) > 0:
                         text_blocks.append(txt_content)
 
@@ -76,7 +80,7 @@ def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST):
         txt = txt.strip() + self.__text_end  # 加上结尾换行符
         return txt
 
-    def __to_md(self, exclude_nodes=[]):
+    def __to_md(self, exclude_nodes=[], exclude_inline_types=[]):
         """把content_list转化为md格式.
 
         Args:
@@ -89,7 +93,7 @@ def __to_md(self, exclude_nodes=[]):
         for page in content_lst:
             for content_lst_node in page:
                 if content_lst_node['type'] not in exclude_nodes:
-                    txt_content = self.__content_lst_node_2_md(content_lst_node)
+                    txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types)
                     if txt_content and len(txt_content) > 0:
                         md_blocks.append(txt_content)
 
@@ -97,15 +101,31 @@ def __to_md(self, exclude_nodes=[]):
         md = md.strip() + self.__text_end  # 加上结尾换行符
         return md
 
-    def to_nlp_md(self, MM_NODE_LIST=[]):
-        if MM_NODE_LIST:
-            md = self.__to_md(exclude_nodes=MM_NODE_LIST)
-        else:
-            md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST)
+    def __validate_exclude_nodes(self, exclude_nodes, exclude_inline_types):
+        if isinstance(exclude_nodes, str):
+            exclude_nodes = [exclude_nodes]
+        if isinstance(exclude_inline_types, str):
+            exclude_inline_types = [exclude_inline_types]
+        if not isinstance(exclude_nodes, list):
+            raise ExtractorChainInputException('exclude_nodes must be a list type.')
+        if not isinstance(exclude_inline_types, list):
+            raise ExtractorChainInputException('exclude_inline_types must be a list type.')
+        for node in exclude_nodes:
+            if node not in self.__nodes_document_type:
+                raise ExtractorChainInputException(f'exclude_nodes contains invalid element type: {node}')
+        for inline_type in exclude_inline_types:
+            if inline_type not in self.__inline_types_document_type:
+                raise ExtractorChainInputException(f'exclude_inline_types contains invalid inline type: {inline_type}')
+        return exclude_nodes, exclude_inline_types
+
+    def to_nlp_md(self, exclude_nodes=[], exclude_inline_types=[]):
+        exclude_nodes, exclude_inline_types = self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types)
+        md = self.__to_md(exclude_nodes + DocElementType.MM_NODE_LIST, exclude_inline_types)
         return md
 
-    def to_mm_md(self):
-        md = self.__to_md()
+    def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[]):
+        self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types)
+        md = self.__to_md(exclude_nodes, exclude_inline_types)
         return md
 
     def to_main_html(self) -> str:
@@ -121,9 +141,11 @@ def to_main_html(self) -> str:
         for page in content_lst:
             for content_lst_node in page:
                 raw_html = content_lst_node['raw_content']
-                if raw_html:
-                    html += raw_html
-
+                if isinstance(raw_html, str):
+                    html_segment = raw_html  # 直接使用字符串
+                else:
+                    html_segment = element_to_html(raw_html)  # 转换HtmlElement为字符串
+                html += html_segment
         return html
 
     def to_json(self, pretty=False) -> str:
@@ -140,7 +162,7 @@ def to_dict(self) -> dict:
     def _get_data(self) -> List[Dict]:
         raise NotImplementedError('This method must be implemented by the subclass.')
 
-    def __content_lst_node_2_md(self, content_lst_node: dict) -> str:
+    def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = []) -> str:
         """把content_list里定义的每种元素块转化为markdown格式.
 
         Args:
@@ -202,7 +224,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str:
             return md_title
         elif node_type == DocElementType.PARAGRAPH:
             paragraph_el_lst = content_lst_node['content']
-            one_para = self.__join_one_para(paragraph_el_lst)
+            one_para = self.__join_one_para(paragraph_el_lst, exclude_inline_types)
             return one_para
         elif node_type == DocElementType.LIST:
             items_paras = []
@@ -210,7 +232,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str:
             for item_idx, item in enumerate(content_lst_node['content']['items']):
                 paras_of_item = []
                 for para in item:
-                    one_para = self.__join_one_para(para)
+                    one_para = self.__join_one_para(para, exclude_inline_types)
                     paras_of_item.append(one_para)
                 # 由于markdown的列表项里可以有多个段落，这里拼装成md列表段落格式
                 list_prefix = f'{item_idx + 1}.' if is_ordered else self.__list_item_start  # 有序列表和无需列表前缀
@@ -218,7 +240,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str:
                 items_paras.append(item_paras_md)
             md_list = '\n'.join(items_paras)
             return md_list
-        elif node_type == DocElementType.TABLE:
+        elif node_type == DocElementType.SIMPLE_TABLE:
             # 对文本格式来说，普通表格直接转为md表格，复杂表格返还原始html
             html_table = content_lst_node['content']['html']
             if html_table is not None:
@@ -227,12 +249,15 @@ def __content_lst_node_2_md(self, content_lst_node: dict) -> str:
                 if cells_count <= 1:  # 单个单元格的表格，直接返回文本
                     text = get_element_text(html_to_element(html_table)).strip()
                     return text
-                is_complex = content_lst_node['content']['is_complex']
-                if is_complex:
-                    return html_table
-                else:
-                    md_table = html_to_markdown_table(html_table)
-                    return md_table
+                md_table = html_to_markdown_table(html_table)
+                return md_table
+            else:
+                return ''
+        elif node_type == DocElementType.COMPLEX_TABLE:
+            html_table = content_lst_node['content']['html']
+            if html_table is not None:
+                html_table = html_table.strip()
+                return html_table
             else:
                 return ''
         else:
@@ -274,7 +299,7 @@ def __para_2_md_list_item(self, paras_of_item: list, list_prefix: str) -> str:
 
         return md_list_item
 
-    def __content_lst_node_2_txt(self, content_lst_node: dict) -> str:
+    def __content_lst_node_2_txt(self, content_lst_node: dict, exclude_inline_types=[]) -> str:
         """把content_list里定义的每种元素块转化为纯文本格式.
 
         Args:
@@ -330,35 +355,38 @@ def __content_lst_node_2_txt(self, content_lst_node: dict) -> str:
             return title_content
         elif node_type == DocElementType.PARAGRAPH:
             paragraph_el_lst = content_lst_node['content']
-            one_para = self.__join_one_para(paragraph_el_lst)
+            one_para = self.__join_one_para(paragraph_el_lst, exclude_inline_types)
             return one_para
         elif node_type == DocElementType.LIST:
             items_paras = []
             for item in content_lst_node['content']['items']:
                 paras_of_item = []
                 for para in item:
-                    one_para = self.__join_one_para(para)
+                    one_para = self.__join_one_para(para, exclude_inline_types)
                     paras_of_item.append(one_para)
                 items_paras.append(paras_of_item)
             items_paras = [self.__txt_para_splitter.join(item) for item in items_paras]
             return self.__txt_para_splitter.join(items_paras)   # 对于txt格式来说一个列表项里多个段落没啥问题，但是对于markdown来说，多个段落要合并成1个，否则md格式无法表达。
-        elif node_type == DocElementType.TABLE:
+        elif node_type == DocElementType.SIMPLE_TABLE:
             # 对文本格式来说，普通表格直接转为md表格，复杂表格返还原始html
             html_table = content_lst_node['content']['html']
             if html_table is not None:
                 html_table = html_table.strip()
-                is_complex = content_lst_node['content']['is_complex']
-                if is_complex:
-                    return html_table
-                else:
-                    md_table = html_to_markdown_table(html_table)
-                    return md_table
+                md_table = html_to_markdown_table(html_table)
+                return md_table
+            else:
+                return ''
+        elif node_type == DocElementType.COMPLEX_TABLE:
+            html_table = content_lst_node['content']['html']
+            if html_table is not None:
+                html_table = html_table.strip()
+                return html_table
             else:
                 return ''
         else:
             raise ValueError(f'content_lst_node contains invalid element type: {node_type}')  # TODO: 自定义异常
 
-    def __join_one_para(self, para: list) -> str:
+    def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str:
         """把一个段落的元素块连接起来.
 
         Args:
@@ -368,6 +396,8 @@ def __join_one_para(self, para: list) -> str:
         """
         one_para = []
         for el in para:
+            if el['t'] in exclude_inline_types:
+                continue
             if el['t'] == ParagraphTextType.TEXT:
                 c = el['c']
                 if not c or not c.strip():
@@ -393,10 +423,10 @@ def _validate(self, json_obj: dict):
             json_obj (dict): _description_
         """
         if not isinstance(json_obj, dict):
-            raise ValueError('json_obj must be a dict type.')
+            raise ExtractorChainInputException('json_obj must be a dict type.')
         if DataJsonKey.CONTENT_LIST in json_obj:
             if not isinstance(json_obj.get(DataJsonKey.CONTENT_LIST, ''), list):
-                raise ValueError('content_list must be a list type.')
+                raise ExtractorChainInputException('content_list must be a list type.')
 
 
 class ContentList(StructureMapper):
diff --git a/llm_web_kit/libs/doc_element_type.py b/llm_web_kit/libs/doc_element_type.py
index c3c63fdb..dd962ed7 100644
--- a/llm_web_kit/libs/doc_element_type.py
+++ b/llm_web_kit/libs/doc_element_type.py
@@ -8,7 +8,8 @@ class ParagraphTextType(object):
 class DocElementType(object):
     PARAGRAPH = 'paragraph'
     LIST = 'list'
-    TABLE = 'table'
+    SIMPLE_TABLE = 'simple_table'
+    COMPLEX_TABLE = 'complex_table'
     EQUATION_INTERLINE = 'equation-interline'
     CODE = 'code'
     TITLE = 'title'
diff --git a/llm_web_kit/libs/statics.py b/llm_web_kit/libs/statics.py
index df640617..006cdcc8 100644
--- a/llm_web_kit/libs/statics.py
+++ b/llm_web_kit/libs/statics.py
@@ -94,10 +94,10 @@ def process_list_items(items, parent_type):
                 elif element_type == DocElementType.LIST:
                     # 使用递归函数处理列表项
                     process_list_items(element['content']['items'], DocElementType.LIST)
-                elif element_type == DocElementType.TABLE:
+                elif element_type == DocElementType.COMPLEX_TABLE:
                     # 统计复杂表格数量
                     if element.get('content', {}).get('is_complex', False):
-                        item_type = f'{DocElementType.TABLE}.complex'
+                        item_type = f'{DocElementType.COMPLEX_TABLE}.complex'
                         current_count = self.statics.get(item_type, 0)
                         self.statics[item_type] = current_count + 1
 
diff --git a/llm_web_kit/tools/cli.py b/llm_web_kit/tools/cli.py
index c2260e1f..ca8a41b4 100644
--- a/llm_web_kit/tools/cli.py
+++ b/llm_web_kit/tools/cli.py
@@ -58,7 +58,6 @@ def cli(input_path, output_path, debug_mode):
         extractor = HTMLFileFormatExtractor({})
         data_e = extractor.extract(DataJson(input_data))
         output_json = data_e.to_json()
-
         if output_path:
             output_path = Path(output_path)
             output_path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py
index 6aad22ba..f0085a69 100644
--- a/tests/llm_web_kit/cli_sdk/test_cli_sdk.py
+++ b/tests/llm_web_kit/cli_sdk/test_cli_sdk.py
@@ -73,11 +73,12 @@ def test_process_html_file_path(self, runner, json_with_file_path, tmp_path):
 
     def test_stdout_output(self, runner, json_with_html_path):
         """测试输出到标准输出."""
+        print('json_with_html_path', json_with_html_path)
         result = runner.invoke(cli, ['-i', str(json_with_html_path)])
 
         assert result.exit_code == 0
         assert result.output
-
+        print('result.output', result.output)
         output_data = json.loads(result.output)
         assert 'content_list' in output_data
         assert isinstance(output_data['content_list'], list)
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html
new file mode 100644
index 00000000..30fce8a5
--- /dev/null
+++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/content_list_empty.html
@@ -0,0 +1 @@
+北京大平层，奶油风浪漫到家！<br />\n-<br />\n设计案例: 168m轻法式大平层设计<br />\n项目地址:北京市大兴区<br />\n-<br />\n在这个168平方米的轻法式大平层设计中，全屋以浪漫的奶白色为主色调，搭配驼色，营造出空间的呼吸感。客餐厅一体设计，地面铺满柔光砖，裸调的高级质感扑面而来。<br />\n<br />\n转角沙发与充满设计感的小型休闲椅相搭配，家居格调瞬间提升。威尼斯棕大理石餐桌的加入，为餐厅增添了更多的层次感和温柔。坐在沙发上，可以一览餐厅和厨房的空间，增加了互动性。<br />\n<br />\n墙面采用暖白色，搭配一些局部的原木色护墙板，让空间的视觉效果更加灵动，不易产生疲劳感。阳光透过窗户洒进室内，整个空间显得格外治愈，喜欢这种明亮纯粹的家。
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html
new file mode 100644
index 00000000..e0b1d2bb
--- /dev/null
+++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/exclude_complex_table.html
@@ -0,0 +1,528 @@
+<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html
+    xmlns=\"http://www.w3.org/1999/xhtml\">\n \n \n\n \n\n\n
+
+<head>\n <title>\n WikiProcessors – smartmontools\n </title>\n
+    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n
+    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\" />\n
+    <!--[if IE]><script type=\"text/javascript\">\n      if (/^#__msie303:/.test(window.location.hash))\n        window.location.replace(window.location.hash.replace(/^#__msie303:/, '#'));\n    </script><![endif]-->\n
+    <link rel=\"search\" href=\"/search\" />\n
+    <link rel=\"help\" href=\"/wiki/TracGuide\" />\n
+    <link rel=\"alternate\" href=\"/wiki/WikiProcessors?version=3&amp;format=txt\" type=\"text/x-trac-wiki\"
+        title=\"Plain Text\" />\n
+    <link rel=\"up\" href=\"/wiki/WikiProcessors\" title=\"View latest version\" />\n
+    <link rel=\"start\" href=\"/wiki\" />\n
+    <link rel=\"stylesheet\" href=\"/chrome/common/css/trac.css\" type=\"text/css\" />\n
+    <link rel=\"stylesheet\" href=\"/chrome/common/css/wiki.css\" type=\"text/css\" />\n
+    <link rel=\"prev\" href=\"/wiki/WikiProcessors?version=2\" title=\"Version 2\" />\n
+    <link rel=\"icon\" href=\"/chrome/common/trac.ico\" type=\"image/x-icon\" />\n <style id=\"trac-noscript\"
+        type=\"text/css\">
+        .trac-noscript {
+            display: none !important
+        }
+    </style>\n
+    <link type=\"application/opensearchdescription+xml\" rel=\"search\" href=\"/search/opensearch\" title=\"Search
+        smartmontools\" />\n
+    <script type=\"text/javascript\" charset=\"utf-8\" src=\"/chrome/common/js/jquery.js\"></script>\n
+    <script type=\"text/javascript\" charset=\"utf-8\" src=\"/chrome/common/js/babel.js\"></script>\n
+    <script type=\"text/javascript\" charset=\"utf-8\" src=\"/chrome/common/js/trac.js\"></script>\n
+    <script type=\"text/javascript\" charset=\"utf-8\" src=\"/chrome/common/js/search.js\"></script>\n
+    <script type=\"text/javascript\" charset=\"utf-8\" src=\"/chrome/common/js/folding.js\"></script>\n
+    <script
+        type=\"text/javascript\">\n      jQuery(\"#trac-noscript\").remove();\n      jQuery(document).ready(function($) {\n        $(\".trac-autofocus\").focus();\n        $(\".trac-target-new\").attr(\"target\", \"_blank\");\n        if ($.ui) { /* is jquery-ui added? */\n          $(\".trac-datepicker:not([readonly])\").prop(\"autocomplete\", \"off\").datepicker();\n          $(\".trac-datetimepicker:not([readonly])\").prop(\"autocomplete\", \"off\").datetimepicker();\n          $(\"#main\").addClass(\"trac-nodatetimehint\");\n        }\n        $(\".trac-disable\").disableSubmit(\".trac-disable-determinant\");\n        setTimeout(function() { $(\".trac-scroll\").scrollToTop() }, 1);\n        $(\".trac-disable-on-submit\").disableOnSubmit();\n      });\n    </script>
+    \n
+    <meta name=\"ROBOTS\" content=\"NOINDEX, NOFOLLOW\" />\n
+    <script
+        type=\"text/javascript\">\n      jQuery(document).ready(function($) {\n        $(\"#content\").find(\"h1,h2,h3,h4,h5,h6\").addAnchor(_(\"Link to this section\"));\n        $(\"#content\").find(\".wikianchor\").each(function() {\n          $(this).addAnchor(babel.format(_(\"Link to #%(id)s\"), {id: $(this).attr('id')}));\n        });\n        $(\".foldable\").enableFolding(true, true);\n      });\n    </script>
+    \n
+</head>\n
+
+<body>\n\t<div id=\"doc3\" class=\"yui-t6\">\n\t\t<div id=\"bd\">\n\t\t\t<a name=\"content\"></a>\n<!-- End Header -->\n
+            <div id=\"banner\">\n <div id=\"header\">\n <h1><a href=\"http://www.smartmontools.org\">smartmontools</a>
+                    </h1>\n </div>\n <form id=\"search\" action=\"/search\" method=\"get\">\n <div>\n <label
+                            for=\"proj-search\">Search:</label>\n <input type=\"text\" id=\"proj-search\" name=\"q\"
+                            size=\"18\" value=\"\" />\n <input type=\"submit\" value=\"Search\" />\n </div>\n </form>\n
+                <div id=\"metanav\" class=\"nav\">\n <ul>\n <li class=\"first\"><a href=\"/login\">Login</a></li>
+                        <li><a href=\"/prefs\">Preferences</a></li>
+                        <li><a href=\"/wiki/TracGuide\">Help/Guide</a></li>
+                        <li><a href=\"/about\">About Trac</a></li>
+                        <li><a href=\"/register\">Register</a></li>
+                        <li class=\"last\"><a href=\"/reset_password\">Forgot your password?</a></li>\n
+                    </ul>\n </div>\n </div>\n <div id=\"mainnav\" class=\"nav\">\n <ul>\n <li class=\"first active\"><a
+                            href=\"/wiki\">Wiki</a></li>
+                    <li><a href=\"/timeline\">Timeline</a></li>
+                    <li><a href=\"/roadmap\">Roadmap</a></li>
+                    <li><a href=\"/report\">View Tickets</a></li>
+                    <li><a href=\"/browser\">Browse Source</a></li>
+                    <li class=\"last\"><a href=\"/search\">Search</a></li>\n
+                </ul>\n </div>\n <div id=\"main\">\n <div id=\"pagepath\" class=\"noprint\">\n <a class=\"pathentry
+                        first\" title=\"View WikiStart\" href=\"/wiki\">wiki:</a><a class=\"pathentry\"
+                        href=\"/wiki/WikiProcessors\" title=\"View WikiProcessors\">WikiProcessors</a>\n</div>\n <div
+                    id=\"ctxtnav\" class=\"nav\">\n <h2>Context Navigation</h2>\n <ul>\n <li class=\"first\">
+                            <span>&larr; <a class=\"prev\" href=\"/wiki/WikiProcessors?version=2\" title=\"Version
+                                    2\">Previous Version</a></span></li>
+                        <li><a href=\"/wiki/WikiProcessors\" title=\"View latest version\">View Latest Version</a></li>
+                        <li class=\"last\"><span class=\"missing\">Next Version &rarr;</span></li>\n
+                    </ul>\n
+                    <hr />\n
+                </div>\n <div id=\"content\" class=\"wiki\">\n <br />\n <table id=\"info\" summary=\"Revision info\">\n
+                        <tr>
+                            <th scope=\"row\">Version 3 (modified by <span class=\"trac-author\">trac</span>, <a
+                                    class=\"timeline\"
+                                    href=\"/timeline?from=2017-11-12T14%3A15%3A51%2B01%3A00&amp;precision=second\"
+                                    title=\"See timeline at Nov 12, 2017, 2:15:51 PM\">5 years ago</a>)\n (<a
+                                    href=\"/wiki/WikiProcessors?action=diff&amp;version=3\">diff</a>)</th>
+                        </tr>\n <tr>
+                            <td class=\"message\">\n <p>\n--\n</p>\n\n </td>
+                        </tr>\n </table>\n <div class=\"wikipage searchable\">\n \n <div id=\"wikipage\"
+                            class=\"trac-content\">
+                            <h1 id=\"WikiProcessors\">Wiki Processors</h1>\n<p>\nProcessors are <a class=\"wiki\"
+                                    href=\"/wiki/WikiMacros\">WikiMacros</a> designed to provide alternative markup
+                                formats for the <a class=\"wiki\" href=\"/wiki/TracWiki\">Wiki engine</a>. Processors
+                                can be thought of as <em>macro functions to process user-edited text</em>. \n</p>\n<p>
+                                \nWiki processors can be used in any Wiki text throughout Trac, such as:\n</p>\n<ul>
+                                <li><a class=\"wiki\" href=\"/wiki/WikiProcessors#CodeHighlightingSupport\">syntax
+                                        highlighting</a> or for rendering text verbatim\n</li>
+                                <li>rendering <a class=\"wiki\" href=\"/wiki/WikiProcessors#HTMLrelated\">Wiki markup
+                                        inside a context</a>, like inside &lt;div&gt; blocks or &lt;span&gt; or within
+                                    &lt;td&gt; or &lt;th&gt; table cells\n</li>
+                                <li>using an alternative markup syntax, like <a class=\"wiki\"
+                                        href=\"/wiki/WikiHtml\">raw HTML</a> and <a class=\"wiki\"
+                                        href=\"/wiki/WikiRestructuredText\">Restructured Text</a> or <a
+                                        class=\"ext-link\" href=\"http://www.textism.com/tools/textile/\"><span
+                                            class=\"icon\">​</span>textile</a>\n</li>
+                            </ul>
+                            <h2 id=\"UsingProcessors\">Using Processors</h2>\n<p>\nTo use a processor on a block of
+                                text, first delimit the lines using a Wiki <em>code block</em>:\n</p>\n
+                            <pre class=\"wiki\">{{{\nThe lines\nthat should be processed...\n}}}\n</pre>
+                            <p>\nImmediately after the <code>{{{</code> or on the line just below, add <code>#!</code>
+                                followed by the <em>processor name</em>:\n</p>\n
+                            <pre
+                                class=\"wiki\">{{{\n#!processorname\nThe lines\nthat should be processed...\n}}}\n</pre>
+                            <p>\nThis is the \"shebang\" notation, familiar to most UNIX users.\n</p>\n<p>\nBesides
+                                their content, some Wiki processors can also accept <em>parameters</em>, which are then
+                                given as <code>key=value</code> pairs after the processor name and on the same line. If
+                                <code>value</code> has to contain space, as it's often the case for the style parameter,
+                                a quoted string can be used (<code>key=\"value with space\"</code>).\n</p>\n<p>\nAs some
+                                processors are meant to process Wiki markup, it's quite possible to <em>nest</em>
+                                processor blocks.\nYou may want to indent the content of nested blocks for increased
+                                clarity, this extra indentation will be ignored when processing the content.\n</p>\n<h2
+                                id=\"Examples\">Examples</h2>\n<table class=\"wiki\">\n<tr>
+                                    <th> Wiki Markup </th>
+                                    <th> Display \n</th>
+                                </tr>
+                                <tr>
+                                    <td colspan=\"2\" align=\"center\" style=\"border: none\">
+                                        <blockquote>\n<p>\n<span class=\"underline\">Example 1</span>: Inserting raw
+                                                HTML\n</p>\n</blockquote>\n
+                                    </td>
+                                </tr>
+                                <tr>\n<td style=\"border: none\">
+                                        <pre
+                                            class=\"wiki\">{{{\n#!html\n&lt;h1 style=\"color: grey\"&gt;This is raw HTML&lt;/h1&gt;\n}}}\n</pre>
+                                    </td>
+                                    <td style=\"border: none; padding-left: 2em\" valign=\"top\">
+                                        <h1 style=\"color: grey\">This is raw HTML</h1>\n
+                                    </td>
+                                </tr>
+                                <tr>\n<td colspan=\"2\" align=\"center\" style=\"border: none\">
+                                        <blockquote>\n<p>\n<span class=\"underline\">Example 2</span>: Highlighted
+                                                Python code in a &lt;div&gt; block with custom style\n</p>\n
+                                        </blockquote>\n
+                                    </td>
+                                </tr>
+                                <tr>\n<td style=\"border: none\">
+                                        <pre
+                                            class=\"wiki\">{{{#!div style=\"background: #ffd; border: 3px ridge\"\n\nThis is an example of embedded \"code\" block:\n\n  {{{\n  #!python\n  def hello():\n      return \"world\"\n  }}}\n\n}}}\n</pre>
+                                    </td>
+                                    <td style=\"border: none; padding: 1em\" valign=\"top\">
+                                        <div style=\"background: #ffd; border: 3px ridge\" class=\"wikipage\">
+                                            <p>\nThis is an example of embedded \"code\" block:\n</p>\n<div
+                                                class=\"wiki-code\">
+                                                <div class=\"code\">
+                                                    <pre><span class=\"k\">def</span> <span class=\"nf\">hello</span><span class=\"p\">():</span>\n    <span class=\"k\">return</span> <span class=\"s2\">\"world\"</span>\n</pre>
+                                                </div>
+                                            </div>
+                                        </div>
+                                    </td>
+                                </tr>
+                                <tr>\n<td colspan=\"2\" align=\"center\" style=\"border: none\">
+                                        <blockquote>\n<p>\n<span class=\"underline\">Example 3</span>: Searching tickets
+                                                from a wiki page, by keywords.\n</p>\n</blockquote>\n
+                                    </td>
+                                </tr>
+                                <tr>\n<td style=\"border: none\">
+                                        <pre
+                                            class=\"wiki\">{{{\n#!html\n&lt;form action=\"/query\" method=\"get\"&gt;&lt;div&gt;\n&lt;input type=\"text\" name=\"keywords\" value=\"~\" size=\"30\"/&gt;\n&lt;input type=\"submit\" value=\"Search by Keywords\"/&gt;\n&lt;!-- To control what fields show up use hidden fields\n&lt;input type=\"hidden\" name=\"col\" value=\"id\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"summary\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"status\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"milestone\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"version\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"owner\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"priority\"/&gt;\n&lt;input type=\"hidden\" name=\"col\" value=\"component\"/&gt;\n--&gt;\n&lt;/div&gt;&lt;/form&gt;\n}}}\n</pre>
+                                    </td>
+                                    <td style=\"border: none; padding: 1em\" valign=\"top\">
+                                        <form action=\"/query\" method=\"get\">
+                                            <div>\n<input type=\"text\" name=\"keywords\" value=\"~\"
+                                                    size=\"30\" />\n<input type=\"submit\" value=\"Search by
+                                                    Keywords\" />\n</div>
+                                        </form>\n
+                                    </td>
+                                </tr>
+                            </table>\n<h2 id=\"AvailableProcessors\">Available Processors</h2>\n<p>\nThe following
+                                processors are included in the Trac distribution:\n</p>\n<table class=\"wiki\">\n<tr>
+                                    <td> <strong><code>#!default</code></strong> </td>
+                                    <td> Present the text verbatim in a preformatted text block. This is the same as
+                                        specifying <em>no</em> processor name (and no <code>#!</code>). \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!comment</code></strong> </td>
+                                    <td> Do not process the text in this section, i.e. contents exist only in the plain
+                                        text - not in the rendered page. \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!rtl</code></strong> </td>
+                                    <td> Introduce a Right-To-Left block with appropriate CSS direction and styling.
+                                        <em>(since 0.12.2)</em> \n</td>
+                                </tr>
+                                <tr>
+                                    <td colspan=\"2\"> \n</td>
+                                </tr>
+                                <tr>
+                                    <th colspan=\"2\"> <strong><span class=\"wikianchor\" id=\"HTMLrelated\">HTML
+                                                related</span></strong> \n</th>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!html</code></strong> </td>
+                                    <td> Insert custom HTML in a wiki page. \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!htmlcomment</code></strong> </td>
+                                    <td> Insert an HTML comment in a wiki page. (<em>since 0.12</em>) \n</td>
+                                </tr>
+                                <tr>
+                                    <td> </td>
+                                    <td> Note that <code>#!html</code> blocks have to be <em>self-contained</em>, i.e.
+                                        you can't start an HTML element in one block and close it later in a second
+                                        block. Use the following processors for achieving a similar effect. \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!div</code></strong> </td>
+                                    <td> Wrap wiki content inside a &lt;div&gt; element. \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!span</code></strong> </td>
+                                    <td> Wrap wiki content inside a &lt;span&gt; element. \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!td</code></strong> </td>
+                                    <td> Wrap wiki content inside a &lt;td&gt; element. (<em>since 0.12</em>) \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!th</code></strong> </td>
+                                    <td> Wrap wiki content inside a &lt;th&gt; element. (<em>since 0.12</em>) \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!tr</code></strong> </td>
+                                    <td> Can optionally be used for wrapping <code>#!td</code> and <code>#!th</code>
+                                        blocks, either for specifying row attributes or better visual grouping.
+                                        (<em>since 0.12</em>) \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!table</code></strong> </td>
+                                    <td> Can optionally be used for wrapping <code>#!tr</code>, <code>#!td</code> and
+                                        <code>#!th</code> blocks, for specifying table attributes. One current
+                                        limitation however is that tables cannot be nested. (<em>since 0.12</em>) \n
+                                    </td>
+                                </tr>
+                                <tr>
+                                    <td> </td>
+                                    <td> See <a class=\"wiki\" href=\"/wiki/WikiHtml\">WikiHtml</a> for example usage
+                                        and more details about these processors. \n</td>
+                                </tr>
+                                <tr>
+                                    <td colspan=\"2\"> \n</td>
+                                </tr>
+                                <tr>
+                                    <th colspan=\"2\"> <strong>Other Markups</strong> \n</th>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!rst</code></strong> </td>
+                                    <td> Trac support for Restructured Text. See <a class=\"wiki\"
+                                            href=\"/wiki/WikiRestructuredText\">WikiRestructuredText</a>. \n</td>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!textile</code></strong> </td>
+                                    <td> Supported if <a class=\"ext-link\"
+                                            href=\"http://cheeseshop.python.org/pypi/textile\"><span
+                                                class=\"icon\">​</span>Textile</a> is installed. See <a
+                                            class=\"ext-link\" href=\"http://www.textism.com/tools/textile/\"><span
+                                                class=\"icon\">​</span>a Textile reference</a>. \n</td>
+                                </tr>
+                                <tr>
+                                    <td colspan=\"2\"> \n</td>
+                                </tr>
+                                <tr>
+                                    <th colspan=\"2\"> <strong><span class=\"wikianchor\"
+                                                id=\"CodeHighlightingSupport\">Code Highlighting Support</span></strong>
+                                        \n</th>
+                                </tr>
+                                <tr>
+                                    <td> <strong><code>#!c</code></strong> <br /> <strong><code>#!cpp</code></strong>
+                                        (C++) <br /> <strong><code>#!python</code></strong> <br />
+                                        <strong><code>#!perl</code></strong> <br /> <strong><code>#!ruby</code></strong>
+                                        <br /> <strong><code>#!php</code></strong> <br />
+                                        <strong><code>#!asp</code></strong> <br /> <strong><code>#!java</code></strong>
+                                        <br /> <strong><code>#!js</code></strong> (Javascript) <br />
+                                        <strong><code>#!sql</code></strong> <br /> <strong><code>#!xml</code></strong>
+                                        (XML or HTML) <br /> <strong><code>#!sh</code></strong> (Bourne/Bash shell)
+                                        <br /> <strong>etc.</strong> <br /> </td>
+                                    <td> Trac includes processors to provide inline syntax highlighting for source code
+                                        in various languages. <br /> <br /> Trac relies on <a class=\"ext-link\"
+                                            href=\"http://pygments.org\"><span class=\"icon\">​</span>Pygments</a> for
+                                        syntax coloring. <br /> <br /> See <a class=\"wiki\"
+                                            href=\"/wiki/TracSyntaxColoring\">TracSyntaxColoring</a> for information
+                                        about which languages are supported and how to enable support for more
+                                        languages. \n</td>
+                                </tr>
+                                <tr>
+                                    <td colspan=\"2\"> \n</td>
+                                </tr>
+                            </table>\n<p>\nSince 1.1.2 the default, coding highlighting and MIME-type processors support
+                                the argument <code>lineno</code> for adding line numbering to the code block. When a
+                                value is specified, as in <code>lineno=3</code>, the numbering will start at the
+                                specified value. When used in combination with the <code>lineno</code> argument, the
+                                <code>marks</code> argument is also supported for highlighting lines. A single line
+                                number, set of line numbers and range of line numbers are allowed. For example,
+                                <code>marks=3</code>, <code>marks=3-6</code>, <code>marks=3,5,7</code> and
+                                <code>marks=3-5,7</code> are all allowed. The specified values are relative to the
+                                numbered lines, so if <code>lineno=2</code> is specified to start the line numbering at
+                                2, <code>marks=2</code> will result in the first line being highlighted.\n</p>\n<p>
+                                \nUsing the MIME type as processor, it is possible to syntax-highlight the same
+                                languages that are supported when browsing source code.\n</p>\n<table class=\"wiki\">\n
+                                <tr>
+                                    <th colspan=\"2\"> <strong>MIME Type Processors</strong> \n</th>
+                                </tr>
+                                <tr>
+                                    <td>
+                                        <p>\nSome examples:\n</p>\n
+                                        <pre class=\"wiki\">{{{#!text/html\n&lt;h1&gt;text&lt;/h1&gt;\n}}}\n</pre>
+                                    </td>
+                                    <td>
+                                        <p>\nThe result will be syntax highlighted HTML code:\n</p>\n<div
+                                            class=\"wiki-code\">
+                                            <div class=\"code\">
+                                                <pre><span class=\"p\">&lt;</span><span class=\"nt\">h1</span><span class=\"p\">&gt;</span>text<span class=\"p\">&lt;/</span><span class=\"nt\">h1</span><span class=\"p\">&gt;</span>\n</pre>
+                                            </div>
+                                        </div>
+                                        <p>\nThe same is valid for all other <a class=\"wiki\"
+                                                href=\"/wiki/TracSyntaxColoring#SyntaxColoringSupport\">mime types
+                                                supported</a>.\n</p>\n
+                                    </td>
+                                </tr>
+                                <tr>
+                                    <td>
+                                        <pre
+                                            class=\"wiki\">{{{#!diff\n--- Version 55\n+++ Version 56\n@@ -115,8 +115,9 @@\n     name='TracHelloWorld', version='1.0',\n     packages=find_packages(exclude=['*.tests*']),\n-    entry_points = \"\"\"\n-        [trac.plugins]\n-        helloworld = myplugs.helloworld\n-    \"\"\",\n+    entry_points = {\n+        'trac.plugins': [\n+            'helloworld = myplugs.helloworld',\n+        ],\n+    },\n )\n}}}\n</pre>
+                                    </td>
+                                    <td>
+                                        <p>\n<strong><code>#!diff</code></strong> has a particularly nice renderer:\n
+                                        </p>\n<div class=\"wiki-code\">
+                                            <div xmlns=\"http://www.w3.org/1999/xhtml\" class=\"diff\">\n\n <ul
+                                                    class=\"entries\">\n \n <li class=\"entry\">\n <h2>\n
+                                                            <a>Version</a>\n \n </h2>\n \n \n <table class=\"trac-diff
+                                                            inline\" summary=\"Differences\" cellspacing=\"0\">\n \n \n
+                                                            \n <colgroup>
+                                                                <col class=\"lineno\" />
+                                                                <col class=\"lineno\" />
+                                                                <col class=\"content\" />
+                                                            </colgroup>\n <thead>\n <tr>\n <th title=\"File Version
+                                                                        55\">\n \n \n </th>\n <th title=\"File Version
+                                                                        56\">\n \n \n </th>\n <td><em></em> </td>\n
+                                                                </tr>\n </thead>\n \n \n \n <tbody class=\"unmod\">\n
+                                                                \n\n \n <tr>\n \n \n \n \n <th>115</th>
+                                                                    <th>115</th>
+                                                                    <td class=\"l\"><span>&nbsp; &nbsp;
+                                                                            name='TracHelloWorld', version='1.0',</span>
+                                                                    </td>\n \n \n \n
+                                                                </tr>
+                                                                <tr>\n \n \n \n \n <th>116</th>
+                                                                    <th>116</th>
+                                                                    <td class=\"l\"><span>&nbsp; &nbsp;
+                                                                            packages=find_packages(exclude=['*.tests*']),</span>
+                                                                    </td>\n \n \n \n
+                                                                </tr>\n \n\n \n\n \n\n \n\n \n
+                                                            </tbody>
+                                                            <tbody class=\"mod\">\n \n\n \n\n \n\n \n\n \n \n \n \n \n
+                                                                <tr class=\"first\">\n <th>117</th>
+                                                                    <th> </th>
+                                                                    <td class=\"l\"><span>&nbsp; &nbsp; entry_points =
+                                                                            \"\"\"</span></td>\n
+                                                                </tr>
+                                                                <tr>\n <th>118</th>
+                                                                    <th> </th>
+                                                                    <td class=\"l\"><span>&nbsp; &nbsp; &nbsp; &nbsp;
+                                                                            [trac.plugins]</span></td>\n
+                                                                </tr>
+                                                                <tr>\n <th>119</th>
+                                                                    <th> </th>
+                                                                    <td class=\"l\"><span>&nbsp; &nbsp; &nbsp; &nbsp;
+                                                                            helloworld = myplugs.helloworld</span></td>
+                                                                    \n
+                                                                </tr>
+                                                                <tr>\n <th>120</th>
+                                                                    <th> </th>
+                                                                    <td class=\"l\"><span>&nbsp; &nbsp; \"\"\",</span>
+                                                                    </td>\n
+                                                                </tr>\n \n <tr>\n <th> </th>
+                                                                    <th>117</th>
+                                                                    <td class=\"r\"><span>&nbsp; &nbsp; entry_points =
+                                                                            {</span></td>\n
+                                                                </tr>
+                                                                <tr>\n <th> </th>
+                                                                    <th>118</th>
+                                                                    <td class=\"r\"><span>&nbsp; &nbsp; &nbsp; &nbsp;
+                                                                            'trac.plugins': [</span></td>\n
+                                                                </tr>
+                                                                <tr>\n <th> </th>
+                                                                    <th>119</th>
+                                                                    <td class=\"r\"><span>&nbsp; &nbsp; &nbsp; &nbsp;
+                                                                            &nbsp; &nbsp; 'helloworld =
+                                                                            myplugs.helloworld',</span></td>\n
+                                                                </tr>
+                                                                <tr>\n <th> </th>
+                                                                    <th>120</th>
+                                                                    <td class=\"r\"><span>&nbsp; &nbsp; &nbsp; &nbsp;
+                                                                            ],</span></td>\n
+                                                                </tr>
+                                                                <tr class=\"last\">\n <th> </th>
+                                                                    <th>121</th>
+                                                                    <td class=\"r\"><span>&nbsp; &nbsp; },</span></td>\n
+                                                                </tr>\n \n \n \n\n \n
+                                                            </tbody>
+                                                            <tbody class=\"unmod\">\n \n\n \n <tr>\n \n \n \n \n <th>121
+                                                                    </th>
+                                                                    <th>122</th>
+                                                                    <td class=\"l\"><span>)</span></td>\n \n \n \n
+                                                                </tr>\n \n\n \n\n \n\n \n\n \n </tbody>\n \n \n
+                                                        </table>\n </li>\n \n </ul>\n\n</div>
+                                        </div>
+                                    </td>
+                                </tr>
+                            </table>\n<p>\nLine numbers can be added to code blocks and lines can be highlighted
+                                <em>(since 1.1.2)</em>.\n</p>\n
+                            <pre
+                                class=\"wiki\">{{{#!python lineno=3 marks=3,9-10,16\ndef expand_markup(stream, ctxt=None):\n    \"\"\"A Genshi stream filter for expanding `genshi.Markup` events.\n\n    Note: Expansion may not be possible if the fragment is badly\n    formed, or partial.\n    \"\"\"\n    for event in stream:\n        if isinstance(event[1], Markup):\n            try:\n                for subevent in HTML(event[1]):\n                    yield subevent\n            except ParseError:\n                yield event\n        else:\n            yield event\n}}}\n</pre>
+                            <div class=\"wiki-code\">
+                                <table class=\"code\">
+                                    <thead>
+                                        <tr>
+                                            <th class=\"lineno\" title=\"Line numbers\">Line</th>
+                                            <th class=\"content\"> </th>
+                                        </tr>
+                                    </thead>
+                                    <tbody>
+                                        <tr class=\"hilite\">
+                                            <th id=\"a-L3\"><a href=\"#a-L3\">3</a></th>
+                                            <td><span class=\"k\">def</span> <span
+                                                    class=\"nf\">expand_markup</span><span
+                                                    class=\"p\">(</span>stream<span class=\"p\">,</span> ctxt<span
+                                                    class=\"o\">=</span><span class=\"kc\">None</span><span
+                                                    class=\"p\">):</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L4\"><a href=\"#a-L4\">4</a></th>
+                                            <td>    <span class=\"sd\">\"\"\"A Genshi stream filter for expanding
+                                                    `genshi.Markup` events.</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L5\"><a href=\"#a-L5\">5</a></th>
+                                            <td><span class=\"sd\"></span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L6\"><a href=\"#a-L6\">6</a></th>
+                                            <td><span class=\"sd\">    Note: Expansion may not be possible if the
+                                                    fragment is badly</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L7\"><a href=\"#a-L7\">7</a></th>
+                                            <td><span class=\"sd\">    formed, or partial.</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L8\"><a href=\"#a-L8\">8</a></th>
+                                            <td><span class=\"sd\">    \"\"\"</span></td>
+                                        </tr>
+                                        <tr class=\"hilite\">
+                                            <th id=\"a-L9\"><a href=\"#a-L9\">9</a></th>
+                                            <td>    <span class=\"k\">for</span> event <span
+                                                    class=\"ow\">in</span> stream<span class=\"p\">:</span></td>
+                                        </tr>
+                                        <tr class=\"hilite\">
+                                            <th id=\"a-L10\"><a href=\"#a-L10\">10</a></th>
+                                            <td>        <span class=\"k\">if</span> <span
+                                                    class=\"nb\">isinstance</span><span class=\"p\">(</span>event<span
+                                                    class=\"p\">[</span><span class=\"mi\">1</span><span
+                                                    class=\"p\">],</span> Markup<span class=\"p\">):</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L11\"><a href=\"#a-L11\">11</a></th>
+                                            <td>            <span class=\"k\">try</span><span class=\"p\">:</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L12\"><a href=\"#a-L12\">12</a></th>
+                                            <td>                <span class=\"k\">for</span> subevent <span
+                                                    class=\"ow\">in</span> HTML<span class=\"p\">(</span>event<span
+                                                    class=\"p\">[</span><span class=\"mi\">1</span><span
+                                                    class=\"p\">]):</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L13\"><a href=\"#a-L13\">13</a></th>
+                                            <td>                    <span class=\"k\">yield</span> subevent</td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L14\"><a href=\"#a-L14\">14</a></th>
+                                            <td>            <span class=\"k\">except</span> ParseError<span
+                                                    class=\"p\">:</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L15\"><a href=\"#a-L15\">15</a></th>
+                                            <td>                <span class=\"k\">yield</span> event</td>
+                                        </tr>
+                                        <tr class=\"hilite\">
+                                            <th id=\"a-L16\"><a href=\"#a-L16\">16</a></th>
+                                            <td>        <span class=\"k\">else</span><span class=\"p\">:</span></td>
+                                        </tr>
+                                        <tr>
+                                            <th id=\"a-L17\"><a href=\"#a-L17\">17</a></th>
+                                            <td>            <span class=\"k\">yield</span> event</td>
+                                        </tr>
+                                    </tbody>
+                                </table>
+                            </div>
+                            <p>\nFor more processor macros developed and/or contributed by users, visit the <a
+                                    class=\"ext-link\" href=\"https://trac-hacks.org\"><span class=\"icon\">​</span>Trac
+                                    Hacks</a> community site.\n</p>\n<p>\nDeveloping processors is no different from
+                                Wiki macros. In fact, they work the same way, only the usage syntax differs. See <a
+                                    class=\"wiki\"
+                                    href=\"/wiki/WikiMacros#DevelopingCustomMacros\">WikiMacros#DevelopingCustomMacros</a>
+                                for more information.\n</p>\n
+                            <hr />\n<p>\nSee also: <a class=\"wiki\" href=\"/wiki/WikiMacros\">WikiMacros</a>, <a
+                                    class=\"wiki\" href=\"/wiki/WikiHtml\">WikiHtml</a>, <a class=\"wiki\"
+                                    href=\"/wiki/WikiRestructuredText\">WikiRestructuredText</a>, <a class=\"wiki\"
+                                    href=\"/wiki/TracSyntaxColoring\">TracSyntaxColoring</a>, <a class=\"wiki\"
+                                    href=\"/wiki/WikiFormatting\">WikiFormatting</a>, <a class=\"wiki\"
+                                    href=\"/wiki/TracGuide\">TracGuide</a>\n</p>\n
+                        </div>\n \n \n \n \n </div>\n \n\n </div>\n
+                <script
+                    type=\"text/javascript\">\n        jQuery.loadStyleSheet(\"/pygments/trac.css\", \"text/css\");\n        jQuery.loadStyleSheet(\"/chrome/common/css/diff.css\", \"text/css\");\n    </script>
+                \n
+                <script
+                    type=\"text/javascript\">jQuery.loadScript(\"/chrome/common/js/diff.js\", \"text/javascript\", \"utf-8\");</script>
+                \n <div id=\"altlinks\">\n <h3>Download in other formats:</h3>\n <ul>\n <li class=\"last first\">\n <a
+                                rel=\"nofollow\" href=\"/wiki/WikiProcessors?version=3&amp;format=txt\">Plain Text</a>\n
+                        </li>\n </ul>\n </div>\n
+            </div>\n <div id=\"footer\" lang=\"en\" xml:lang=\"en\">
+                <hr />\n <a id=\"tracpowered\" href=\"https://trac.edgewall.org/\"><img
+                        src=\"/chrome/common/trac_logo_mini.png\" height=\"30\" width=\"107\" alt=\"Trac
+                        Powered\" /></a>\n <p class=\"left\">Powered by <a href=\"/about\"><strong>Trac
+                            1.2.5</strong></a><br />\n By <a href=\"http://www.edgewall.org/\">Edgewall Software</a>.
+                </p>\n <p class=\"right\">Validator: <a href=\"http://validator.w3.org/check?uri=referer\">Check
+                        XHTML</a></p>\n
+            </div>\n<!-- Footer -->\n\t\t</div>\n </div>\n<!-- End Footer -->\n<div><a
+            href=\"http://www.smartmontools.org/partisanfrosted.php\"></a><!-- this is for harvester --></div>\n</body>
+\n
+
+</html>
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html
new file mode 100644
index 00000000..a7126065
--- /dev/null
+++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html
@@ -0,0 +1 @@
+<!DOCTYPE html>\n<html b:version='2' class='v2' dir='ltr'>\n<head>\n<meta content='width=device-width,initial-scale=1.0,minimum-scale=1.0,maximum-scale=1.0' name='viewport'/>\n<meta content='text/html; charset=UTF-8' http-equiv='Content-Type'/>\n<script type=\"text/javascript\">(function() { var b=window,e=\"jstiming\",g=\"tick\";(function(){function d(a){this.t={};this.tick=function(a,d,c){c=void 0!=c?c:(new Date).getTime();this.t[a]=[c,d]};this[g](\"start\",null,a)}var a=new d;b.jstiming={Timer:d,load:a};if(b.performance&&b.performance.timing){var a=b.performance.timing,c=b[e].load,f=a.navigationStart,a=a.responseStart;0<f&&a>=f&&(c[g](\"_wtsrt\",void 0,f),c[g](\"wtsrt_\",\"_wtsrt\",a),c[g](\"tbsd_\",\"wtsrt_\"))}try{a=null,b.chrome&&b.chrome.csi&&(a=Math.floor(b.chrome.csi().pageT),c&&0<f&&(c[g](\"_tbnd\",void 0,b.chrome.csi().startE),\nc[g](\"tbnd_\",\"_tbnd\",f))),null==a&&b.gtbExternal&&(a=b.gtbExternal.pageT()),null==a&&b.external&&(a=b.external.pageT,c&&0<f&&(c[g](\"_tbnd\",void 0,b.external.startE),c[g](\"tbnd_\",\"_tbnd\",f))),a&&(b[e].pt=a)}catch(l){}})();b.tickAboveFold=function(d){var a=0;if(d.offsetParent){do a+=d.offsetTop;while(d=d.offsetParent)}d=a;750>=d&&b[e].load[g](\"aft\")};var h=!1;function k(){h||(h=!0,b[e].load[g](\"firstScrollTime\"))}b.addEventListener?b.addEventListener(\"scroll\",k,!1):b.attachEvent(\"onscroll\",k);\n })();</script>\n<meta content='true' name='MSSmartTagsPreventParsing'/>\n<meta content='blogger' name='generator'/>\n<link href='http://plainblogaboutpolitics.blogspot.com/favicon.ico' rel='icon' type='image/x-icon'/>\n<link href='http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html' rel='canonical'/>\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"A plain blog about politics - Atom\" href=\"http://plainblogaboutpolitics.blogspot.com/feeds/posts/default\" />\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"A plain blog about politics - RSS\" href=\"http://plainblogaboutpolitics.blogspot.com/feeds/posts/default?alt=rss\" />\n<link rel=\"service.post\" type=\"application/atom+xml\" title=\"A plain blog about politics - Atom\" href=\"http://www.blogger.com/feeds/6926413038778731189/posts/default\" />\n<link rel=\"EditURI\" type=\"application/rsd+xml\" title=\"RSD\" href=\"http://www.blogger.com/rsd.g?blogID=6926413038778731189\" />\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"A plain blog about politics - Atom\" href=\"http://plainblogaboutpolitics.blogspot.com/feeds/8371231732347060989/comments/default\" />\n<!--[if IE]><script type=\"text/javascript\" src=\"//www.blogger.com/static/v1/jsbin/2963240465-ieretrofit.js\"></script>\n<![endif]-->\n<!--[if IE]> <script> (function() { var html5 = (\"abbr,article,aside,audio,canvas,datalist,details,\" + \"figure,footer,header,hgroup,mark,menu,meter,nav,output,\" + \"progress,section,time,video\").split(','); for (var i = 0; i < html5.length; i++) { document.createElement(html5[i]); } try { document.execCommand('BackgroundImageCache', false, true); } catch(e) {} })(); </script> <![endif]-->\n<title>A plain blog about politics: Acceptable</title>\n<link type='text/css' rel='stylesheet' href='//www.blogger.com/static/v1/widgets/564950992-widget_css_mobile_2_bundle.css' />\n<link type=\"text/css\" rel=\"stylesheet\" href=\"//www.blogger.com/dyn-css/authorization.css?targetBlogID=6926413038778731189&zx=f2a1ff1b-0778-45ff-ac2a-d2a379442a83\"/>\n<style id='page-skin-1' type='text/css'><!--\n/*\n-----------------------------------------------\nBlogger Template Style\nName:     Simple\nDesigner: Josh Peterson\nURL:      www.noaesthetic.com\n----------------------------------------------- */\n/* Variable definitions\n====================\n<Variable name=\"keycolor\" description=\"Main Color\" type=\"color\" default=\"#66bbdd\"/>\n<Group description=\"Page Text\" selector=\"body\">\n<Variable name=\"body.font\" description=\"Font\" type=\"font\"\ndefault=\"normal normal 12px Arial, Tahoma, Helvetica, FreeSans, sans-serif\"/>\n<Variable name=\"body.text.color\" description=\"Text Color\" type=\"color\" default=\"#222222\"/>\n</Group>\n<Group description=\"Backgrounds\" selector=\".body-fauxcolumns-outer\">\n<Variable name=\"body.background.color\" description=\"Outer Background\" type=\"color\" default=\"#66bbdd\"/>\n<Variable name=\"content.background.color\" description=\"Main Background\" type=\"color\" default=\"#ffffff\"/>\n<Variable name=\"header.background.color\" description=\"Header Background\" type=\"color\" default=\"transparent\"/>\n</Group>\n<Group description=\"Links\" selector=\".main-outer\">\n<Variable name=\"link.color\" description=\"Link Color\" type=\"color\" default=\"#2288bb\"/>\n<Variable name=\"link.visited.color\" description=\"Visited Color\" type=\"color\" default=\"#888888\"/>\n<Variable name=\"link.hover.color\" description=\"Hover Color\" type=\"color\" default=\"#33aaff\"/>\n</Group>\n<Group description=\"Blog Title\" selector=\".header h1\">\n<Variable name=\"header.font\" description=\"Font\" type=\"font\"\ndefault=\"normal normal 60px Arial, Tahoma, Helvetica, FreeSans, sans-serif\"/>\n<Variable name=\"header.text.color\" description=\"Title Color\" type=\"color\" default=\"#3399bb\" />\n</Group>\n<Group description=\"Blog Description\" selector=\".header .description\">\n<Variable name=\"description.text.color\" description=\"Description Color\" type=\"color\"\ndefault=\"#777777\" />\n</Group>\n<Group description=\"Tabs Text\" selector=\".tabs-inner .widget li a\">\n<Variable name=\"tabs.font\" description=\"Font\" type=\"font\"\ndefault=\"normal normal 14px Arial, Tahoma, Helvetica, FreeSans, sans-serif\"/>\n<Variable name=\"tabs.text.color\" description=\"Text Color\" type=\"color\" default=\"#999999\"/>\n<Variable name=\"tabs.selected.text.color\" description=\"Selected Color\" type=\"color\" default=\"#000000\"/>\n</Group>\n<Group description=\"Tabs Background\" selector=\".tabs-outer .PageList\">\n<Variable name=\"tabs.background.color\" description=\"Background Color\" type=\"color\" default=\"#f5f5f5\"/>\n<Variable name=\"tabs.selected.background.color\" description=\"Selected Color\" type=\"color\" default=\"#eeeeee\"/>\n</Group>\n<Group description=\"Post Title\" selector=\"h3.post-title, .comments h4\">\n<Variable name=\"post.title.font\" description=\"Font\" type=\"font\"\ndefault=\"normal normal 22px Arial, Tahoma, Helvetica, FreeSans, sans-serif\"/>\n</Group>\n<Group description=\"Date Header\" selector=\".date-header\">\n<Variable name=\"date.header.color\" description=\"Text Color\" type=\"color\"\ndefault=\"#666666\"/>\n<Variable name=\"date.header.background.color\" description=\"Background Color\" type=\"color\"\ndefault=\"transparent\"/>\n</Group>\n<Group description=\"Post Footer\" selector=\".post-footer\">\n<Variable name=\"post.footer.text.color\" description=\"Text Color\" type=\"color\" default=\"#666666\"/>\n<Variable name=\"post.footer.background.color\" description=\"Background Color\" type=\"color\"\ndefault=\"#f9f9f9\"/>\n<Variable name=\"post.footer.border.color\" description=\"Shadow Color\" type=\"color\" default=\"#eeeeee\"/>\n</Group>\n<Group description=\"Gadgets\" selector=\"h2\">\n<Variable name=\"widget.title.font\" description=\"Title Font\" type=\"font\"\ndefault=\"normal bold 11px Arial, Tahoma, Helvetica, FreeSans, sans-serif\"/>\n<Variable name=\"widget.title.text.color\" description=\"Title Color\" type=\"color\" default=\"#000000\"/>\n<Variable name=\"widget.alternate.text.color\" description=\"Alternate Color\" type=\"color\" default=\"#999999\"/>\n</Group>\n<Group description=\"Images\" selector=\".main-inner\">\n<Variable name=\"image.background.color\" description=\"Background Color\" type=\"color\" default=\"#ffffff\"/>\n<Variable name=\"image.border.color\" description=\"Border Color\" type=\"color\" default=\"#eeeeee\"/>\n<Variable name=\"image.text.color\" description=\"Caption Text Color\" type=\"color\" default=\"#666666\"/>\n</Group>\n<Group description=\"Accents\" selector=\".content-inner\">\n<Variable name=\"body.rule.color\" description=\"Separator Line Color\" type=\"color\" default=\"#eeeeee\"/>\n<Variable name=\"tabs.border.color\" description=\"Tabs Border Color\" type=\"color\" default=\"#dddddd\"/>\n</Group>\n<Variable name=\"body.background\" description=\"Body Background\" type=\"background\"\ncolor=\"#ffffff\" default=\"$(color) none repeat scroll top left\"/>\n<Variable name=\"body.background.override\" description=\"Body Background Override\" type=\"string\" default=\"\"/>\n<Variable name=\"body.background.gradient.cap\" description=\"Body Gradient Cap\" type=\"url\"\ndefault=\"url(//www.blogblog.com/1kt/simple/gradients_light.png)\"/>\n<Variable name=\"body.background.gradient.tile\" description=\"Body Gradient Tile\" type=\"url\"\ndefault=\"url(//www.blogblog.com/1kt/simple/body_gradient_tile_light.png)\"/>\n<Variable name=\"content.background.color.selector\" description=\"Content Background Color Selector\" type=\"string\" default=\".content-inner\"/>\n<Variable name=\"content.padding\" description=\"Content Padding\" type=\"length\" default=\"10px\"/>\n<Variable name=\"content.padding.horizontal\" description=\"Content Horizontal Padding\" type=\"length\" default=\"10px\"/>\n<Variable name=\"content.shadow.spread\" description=\"Content Shadow Spread\" type=\"length\" default=\"40px\"/>\n<Variable name=\"content.shadow.spread.webkit\" description=\"Content Shadow Spread (WebKit)\" type=\"length\" default=\"5px\"/>\n<Variable name=\"content.shadow.spread.ie\" description=\"Content Shadow Spread (IE)\" type=\"length\" default=\"10px\"/>\n<Variable name=\"main.border.width\" description=\"Main Border Width\" type=\"length\" default=\"0\"/>\n<Variable name=\"header.background.gradient\" description=\"Header Gradient\" type=\"url\" default=\"none\"/>\n<Variable name=\"header.shadow.offset.left\" description=\"Header Shadow Offset Left\" type=\"length\" default=\"-1px\"/>\n<Variable name=\"header.shadow.offset.top\" description=\"Header Shadow Offset Top\" type=\"length\" default=\"-1px\"/>\n<Variable name=\"header.shadow.spread\" description=\"Header Shadow Spread\" type=\"length\" default=\"1px\"/>\n<Variable name=\"header.padding\" description=\"Header Padding\" type=\"length\" default=\"30px\"/>\n<Variable name=\"header.border.size\" description=\"Header Border Size\" type=\"length\" default=\"1px\"/>\n<Variable name=\"header.bottom.border.size\" description=\"Header Bottom Border Size\" type=\"length\" default=\"1px\"/>\n<Variable name=\"header.border.horizontalsize\" description=\"Header Horizontal Border Size\" type=\"length\" default=\"0\"/>\n<Variable name=\"description.text.size\" description=\"Description Text Size\" type=\"string\" default=\"140%\"/>\n<Variable name=\"tabs.margin.top\" description=\"Tabs Margin Top\" type=\"length\" default=\"0\" />\n<Variable name=\"tabs.margin.side\" description=\"Tabs Side Margin\" type=\"length\" default=\"30px\" />\n<Variable name=\"tabs.background.gradient\" description=\"Tabs Background Gradient\" type=\"url\"\ndefault=\"url(//www.blogblog.com/1kt/simple/gradients_light.png)\"/>\n<Variable name=\"tabs.border.width\" description=\"Tabs Border Width\" type=\"length\" default=\"1px\"/>\n<Variable name=\"tabs.bevel.border.width\" description=\"Tabs Bevel Border Width\" type=\"length\" default=\"1px\"/>\n<Variable name=\"date.header.padding\" description=\"Date Header Padding\" type=\"string\" default=\"inherit\"/>\n<Variable name=\"date.header.letterspacing\" description=\"Date Header Letter Spacing\" type=\"string\" default=\"inherit\"/>\n<Variable name=\"date.header.margin\" description=\"Date Header Margin\" type=\"string\" default=\"inherit\"/>\n<Variable name=\"post.margin.bottom\" description=\"Post Bottom Margin\" type=\"length\" default=\"25px\"/>\n<Variable name=\"image.border.small.size\" description=\"Image Border Small Size\" type=\"length\" default=\"2px\"/>\n<Variable name=\"image.border.large.size\" description=\"Image Border Large Size\" type=\"length\" default=\"5px\"/>\n<Variable name=\"page.width.selector\" description=\"Page Width Selector\" type=\"string\" default=\".region-inner\"/>\n<Variable name=\"page.width\" description=\"Page Width\" type=\"string\" default=\"auto\"/>\n<Variable name=\"main.section.margin\" description=\"Main Section Margin\" type=\"length\" default=\"15px\"/>\n<Variable name=\"main.padding\" description=\"Main Padding\" type=\"length\" default=\"15px\"/>\n<Variable name=\"main.padding.top\" description=\"Main Padding Top\" type=\"length\" default=\"30px\"/>\n<Variable name=\"main.padding.bottom\" description=\"Main Padding Bottom\" type=\"length\" default=\"30px\"/>\n<Variable name=\"paging.background\"\ncolor=\"#ffffff\"\ndescription=\"Background of blog paging area\" type=\"background\"\ndefault=\"transparent none no-repeat scroll top center\"/>\n<Variable name=\"footer.bevel\" description=\"Bevel border length of footer\" type=\"length\" default=\"0\"/>\n<Variable name=\"mobile.background.overlay\" description=\"Mobile Background Overlay\" type=\"string\"\ndefault=\"transparent none repeat scroll top left\"/>\n<Variable name=\"mobile.background.size\" description=\"Mobile Background Size\" type=\"string\" default=\"auto\"/>\n<Variable name=\"mobile.button.color\" description=\"Mobile Button Color\" type=\"color\" default=\"#ffffff\" />\n<Variable name=\"startSide\" description=\"Side where text starts in blog language\" type=\"automatic\" default=\"left\"/>\n<Variable name=\"endSide\" description=\"Side where text ends in blog language\" type=\"automatic\" default=\"right\"/>\n*/\n/* Content\n----------------------------------------------- */\nbody {\nfont: normal normal 12px 'Trebuchet MS', Trebuchet, Verdana, sans-serif;\ncolor: #666666;\nbackground: #ffffff none repeat scroll top left;\npadding: 0 0 0 0;\n}\nhtml body .region-inner {\nmin-width: 0;\nmax-width: 100%;\nwidth: auto;\n}\na:link {\ntext-decoration:none;\ncolor: #2288bb;\n}\na:visited {\ntext-decoration:none;\ncolor: #888888;\n}\na:hover {\ntext-decoration:underline;\ncolor: #33aaff;\n}\n.body-fauxcolumn-outer .fauxcolumn-inner {\nbackground: transparent none repeat scroll top left;\n_background-image: none;\n}\n.body-fauxcolumn-outer .cap-top {\nposition: absolute;\nz-index: 1;\nheight: 400px;\nwidth: 100%;\nbackground: #ffffff none repeat scroll top left;\n}\n.body-fauxcolumn-outer .cap-top .cap-left {\nwidth: 100%;\nbackground: transparent none repeat-x scroll top left;\n_background-image: none;\n}\n.content-outer {\n-moz-box-shadow: 0 0 0 rgba(0, 0, 0, .15);\n-webkit-box-shadow: 0 0 0 rgba(0, 0, 0, .15);\n-goog-ms-box-shadow: 0 0 0 #333333;\nbox-shadow: 0 0 0 rgba(0, 0, 0, .15);\nmargin-bottom: 1px;\n}\n.content-inner {\npadding: 10px 40px;\n}\n.content-inner {\nbackground-color: #ffffff;\n}\n/* Header\n----------------------------------------------- */\n.header-outer {\nbackground: transparent none repeat-x scroll 0 -400px;\n_background-image: none;\n}\n.Header h1 {\nfont: normal normal 40px 'Trebuchet MS',Trebuchet,Verdana,sans-serif;\ncolor: #000000;\ntext-shadow: 0 0 0 rgba(0, 0, 0, .2);\n}\n.Header h1 a {\ncolor: #000000;\n}\n.Header .description {\nfont-size: 18px;\ncolor: #000000;\n}\n.header-inner .Header .titlewrapper {\npadding: 22px 0;\n}\n.header-inner .Header .descriptionwrapper {\npadding: 0 0;\n}\n/* Tabs\n----------------------------------------------- */\n.tabs-inner .section:first-child {\nborder-top: 0 solid #dddddd;\n}\n.tabs-inner .section:first-child ul {\nmargin-top: -1px;\nborder-top: 1px solid #dddddd;\nborder-left: 1px solid #dddddd;\nborder-right: 1px solid #dddddd;\n}\n.tabs-inner .widget ul {\nbackground: transparent none repeat-x scroll 0 -800px;\n_background-image: none;\nborder-bottom: 1px solid #dddddd;\nmargin-top: 0;\nmargin-left: -30px;\nmargin-right: -30px;\n}\n.tabs-inner .widget li a {\ndisplay: inline-block;\npadding: .6em 1em;\nfont: normal normal 12px 'Trebuchet MS', Trebuchet, Verdana, sans-serif;\ncolor: #000000;\nborder-left: 1px solid #ffffff;\nborder-right: 1px solid #dddddd;\n}\n.tabs-inner .widget li:first-child a {\nborder-left: none;\n}\n.tabs-inner .widget li.selected a, .tabs-inner .widget li a:hover {\ncolor: #000000;\nbackground-color: #eeeeee;\ntext-decoration: none;\n}\n/* Columns\n----------------------------------------------- */\n.main-outer {\nborder-top: 0 solid #dddddd;\n}\n.fauxcolumn-left-outer .fauxcolumn-inner {\nborder-right: 1px solid #dddddd;\n}\n.fauxcolumn-right-outer .fauxcolumn-inner {\nborder-left: 1px solid #dddddd;\n}\n/* Headings\n----------------------------------------------- */\nh2 {\nmargin: 0 0 1em 0;\nfont: normal bold 11px 'Trebuchet MS',Trebuchet,Verdana,sans-serif;\ncolor: #000000;\n}\n/* Widgets\n----------------------------------------------- */\n.widget .zippy {\ncolor: #999999;\ntext-shadow: 2px 2px 1px rgba(0, 0, 0, .1);\n}\n.widget .popular-posts ul {\nlist-style: none;\n}\n/* Posts\n----------------------------------------------- */\n.date-header span {\nbackground-color: #bbbbbb;\ncolor: #ffffff;\npadding: 0.4em;\nletter-spacing: 3px;\nmargin: inherit;\n}\n.main-inner {\npadding-top: 35px;\npadding-bottom: 65px;\n}\n.main-inner .column-center-inner {\npadding: 0 0;\n}\n.main-inner .column-center-inner .section {\nmargin: 0 1em;\n}\n.post {\nmargin: 0 0 45px 0;\n}\nh3.post-title, .comments h4 {\nfont: normal normal 22px 'Trebuchet MS',Trebuchet,Verdana,sans-serif;\nmargin: .75em 0 0;\n}\n.post-body {\nfont-size: 110%;\nline-height: 1.4;\nposition: relative;\n}\n.post-body img, .post-body .tr-caption-container, .Profile img, .Image img,\n.BlogList .item-thumbnail img {\npadding: 2px;\nbackground: #ffffff;\nborder: 1px solid #eeeeee;\n-moz-box-shadow: 1px 1px 5px rgba(0, 0, 0, .1);\n-webkit-box-shadow: 1px 1px 5px rgba(0, 0, 0, .1);\nbox-shadow: 1px 1px 5px rgba(0, 0, 0, .1);\n}\n.post-body img, .post-body .tr-caption-container {\npadding: 5px;\n}\n.post-body .tr-caption-container {\ncolor: #666666;\n}\n.post-body .tr-caption-container img {\npadding: 0;\nbackground: transparent;\nborder: none;\n-moz-box-shadow: 0 0 0 rgba(0, 0, 0, .1);\n-webkit-box-shadow: 0 0 0 rgba(0, 0, 0, .1);\nbox-shadow: 0 0 0 rgba(0, 0, 0, .1);\n}\n.post-header {\nmargin: 0 0 1.5em;\nline-height: 1.6;\nfont-size: 90%;\n}\n.post-footer {\nmargin: 20px -2px 0;\npadding: 5px 10px;\ncolor: #666666;\nbackground-color: #eeeeee;\nborder-bottom: 1px solid #eeeeee;\nline-height: 1.6;\nfont-size: 90%;\n}\n#comments .comment-author {\npadding-top: 1.5em;\nborder-top: 1px solid #dddddd;\nbackground-position: 0 1.5em;\n}\n#comments .comment-author:first-child {\npadding-top: 0;\nborder-top: none;\n}\n.avatar-image-container {\nmargin: .2em 0 0;\n}\n#comments .avatar-image-container img {\nborder: 1px solid #eeeeee;\n}\n/* Comments\n----------------------------------------------- */\n.comments .comments-content .icon.blog-author {\nbackground-repeat: no-repeat;\nbackground-image: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAASCAYAAABWzo5XAAAAAXNSR0IArs4c6QAAAAZiS0dEAP8A/wD/oL2nkwAAAAlwSFlzAAALEgAACxIB0t1+/AAAAAd0SU1FB9sLFwMeCjjhcOMAAAD+SURBVDjLtZSvTgNBEIe/WRRnm3U8RC1neQdsm1zSBIU9VVF1FkUguQQsD9ITmD7ECZIJSE4OZo9stoVjC/zc7ky+zH9hXwVwDpTAWWLrgS3QAe8AZgaAJI5zYAmc8r0G4AHYHQKVwII8PZrZFsBFkeRCABYiMh9BRUhnSkPTNCtVXYXURi1FpBDgArj8QU1eVXUzfnjv7yP7kwu1mYrkWlU33vs1QNu2qU8pwN0UpKoqokjWwCztrMuBhEhmh8bD5UDqur75asbcX0BGUB9/HAMB+r32hznJgXy2v0sGLBcyAJ1EK3LFcbo1s91JeLwAbwGYu7TP/3ZGfnXYPgAVNngtqatUNgAAAABJRU5ErkJggg==);\n}\n.comments .comments-content .loadmore a {\nborder-top: 1px solid #999999;\nborder-bottom: 1px solid #999999;\n}\n.comments .comment-thread.inline-thread {\nbackground-color: #eeeeee;\n}\n.comments .continue {\nborder-top: 2px solid #999999;\n}\n/* Accents\n---------------------------------------------- */\n.section-columns td.columns-cell {\nborder-left: 1px solid #dddddd;\n}\n.blog-pager {\nbackground: transparent url(//www.blogblog.com/1kt/simple/paging_dot.png) repeat-x scroll top center;\n}\n.blog-pager-older-link, .home-link,\n.blog-pager-newer-link {\nbackground-color: #ffffff;\npadding: 5px;\n}\n.footer-outer {\nborder-top: 1px dashed #bbbbbb;\n}\n/* Mobile\n----------------------------------------------- */\nbody.mobile  {\nbackground-size: auto;\n}\n.mobile .body-fauxcolumn-outer {\nbackground: transparent none repeat scroll top left;\n}\n.mobile .body-fauxcolumn-outer .cap-top {\nbackground-size: 100% auto;\n}\n.mobile .content-outer {\n-webkit-box-shadow: 0 0 3px rgba(0, 0, 0, .15);\nbox-shadow: 0 0 3px rgba(0, 0, 0, .15);\n}\nbody.mobile .AdSense {\nmargin: 0 -0;\n}\n.mobile .tabs-inner .widget ul {\nmargin-left: 0;\nmargin-right: 0;\n}\n.mobile .post {\nmargin: 0;\n}\n.mobile .main-inner .column-center-inner .section {\nmargin: 0;\n}\n.mobile .date-header span {\npadding: 0.1em 10px;\nmargin: 0 -10px;\n}\n.mobile h3.post-title {\nmargin: 0;\n}\n.mobile .blog-pager {\nbackground: transparent none no-repeat scroll top center;\n}\n.mobile .footer-outer {\nborder-top: none;\n}\n.mobile .main-inner, .mobile .footer-inner {\nbackground-color: #ffffff;\n}\n.mobile-index-contents {\ncolor: #666666;\n}\n.mobile-link-button {\nbackground-color: #2288bb;\n}\n.mobile-link-button a:link, .mobile-link-button a:visited {\ncolor: #ffffff;\n}\n.mobile .tabs-inner .section:first-child {\nborder-top: none;\n}\n.mobile .tabs-inner .PageList .widget-content {\nbackground-color: #eeeeee;\ncolor: #000000;\nborder-top: 1px solid #dddddd;\nborder-bottom: 1px solid #dddddd;\n}\n.mobile .tabs-inner .PageList .widget-content .pagelist-arrow {\nborder-left: 1px solid #dddddd;\n}\n\n--></style>\n<style id='template-skin-1' type='text/css'><!--\nbody {\nmin-width: 960px;\n}\n.content-outer, .content-fauxcolumn-outer, .region-inner {\nmin-width: 960px;\nmax-width: 960px;\n_width: 960px;\n}\n.main-inner .columns {\npadding-left: 0;\npadding-right: 0;\n}\n.main-inner .fauxcolumn-center-outer {\nleft: 0;\nright: 0;\n/* IE6 does not respect left and right together */\n_width: expression(this.parentNode.offsetWidth -\nparseInt(\"0\") -\nparseInt(\"0\") + 'px');\n}\n.main-inner .fauxcolumn-left-outer {\nwidth: 0;\n}\n.main-inner .fauxcolumn-right-outer {\nwidth: 0;\n}\n.main-inner .column-left-outer {\nwidth: 0;\nright: 100%;\nmargin-left: -0;\n}\n.main-inner .column-right-outer {\nwidth: 0;\nmargin-right: -0;\n}\n#layout {\nmin-width: 0;\n}\n#layout .content-outer {\nmin-width: 0;\nwidth: 800px;\n}\n#layout .region-inner {\nmin-width: 0;\nwidth: auto;\n}\n--></style>\n<script type=\"text/javascript\">\nif (window.jstiming) window.jstiming.load.tick('headEnd');\n</script></head>\n<body class='loading mobile'>\n<div class='navbar section' id='navbar'><div class='widget Navbar' id='Navbar1'><script type=\"text/javascript\">\n    function setAttributeOnload(object, attribute, val) {\n      if(window.addEventListener) {\n        window.addEventListener('load',\n          function(){ object[attribute] = val; }, false);\n      } else {\n        window.attachEvent('onload', function(){ object[attribute] = val; });\n      }\n    }\n  </script>\n<script type=\"text/javascript\">\n(function() {\nvar script = document.createElement('script');\nscript.type = 'text/javascript';\nscript.src = '//pagead2.googlesyndication.com/pagead/js/google_top_exp.js';\nvar head = document.getElementsByTagName('head')[0];\nif (head) {\nhead.appendChild(script);\n}})();\n</script>\n</div></div>\n<div class='body-fauxcolumns'>\n<div class='fauxcolumn-outer body-fauxcolumn-outer'>\n<div class='cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left'>\n<div class='fauxborder-right'></div>\n<div class='fauxcolumn-inner'>\n</div>\n</div>\n<div class='cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n</div>\n<div class='content'>\n<div class='content-fauxcolumns'>\n<div class='fauxcolumn-outer content-fauxcolumn-outer'>\n<div class='cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left'>\n<div class='fauxborder-right'></div>\n<div class='fauxcolumn-inner'>\n</div>\n</div>\n<div class='cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n</div>\n<div class='content-outer'>\n<div class='content-cap-top cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left content-fauxborder-left'>\n<div class='fauxborder-right content-fauxborder-right'></div>\n<div class='content-inner'>\n<header>\n<div class='header-outer'>\n<div class='header-cap-top cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left header-fauxborder-left'>\n<div class='fauxborder-right header-fauxborder-right'></div>\n<div class='region-inner header-inner'>\n<div class='header section' id='header'><div class='widget Header' id='Header1'>\n<div id='header-inner'>\n<div class='titlewrapper'>\n<h1 class='title'>\n<a href='http://plainblogaboutpolitics.blogspot.com/?m=1'>A plain blog about politics</a>\n</h1>\n</div>\n<div class='descriptionwrapper'>\n<p class='description'><span>\n</span></p>\n</div>\n</div>\n</div></div>\n</div>\n</div>\n<div class='header-cap-bottom cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n</header>\n<div class='tabs-outer'>\n<div class='tabs-cap-top cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left tabs-fauxborder-left'>\n<div class='fauxborder-right tabs-fauxborder-right'></div>\n<div class='region-inner tabs-inner'>\n<div class='tabs section' id='crosscol'></div>\n<div class='tabs section' id='crosscol-overflow'></div>\n</div>\n</div>\n<div class='tabs-cap-bottom cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n<div class='main-outer'>\n<div class='main-cap-top cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left main-fauxborder-left'>\n<div class='fauxborder-right main-fauxborder-right'></div>\n<div class='region-inner main-inner'>\n<div class='columns fauxcolumns'>\n<div class='fauxcolumn-outer fauxcolumn-center-outer'>\n<div class='cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left'>\n<div class='fauxborder-right'></div>\n<div class='fauxcolumn-inner'>\n</div>\n</div>\n<div class='cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n<div class='fauxcolumn-outer fauxcolumn-left-outer'>\n<div class='cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left'>\n<div class='fauxborder-right'></div>\n<div class='fauxcolumn-inner'>\n</div>\n</div>\n<div class='cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n<div class='fauxcolumn-outer fauxcolumn-right-outer'>\n<div class='cap-top'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n<div class='fauxborder-left'>\n<div class='fauxborder-right'></div>\n<div class='fauxcolumn-inner'>\n</div>\n</div>\n<div class='cap-bottom'>\n<div class='cap-left'></div>\n<div class='cap-right'></div>\n</div>\n</div>\n<!-- corrects IE6 width calculation -->\n<div class='columns-inner'>\n<div class='column-center-outer'>\n<div class='column-center-inner'>\n<div class='main section' id='main'><div class='widget Blog' id='Blog1'>\n<div class='blog-posts hfeed'>\n<div class='date-outer'>\n<h2 class='date-header'><span>Monday, December 5, 2011</span></h2>\n<div class='date-posts'>\n<div class='post-outer'>\n<div class='post hentry uncustomized-post-template' itemscope='itemscope' itemtype='http://schema.org/BlogPosting'>\n<meta content='6926413038778731189' itemprop='blogId'/>\n<meta content='8371231732347060989' itemprop='postId'/>\n<a name='8371231732347060989'></a>\n<h3 class='post-title entry-title' itemprop='name'>\nAcceptable\n</h3>\n<div class='post-header'>\n<div class='post-header-line-1'></div>\n</div>\n<div class='post-body entry-content' id='post-body-8371231732347060989' itemprop='articleBody'>\nSince I've commented quite a bit on polling that as I read it shows Mitt Romney broadly acceptable to most Republican voters, I definitely need to say something about a new poll today that doesn't exactly show that. Gallup got around to actually <a href=\"http://www.gallup.com/poll/151325/Republicans-Gingrich-Romney-Acceptable-Nominees.aspx?utm_source=alert\">asking that very question</a> (\"Please tell me if you would find ___ to be an acceptable &nbsp;nominee for president from the Republican Party, or not\"). The answers mostly showed the weakness of the field, with six of the eight candidates asked about scoring well below 50% acceptable. But the clear most-acceptable candidate is Newt Gingrich, with a 62/34 acceptable/not acceptable ratio, while Romney is only at 54/41.<br />\n<br />\nThere are a lot of ways to look at this, but overall it's certainly a piece of evidence that the anti-Romney vote is, well, around 40%. Only a piece of evidence, however. It's not clear how hard these kinds of numbers might be, in either direction. On the positive side, it seems unlikely that Newt would remain over 60% once more Republicans know that he's been lobbying for Freddie Mac, and supported the individual mandate on health insurance, and made a climate change ad with Nancy Pelosi, and all the rest of it. On the other hand, it's certainly possible that the \"unacceptable\" answers are awful soft, for Romney and for everyone else.<br />\n<br />\nIn particular, as Greg <a href=\"http://www.washingtonpost.com/blogs/plum-line/post/are-even-moderate-republicans-willing-to-accept-newt-gingrich/2011/12/05/gIQAe9hAXO_blog.html\">pointed out</a>, Romney only does three points better on the \"acceptable\" scale with moderate Republicans than does Newt. This isn't the first indication we've had that Romney isn't doing as well with moderate Republicans as one would think he should be. Whether that means he has some room to grow or that he's just not an appealing politician is, I guess, still entirely up in the air at this point.<br />\n<br />\nI still overall don't see a low cap on Romney's support, but of course all the evidence counts, and polling in general begins to be a little more important the closer we get to actual voting. I'll be continuing to track anything more we get on this one.\n<div style='clear: both;'></div>\n</div>\n<div class='post-footer'>\n<div class='post-footer-line post-footer-line-1'>\n<span class='post-author vcard'>\n<span class='fn' itemprop='author' itemscope='itemscope' itemtype='http://schema.org/Person'>\n<meta content='http://www.blogger.com/profile/15931039630306253241' itemprop='url'/>\n<a href='http://www.blogger.com/profile/15931039630306253241' rel='author' title='author profile'>\n<span itemprop='name'>Jonathan Bernstein</span>\n</a>\n</span>\n</span>\n<span class='post-timestamp'>\nat\n<meta content='http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html' itemprop='url'/>\n<a class='timestamp-link' href='http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?m=1' rel='bookmark' title='permanent link'><abbr class='published' itemprop='datePublished' title='2011-12-05T16:12:00-06:00'>4:12 PM</abbr></a>\n</span>\n<span class='post-comment-link'>\n</span>\n</div>\n<div class='post-footer-line post-footer-line-2'>\n<div class='goog-inline-block dummy-container'><g:plusone source='blogger:blog:plusone' href='http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html' size='medium'/></div>\n</div>\n</div>\n</div>\n<div class='comments' id='comments'>\n<a name='comments'></a>\n<h4>14 comments:</h4>\n<div class='comments-content'>\n<script async='async' src='//www.blogblog.com/dynamicviews/4224c15c4e7c9321/js/comments.js' type='text/javascript'></script>\n<script type='text/javascript'>\n    (function() {\n      var items = [{'id': '1480162862109351710', 'body': 'Kudos for adapting your views a bit as new data comes in.  I think you\\46#39;ve been a step behind on the Newt thing because your analysis is so thoroughly grounded in political theory, data and historical precedent.  Sure, that stuff still matters, but in the 2012 Republican primaries, all bets are off.  The GOP and their base went nuts after Obama won, pretending otherwise only weakens your analysis.', 'timestamp': '1323124651267', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323124651267\\46m\\0751#c1480162862109351710', 'author': {'name': 'Anonymous', 'profileUrl': ''}, 'displayTime': 'December 5, 2011 at 4:37 PM', 'deleteclass': 'item-control blog-admin pid-1401034170'}, {'id': '8892245637901595489', 'body': '\\74i\\76\\46quot;I still overall don\\46#39;t see a low cap on Romney\\46#39;s support...\\46quot;\\74/i\\76\\74br /\\76\\74br /\\76.\\74br /\\76\\74br /\\76Well as I say, there\\46#39;s no gettin\\46#39; through to the terminally perceptive. ;-)\\74br /\\76\\74br /\\76And you\\46#39;re right, polling does begin to be a little more important the closer we get to actual voting. But there are exceptions to that rule, for candidates who are well known. Unknowns can see late movement, as they become known (Huntsman?). But knowns, a month out from an election, are captive to events and fortune, because everybody already knows who they are. That\\46#39;s why you see Romney panicking the last few days. He\\46#39;s captive to events and fortune, things like NEWT FRICKING GINGRICH stealing his nomination, if you can believe it.\\74br /\\76\\74br /\\76Willard is a known, and the ABR bloc is at 40% and counting. Heck, that\\46#39;s even bigger than I thought it was.', 'timestamp': '1323124881550', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323124881550\\46m\\0751#c8892245637901595489', 'author': {'name': 'Anonymous', 'profileUrl': ''}, 'displayTime': 'December 5, 2011 at 4:41 PM', 'deleteclass': 'item-control blog-admin pid-1217317497'}, {'id': '6178526766112581670', 'body': 'Anon 4:41, I don\\46#39;t understand -- Mitt Romney\\46#39;s numbers can\\46#39;t move much because he\\46#39;s so well known to the GOP base, while Gingrich\\46#39;s numbers have shifted by tens and dozens of points (depending on the question)? Romney is so much better known than Newt Gingrich that such widespread openness to reevaluating Gingrich doesn\\46#39;t suggest potential willingness to reevaluate Romney? How could that possibly be the case?', 'timestamp': '1323126466280', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323126466280\\46m\\0751#c6178526766112581670', 'author': {'name': 'the classicist', 'profileUrl': 'http://www.blogger.com/profile/08691196845661570282'}, 'displayTime': 'December 5, 2011 at 5:07 PM', 'deleteclass': 'item-control blog-admin pid-1147218478'}, {'id': '417935448086370401', 'body': 'Of course the numbers are soft, but I would think they are soft more in the sense that once there is a Republican nominee and everyone starts singing from the same hymnal (see Rushbo\\46#39;s rhetorical gymnastics to support nominee McCain), anyone they nominate will eventually be \\46quot;acceptable\\46quot; to 80-90% of Republican primary voters.\\74br /\\76\\74br /\\76This is how Alan Keyes got 27% as a carpet bagging IL Senate candidate when the original nominee was forced out by a sex scandal.  Even the most ridiculous campaign by a painfully flawed replacement candidate gets 27% of Republicans.  It\\46#39;s an utter floor.\\74br /\\76\\74br /\\76However, that doesn\\46#39;t happen until there is an actual nominee - and it\\46#39;s during the upcoming primaries where these numbers still matter to some degree.  Even still, second place?\\74br /\\76\\74br /\\76THIS IS EXCELLENT NEWS FOR MITT ROMNEY!', 'timestamp': '1323128044495', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323128044495\\46m\\0751#c417935448086370401', 'author': {'name': 'JS', 'profileUrl': ''}, 'displayTime': 'December 5, 2011 at 5:34 PM', 'deleteclass': 'item-control blog-admin pid-1199500360'}, {'id': '615557265001297408', 'body': 'You\\46#39;d have to reference my discussion of the other day, but I\\46#39;ll repeat a bit of it here. Gingrich\\46#39;s movement is directly related to the existence of the ABR bloc. The fact that the ABR crowd is circling back around to Gingrich, the guy that they summarily rejected at the commencement of their long slog through the ABR process,  is the final proof of the magnitude of the ABR bloc (and I doubt there\\46#39;s anybody but the terminally perceptive who need that final proof, but here it is.). \\74br /\\76\\74br /\\76These people are coming back around, full circle,  to NEWT FRICKING GINGRICH. Anybody who doesn\\46#39;t get the significance of this must be trapped in an impenetrable bubble somewhere.\\74br /\\76\\74br /\\76Gingrich is one of the events and fortunes that poor Willard has to accept, because he can\\46#39;t move the needle on his own. And Gingrich is just fortunate that Willard is out there. This all has nothing to do with him... he\\46#39;s just like Willard in that way. That\\46#39;s why I\\46#39;m expecting him to step on a landmine soon, like he does. He thinks he\\46#39;s standing on 3rd because he hit a triple.', 'timestamp': '1323128357116', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323128357116\\46m\\0751#c615557265001297408', 'author': {'name': 'Anonymous', 'profileUrl': ''}, 'displayTime': 'December 5, 2011 at 5:39 PM', 'deleteclass': 'item-control blog-admin pid-1217317497'}, {'id': '7300966727803977971', 'body': 'You lefties should really be paying me for this commentary. ;-)', 'timestamp': '1323128723850', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323128723850\\46m\\0751#c7300966727803977971', 'author': {'name': 'Anonymous', 'profileUrl': ''}, 'displayTime': 'December 5, 2011 at 5:45 PM', 'deleteclass': 'item-control blog-admin pid-1217317497'}, {'id': '1566139850793705353', 'body': 'Who do we make the check out to?', 'timestamp': '1323129092665', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323129092665\\46m\\0751#c1566139850793705353', 'author': {'name': 'Bill', 'profileUrl': ''}, 'displayTime': 'December 5, 2011 at 5:51 PM', 'deleteclass': 'item-control blog-admin pid-1549693158'}, {'id': '1430857687389934643', 'body': 'Did they conduct this poll earlier in the race, when Cain/Bachmann/Perry/Trump/et al were in the lead? If they did, it might give us a clue how useful this metric really is. If, for example, Trump was considered broadly \\46quot;acceptable\\46quot; by GOP voters at the time of his surge, that would be evidence that this metric is pretty meaningless.', 'timestamp': '1323129887873', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323129887873\\46m\\0751#c1430857687389934643', 'author': {'name': 'Kylopod', 'profileUrl': 'http://www.blogger.com/profile/06932528611103718373'}, 'displayTime': 'December 5, 2011 at 6:04 PM', 'deleteclass': 'item-control blog-admin pid-1446607846'}, {'id': '5857450241240338675', 'body': 'Unfortunately, no; it\\46#39;s the first time they\\46#39;ve run it this cycle. And I spent some time looking for their 2008 results, but couldn\\46#39;t find them, so either I\\46#39;m lame with the Google or they didn\\46#39;t do it. Both are possible!', 'timestamp': '1323130091856', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323130091856\\46m\\0751#c5857450241240338675', 'author': {'name': 'Jonathan Bernstein', 'profileUrl': 'http://www.blogger.com/profile/15931039630306253241'}, 'displayTime': 'December 5, 2011 at 6:08 PM', 'deleteclass': 'item-control blog-admin pid-1504076511'}, {'id': '4860173864736033608', 'body': 'Jon:\\74br /\\76\\74br /\\76http://www.gallup.com/poll/23764/nearly-republicans-would-find-mccain-acceptable-nominee.aspx\\74br /\\76\\74br /\\76http://www.gallup.com/poll/28162/giuliani-would-acceptable-nominee-nearly-republicans.aspx', 'timestamp': '1323130281196', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323130281196\\46m\\0751#c4860173864736033608', 'author': {'name': 'Kylopod', 'profileUrl': 'http://www.blogger.com/profile/06932528611103718373'}, 'displayTime': 'December 5, 2011 at 6:11 PM', 'deleteclass': 'item-control blog-admin pid-1446607846'}, {'id': '1626417217290616985', 'body': 'Wow. I am officially way lame. \\74br /\\76\\74br /\\76As far as substance...Romney is about where McCain was. FWIW. And in the earlier one, Newt was 43/52, again FWIW. \\74br /\\76\\74br /\\76I\\46#39;ll stick with what I said: it\\46#39;s evidence, but it\\46#39;s not anything close to definitive. As usual, the most important thing to remember is that most GOP voters have paid very little attention to the nomination race so far, and there\\46#39;s good reason to believe that their opinions are very soft.', 'timestamp': '1323131211877', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323131211877\\46m\\0751#c1626417217290616985', 'author': {'name': 'Jonathan Bernstein', 'profileUrl': 'http://www.blogger.com/profile/15931039630306253241'}, 'displayTime': 'December 5, 2011 at 6:26 PM', 'deleteclass': 'item-control blog-admin pid-1504076511'}, {'id': '4064500560936806699', 'body': 'Yeah, but the McCain one was from mid-2006. Notice that Obama isn\\46#39;t even mentioned, yet several candidates who never entered the race (Condi, Jeb, Newt, Al Gore, etc.) are included in the poll. I\\46#39;d take a poll from a few weeks before the primary just a tad more seriously than one from eighteen months before.\\74br /\\76\\74br /\\76(I got to these results simply by typing \\74i\\76site:gallup.com \\46quot;acceptable nominees\\46quot;\\74/i\\76 into Google. It turned up nine hits, most of them references to the recent poll.)', 'timestamp': '1323132772246', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323132772246\\46m\\0751#c4064500560936806699', 'author': {'name': 'Kylopod', 'profileUrl': 'http://www.blogger.com/profile/06932528611103718373'}, 'displayTime': 'December 5, 2011 at 6:52 PM', 'deleteclass': 'item-control blog-admin pid-1446607846'}, {'id': '3763502910848113002', 'body': 'Anon 5:39: okay, so it\\46#39;s a more complicated claim: they know Newt and know they don\\46#39;t like him, but he looks very different against just Romney than he does against the whole field. In the former context, he\\46#39;s more than acceptable. Is that right?\\74br /\\76\\74br /\\76So, that makes sense and is perfectly plausible, but isn\\46#39;t it circular? \\46quot;We can tell ABR supporters won\\46#39;t change their minds about Romney, because they\\46#39;ll even change their minds about Gingrich rather than support Romney\\46quot; partially relies on the claim it\\46#39;s trying to prove. This could certainly still be right -- not all circles are vicious -- but I had thought you were making the stronger claim that this provided independent evidence for the hardness of the hard core of ABR.\\74br /\\76\\74br /\\76So, this goes back to your epistemological critique, which is both broad and deep (it\\46#39;s pretty hard to get out of a self-reinforcing bubble!). Liberals can\\46#39;t correctly interpret the information they\\46#39;re -- we\\46#39;re -- given, so it\\46#39;s pointless to try to reason by offering isolated facts, even in response to purely empirical questions (like this one about the strength of the ABR bloc). Rather, the best strategy is to present a coherent alternative worldview, so that we can see our bubble, and that there\\46#39;s something outside it.\\74br /\\76\\74br /\\76-- Am I understanding you correctly? If so, that\\46#39;s a very deep puzzle about justification, and probably the best case one could make for gadfly behavior. Hm...', 'timestamp': '1323136588179', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323136588179\\46m\\0751#c3763502910848113002', 'author': {'name': 'the classicist', 'profileUrl': 'http://www.blogger.com/profile/08691196845661570282'}, 'displayTime': 'December 5, 2011 at 7:56 PM', 'deleteclass': 'item-control blog-admin pid-1147218478'}, {'id': '4223078793031976294', 'body': 'Not too mucb interested in what you formulate as conclusion, classicist. You can go on ahead with that, alone. \\74br /\\76\\74br /\\76We have what we have, right in front of our faces. We don\\46#39;t have what we don\\46#39;t have in front of our faces. The only question is, are we paying attention to what we have, and what we don\\46#39;t have, right in front of our faces?\\74br /\\76\\74br /\\76The terminally perceptive aren\\46#39;t, for sure. ;-)', 'timestamp': '1323177287116', 'permalink': 'http://plainblogaboutpolitics.blogspot.com/2011/12/acceptable.html?showComment\\0751323177287116\\46m\\0751#c4223078793031976294', 'author': {'name': 'Anonymous', 'profileUrl': ''}, 'displayTime': 'December 6, 2011 at 7:14 AM', 'deleteclass': 'item-control blog-admin pid-1217317497'}];\n      var msgs = {'loadMore': 'Load more...', 'loading': 'Loading...', 'loaded': 'No more!', 'addComment': 'Add comment', 'reply': 'Reply', 'delete': 'Delete'};\n      var config = {'blogId': '6926413038778731189', 'postId': '8371231732347060989', 'feed': 'http://plainblogaboutpolitics.blogspot.com/feeds/8371231732347060989/comments/default', 'authorName': 'Jonathan Bernstein', 'authorUrl': 'http://www.blogger.com/profile/15931039630306253241', 'baseUri': 'http://www.blogger.com', 'maxThreadDepth': 2};\n\n// <![CDATA[\n      var cursor = null;\n      if (items && items.length > 0) {\n        cursor = parseInt(items[items.length - 1].timestamp) + 1;\n      }\n\n      var bodyFromEntry = function(entry) {\n        if (entry.gd$extendedProperty) {\n          for (var k in entry.gd$extendedProperty) {\n            if (entry.gd$extendedProperty[k].name == 'blogger.contentRemoved') {\n              return '<span class=\"deleted-comment\">' + entry.content.$t + '</span>';\n            }\n          }\n        }\n        return entry.content.$t;\n      }\n\n      var parse = function(data) {\n        cursor = null;\n        var comments = [];\n        if (data && data.feed && data.feed.entry) {\n          for (var i = 0, entry; entry = data.feed.entry[i]; i++) {\n            var comment = {};\n            // comment ID, parsed out of the original id format\n            var id = /blog-(\\d+).post-(\\d+)/.exec(entry.id.$t);\n            comment.id = id ? id[2] : null;\n            comment.body = bodyFromEntry(entry);\n            comment.timestamp = Date.parse(entry.published.$t) + '';\n            if (entry.author && entry.author.constructor === Array) {\n              var auth = entry.author[0];\n              if (auth) {\n                comment.author = {\n                  name: (auth.name ? auth.name.$t : undefined),\n                  profileUrl: (auth.uri ? auth.uri.$t : undefined),\n                  avatarUrl: (auth.gd$image ? auth.gd$image.src : undefined)\n                };\n              }\n            }\n            if (entry.link) {\n              if (entry.link[2]) {\n                comment.link = comment.permalink = entry.link[2].href;\n              }\n              if (entry.link[3]) {\n                var pid = /.*comments\\/default\\/(\\d+)\\?.*/.exec(entry.link[3].href);\n                if (pid && pid[1]) {\n                  comment.parentId = pid[1];\n                }\n              }\n            }\n            comment.deleteclass = 'item-control blog-admin';\n            if (entry.gd$extendedProperty) {\n              for (var k in entry.gd$extendedProperty) {\n                if (entry.gd$extendedProperty[k].name == 'blogger.itemClass') {\n                  comment.deleteclass += ' ' + entry.gd$extendedProperty[k].value;\n                } else if (entry.gd$extendedProperty[k].name == 'blogger.displayTime') {\n                  comment.displayTime = entry.gd$extendedProperty[k].value;\n                }\n              }\n            }\n            comments.push(comment);\n          }\n        }\n        return comments;\n      };\n\n      var paginator = function(callback) {\n        if (hasMore()) {\n          var url = config.feed + '?alt=json&v=2&orderby=published&reverse=false&max-results=50';\n          if (cursor) {\n            url += '&published-min=' + new Date(cursor).toISOString();\n          }\n          window.bloggercomments = function(data) {\n            var parsed = parse(data);\n            cursor = parsed.length < 50 ? null\n                : parseInt(parsed[parsed.length - 1].timestamp) + 1\n            callback(parsed);\n            window.bloggercomments = null;\n          }\n          url += '&callback=bloggercomments';\n          var script = document.createElement('script');\n          script.type = 'text/javascript';\n          script.src = url;\n          document.getElementsByTagName('head')[0].appendChild(script);\n        }\n      };\n      var hasMore = function() {\n        return !!cursor;\n      };\n      var getMeta = function(key, comment) {\n        if ('iswriter' == key) {\n          var matches = !!comment.author\n              && comment.author.name == config.authorName\n              && comment.author.profileUrl == config.authorUrl;\n          return matches ? 'true' : '';\n        } else if ('deletelink' == key) {\n          return config.baseUri + '/delete-comment.g?blogID='\n               + config.blogId + '&postID=' + comment.id;\n        } else if ('deleteclass' == key) {\n          return comment.deleteclass;\n        }\n        return '';\n      };\n\n      var replybox = null;\n      var replyUrlParts = null;\n      var replyParent = undefined;\n\n      var onReply = function(commentId, domId) {\n        if (replybox == null) {\n          // lazily cache replybox, and adjust to suit this style:\n          replybox = document.getElementById('comment-editor');\n          if (replybox != null) {\n            replybox.height = '250px';\n            replybox.style.display = 'block';\n            replyUrlParts = replybox.src.split('#');\n          }\n        }\n        if (replybox && (commentId !== replyParent)) {\n          document.getElementById(domId).insertBefore(replybox, null);\n          replybox.src = replyUrlParts[0]\n              + (commentId ? '&parentID=' + commentId : '')\n              + '#' + replyUrlParts[1];\n          replyParent = commentId;\n        }\n      };\n\n      var hash = (window.location.hash || '#').substring(1);\n      var startThread, targetComment;\n      if (/^comment-form_/.test(hash)) {\n        startThread = hash.substring('comment-form_'.length);\n      } else if (/^c[0-9]+$/.test(hash)) {\n        targetComment = hash.substring(1);\n      }\n\n      // Configure commenting API:\n      var configJso = {\n        'maxDepth': config.maxThreadDepth\n      };\n      var provider = {\n        'id': config.postId,\n        'data': items,\n        'loadNext': paginator,\n        'hasMore': hasMore,\n        'getMeta': getMeta,\n        'onReply': onReply,\n        'rendered': true,\n        'initComment': targetComment,\n        'initReplyThread': startThread,\n        'config': configJso,\n        'messages': msgs\n      };\n\n      var render = function() {...To Be Continued..
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl
index 893f1836..21da5ce9 100644
--- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl
+++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl
@@ -18,4 +18,7 @@
 {"track_id": "test_clean_tags", "dataset_name": "test_pipeline_suit", "url": "https://math.stackexchange.com/questions/4082284/solving-for-vector-contained-in-a-diagonal-matrix","data_source_category": "HTML",  "path":"test_clean_tags.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "list_nest_three", "dataset_name": "list_nest_three", "url": "http://test.com","data_source_category": "HTML",  "path":"list_nest_three.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "table_include_entity", "dataset_name": "table_include_entity", "url": "http://math.stackexchange.com/questions/658871/perfectly-centered-break-of-a-perfectly-aligned-pool-ball-rack?answertab=active","data_source_category": "HTML",  "path":"table_include_entity.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "content_list_empty", "dataset_name": "content_list_empty", "url": "https://test.com","data_source_category": "HTML",  "path":"content_list_empty.html", "file_bytes": 1000, "page_layout_type":"forum", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "exclude_complex_table", "dataset_name": "exclude_complex_table", "url": "https://test.com","data_source_category": "HTML",  "path":"exclude_complex_table.html", "file_bytes": 1000, "page_layout_type":"article", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "para_is_short", "dataset_name": "para_is_short", "url": "https://test.com","data_source_category": "HTML",  "path":"para_is_short.html", "file_bytes": 1000, "page_layout_type":"article", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "xml_tag", "dataset_name": "test_pipeline_suit", "url": "http://www.chicks.org.uk/index.php?option=com_content&view=article&id=342%3Achicks-children-on-the-ice&Itemid=27","data_source_category": "HTML",  "path":"xml_tag.html", "file_bytes": 1000, "page_layout_type":"article", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html
index 5b33ffa1..e90fe9f1 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/raw_html_attr.html
@@ -1,5 +1,5 @@
 <html>
-<ccmath html="$E=MC^2$">E=MC^2</ccmath>
+<ccmath html="<math>$E=MC^2$</math>">E=MC^2</ccmath>
 <span>这是python</span>
 <pre lang="python" class="code-highlight">
   <cccode language="python">
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
index b0baf47d..5bb85151 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
@@ -1,5 +1,5 @@
 {
-    "type": "table",
+    "type": "complex_table",
     "raw_content": "<cctable table_type=\"complex\" html='&lt;table id=\\\"wp-calendar\\\"&gt;\\n\\t&lt;caption&gt;ফেব্রুয়ারি ২০২৪&lt;/caption&gt;\\n\\t&lt;thead&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"সোমবার\\\"&gt;সোম&lt;/th&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"মঙ্গলবার\\\"&gt;মঙ্গল&lt;/th&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"বুধবার\\\"&gt;বুধ&lt;/th&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"বৃহষ্পতিবার\\\"&gt;বৃহ&lt;/th&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"শুক্রবার\\\"&gt;শুক্র&lt;/th&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"শনিবার\\\"&gt;শনি&lt;/th&gt;\\n\\t\\t&lt;th scope=\\\"col\\\" title=\\\"রবিবার\\\"&gt;রবি&lt;/th&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;/thead&gt;\\n\\n\\t&lt;tfoot&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;td colspan=\\\"3\\\" id=\\\"prev\\\"&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/01/\\\" title=\\\"View posts for জানুয়ারি ২০২৪\\\"&gt;« জানুয়ারি&lt;/a&gt;&lt;/td&gt;\\n\\t\\t&lt;td class=\\\"pad\\\"&gt; &lt;/td&gt;\\n\\t\\t&lt;td colspan=\\\"3\\\" id=\\\"next\\\" class=\\\"pad\\\"&gt; &lt;/td&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;/tfoot&gt;\\n\\n\\t&lt;tbody&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;td colspan=\\\"3\\\" class=\\\"pad\\\"&gt; &lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/01/\\\" title=\\\"আজ অমর একুশে বইমেলা উদ্বোধন করবেন প্রধানমন্ত্রী, ভাষার মাসের প্রথম দিনে বাংলায় আদেশ দিলেন হাইকোর্ট, ৯ মামলায় মির্জা আব্বাসের গ্রেপ্তার ও জামিন শুনানি আজ, দেশের বিভিন্নস্থানে গুঁড়ি গুঁড়ি বৃষ্টি, আমদানি করা ডিম কেন দেশে আসছে না, শিক্ষার্থীদের অর্থ সহায়তা দেবে প্রধানমন্ত্রীর তহবিল, আবেদন শুরু আজ, বইমেলার উদ্বোধন করলেন প্রধানমন্ত্রী, জামিন পেলেন না মির্জা ফখরুল, বিশ্ব ইজতেমায় প্রাণ গেল আরও এক মুসল্লির, মির্জা আব্বাস আরো ৯ মামলায় গ্রেফতার, গাজায় নিহতের সংখ্যা ২৭ হাজার ছাড়াল, বিশ্ব ইজতেমা ময়দানে মুসলধারে  বৃষ্টি, বিশ্ব ইজতেমায় ৪৩ দেশের ১৫৬৯ মুসুল্লি এসেছেন, বিশ্ব ইজতেমা শুরু হবে মাওলানা আহমেদ লাটের আম বয়ানের মাধ্যমে, বিশ্ব ইজতেমার মূলপ্যান্ডেল পরিপূর্ণ রাস্তার পাশে জামাতের তাবু, রিজার্ভ নামল ১৯ বিলিয়ন ডলারের ঘরে, বইমেলা আমাদের প্রাণের মেলা : প্রধানমন্ত্রী\\\"&gt;১&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/02/\\\" title=\\\"আল্লাহু আকবার ধ্বনিতে প্রকম্পিত টঙ্গী তুরাগ তীর, ইরানি টার্গেটে হামলার পরিকল্পনা অনুমোদন যুক্তরাষ্ট্রের, ডিএমপির ৪৯তম প্রতিষ্ঠা দিবস পালিত, বিশ্ব ইজতেমার ইতিকথা, বিশ্ব ইজতেমায় লাখো মুসল্লীর জুমার নামাজ আদায়, সন্ধ্যা নামতেই বইমেলায় মানুষের ঢল, রাখাইনের রাজধানীতে কারফিউ জারি, বিশ্ব ইজতেমায় পুলিশ সদস্য সহ সাত জনের মৃত্যু, সাংবাদিক সম্মেলনে দাবী,  আমরা জুবায়ের পন্থী না, শূরায়ে নেজাম\\\"&gt;২&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/03/\\\" title=\\\"বিশ্ব ইজতেমায়  কাল আখেরী মোনাজাত, সিরিয়ায় মার্কিন হামলায় ১৮ ইরানপন্থি যোদ্ধা নিহত, ড. ইউনূসের বিচারিক-প্রক্রিয়া নিয়ে মার্কিন পররাষ্ট্র দফতরের আহ্বান, আইনমন্ত্রীর ‘৫০ বছর তত্ত্বে’ সাগর-রুনি পরিবারে অনন্ত হতাশা, কাল সকাল নয়টায় আখেরী মোনাজাত আজ মধ্যরাত থেকে গণপরিবহন বন্ধ, ইজতেমায় দেড়শ তোলা রুপা বা তার সমমূল্য অর্থের মোহরানায় ৭২টি বিয়ে, ইজতেমায় ট্যুরিস্ট পুলিশকে ধন্যবাদ ও কৃতজ্ঞতা জানালেন এক বিদেশী নাগরিক, বিশ্ব ইজতেমায় আরো তিন মুসল্লির মৃত্যু এনিয়ে মৃত্যুর সংখ্যা দশজন, ৫ রানের হারে স্বপ্নভঙ্গ বাংলাদেশের, গাজীপুরে ৪ ঘণ্টার চেষ্টায় কারখানার আগুন নিয়ন্ত্রণে\\\"&gt;৩&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/04/\\\" title=\\\"বিশ্ব ইজতেমায় আলমী শূরার আজ আখেরী মোনাজাত, আখেরী মোনাজাতে আমিন আমিন ধ্বনিতে কেঁপে উঠে কহর দরিয়ার ইজতেমা ময়দান, গোলাগুলি চলছেই, আতঙ্কে বাড়িঘর ছাড়ছেন সীমান্তের বাসিন্দারা, বাংলাদেশের নির্বাচন অবাধ ও সুষ্ঠু হয়নি : পিটার হাস, আখেরী মোনাজাত শেষে ঘরে ফেরা মানুষের ঢল, বিজিবি ক্যাম্পে আশ্রয় নিয়েছে মিয়ানমারের ১৪ পুলিশ, আরও বাড়লো এলপি গ্যাসের দাম, সীমান্তে ব্যাপক সংঘর্ষ, বাংলাদেশে আশ্রয় নিলো মিয়ানমার বর্ডার পুলিশের ৭০ সদস্য, চার ধাপে উপজেলা নির্বাচন, এপ্রিলের শেষে শুরু, প্রধানমন্ত্রীকে বাইডেনের চিঠি, নতুন করে বাড়ি-গাড়ি করতে পারবেন না ঋণ খেলাপিরা\\\"&gt;৪&lt;/a&gt;&lt;/td&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/05/\\\" title=\\\"নিরাপদ পানি না থাকলে মারা যাবে আরও অনেক ফিলিস্তিনি, বিশ্ব ইজতেমায় পুলিশি নিষেধাজ্ঞা প্রত্যাহার মহাসড়কে যানবাহন চলাচল স্বাভাবিক, বিশ্ব ইজতেমা ময়দান থেকে বাড়ি ফেরা হল না আলীমের, ইজতেমা ময়দান খালি হচ্ছে ময়লার দুর্গন্ধে বাতাস বিষাক্ত, যানবাহন সংকটে যাত্রী বিড়ম্বনা,  ভাড়া বেশী, ইজতেমায় ফিলিস্তিনিদের জন্য ৫০ টি পয়েন্টে সাহায্য তুলেছে   হাফেজ্জী হুজুর ফাউন্ডেশন, ভারতকে হারিয়ে ফাইনালে বাংলাদেশ, বাতিল হচ্ছে না বিতর্কিত শরীফার গল্প, ভোটের রাতে সঙ্ঘবদ্ধ ধর্ষণ : নোয়াখালীতে ১০ জনের ফাঁসি, জামিন পেলেন মামুনুল হক, মিয়ানমার থেকে আসা মর্টারশেলে বাংলাদেশিসহ নিহত ২, আসছে দুর্বল ব্যাংককে একীভূত করার নীতিমালা, সংরক্ষিত নারী আসনের তফসিল মঙ্গলবার, ভালুকায় ইজতেমা থেকে ফেরার পথে দুই মাদরাসা ছাত্র নিহত : আহত ১৫, সশস্ত্রবাহিনী ও বিজিবিকে ধৈর্য ধরার নির্দেশ প্রধানমন্ত্রীর, নাইক্ষ্যংছড়ির ৫টি বিদ্যালয় বন্ধ ঘোষণা, সারাদেশ থেকে কয়েক হাজার ভিক্ষুক এসেছিল ইজতেমায়, ঘুমধুম সীমান্তে ৪০০ চাকমা বাংলাদেশে অনুপ্রবেশের অপেক্ষায়\\\"&gt;৫&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/06/\\\" title=\\\"বগুড়া জেলার   ধুনটে “ভুট্টা ক্ষেতে” গৃহবধুকে ধর্ষণের চেষ্টা! আটক ১, মিয়ানমার থেকে পালিয়ে বাংলাদেশে আরো ৭ বিজিপি সদস্য, পরাজয়ের দ্বারপ্রান্তে মিয়ানমারের জান্তা?, সাগর-রুনী হত্যাকাণ্ডের তদন্তে গভীরে যেতে হচ্ছে : আইনমন্ত্রী, বিশ্ব ইজতেমা ময়দান প্রশাসনের মধ্যস্থতায় সাদ পন্থীদের কাছে হস্তান্তর, টঙ্গীতে নির্বাচনী সহিংসতার পৃথক ঘটনায় ৪ কলেজ ছাত্র সহ ১৭ জন আহত, আটক ৬,, প্রধানমন্ত্রীর কাছে আবেদন আমাদের এক করে দিন, কবরস্থানে মৃতদের সঙ্গে থাকছেন গাজার অনেক মানুষ, বিনোদন কেন্দ্রের কর্মীদের হাতে লাঞ্চিত হয়েছে নারী-শিশুসহ এক পরিবারের সাত সদস্য।, হেলমেট পরে ঘরে ঢুকে শিশুকে হত্যা করল ২ যুবক, পালিয়ে বাঁচল বোন, উপজেলা নির্বাচন হবে ৪ ধাপে, প্রথম দফায় ভোটগ্রহণ ৪ মে\\\"&gt;৬&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/07/\\\" title=\\\"কাজিপুরে ঘুষের টাকা ফেরতের দাবিতে শিক্ষিকার লাশ নিয়ে স্কুলে বিক্ষোভ, বগুড়ায় বীর মুক্তিযোদ্ধাদের যুদ্ধকালীন বীরত্বগাথা শুনলেন কোমলমতি শিক্ষার্থীরা, সীমান্ত এখন জনমানবশূন্য, নিরাপদ আশ্রয়ের খোঁজে স্থানীয়রা, ৪৪ হাজার কোটা খালি রেখেই শেষ হলো হজ নিবন্ধন, সবুজ থেকে টুসির ঘরে সেই শিক্ষা অফিসার!, আশুলিয়া-আবদুল্লাহপুর এলিভেটেড এক্সপ্রেসওয়ে র নির্মাণ কাজে ধীরগতিতে জনদুর্ভোগ, মালয়েশিয়ান গৃহবধূকে ডাকাতদের প্রশ্ন হয়ার আর ইউ ফর্ম!, বিশ্ব ইজতেমার দ্বিতীয় পর্বের প্রস্তুতি চলছে জোরেশোরে, আজ সন্ধ্যা ৭ টার মধ্যে মাওলানা সাদকে আসার অনুমতি দিন— সাংবাদিক সম্মেলনে দাবী, স্ত্রীকে খাটে বেঁধে স্বামীর আত্মহত্যা, অভিনেতা আহমেদ রুবেল মারা গেছেন, ভোটের আগের দিন পাকিস্তানে পরপর বিস্ফোরণে নিহত ২৬, বিশ্ব ইজতেমা ময়দানে পৌছেছেন মাওলানা সাদের তিন ছেলে, নিরাপত্তা জোরদার\\\"&gt;৭&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/08/\\\" title=\\\"পর্যটকের উপর হামলার ঘটনায় তিনজনকে গ্রেফতার করেছে পুলিশ, অনির্দিষ্টকালের জন্য সেন্টমার্টিন ভ্রমণে নিষেধাজ্ঞা, মিয়ানমার পরিস্থিতি নিয়ে একস‌ঙ্গে কাজ করতে সম্মত বাংলাদেশ-ভারত, শুভেচ্ছা জানিয়ে শেখ হাসিনাকে ঋষি সুনাকের চিঠি, উত্তরায় রাজউকের উচ্ছেদ অভিযান, কাপাসিয়ায় নাশেরা উচ্চ বিদ্যালয়ের ম্যানেজিং কমিটির নির্বাচন জমে উঠেছে, সিরাজগঞ্জে ট্রাকচাপায় প্রাণ গেল ২ জনের, ফতুল্লায় পোশাক কারখানা গ্যাস পাইপলাইন বিস্ফোরণ, দগ্ধ ১৪, চরম নাটকীয়তার ফাইনালে ‘টস’ জিতে শিরোপা ভারতের, আপত্তি বাংলাদেশের, আ. লীগের টিকিটে সংরক্ষিত এমপি হতে চান ১৫৪৯ জন নারী, বিশ্ব ইজতেমা ময়দানে ক্রিকেটার মুশফিকুর রহিম, হাসপাতালে  খালেদা জিয়া, বান্দরবান থেকে কক্সবাজারে সরিয়ে নেয়া হলো ১০০ বিজিপি সদস্যকে\\\"&gt;৮&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/09/\\\" title=\\\"আদমদীঘিতে অনুমোদন ছাড়াই তিন ফসলি জমিতে নির্মাণ করা হচ্ছে হিমাগার, আম বয়ানে শুরু ইজতেমার দ্বিতীয় পর্ব, ইজতেমার দ্বিতীয় পর্বের শুরুতেই ৬ মুসল্লির মৃত্যু, টাকার সঙ্কটে ভর্তুকির দায় শোধ হচ্ছে বন্ডে, ভারত-মিয়ানমার সীমান্তে কাঁটাতার বসানোর প্রস্তুতি, তুরাগতীরে দেশের বৃহত্তম জুমার জামাত অনুষ্ঠিত, পাকিস্তানে হাড্ডাহাড্ডি লড়াই, ফের এগিয়ে ইমরানের দলের স্বতন্ত্ররা, গোলাগুলি আর মর্টার শেল বিস্ফোরণে কাঁপল টেকনাফ সীমান্ত, স্কুল ঘেরাওয়ের হুমকি দিচ্ছেন ভিকারুননিসার সেই শিক্ষক\\\"&gt;৯&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/10/\\\" title=\\\"“আমরা স্বপ্ন দেখি বাংলাদেশের প্রতিটি মানুষ নিজের রক্তের গ্রুপ নিজে জানবে” বিডিইউ শিক্ষার্থী, সেনা-সীমান্তরক্ষীদের ফি‌রিয়ে নিতে আসছে মিয়ানমারের জাহাজ, বিশ্ব ইজতেমা : টঙ্গী ধর্মীয় নগরীতে পরিণত, আ.লীগ নেতাকে পিটিয়ে হত্যা, ছাত্রলীগের দুই নেতা গ্রেপ্তার, কুষ্টিয়ায় দুই দিনে কেজিতে পেঁয়াজের দাম বেড়েছে ৩০ টাকা, ভোটের দুই দিন পর ১৪ মামলায় জামিন পেলেন ইমরান খান, জাপার জনপ্রিয়তায় ধস নেমেছে : রওশন এরশাদ, এবারের নির্বাচন কেউ প্রশ্নবিদ্ধ করতে পারেনি\\\"&gt;১০&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/11/\\\" title=\\\"প্রধানমন্ত্রী কে হবেন, সিদ্ধান্ত নেবেন ইমরান : পিটিআই চেয়ারম্যান, সাদ সাহেব আসতে পারলে ২৫ হাজারের বেশী বিদেশি মেহমান আসত, ৫৭ তম বিশ্ব ইজতেমার শেষ রজনীতে ইবাদত বন্দেগী ও দ্বীনি আমল, বিশ্ব ইজতেমায় শ্রীলংকান নাগরিকসহ ১৪ জোড়া বর- কনের  পরিচিতি, আজ আখেরী মোনাজাত, আখেরী মোনাজাতে আমিন আমিন ধ্বনিতে প্রকম্পিত তুরাগ তীর, কুমিল্লায় সড়ক দুর্ঘটনায় নিহত বেড়ে ৫, নওয়াজকে নিরাশ করে যেভাবে সরকার গঠন করতে পারে ইমরানের অনুগতরা, গাজায় জাতিসংঘ সদর দপ্তরের নিচে হামাসের কমান্ড টানেল: ইসরায়েল, এমবিবিএস ভর্তি পরীক্ষার ফল প্রকাশ\\\"&gt;১১&lt;/a&gt;&lt;/td&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/12/\\\" title=\\\"শবে বরাত কবে, জানাল ইসলামিক ফাউন্ডেশন, উত্তরপত্র ছিঁড়লেন পর্যবেক্ষক, হাউমাউ করে কাঁদলেন শিক্ষার্থী, ‘ভোটে কখনও না হারা’ সেনাবাহিনীর রেকর্ড গুঁড়িয়ে দিতে পারবেন ইমরান?, সূর্যমুখী সেলে আদম তমিজি একাই থাকেন, সময় কাটে নীরবে, মিয়ানমারের সেনা-বিজিপি সদস্যরা ফেরত যাবে কবে?, উড়ছে হাজারো পলিথিন, ভাসছে অসহনীয় দুর্গন্ধ, এখনো জ্বলছে আলো!, শাহজাদপুরে যমুনা নদীতে ভাঙন, ৭ গ্রামের ২ শতাধিক বাড়ি-ঘর বিলিন হওয়ার পথে, আনসার বাহিনীকে স্মার্ট ও আধুনিক করতে কাজ চলছে, সরকার গঠন নয়, বিরোধী দল হিসেবে সংসদে যাওয়ার ইঙ্গিত পিটিআইয়ের, চট্টগ্রামে পুলিশ-হকার দফায় দফায় সংঘর্ষ, গুলিবিদ্ধ ১, ভারতের কাছে দেড় লাখ টন চিনি-পেঁয়াজ চেয়েছে বাংলাদেশ, মন্ত্রিসভার আকার বাড়ানো হবে : কাদের, ‘আড়াই বছর করে প্রধানমন্ত্রী’ শর্তে পাকিস্তানে গঠিত হতে পারে সরকার\\\"&gt;১২&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/13/\\\" title=\\\"বগুড়ায় সাংবাদিক ইউনিয়নের নির্বাচনে রউফ সভাপতি, রানা সা. সম্পাদক, বগুড়ায় আমিও জিততে চাই শিরোনামে ইয়ুথ ফেয়ার অনুষ্ঠিত, এক বছরে খেলাপি ঋণ বেড়েছে ২৫ হাজার কোটি টাকা, হঠাৎ পিটার হাসের সঙ্গে সাক্ষাৎ করলেন মঈন খান, সাকিব অধ্যায়ের ইতি, নতুন অধিনায়ক শান্ত, আনুষ্ঠানিকভাবে সমাপ্ত হল ৫৭তম বিশ্ব ইজতেমা, অতিরিক্ত ভালোবাসা ঠিক নয়!, নির্বাচন কীভাবে আরও স্বচ্ছ হয় তা নিয়ে কাজ করা উচিত : সিইসি, পিটিআইয়ের সরকার গঠন করতে চান ইমরান খান, বন প্রহরী কর্তৃক অসদাচরণের শিকার হয়েছেন সাংবাদিক মোজাহিদ, প্রাথমিকের দ্বিতীয় ধাপের পরীক্ষার ফল আগামী সপ্তাহে হতে পারে\\\"&gt;১৩&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/14/\\\" title=\\\"আজ বসন্ত, আজ ভালোবাসার দিন, ফুলের দোকানে সংবাদ সংগ্রহে যাওয়া তিন সাংবাদিকের ওপর হামলা, রাখাইনের বাংলাদেশ মিশনের সবাইকে ইয়াঙ্গুনে নেওয়া হয়েছে, ‘ভালোবাসার’ এক গোলাপ ১০০ টাকা!, ৭ দিনব্যাপী ‘হাইওয়ে পুলিশের সেবা সপ্তাহ, বগুড়ায় ১২টি উপজেলার প্রার্থীরা নির্বাচনী প্রচারণায় আদাজল খেয়ে  মাঠে নেমেছেন, আমার ভালোবাসা আছে লাগবে না সাজুগুজু দেখলে ভালো লাগে, টঙ্গীর শিশু উন্নয়ন কেন্দ্রের বন্দির মৃত্যু নির্যাতনে মৃত্যুর অভিযোগ পরিবারের, যুদ্ধ বন্ধে জেলেনস্কির সঙ্গে বৈঠক করবেন প্রধানমন্ত্রী, পাকিস্তানের প্রেসিডেন্ট হচ্ছেন জারদারি\\\"&gt;১৪&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/15/\\\" title=\\\"এসএসসি শুরু আজ, পরীক্ষার্থী ২০ লাখেরও বেশি, সংরক্ষিত আসনে আওয়ামী লীগের প্রার্থী হলেন যারা, নতুন শিক্ষা ব্যবস্থা পরিবর্তনের পরিকল্পনা নেই : শিক্ষামন্ত্রী, রোহিঙ্গাদের কারণে বাংলাদেশের নিরাপত্তা ঝুঁকি তৈ‌রি হ‌য়ে‌ছে, বগুড়ায় শুরু হচ্ছে ৪র্থ আন্তর্জাতিক চলচ্চিত্র উৎসব, লেবাননে ইসরায়েলের ব্যাপক হামলা, শিশুসহ নিহত ৯, সাবেক এমপি হাবিব হাসানের সাথে তুরাগ থানা শ্রমিকলীগের নতুন কমিটির সাক্ষাৎ, পাকিস্তানজুড়ে বিক্ষোভের ঘোষণা পিটিআইয়ের, সংরক্ষিত আসনে জাপার মনোনয়ন পেলেন সালমা ইসলাম ও নূরুন নাহার, বিজয় না আসা পর্যন্ত গণতন্ত্র ফেরানোর আন্দোলন চলবে : ফখরুল, শনিবার থেকে ৮ মিনিট পরপর চলবে মেট্রোরেল\\\"&gt;১৫&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/16/\\\" title=\\\"গাজায় নিহত আরও ৮৭, প্রাণহানি বেড়ে প্রায় ২৮ হাজার ৭০০, বগুড়ায় এসএসসি ও সমমানের পরীক্ষায় ৬ শিক্ষার্থী ও ৬ শিক্ষক বহিস্কৃত, সাংবাদিকের দ্বিখণ্ডিত মরদেহ উদ্ধার, মৃত্যু নিয়ে প্রশ্ন, বৈশ্বিক গণতান্ত্রিক সূচকে বাংলাদেশের অবনতি, ময়মনসিংহে বাসচাপায় অটোরিকশার ৭ যাত্রী নিহত, মির্জা ফখরুল আবারও দিবাস্বপ্নে বিভোর হয়ে পড়েছেন : কাদের, ড. ইউনূসকে নিয়ে যে খবর পাচ্ছি, তাতে আমরা খুবই উদ্বিগ্ন: জাতিসংঘ, এবার টেকনাফের শাহপরীরদ্বীপ ও সেন্টমার্টিন সীমান্তে গোলাগুলির শব্দ, নোয়াখালীর বেগমগঞ্জে বাবার দাফনের চার ঘণ্টা পর ছেলের মৃত্যু হয়েছে।, সংসার ভাঙল মাহিয়া মাহির\\\"&gt;১৬&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/17/\\\" title=\\\"ছাত্রী ধর্ষণ চেষ্টার অভিযোগে চবি শিক্ষক বহিষ্কার, বিশ্ব নেতাদের প্রতি যে প্রস্তাব রাখলেন প্রধানমন্ত্রী, রাশিয়ার বিরোধী নেতা নাভালনির কারাগারে মৃত্যু, রাষ্ট্রীয় মর্যাদায় মুক্তিযোদ্ধা হামিদুল ইসলাম মতির দাফন সম্পন্ন, শেরপুরে রাস্তা রক্ষার দাবীতে গ্রামবাসীর মানব বন্ধন, প্রতারণা মামলায় ট্রাম্পকে ৩৫৪ মিলিয়ন ডলার জরিমানা, নাটকীয় সিদ্ধান্ত নিলো ইমরান খানের দল, কেজিতে ১০ থেকে ২০ টাকা বেড়েছে মুরগির দাম, ভুল চিকিৎসায় কারখানা শ্রমিকের দেহে ধরেছে পচন, কারচুপি করে ‘জিতিয়ে দেওয়া হয়েছে’ স্বীকার করে কমিশনারের পদত্যাগ, ক্ষমতায় গেলে সবাইকে ক্ষমা করে দেবেন ইমরান খান, জেলেনস্কির সঙ্গে প্রধানমন্ত্রীর বৈঠক\\\"&gt;১৭&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/18/\\\" title=\\\"পাকিস্তানে সরকার গঠন করতে চায় না কোনো দল!, আবার রোহিঙ্গা অনুপ্রবেশ শুরু, গুলিবিদ্ধ নারীসহ এল ৫ জন, বাগেরহাটে তৈরি ৪০ হাজার ‘কাঠের সাইকেল’ যাচ্ছে ইউরোপে, রমজানে কুয়েতে ৪ ঘণ্টার অফিস, নিজের পরিবারের ১২ জনকে গুলি করে হত্যা, ধুনটে দুর্বৃত্তদের আগুনে দোকান পুড়ে নি:স্ব ব্যবসায়ী, দুই শিশুর মৃত্যু নিপাহ ভাইরাসে নয়, অন্য কারণও জানা যায়নি, মির্জাপুরে ত্রিমুখী সংঘর্ষে নিহত ৪, এবার রওশনের সঙ্গে যোগ দিলেন বাবলা, ‘পাকিস্তানে সরকার গঠন করবে পিটিআই’, আজও শাহপরীর দ্বীপ সীমান্তে গুলির শব্দ\\\"&gt;১৮&lt;/a&gt;&lt;/td&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/19/\\\" title=\\\"রিমির উপন্যাস ‘চিত্রনাট্যে ভুল ছিল’ অবলম্বনে তৈরী হচ্ছে চলচ্চিত্র!, শেরপুরে লেখক সংঘের ৩য় পাক্ষিক সাহিত্য আসর অনুষ্ঠিত, গাজীপুরে ট্রাকচাপায় নিহত ৩, বাগেরহাটে দুই পক্ষের সংঘর্ষে একজন নিহত, পুলিশসহ আহত ২৫, ধর্ম-ভিত্তিক দলের সাথে জোটের সিদ্ধান্ত ইমরানের পিটিআইয়ের, ১১১ দিন পর কারামুক্ত মির্জা আব্বাস, বাংলাদেশে পেঁয়াজ রপ্তানির অনুমতি দিলো ভারত, পুরো রাখাইনের নিয়ন্ত্রণ নেওয়ার দ্বারপ্রান্তে বিদ্রোহীরা, টঙ্গীতে শতকোটি টাকা মূল্যের পুকুর ভরাট কাজ বন্ধ, এবার ভিডিও নিয়ে হাজির হওয়ার ঘোষণা মাহির স্বামী রাকিবের!\\\"&gt;১৯&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href=\\\"https://grambanglanews24.com/date/2024/02/20/\\\" title=\\\"ঢাকাসহ বিভিন্ন স্থানে বৃষ্টির আভাস, সংঘাতের মধ্যেই মিয়ানমারে নির্বাচনের তোড়জোড় জান্তার, গাবতলীতে বিরল প্রজাতির বন্যপ্রাণী তক্ষকসহ আটক-১, ২৪ ঘণ্টার মধ্যে বেতন না দিলে ট্রেন বন্ধের ঘোষণা, রাখাইনের মুসলিমদের হাতে অস্ত্র তুলে দিতে চায় জান্তা, ফিলিস্তিনি নারীদের ধর্ষণ করছে ইসরাইলি সৈন্যরা : জাতিসঙ্ঘ বিশেষজ্ঞ, বানিজ্য মেলায় হ্যাট্রিক পুরস্কার পেলো শারীরিক প্রতিবন্ধী সুরক্ষা ট্রাষ্ট, আমার আয়সী  এখন বাবা পাবে কই!, শ্রীপুরে ৮টি ভাংঙ্গা রাস্তা সংস্কার হচ্ছে, প্রতিমন্ত্রীর ছোঁয়ায়, সাবেক ভূমিমন্ত্রী সাইফুজ্জামান চৌধুরীর সম্পদের পাহাড় যুক্তরাজ্যে, বিএনপি নেতাকে জেটেবের ফৃুলের শুভেচ্ছা\\\"&gt;২০&lt;/a&gt;&lt;/td&gt;&lt;td id=\\\"today\\\"&gt;২১&lt;/td&gt;&lt;td&gt;২২&lt;/td&gt;&lt;td&gt;২৩&lt;/td&gt;&lt;td&gt;২৪&lt;/td&gt;&lt;td&gt;২৫&lt;/td&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;tr&gt;\\n\\t\\t&lt;td&gt;২৬&lt;/td&gt;&lt;td&gt;২৭&lt;/td&gt;&lt;td&gt;২৮&lt;/td&gt;&lt;td&gt;২৯&lt;/td&gt;\\n\\t\\t&lt;td class=\\\"pad\\\" colspan=\\\"3\\\"&gt; &lt;/td&gt;\\n\\t&lt;/tr&gt;\\n\\t&lt;/tbody&gt;\\n\\t&lt;/table&gt;'>&lt;table&gt;&lt;caption&gt;ফেব্রুয়ারি ২০২৪&lt;/caption&gt;&lt;thead&gt;&lt;tr&gt;&lt;th&gt;সোম&lt;/th&gt;&lt;th&gt;মঙ্গল&lt;/th&gt;&lt;th&gt;বুধ&lt;/th&gt;&lt;th&gt;বৃহ&lt;/th&gt;&lt;th&gt;শুক্র&lt;/th&gt;&lt;th&gt;শনি&lt;/th&gt;&lt;th&gt;রবি&lt;/th&gt;&lt;/tr&gt;&lt;/thead&gt;&lt;tfoot&gt;&lt;tr&gt;&lt;td colspan=\\\"3\\\"&gt;« জানুয়ারি&lt;/td&gt;&lt;td&gt;&lt;/td&gt;&lt;td colspan=\\\"3\\\"&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/tfoot&gt;&lt;tbody&gt;&lt;tr&gt;&lt;td colspan=\\\"3\\\"&gt;&lt;/td&gt;&lt;td&gt;১&lt;/td&gt;&lt;td&gt;২&lt;/td&gt;&lt;td&gt;৩&lt;/td&gt;&lt;td&gt;৪&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;৫&lt;/td&gt;&lt;td&gt;৬&lt;/td&gt;&lt;td&gt;৭&lt;/td&gt;&lt;td&gt;৮&lt;/td&gt;&lt;td&gt;৯&lt;/td&gt;&lt;td&gt;১০&lt;/td&gt;&lt;td&gt;১১&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;১২&lt;/td&gt;&lt;td&gt;১৩&lt;/td&gt;&lt;td&gt;১৪&lt;/td&gt;&lt;td&gt;১৫&lt;/td&gt;&lt;td&gt;১৬&lt;/td&gt;&lt;td&gt;১৭&lt;/td&gt;&lt;td&gt;১৮&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;১৯&lt;/td&gt;&lt;td&gt;২০&lt;/td&gt;&lt;td&gt;২১&lt;/td&gt;&lt;td&gt;২২&lt;/td&gt;&lt;td&gt;২৩&lt;/td&gt;&lt;td&gt;২৪&lt;/td&gt;&lt;td&gt;২৫&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;২৬&lt;/td&gt;&lt;td&gt;২৭&lt;/td&gt;&lt;td&gt;২৮&lt;/td&gt;&lt;td&gt;২৯&lt;/td&gt;&lt;td colspan=\\\"3\\\"&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;</cctable>",
     "content": {
         "html": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>",
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
index 95ee95be..357f2843 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
@@ -1 +1 @@
-{"type": "table", "raw_content": "<cctable table_type=\\\"simple\\\" html=\\\"&lt;table&gt;&lt;tr&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;\\\">&lt;table&gt;&lt;tr&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;<tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></cctable>", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "is_complex": false}}
\ No newline at end of file
+{"type": "simple_table", "raw_content": "<cctable table_type=\\\"simple\\\" html=\\\"&lt;table&gt;&lt;tr&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;\\\">&lt;table&gt;&lt;tr&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;<tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></cctable>", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "is_complex": false}}
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html
index 246702a9..2e1a6a91 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html
@@ -1,4 +1,4 @@
-"html_source": "<!DOCTYPE html>
+<!DOCTYPE html>
 <html lang=\"zh-CN\" prefix=\"og: https://ogp.me/ns#\" class=\"no-js article-header-style-default \">
 <head>
     <meta charset=\"UTF-8\">
@@ -7384,4 +7384,4 @@ <h4>相关文章链接：</h4>
 <script type='text/javascript'
         id='wp-statistics-tracker-js-extra'>var WP_Statistics_Tracker_Object = {\"requestUrl\":\"https:\\/\\/whatsknow.com\\/wp-json\\/wp-statistics\\/v2\",\"ajaxUrl\":\"https:\\/\\/whatsknow.com\\/wp-admin\\/admin-ajax.php\",\"hitParams\":{\"wp_statistics_hit\":1,\"source_type\":\"post\",\"source_id\":1580,\"search_query\":\"\",\"signature\":\"64a14a2690e7ceadfd8e1b1699538850\",\"endpoint\":\"hit\"},\"onlineParams\":{\"wp_statistics_hit\":1,\"source_type\":\"post\",\"source_id\":1580,\"search_query\":\"\",\"signature\":\"64a14a2690e7ceadfd8e1b1699538850\",\"endpoint\":\"online\"},\"option\":{\"userOnline\":\"1\",\"consentLevel\":\"disabled\",\"dntEnabled\":false,\"bypassAdBlockers\":false,\"isWpConsentApiActive\":false,\"trackAnonymously\":false,\"isPreview\":false},\"jsCheckTime\":\"60000\"};</script>
 <script async='async'
-        src=\"https://whatsknow.com/wp-content/cache/autoptimize/js/autoptimize_fe6b5f33f1d030f29a946c59f754e0ce.js\"></script></body></html> \n<!-- Dynamic page generated in 0.415 seconds. -->\n<!-- Cached page generated by WP-Super-Cache on 2024-12-04 02:21:00 -->\n\n<!-- super cache -->",
+        src=\"https://whatsknow.com/wp-content/cache/autoptimize/js/autoptimize_fe6b5f33f1d030f29a946c59f754e0ce.js\"></script></body></html> \n<!-- Dynamic page generated in 0.415 seconds. -->\n<!-- Cached page generated by WP-Super-Cache on 2024-12-04 02:21:00 -->\n\n<!-- super cache -->
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_code.py b/tests/llm_web_kit/extractor/html/recognizer/test_code.py
index ea7a4fb7..ba8b3f38 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_code.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_code.py
@@ -245,12 +245,12 @@ def test_code_rec(self):
             base_url = test_case['input'][1]
             print(base_url)
             raw_html = raw_html_path.read_text()
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
-            parts = [
-                part[0]
-                for part in parts
-                if CCTag.CC_CODE in part[0] or CCTag.CC_CODE_INLINE in part[0]
-            ]
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            # parts = [
+            #    part[0]
+            #    for part in parts
+            #    if CCTag.CC_CODE in part[0] or CCTag.CC_CODE_INLINE in part[0]
+            # ]
             # for part in parts:
             #     part_el = html_to_element(part)
             #     answer = get_element_text(part_el).strip()
@@ -259,7 +259,7 @@ def test_code_rec(self):
             #     print("--------------------------------------------------")
             answers = []
             for part in parts:
-                part_el = html_to_element(part)
+                part_el = part[0]
                 cccodes = part_el.xpath(f'.//{CCTag.CC_CODE}') + part_el.xpath(
                     f'.//{CCTag.CC_CODE_INLINE}'
                 )
@@ -532,4 +532,4 @@ def test_lineno_4(self):
 </div>
 """
         # 无须检查内容，只要不爆错就可以了
-        _ = self.rec.recognize('', [(html, html)], html)
+        _ = self.rec.recognize('', [(html_to_element(html), html_to_element(html))], html)
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
index ab3cd733..9f374848 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
@@ -3,6 +3,7 @@
 
 from llm_web_kit.extractor.html.recognizer.image import ImageRecognizer
 from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
+from llm_web_kit.libs.html_utils import html_to_element
 
 TEST_CASES_HTML = [
     {
@@ -98,7 +99,7 @@ def test_recognize(self):
             raw_html_path = base_dir.joinpath(test_case['input'])
             base_url = test_case['base_url']
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.img_recognizer.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.img_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             self.assertEqual(len(parts), test_case['expected'])
             ccimg_datas = [ccimg[0] for ccimg in parts if CCTag.CC_IMAGE in ccimg[0] and 'by="svg"' not in ccimg[0]]
             if ccimg_datas:
@@ -109,7 +110,7 @@ def test_recognize(self):
     def test_to_content_list_node(self):
         for test_case in TEST_CC_CASE:
             try:
-                res = self.img_recognizer.to_content_list_node(test_case['url'], test_case['parsed_content'],
+                res = self.img_recognizer.to_content_list_node(test_case['url'], html_to_element(test_case['parsed_content']),
                                                                test_case['html'])
                 self.assertEqual(res, test_case['expected'])
                 self.assertEqual(res['content']['alt'], test_case['alt'])
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
index 2cc10aac..0696618f 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
@@ -2,6 +2,7 @@
 import unittest
 
 from llm_web_kit.extractor.html.recognizer.list import ListRecognizer
+from llm_web_kit.libs.html_utils import html_to_element
 
 
 class TestSimpleListRecognize(unittest.TestCase):
@@ -17,10 +18,10 @@ def setUp(self):
             self.__complex_list_content = file.read()
 
     def test_simple_list(self):
-        html_part = self.__list_recognize.recognize('http://url.com', [(self.__simple_list_content, self.__complex_list_content)], self.__simple_list_content)
+        html_part = self.__list_recognize.recognize('http://url.com', [(html_to_element(self.__simple_list_content), html_to_element(self.__complex_list_content))], self.__simple_list_content)
         assert len(html_part) == 6
 
     def test_complex_list(self):
         # TODO: Fix this test
-        html_part = self.__list_recognize.recognize('http://url.com', [(self.__simple_list_content, self.__complex_list_content)], self.__complex_list_content)
+        html_part = self.__list_recognize.recognize('http://url.com', [(html_to_element(self.__simple_list_content), html_to_element(self.__complex_list_content))], self.__complex_list_content)
         assert len(html_part) == 6
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index fa9d7614..eb5fbbf4 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -4,7 +4,7 @@
 from llm_web_kit.exception.exception import HtmlMathRecognizerException
 from llm_web_kit.extractor.html.recognizer.ccmath import CCMATH, MathRecognizer
 from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
-from llm_web_kit.libs.html_utils import html_to_element
+from llm_web_kit.libs.html_utils import element_to_html, html_to_element
 
 TEST_CASES = [
     # 基本公式测试用例
@@ -365,17 +365,14 @@ def test_math_recognizer(self):
             with self.subTest(input=test_case['input'], raw_html=test_case['raw_html']):
                 output_html = self.math_recognizer.recognize(
                     'https://www.baidu.com',
-                    test_case['input'],
+                    [(html_to_element(test_case['input'][0][0]), html_to_element(test_case['input'][0][1]))],
                     test_case['raw_html']
                 )
-                print(output_html)
                 expect_len = len(test_case['expected'])
                 self.assertEqual(len(output_html), len(test_case['expected']), msg=f'result is: {len(output_html)}, expected is: {expect_len}')
                 for i in range(len(output_html)):
                     expect = test_case['expected'][i][0]
-                    print(output_html[i][0])
-                    print(expect)
-                    self.assertEqual(output_html[i][0], expect, msg=f'result is: {output_html[i][0]}, expected is: {expect}')
+                    self.assertEqual(element_to_html(output_html[i][0]), expect, msg=f'result is: {output_html[i][0]}, expected is: {expect}')
 
     def test_math_recognizer_html(self):
         for test_case in TEST_CASES_HTML:
@@ -383,7 +380,7 @@ def test_math_recognizer_html(self):
             # print('raw_html_path::::::::', raw_html_path)
             base_url = test_case['base_url']
             raw_html = raw_html_path.read_text()
-            parts = self.math_recognizer.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             # print(parts)
             # 将parts列表中第一个元素拼接保存到文件，带随机数
             # import random
@@ -391,8 +388,10 @@ def test_math_recognizer_html(self):
             #     for part in parts:
             #         f.write(str(part[0]))
             # 检查行间公式抽取正确性
-            parts = [part[0] for part in parts if CCTag.CC_MATH_INTERLINE in part[0]]
-            print(len(parts))
+            new_parts = []
+            for part in parts:
+                new_parts.append((element_to_html(part[0]), element_to_html(part[1])))
+            parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
             expect_text = base_dir.joinpath(test_case['expected']).read_text().strip()
             expect_formulas = [formula for formula in expect_text.split('\n') if formula]
             self.assertEqual(len(parts), len(expect_formulas))
@@ -410,10 +409,7 @@ def test_math_recognizer_html(self):
             # self.write_to_html(answers, test_case['input'][0])
             # 检查行内公式抽取正确性
             if test_case.get('expected_inline', None):
-                print('expected_inline::::::::', test_case['expected_inline'])
                 parts = [part[0] for part in parts if CCTag.CC_MATH_INLINE in part[0]]
-                print(len(parts))
-                print(parts)
 
     def write_to_html(self, answers, file_name):
         file_name = file_name.split('.')[0]
@@ -427,9 +423,11 @@ def test_to_content_list_node(self):
             with self.subTest(input=test_case['input']):
                 output_node = self.math_recognizer.to_content_list_node(
                     test_case['input'][0],
-                    test_case['input'][1],
+                    html_to_element(test_case['input'][1]),
                     test_case['input'][2]
                 )
+                print('output_node::::::::', output_node)
+                print(test_case['expected'])
                 self.assertEqual(output_node, test_case['expected'])
 
         # 测试没有ccmath标签的情况
@@ -441,7 +439,7 @@ def test_to_content_list_node(self):
         with self.assertRaises(HtmlMathRecognizerException) as exc_info:
             self.math_recognizer.to_content_list_node(
                 invalid_content[0],
-                invalid_content[1],
+                html_to_element(invalid_content[1]),
                 invalid_content[2]
             )
         self.assertIn('No ccmath element found in content', str(exc_info.exception))
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_para.py b/tests/llm_web_kit/extractor/html/recognizer/test_para.py
index 42e988bd..adb38c59 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_para.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_para.py
@@ -20,21 +20,21 @@ def test_recognize_simple_para(self):
             html = f.read()
 
         # 执行识别
-        result = self.recognizer.recognize('', [(html, html)], html)
+        result = self.recognizer.recognize('', [(html_to_element(html), html_to_element(html))], html)
 
         # 验证结果
         self.assertEqual(len(result), 2)  # 应该识别出2个段落
 
         # 验证第一个段落
         first_para = result[0][0]
-        ccel = html_to_element(first_para)
+        ccel = first_para
         jso = json.loads(ccel.text)
         self.assertEqual(jso[0]['c'], '质量方程')
         self.assertEqual(jso[0]['t'], 'text')
 
         # 验证第二个段落
         second_para = result[1][0]
-        text = html_to_element(second_para).text
+        text = second_para.text
         jso = json.loads(text)
         self.assertEqual(jso[0]['c'], '爱因斯坦的方程')
         self.assertEqual(jso[0]['t'], 'text')
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py b/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py
index 7bedf512..86b303e6 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_recognizer.py
@@ -3,6 +3,7 @@
 
 from llm_web_kit.extractor.html.recognizer.recognizer import \
     BaseHTMLElementRecognizer
+from llm_web_kit.libs.html_utils import element_to_html, html_to_element
 
 
 class TestBaseHTMLElementRecognizer(unittest.TestCase):
@@ -10,51 +11,50 @@ def test_html_split_by_tags_1(self):
         with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/image.html', 'r') as file:
             html_content = file.read()
 
-        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['img'])
+        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['img'])
         assert len(result) == 7
 
     def test_html_split_by_tags_2(self):
         with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/cccode.html', 'r') as file:
             html_content = file.read()
 
-        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['cccode'])
+        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['cccode'])
         assert len(result) == 3
 
     def test_html_split_by_tags_3(self):
         with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/raw_html_attr.html', 'r') as file:
             html_content = file.read()
-
-        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, ['ccmath'])
+        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), ['ccmath'])
         assert len(result) == 2
-        assert result[0][1] == '$E=MC^2$'
+        assert element_to_html(result[0][1]) == '<math>$E=MC^2$</math>'
 
     def test_html_split_by_tags_with_parent_nodes(self):
         """测试是否能够正确带上父节点."""
         with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/with_parent_nodes.html', 'r') as file:
             html_content = file.read()
 
-        result_with_parent = BaseHTMLElementRecognizer.html_split_by_tags(html_content, 'cccode')
+        result_with_parent = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), 'cccode')
         assert len(result_with_parent) == 7
-        assert result_with_parent[0][0] == """<html><body><article>
+        assert element_to_html(result_with_parent[0][0]) == """<html><body><article>
     这里是text
     <span>这里是span</span></article></body></html>"""
-        assert result_with_parent[2][0] == '<html><body><article><cccode>print("BBBBBB")</cccode></article></body></html>'
-        assert result_with_parent[3][0] == """<html><body><article>
+        assert element_to_html(result_with_parent[2][0]) == '<html><body><article><cccode>print("BBBBBB")</cccode></article></body></html>'
+        assert element_to_html(result_with_parent[3][0]) == """<html><body><article>
     这里是tail
     <div><p>
             这里是div text
             <span>这里是span2</span></p></div></article></body></html>"""
 
-        result = BaseHTMLElementRecognizer.html_split_by_tags(html_content, 'cccode')
+        result = BaseHTMLElementRecognizer.html_split_by_tags(html_to_element(html_content), 'cccode')
         assert len(result) == 7
 
     def test_is_cctag(self):
         with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/iscctag.html', 'r') as file:
             html_content = file.read()
 
-        assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'cccode')
-        assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccmath')
-        assert BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccimage')
-        assert not BaseHTMLElementRecognizer.is_cc_html(html_content, 'ccvideo')
-        assert not BaseHTMLElementRecognizer.is_cc_html(html_content, 'cctitle')
-        assert BaseHTMLElementRecognizer.is_cc_html(html_content, ['cccode', 'ccxxx'])
+        assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'cccode')
+        assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccmath')
+        assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccimage')
+        assert not BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'ccvideo')
+        assert not BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), 'cctitle')
+        assert BaseHTMLElementRecognizer.is_cc_html(html_to_element(html_content), ['cccode', 'ccxxx'])
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
index 6e91c85e..2470c060 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
@@ -45,7 +45,7 @@ def test_involve_cctale(self):
             raw_html_path = base_dir.joinpath(test_case['input'][0])
             base_url = test_case['input'][1]
             raw_html = raw_html_path.read_text()
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             self.assertEqual(len(parts), 4)
 
     def test_not_involve_table(self):
@@ -54,7 +54,7 @@ def test_not_involve_table(self):
             raw_html_path = base_dir.joinpath(test_case['input'][1])
             base_url = test_case['input'][1]
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             self.assertEqual(len(parts), 1)
 
     def test_only_involve_table(self):
@@ -63,9 +63,9 @@ def test_only_involve_table(self):
             raw_html_path = base_dir.joinpath(test_case['input'][2])
             base_url = test_case['input'][1]
             raw_html = raw_html_path.read_text()
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             self.assertEqual(len(parts), 2)
-            table_body = html_to_element(parts[1][0]).text_content()
+            table_body = parts[1][0].text_content()
             assert table_body == r'<table><tr><td>Mrs S Hindle</td></tr><tr><td>Show</td><td>CC</td><td>RCC</td></tr><tr><td>Driffield 5th October 2006</td><td>CH. Ricksbury Royal Hero</td><td>CH. Keyingham Branwell</td></tr><tr><td>Manchester 16th January 2008</td><td>CH. Lochbuie Geordie</td><td>Merryoth Maeve</td></tr><tr><td>Darlington 20th September 2009</td><td>CH. Maibee Make Believe</td><td>CH. Loranka Just Like Heaven JW</td></tr><tr><td>Blackpool 22nd June 2012</td><td>CH. Loranka Sherrie Baby</td><td>Dear Magic Touch De La Fi Au Songeur</td></tr><tr><td>Welsh Kennel Club 2014</td><td>Brymarden Carolina Sunrise</td><td>Ch. Wandris Evan Elp Us</td></tr><tr><td>Welsh Kennel Club 2014</td><td>Ch. Charnell Clematis of Salegreen</td><td>CH. Byermoor Queens Maid</td></tr></table>'
 
     def test_table_include_img_label(self):
@@ -74,9 +74,9 @@ def test_table_include_img_label(self):
             raw_html_path = base_dir.joinpath(test_case['input'][6])
             base_url = test_case['input'][1]
             raw_html = raw_html_path.read_text()
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             assert len(parts) == 3
-            simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0]
+            simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0]
             simple_table_type = simple_table_tag.attrib
             assert simple_table_type['table_type'] == 'simple'
 
@@ -86,9 +86,9 @@ def test_cc_simple_table(self):
             raw_html_path = base_dir.joinpath(test_case['input'][7])
             base_url = test_case['input'][8]
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             assert len(parts) == 3
-            content = html_to_element(parts[1][0]).text_content()
+            content = parts[1][0].text_content()
             assert content == r'<table><tbody><tr><td>Рейтинг:</td><td>Рейтинг 5.00 из 5 на основе опроса 3 пользователей</td></tr><tr><td>Тип товара:</td><td>Препараты для омоложения</td></tr><tr><td>Форма:</td><td>Крем</td></tr><tr><td>Объем:</td><td>50 мл</td></tr><tr><td>Рецепт:</td><td>Отпускается без рецепта</td></tr><tr><td>Способ хранения:</td><td>Хранить при температуре 4-20°</td></tr><tr><td>Примечание:</td><td>Беречь от детей</td></tr><tr><td>Оплата:</td><td>Наличными/банковской картой</td></tr><tr><td>Доступность в Северске:</td><td>В наличии</td></tr><tr><td>Доставка:</td><td>2-7 Дней</td></tr><tr><td>Цена:</td><td>84 ₽</td></tr></tbody></table>'
 
     def test_cc_complex_table(self):
@@ -97,11 +97,11 @@ def test_cc_complex_table(self):
             raw_html_path = base_dir.joinpath(test_case['input'][8])
             base_url = test_case['input'][8]
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             assert len(parts) == 3
-            content = html_to_element(parts[1][0]).text_content()
+            content = parts[1][0].text_content()
             assert content == r'<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan="3">« জানুয়ারি</td><td></td><td colspan="3"></td></tr></tfoot><tbody><tr><td colspan="3"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan="3"></td></tr></tbody></table>'
-            table_type = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0]
+            table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0]
             assert table_type.attrib['table_type'] == 'complex'
 
     def test_simple_complex_table(self):
@@ -110,12 +110,12 @@ def test_simple_complex_table(self):
             raw_html_path = base_dir.joinpath(test_case['input'][3])
             base_url = test_case['input'][1]
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
-            simple_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')[0]
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0]
             simple_table_type = simple_table_tag.attrib
             assert simple_table_type['table_type'] == 'simple'
             assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '<table>\n    <tr>\n        <td>1</td>\n        <td>2</td>\n    </tr>\n    <tr>\n        <td>3</td>\n        <td>4</td>\n    </tr>\n</table>\n\n'}
-            complex_table_tag = html_to_element(parts[2][0]).xpath(f'.//{CCTag.CC_TABLE}')[0]
+            complex_table_tag = parts[2][0].xpath(f'.//{CCTag.CC_TABLE}')[0]
             complex_table_type = complex_table_tag.attrib
             assert complex_table_type['table_type'] == 'complex'
             assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '<table>\n        <tr>\n            <td rowspan="2">1</td>\n            <td>2</td>\n            <td>3</td>\n        </tr>\n        <tr>\n            <td colspan="2">4</td>\n        </tr>\n        <tr>\n            <td>5</td>\n            <td>6</td>\n            <td>7</td>\n        </tr>\n    </table>\n    '}
@@ -127,9 +127,10 @@ def test_table_to_content_list_node_simple(self):
             base_url = test_case['input'][1]
             raw_html = raw_html_path.read_text(encoding='utf-8')
             parsed_content = raw_html
-            result = self.rec.to_content_list_node(base_url, parsed_content, raw_html)
+            result = self.rec.to_content_list_node(base_url, html_to_element(parsed_content), raw_html)
             expect = base_dir.joinpath(test_case['expected'][0])
             expect_json = expect.read_text(encoding='utf-8')
+            print(result)
             assert result['type'] == json.loads(expect_json)['type']
             assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex']
             assert result['raw_content'] == json.loads(expect_json)['raw_content']
@@ -142,7 +143,7 @@ def test_table_to_content_list_node_complex(self):
             raw_html_path = base_dir.joinpath(test_case['input'][5])
             expect_path = base_dir.joinpath(test_case['expected'][1])
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            result = self.rec.to_content_list_node(expect_path, raw_html, raw_html)
+            result = self.rec.to_content_list_node(expect_path, html_to_element(raw_html), raw_html)
             fr = open(expect_path, 'r', encoding='utf-8')
             expect_result = json.loads(fr.read())
             assert result == expect_result
@@ -153,9 +154,11 @@ def test_table_involve_equation(self):
             raw_html_path = base_dir.joinpath(test_case['input'][9])
             base_url = 'https://en.m.wikipedia.org/wiki/Variance'
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
-            complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')
-            print(complex_table_tag[0].text)
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')
+            assert complex_table_tag[0].text == r'<table><tbody><tr><th>Name of the probability distribution</th><th>Probability distribution function</th><th>Mean</th><th>Variance</th></tr><tr><td>Binomial distribution</td><td>${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$</td><td>${\displaystyle np}$</td><th>${\displaystyle np(1-p)}$</th></tr><tr><td>Geometric distribution</td><td>${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$</td><td>${\displaystyle {\frac {1}{p}}}$</td><th>${\displaystyle {\frac {(1-p)}{p^{2}}}}$</th></tr><tr><td>Normal distribution</td><td>${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$</td><td>${\displaystyle \mu }$</td><th>${\displaystyle \sigma ^{2}}$</th></tr><tr><td>Uniform distribution (continuous)</td><td>${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}$</td><td>${\displaystyle {\frac {a+b}{2}}}$</td><th>${\displaystyle {\frac {(b-a)^{2}}{12}}}$</th></tr><tr><td>Exponential distribution</td><td>${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$</td><td>${\displaystyle {\frac {1}{\lambda }}}$</td><th>${\displaystyle {\frac {1}{\lambda ^{2}}}}$</th></tr><tr><td>Poisson distribution</td><td>${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$</td><td>${\displaystyle \lambda }$</td><th>${\displaystyle \lambda }$</th></tr></tbody></table>'
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')
             assert complex_table_tag[0].text == r'<table><tbody><tr><th>Name of the probability distribution</th><th>Probability distribution function</th><th>Mean</th><th>Variance</th></tr><tr><td>Binomial distribution</td><td>${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$</td><td>${\displaystyle np}$</td><th>${\displaystyle np(1-p)}$</th></tr><tr><td>Geometric distribution</td><td>${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$</td><td>${\displaystyle {\frac {1}{p}}}$</td><th>${\displaystyle {\frac {(1-p)}{p^{2}}}}$</th></tr><tr><td>Normal distribution</td><td>${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$</td><td>${\displaystyle \mu }$</td><th>${\displaystyle \sigma ^{2}}$</th></tr><tr><td>Uniform distribution (continuous)</td><td>${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}x<a{\text{ or }}x>b\end{cases}}}$</td><td>${\displaystyle {\frac {a+b}{2}}}$</td><th>${\displaystyle {\frac {(b-a)^{2}}{12}}}$</th></tr><tr><td>Exponential distribution</td><td>${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$</td><td>${\displaystyle {\frac {1}{\lambda }}}$</td><th>${\displaystyle {\frac {1}{\lambda ^{2}}}}$</th></tr><tr><td>Poisson distribution</td><td>${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$</td><td>${\displaystyle \lambda }$</td><th>${\displaystyle \lambda }$</th></tr></tbody></table>'
 
     def test_table_involve_after_code(self):
@@ -164,8 +167,8 @@ def test_table_involve_after_code(self):
             raw_html_path = base_dir.joinpath(test_case['input'][10])
             base_url = 'https://en.m.wikipedia.org/wiki/Variance'
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
-            assert html_to_element(parts[0][0]).xpath(f'.//{CCTag.CC_TABLE}')[0].text is None
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            assert parts[0][0].xpath(f'.//{CCTag.CC_TABLE}')[0].text is None
 
     @unittest.skip(reason='在code模块解决了table嵌套多行代码问题')
     def test_table_involve_code(self):
@@ -174,8 +177,8 @@ def test_table_involve_code(self):
             raw_html_path = base_dir.joinpath(test_case['input'][11])
             base_url = 'https://en.m.wikipedia.org/wiki/Variance'
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
-            complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')
             expect_path = base_dir.joinpath(test_case['expected'][3])
             content = open(expect_path, 'r', encoding='utf-8').read()
             assert complex_table_tag[0].text == content.strip('\n')
@@ -187,8 +190,8 @@ def test_table_involve_complex_code(self):
             raw_html_path = base_dir.joinpath(test_case['input'][12])
             base_url = 'https://en.m.wikipedia.org/wiki/Variance'
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.rec.recognize(base_url, [(raw_html, raw_html)], raw_html)
-            complex_table_tag = html_to_element(parts[1][0]).xpath(f'.//{CCTag.CC_TABLE}')
+            parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
+            complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')
             expect_path = base_dir.joinpath(test_case['expected'][3])
             content = open(expect_path, 'r', encoding='utf-8').read()
             assert complex_table_tag[0].text == content.strip('\n')
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index a9d368a1..7c85ddd7 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -9,6 +9,7 @@
     BaseHTMLElementRecognizer
 from llm_web_kit.extractor.html.recognizer.text import TextParagraphRecognizer
 from llm_web_kit.input.datajson import DataJson
+from llm_web_kit.libs.html_utils import element_to_html, html_to_element
 
 
 class TestTextParagraphRecognize(unittest.TestCase):
@@ -28,8 +29,8 @@ def test_text_1(self):
         assert self.text_recognize._TextParagraphRecognizer__combine_text('知识乱象\n',
                                                                           '中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
                                                                           'zh')[:7] == '知识乱象\n中共'
-        result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert result[909][0][1413:1422] == '知识乱象\\n 中共'
+        result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
+        assert '知识乱象\\n 中共' in element_to_html(result[908][0])
 
     def test_text_2(self):
         """
@@ -150,8 +151,8 @@ def test_text_7(self):
         """
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text7.html', 'r') as file:
             html_content = file.read()
-        result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
+        result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
+        assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in element_to_html(result[0][0]) and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
 
     def test_text_8(self):
         """
@@ -162,8 +163,8 @@ def test_text_8(self):
         """
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
             html_content = file.read()
-        result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
+        result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
+        assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in element_to_html(result[0][0]) and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
 
     def test_text_9(self):
         """
@@ -174,8 +175,8 @@ def test_text_9(self):
         """
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
             html_content = file.read()
-        result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert '1) Consider the formula f(x)=lim(n--&gt;infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D--&gt;R is continuous.\\n\\n 2) Let f: D--&gt;R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)--&gt;R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0])
+        result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
+        assert '1) Consider the formula f(x)=lim(n--&gt;infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D--&gt;R is continuous.\\n\\n 2) Let f: D--&gt;R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)--&gt;R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html(result[50][0]) and BaseHTMLElementRecognizer.is_cc_html(result[50][0])
 
     def test_text_10(self):
         """
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py
index d3eedc2d..8cc8eeeb 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py
@@ -4,6 +4,7 @@
 import pytest
 
 from llm_web_kit.extractor.html.recognizer.title import TitleRecognizer
+from llm_web_kit.libs.html_utils import element_to_html
 
 
 @pytest.fixture
@@ -17,9 +18,9 @@ def test_title_recognizer(title_recognizer):
 
     result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
     assert len(result) == 10
-    assert result[0][0] == """<html><body><cctitle level="1" html="&lt;h1&gt;大模型好，大模型棒1&lt;/h1&gt;
+    assert element_to_html(result[0][0]) == """<html><body><cctitle level="1" html="&lt;h1&gt;大模型好，大模型棒1&lt;/h1&gt;
         ">大模型好，大模型棒1</cctitle></body></html>"""
-    assert result[6][0] == """<html><body><cctitle level="3" html="&lt;h3&gt;大模型好，大模型棒5&lt;span&gt;大模型很棒&lt;/span&gt;&lt;/h3&gt;
+    assert element_to_html(result[6][0]) == """<html><body><cctitle level="3" html="&lt;h3&gt;大模型好，大模型棒5&lt;span&gt;大模型很棒&lt;/span&gt;&lt;/h3&gt;
         ">大模型好，大模型棒5 大模型很棒</cctitle></body></html>"""
 
 
@@ -27,5 +28,5 @@ def test_title_tails_and_levels(title_recognizer):
     html_content = """<h4>TEST:<cccode-inline>import *</cccode-inline>TEST</h4>Tail<p>aaa</p>"""
     result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
     assert len(result) == 2
-    assert result[0][0] == '<div><cctitle level="4" html="&lt;h4&gt;TEST:&lt;cccode-inline&gt;import *&lt;/cccode-inline&gt;TEST&lt;/h4&gt;Tail">TEST: `import *` TEST</cctitle></div>'
+    assert element_to_html(result[0][0]) == '<div><cctitle level="4" html="&lt;h4&gt;TEST:&lt;cccode-inline&gt;import *&lt;/cccode-inline&gt;TEST&lt;/h4&gt;Tail">TEST: `import *` TEST</cctitle></div>'
     pass
diff --git a/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py b/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py
index da4a4d7e..cd7196c9 100644
--- a/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py
+++ b/tests/llm_web_kit/extractor/html/test_ContentListStaticsPostExtractor.py
@@ -54,7 +54,7 @@ def setUp(self):
                         }
                     },
                     {
-                        'type': 'table',
+                        'type': 'complex_table',
                         'raw_content': '',
                         'content': {
                             'html': '<table><tr><td>1</td><td>2</td></tr></table>',
@@ -75,5 +75,5 @@ def test_content_list_statics_post_extractor(self):
         self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('paragraph.text'), 2)
         self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('paragraph.equation-inline'), 1)
         self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('equation-interline'), 1)
-        self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('table'), 1)
-        self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('table.complex'), 1)
+        self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('complex_table'), 1)
+        self.assertEqual(data_json.get(DataJsonKey.METAINFO, {}).get(DataJsonKey.STATICS, {}).get('complex_table.complex'), 1)
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index 01fb611a..271bcfdd 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -60,7 +60,7 @@ def setUp(self):
             for line in f:
                 self.data_json.append(json.loads(line.strip()))
 
-        assert len(self.data_json) == 21
+        assert len(self.data_json) == 24
 
         # Config for HTML extraction
         self.config = load_pipe_tpl('html-test')
@@ -105,13 +105,13 @@ def test_html_pipeline(self):
 
         # 然后是simple table
         html_content = html_content_list[4]
-        self.assertEqual(html_content['type'], DocElementType.TABLE)
+        self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE)
         self.assertEqual(html_content['content']['is_complex'], False)
         assert html_content['content']['html'].startswith('<table')
 
         # 然后是complex table
         html_content = html_content_list[5]
-        self.assertEqual(html_content['type'], DocElementType.TABLE)
+        self.assertEqual(html_content['type'], DocElementType.COMPLEX_TABLE)
         self.assertEqual(html_content['content']['is_complex'], True)
 
         # 然后是list
@@ -174,6 +174,7 @@ def test_html_pipeline(self):
 
         # md格式
         md_content = result.get_content_list().to_nlp_md()
+        print('md_content', md_content)
         self.assertEqual(md_content, self.md_expected_content)
         self.assertNotEqual(md_content[-2], '\n')
         self.assertEqual(md_content[-1], '\n')
@@ -243,6 +244,7 @@ def test_code_pre_mixed(self):
         # Create DataJson from test data
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
+        # print("code_pre_mixed", result.get_content_list().to_mm_md())
         self.assertIn("""```
 this (DEFAULT_SERVER_NAME, DEFAULT_SERVER_PORT);
 ```
@@ -262,8 +264,9 @@ def test_image_without_path(self):
         # Create DataJson from test data
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
+        print(result.get_content_list()._get_data())
         self.assertIn('![點(diǎn)擊進(jìn)入下一頁(yè)]( "")', result.get_content_list().to_mm_md())
-        self.assertIn('![點(diǎn)擊進(jìn)入下一頁(yè)]( "")', result.get_content_list().to_txt([]))
+        self.assertNotIn('![點(diǎn)擊進(jìn)入下一頁(yè)]( "")', result.get_content_list().to_txt())
 
     def test_lineno_detect(self):
         chain = ExtractSimpleFactory.create(self.config)
@@ -420,11 +423,43 @@ def test_table_include_entity(self):
         assert '&amp;' not in result_md
         assert '&nbsp;' not in result_md
 
+    def test_content_list_empty(self):
+        """测试content_list为空."""
+        chain = ExtractSimpleFactory.create(self.config)
+        self.assertIsNotNone(chain)
+        test_data = self.data_json[20]
+        input_data = DataJson(test_data)
+        result = chain.extract(input_data)
+        content_mmd = result.get_content_list().to_mm_md()
+        assert '京大平层，奶油风浪漫到家！' in content_mmd
+
+    def test_nlp_md_exclude_node_types(self):
+        """测试nlp_md排除节点类型."""
+        chain = ExtractSimpleFactory.create(self.config)
+        self.assertIsNotNone(chain)
+        test_data = self.data_json[21]
+        input_data = DataJson(test_data)
+        result = chain.extract(input_data)
+        content_txt = result.get_content_list().to_nlp_md(exclude_nodes=[DocElementType.COMPLEX_TABLE])
+        assert '<table>' not in content_txt
+        assert '</table>' not in content_txt
+
+    def test_para_is_short(self):
+        """测试para识别后内容太短."""
+        chain = ExtractSimpleFactory.create(self.config)
+        self.assertIsNotNone(chain)
+        test_data = self.data_json[22]
+        input_data = DataJson(test_data)
+        result = chain.extract(input_data)
+        content_txt = result.get_content_list().to_nlp_md()
+        print('content_txt', content_txt)
+        assert len(content_txt) == 3983
+
     def test_xml_tag(self):
         """测试xml标签."""
         chain = ExtractSimpleFactory.create(self.config)
         self.assertIsNotNone(chain)
-        test_data = self.data_json[20]
+        test_data = self.data_json[23]
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         result_md = result.get_content_list().to_mm_md()
diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py
index c967e330..413a657a 100644
--- a/tests/llm_web_kit/input/test_datajson.py
+++ b/tests/llm_web_kit/input/test_datajson.py
@@ -2,7 +2,9 @@
 
 import pytest
 
+from llm_web_kit.exception.exception import ExtractorChainInputException
 from llm_web_kit.input.datajson import ContentList, DataJson, DataJsonKey
+from llm_web_kit.libs.doc_element_type import DocElementType
 
 
 def test_datajson_init():
@@ -98,14 +100,71 @@ def test_datajson_serialization():
 
 def test_datajson_validation():
     # Test invalid input type
-    with pytest.raises(ValueError):
+    with pytest.raises(ExtractorChainInputException):
         DataJson([])  # List instead of dict
 
     # Test invalid content_list type
-    with pytest.raises(ValueError):
+    with pytest.raises(ExtractorChainInputException):
         DataJson({DataJsonKey.CONTENT_LIST: 'invalid'})  # String instead of list
 
 
+def test_datajson_exclude_nodes_to_nlp_md():
+    data = {
+        DataJsonKey.DATASET_NAME: 'test_dataset',
+        DataJsonKey.FILE_FORMAT: 'html',
+        DataJsonKey.CONTENT_LIST: [[{
+            'type': 'simple_table',
+            'raw_content': "<table class=\"table itemDisplayTable\"><tr><td class=\"metadataFieldLabel dc_title\">Title: </td><td class=\"metadataFieldValue dc_title\">T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.</td></tr><tr><td class=\"metadataFieldLabel dc_contributor\">Authors: </td><td class=\"metadataFieldValue dc_contributor\"><a class=\"author\" href=\"/browse?type=author&amp;value=T.J.%2C+Byrne\">T.J., Byrne</a><br><a class=\"author\" href=\"/browse?type=author&amp;value=Fewer%2C+Michael\">Fewer, Michael</a></td></tr><tr><td class=\"metadataFieldLabel dc_subject\">Keywords: </td><td class=\"metadataFieldValue dc_subject\">T.J. Byrne<br>Cottages<br>Poor Law Commission</td></tr><tr><td class=\"metadataFieldLabel dc_date_issued\">Issue Date: </td><td class=\"metadataFieldValue dc_date_issued\">2011<br>2011</td></tr><tr><td class=\"metadataFieldLabel dc_description\">Description: </td><td class=\"metadataFieldValue dc_description\">T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.</td></tr><tr><td class=\"metadataFieldLabel dc_identifier_uri\">URI: </td><td class=\"metadataFieldValue dc_identifier_uri\"><a href=\"https://hdl.handle.net/10599/5719\">https://hdl.handle.net/10599/5719</a></td></tr><tr><td class=\"metadataFieldLabel\">Appears in Collections:</td><td class=\"metadataFieldValue\"><a href=\"/handle/10599/3\">Published Items</a><br><a href=\"/handle/10599/5553\">T.J. Byrne Collection</a><br></td></tr></table>",
+            'content': {
+                'html': "<table><tr><td>Title:</td><td>T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.</td></tr><tr><td>Authors:</td><td>T.J., Byrne Fewer, Michael</td></tr><tr><td>Keywords:</td><td>T.J. Byrne Cottages Poor Law Commission</td></tr><tr><td>Issue Date:</td><td>2011 2011</td></tr><tr><td>Description:</td><td>T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.</td></tr><tr><td>URI:</td><td>https://hdl.handle.net/10599/5719</td></tr><tr><td>Appears in Collections:</td><td>Published Items T.J. Byrne Collection</td></tr></table>",
+                'is_complex': False,
+                'table_nest_level': '1'
+            }
+        }]]
+    }
+    datajson = DataJson(data)
+    md = datajson.get_content_list().to_nlp_md(exclude_nodes=DocElementType.COMPLEX_TABLE)
+    assert '<table>' not in md
+
+
+def test_datajson_exclude_nodes_to_mmd():
+    data = {
+        DataJsonKey.DATASET_NAME: 'test_dataset',
+        DataJsonKey.FILE_FORMAT: 'html',
+        DataJsonKey.CONTENT_LIST: [[{
+            'type': 'simple_table',
+            'raw_content': "<table class=\"table itemDisplayTable\"><tr><td class=\"metadataFieldLabel dc_title\">Title: </td><td class=\"metadataFieldValue dc_title\">T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.</td></tr><tr><td class=\"metadataFieldLabel dc_contributor\">Authors: </td><td class=\"metadataFieldValue dc_contributor\"><a class=\"author\" href=\"/browse?type=author&amp;value=T.J.%2C+Byrne\">T.J., Byrne</a><br><a class=\"author\" href=\"/browse?type=author&amp;value=Fewer%2C+Michael\">Fewer, Michael</a></td></tr><tr><td class=\"metadataFieldLabel dc_subject\">Keywords: </td><td class=\"metadataFieldValue dc_subject\">T.J. Byrne<br>Cottages<br>Poor Law Commission</td></tr><tr><td class=\"metadataFieldLabel dc_date_issued\">Issue Date: </td><td class=\"metadataFieldValue dc_date_issued\">2011<br>2011</td></tr><tr><td class=\"metadataFieldLabel dc_description\">Description: </td><td class=\"metadataFieldValue dc_description\">T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.</td></tr><tr><td class=\"metadataFieldLabel dc_identifier_uri\">URI: </td><td class=\"metadataFieldValue dc_identifier_uri\"><a href=\"https://hdl.handle.net/10599/5719\">https://hdl.handle.net/10599/5719</a></td></tr><tr><td class=\"metadataFieldLabel\">Appears in Collections:</td><td class=\"metadataFieldValue\"><a href=\"/handle/10599/3\">Published Items</a><br><a href=\"/handle/10599/5553\">T.J. Byrne Collection</a><br></td></tr></table>",
+            'content': {
+                'html': "<table><tr><td>Title:</td><td>T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.</td></tr><tr><td>Authors:</td><td>T.J., Byrne Fewer, Michael</td></tr><tr><td>Keywords:</td><td>T.J. Byrne Cottages Poor Law Commission</td></tr><tr><td>Issue Date:</td><td>2011 2011</td></tr><tr><td>Description:</td><td>T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.</td></tr><tr><td>URI:</td><td>https://hdl.handle.net/10599/5719</td></tr><tr><td>Appears in Collections:</td><td>Published Items T.J. Byrne Collection</td></tr></table>",
+                'is_complex': False,
+                'table_nest_level': '1'
+            }
+        }, {
+            'type': 'complex_table',
+            'raw_content': "<table class=\"table itemDisplayTable\"><tr><td class=\"metadataFieldLabel dc_title\">Title: </td><td class=\"metadataFieldValue dc_title\">T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.</td></tr><tr><td class=\"metadataFieldLabel dc_contributor\">Authors: </td><td class=\"metadataFieldValue dc_contributor\"><a class=\"author\" href=\"/browse?type=author&amp;value=T.J.%2C+Byrne\">T.J., Byrne</a><br><a class=\"author\" href=\"/browse?type=author&amp;value=Fewer%2C+Michael\">Fewer, Michael</a></td></tr><tr><td class=\"metadataFieldLabel dc_subject\">Keywords: </td><td class=\"metadataFieldValue dc_subject\">T.J. Byrne<br>Cottages<br>Poor Law Commission</td></tr><tr><td class=\"metadataFieldLabel dc_date_issued\">Issue Date: </td><td class=\"metadataFieldValue dc_date_issued\">2011<br>2011</td></tr><tr><td class=\"metadataFieldLabel dc_description\">Description: </td><td class=\"metadataFieldValue dc_description\">T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.</td></tr><tr><td class=\"metadataFieldLabel dc_identifier_uri\">URI: </td><td class=\"metadataFieldValue dc_identifier_uri\"><a href=\"https://hdl.handle.net/10599/5719\">https://hdl.handle.net/10599/5719</a></td></tr><tr><td class=\"metadataFieldLabel\">Appears in Collections:</td><td class=\"metadataFieldValue\"><a href=\"/handle/10599/3\">Published Items</a><br><a href=\"/handle/10599/5553\">T.J. Byrne Collection</a><br></td></tr></table>",
+            'content': {
+                'html': "<table><tr><td>Title:</td><td>T.J. Byrne, Slide of floor plan, Poor Law Commission cottage, 1872.</td></tr><tr><td>Authors:</td><td>T.J., Byrne Fewer, Michael</td></tr><tr><td>Keywords:</td><td>T.J. Byrne Cottages Poor Law Commission</td></tr><tr><td>Issue Date:</td><td>2011 2011</td></tr><tr><td>Description:</td><td>T.J. Byrne's slide of a one storey cottage, labelled 'Mr Barney's Plan', recommended by the Poor Law Commission, 1872.</td></tr><tr><td>URI:</td><td>https://hdl.handle.net/10599/5719</td></tr><tr><td>Appears in Collections:</td><td>Published Items T.J. Byrne Collection</td></tr></table>",
+                'is_complex': True,
+                'table_nest_level': '1'
+            }
+        }, {
+            'type': 'image',
+            'raw_content': "<img decoding=\"async\" loading=\"lazy\" aria-describedby=\"caption-attachment-17269\" class=\"wp-image-17269 size-full\" title=\"Curtindo o apartamento com piscina no centro de SP. \" src=\"https://naproadavida.com/wp-content/uploads/2020/11/20201024-Airbnb-SP-Consolacao_getaway_manha_Sony-1.jpg\" alt=\"Curtindo o apartamento com piscina no centro de SP. \" width=\"765\" height=\"510\" srcset=\"https://naproadavida.com/wp-content/uploads/2020/11/20201024-Airbnb-SP-Consolacao_getaway_manha_Sony-1.jpg 765w, https://naproadavida.com/wp-content/uploads/2020/11/20201024-Airbnb-SP-Consolacao_getaway_manha_Sony-1-480x320.jpg 480w\" sizes=\"(min-width: 0px) and (max-width: 480px) 480px, (min-width: 481px) 765px, 100vw\">",
+            'content': {
+                'url': 'https://naproadavida.com/wp-content/uploads/2020/11/20201024-Airbnb-SP-Consolacao_getaway_manha_Sony-1.jpg',
+                'data': None,
+                'alt': 'Curtindo o apartamento com piscina no centro de SP. ',
+                'title': 'Curtindo o apartamento com piscina no centro de SP. ',
+                'caption': None
+            }
+        }]]
+    }
+    datajson = DataJson(data)
+    md = datajson.get_content_list().to_mm_md(exclude_nodes=DocElementType.COMPLEX_TABLE)
+    assert '<table>' not in md
+    assert 'Curtindo o apartamento com piscina no centro de SP.' in md
+
+
 def test_data_json_deepcopy():
     """从一个外部dict构建datajson, 改变datajson，不改变外部dict."""
     d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28',
@@ -174,7 +233,7 @@ def test_data_json_to_nlp_md():
                     }
                 },
                 {
-                    'type': 'table',
+                    'type': 'simple_table',
                     'raw_content': '<table class=\"table table-hover\" id=\"table-visitinghours\"><tr class=\"\"><td>\n\t\t\t\tMaandag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tDinsdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tWoensdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tDonderdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tVrijdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tZaterdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tZondag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr></table>',
                     'content': {
                         'html': '<table><tr><td>Maandag</td><td>-</td></tr><tr><td>Dinsdag</td><td>-</td></tr><tr><td>Woensdag</td><td>-</td></tr><tr><td>Donderdag</td><td>-</td></tr><tr><td>Vrijdag</td><td>-</td></tr><tr><td>Zaterdag</td><td>-</td></tr><tr><td>Zondag</td><td>-</td></tr></table>',
@@ -205,7 +264,7 @@ def test_default_exclude():
 
     def test_custom_exclude():
         datajson = DataJson(d)
-        md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table'])
+        md = datajson.get_content_list().to_nlp_md(exclude_nodes=[DocElementType.COMPLEX_TABLE, DocElementType.SIMPLE_TABLE])
         assert 'Ziet u iets wat niet hoort of niet klopt?' in md
         assert 'Openingstijden' in md
         assert 'Maandag' not in md