diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py
index 30f8241d..53f612dc 100644
--- a/llm_web_kit/extractor/html/recognizer/image.py
+++ b/llm_web_kit/extractor/html/recognizer/image.py
@@ -157,8 +157,7 @@ def __parse_img_elements(self, base_url: str, img_elements: HtmlElement, html_ob
                 'html': raw_img_html,  # 保留原始 <img> 标签作为属性值
                 'format': 'url',  # 指定图片格式，url|base
             }
-            if elem.text and elem.text.strip():
-                attributes['caption'] = elem.text.strip()
+            attributes['caption'] = elem.xpath('normalize-space()')
             if tag in ['embed', 'object', 'iframe', 'video', 'audio', 'canvas']:
                 if not [img_elem for img_elem in self.IMG_LABEL if
                         img_elem in raw_img_html.lower()]:
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 3ded4c2b..bf970f93 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -298,7 +298,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
             else:
                 image_caption = ''
 
-            image_des = image_title if image_title else image_caption if image_caption else ''
+            image_des = image_title if image_title else ''
             # 优先使用data, 其次path.其中data是base64编码的图片，path是图片的url
             if image_data:
                 if image_des:
@@ -310,7 +310,13 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
                     image = f'![{image_alt}]({image_path} "{image_des}")'
                 else:
                     image = f'![{image_alt}]({image_path})'
-            return image
+
+            if image_caption:
+                image_with_caption = f'{image}\n\n{image_caption}'
+            else:
+                image_with_caption = image
+
+            return image_with_caption
         elif node_type == DocElementType.AUDIO:
             return ''  # TODO: 音频格式
         elif node_type == DocElementType.VIDEO:
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
index 73a4c3a0..6396c6d1 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
@@ -350,3 +350,36 @@ def test_complex_heading_image_removal(self):
             img_in_p.extend(p.xpath('.//img'))
 
         self.assertEqual(len(img_in_p), 0, '段落中不应该有img标签')
+
+    def test_image_caption(self):
+        complex_html = """
+        <figure class="wp-block-image size-full" data-anno-uid="anno-uid-q4azamuwlp"><img alt=""
+                                                                                          cc-select="true"
+                                                                                          class="wp-image-163239 mark-selected"
+                                                                                          data-anno-uid="anno-uid-trx696xmwg"
+                                                                                          decoding="async"
+                                                                                          height="899"
+                                                                                          loading="lazy"
+                                                                                          sizes="(max-width: 900px) 100vw, 730px"
+                                                                                          src="https://www.ask.com/wp-content/uploads/sites/3/2022/09/87f86f75062a6a7084c4d95d06e502ea.png"
+                                                                                          srcset="https://www.ask.com/wp-content/uploads/sites/3/2022/09/87f86f75062a6a7084c4d95d06e502ea.png?resize=900,505 900w, https://www.ask.com/wp-content/uploads/sites/3/2022/09/87f86f75062a6a7084c4d95d06e502ea.png?resize=730,410 730w, https://www.ask.com/wp-content/uploads/sites/3/2022/09/87f86f75062a6a7084c4d95d06e502ea.png?resize=500,280 500w, https://www.ask.com/wp-content/uploads/sites/3/2022/09/87f86f75062a6a7084c4d95d06e502ea.png?resize=370,207 370w"
+                                                                                          style=""
+                                                                                          width="1600">
+            <figcaption data-anno-uid="anno-uid-6qfcte0y2eh">
+                <marked-text cc-select="true" class="mark-selected"
+                             data-anno-uid="anno-uid-5ssd5opgc1e">Roger Moore in
+                </marked-text>
+                <em cc-select="true" class="mark-selected" data-anno-uid="anno-uid-lp7dtyaq7gl"> For
+                    Your Eyes Only</em>
+                <marked-tail cc-select="true" class="mark-selected"
+                             data-anno-uid="anno-uid-1qugevcscpa" style="">. Photo Courtesy: United
+                    Artists/Everett Collection
+                </marked-tail>
+            </figcaption>
+        </figure>
+        """
+        element = html_to_element(complex_html)
+        base_url = 'http://example.com'
+        parts = self.img_recognizer.recognize(base_url, [(element, element)], complex_html)
+        html = element_to_html(parts[0][0])
+        self.assertIn('caption="Roger Moore in For Your Eyes Only . Photo Courtesy: United Artists/Everett Collection', html)
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index a5f68540..5255efe9 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -105,7 +105,7 @@ def test_html_pipeline(self):
         self.assertEqual(html_content['content']['title'], 'image-title')
         self.assertEqual(html_content['content']['alt'], 'image-alt')
         self.assertEqual(html_content['content']['url'], 'https://www.test.com/test.png')
-        self.assertEqual(html_content['content']['caption'], None)
+        self.assertEqual(html_content['content']['caption'], '')
 
         # 然后是simple table
         html_content = html_content_list[4]