Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions llm_web_kit/extractor/html/recognizer/text.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return txt.strip().replace('\r\n', '\n').replace('\n', '\n') 原来的text1内容是带转义的吗,这里直接换成不转义的字符\n吗?

Copy link
Copy Markdown
Contributor Author

@LollipopsAndWine LollipopsAndWine Mar 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里把文本中的双斜杠\n换成单斜杠\n是因为要保留换行,只有单斜杠\n才能生成有换行的文本,然后生成markdown时有换行效果

Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,15 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
text2: str: 第二段文本
lang: str: 语言 TODO 实现根据语言连接文本的不同方式, 还有就是一些特殊符号开头的连接不加空格。
"""
text1 = text1.strip() if text1 else ''
text2 = text2.strip() if text2 else ''
text1 = text1.strip(' ') if text1 else ''
text2 = text2.strip(' ') if text2 else ''
if lang == 'zh':
return text1.strip() + text2.strip()
txt = text1 + text2
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
else:
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
txt = text1 + words_sep + text2
return txt.strip()
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')

def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
"""
Expand All @@ -140,13 +141,15 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
para_text.append({'c':el.text, 't':ParagraphTextType.EQUATION_INLINE})
elif el.tag == CCTag.CC_CODE_INLINE:
if text:
para_text.append({'c':text, 't':ParagraphTextType.TEXT})
para_text.append({'c': text, 't': ParagraphTextType.TEXT})
text = ''
para_text.append({'c':el.text, 't':ParagraphTextType.CODE_INLINE})
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
elif el.tag in ['br']:
text += '\n'
else:
if el.text and el.text.strip():
text = self.__combine_text(text, el.text.strip())
for child in el.getchildren():
for child in el:
text = __get_paragraph_text_recusive(child, text)

if el.tail and el.tail.strip():
Expand Down
Loading