Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,15 @@
text2: str: 第二段文本
lang: str: 语言 TODO 实现根据语言连接文本的不同方式, 还有就是一些特殊符号开头的连接不加空格。
"""
text1 = text1.strip() if text1 else ''
text2 = text2.strip() if text2 else ''
text1 = text1.strip(' ') if text1 else ''
text2 = text2.strip(' ') if text2 else ''
if lang == 'zh':
return text1.strip() + text2.strip()
txt = text1 + text2
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')

Check warning on line 115 in llm_web_kit/extractor/html/recognizer/text.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/text.py#L114-L115

Added lines #L114 - L115 were not covered by tests
else:
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
txt = text1 + words_sep + text2
return txt.strip()
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')

def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
"""
Expand All @@ -140,13 +141,15 @@
para_text.append({'c':el.text, 't':ParagraphTextType.EQUATION_INLINE})
elif el.tag == CCTag.CC_CODE_INLINE:
if text:
para_text.append({'c':text, 't':ParagraphTextType.TEXT})
para_text.append({'c': text, 't': ParagraphTextType.TEXT})
text = ''
para_text.append({'c':el.text, 't':ParagraphTextType.CODE_INLINE})
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
elif el.tag in ['br']:
text += '\n'
else:
if el.text and el.text.strip():
text = self.__combine_text(text, el.text.strip())
for child in el.getchildren():
for child in el:
text = __get_paragraph_text_recusive(child, text)

if el.tail and el.tail.strip():
Expand Down
Loading