Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions llm_web_kit/extractor/html/recognizer/constant.py

This file was deleted.

8 changes: 3 additions & 5 deletions llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
from llm_web_kit.libs.html_utils import element_to_html

from .constant import LINE_BREAK_UNIX, LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR

special_symbols = [ # TODO 从文件读取
'®', # 注册商标符号
'™', # 商标符号
Expand Down Expand Up @@ -114,11 +112,11 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
text2 = text2.strip(' ') if text2 else ''
if lang == 'zh':
txt = text1 + text2
return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
else:
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
txt = text1 + words_sep + text2
return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')

def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
"""
Expand Down Expand Up @@ -147,7 +145,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
text = ''
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
elif el.tag in ['br']:
text += PARAGRAPH_SEPARATOR
text += '\n'
else:
if el.text and el.text.strip():
text = self.__combine_text(text, el.text.strip())
Expand Down
Loading