Skip to content
This repository was archived by the owner on Apr 17, 2024. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions parser/check_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ def search_for_strings(html_soup, output):
search_for_strings(element, output)


def random_substring(s, n=30):
length = len(s)

start_char = random.randint(0, length - 30)

return s[start_char : start_char + n]


def select_random_text_from_file(path, n):
html = Path(path).read_text()
html_soup = BeautifulSoup(html, "html.parser")
Expand All @@ -30,6 +38,9 @@ def select_random_text_from_file(path, n):

output = list(filter(lambda x: len(x) > 40, output))

# Get a random substring of each string
output = list(map(random_substring, output))

if n == "all":
return output
else:
Expand Down
33 changes: 33 additions & 0 deletions parser/lman_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import subprocess
from urllib.parse import urlparse
import bs4
from bs4 import BeautifulSoup
from pprint import pprint, pformat
from html_to_dita import htmlToDITA
Expand Down Expand Up @@ -565,6 +566,38 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="")
# insert rest of converted content
dita_section.extend(converted_bits)

def is_empty_p_element(el):
if el is None:
return False
elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0:
return True
else:
return False

# Check for repeated <p>&nbsp;</p> elements
p_elements = page.find_all("p")
empty_p_elements = list(filter(is_empty_p_element, p_elements))
found = False
for el in empty_p_elements:
count = 0
for next_sibling in el.next_siblings:
if next_sibling is None:
break
elif type(next_sibling) is bs4.element.NavigableString:
continue
elif is_empty_p_element(next_sibling):
count += 1

if count >= 4:
found = True
break

if found:
logging.warning(
f"Found string of repeated <p>&nbsp;</p> elements in div with ID {page.get('id')} in file {filename}"
)
break

return dita_section

def find_first_page_layer(self, top_to_div_mapping, html_soup):
Expand Down
6 changes: 3 additions & 3 deletions parser/parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,9 @@ def generate_top_to_div_mapping(
# exited in an earlier if statement), so we check if there are some elements without top values
# and raise a warning if so
if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1:
logging.warning(
f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
)
# logging.warning(
# f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
# )
return [(0, html_soup)]

return top_to_div_mapping
Expand Down