diff --git a/parser/check_files.py b/parser/check_files.py index 8c9c5617..b50976e9 100644 --- a/parser/check_files.py +++ b/parser/check_files.py @@ -21,6 +21,14 @@ def search_for_strings(html_soup, output): search_for_strings(element, output) +def random_substring(s, n=30): + length = len(s) + + start_char = random.randint(0, length - 30) + + return s[start_char : start_char + n] + + def select_random_text_from_file(path, n): html = Path(path).read_text() html_soup = BeautifulSoup(html, "html.parser") @@ -30,6 +38,9 @@ def select_random_text_from_file(path, n): output = list(filter(lambda x: len(x) > 40, output)) + # Get a random substring of each string + output = list(map(random_substring, output)) + if n == "all": return output else: diff --git a/parser/lman_parser.py b/parser/lman_parser.py index 5a6ecb02..5dbd44ab 100644 --- a/parser/lman_parser.py +++ b/parser/lman_parser.py @@ -8,6 +8,7 @@ import os import subprocess from urllib.parse import urlparse +import bs4 from bs4 import BeautifulSoup from pprint import pprint, pformat from html_to_dita import htmlToDITA @@ -565,6 +566,38 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="") # insert rest of converted content dita_section.extend(converted_bits) + def is_empty_p_element(el): + if el is None: + return False + elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0: + return True + else: + return False + + # Check for repeated

 

elements + p_elements = page.find_all("p") + empty_p_elements = list(filter(is_empty_p_element, p_elements)) + found = False + for el in empty_p_elements: + count = 0 + for next_sibling in el.next_siblings: + if next_sibling is None: + break + elif type(next_sibling) is bs4.element.NavigableString: + continue + elif is_empty_p_element(next_sibling): + count += 1 + + if count >= 4: + found = True + break + + if found: + logging.warning( + f"Found string of repeated

 

elements in div with ID {page.get('id')} in file {filename}" + ) + break + return dita_section def find_first_page_layer(self, top_to_div_mapping, html_soup): diff --git a/parser/parser_utils.py b/parser/parser_utils.py index 8f473226..cc445422 100644 --- a/parser/parser_utils.py +++ b/parser/parser_utils.py @@ -191,9 +191,9 @@ def generate_top_to_div_mapping( # exited in an earlier if statement), so we check if there are some elements without top values # and raise a warning if so if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1: - logging.warning( - f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}" - ) + # logging.warning( + # f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}" + # ) return [(0, html_soup)] return top_to_div_mapping