From a09ae5dc372a8e06a9d5c8e1f32269d731ad16de Mon Sep 17 00:00:00 2001 From: Robin Wilson Date: Wed, 22 Nov 2023 16:36:53 +0000 Subject: [PATCH 1/3] Change check_files.py script to get a random 30 char substring of the text and check that exists in the target file. Fixes #546 --- parser/check_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/parser/check_files.py b/parser/check_files.py index 8c9c5617..b50976e9 100644 --- a/parser/check_files.py +++ b/parser/check_files.py @@ -21,6 +21,14 @@ def search_for_strings(html_soup, output): search_for_strings(element, output) +def random_substring(s, n=30): + length = len(s) + + start_char = random.randint(0, length - 30) + + return s[start_char : start_char + n] + + def select_random_text_from_file(path, n): html = Path(path).read_text() html_soup = BeautifulSoup(html, "html.parser") @@ -30,6 +38,9 @@ def select_random_text_from_file(path, n): output = list(filter(lambda x: len(x) > 40, output)) + # Get a random substring of each string + output = list(map(random_substring, output)) + if n == "all": return output else: From ac2287e31ad281eb63d0fd64214852b1111357bd Mon Sep 17 00:00:00 2001 From: Robin Wilson Date: Thu, 23 Nov 2023 09:42:23 +0000 Subject: [PATCH 2/3] Identify where multiple

 

tags have been added and give warning. Also remove warning for old way of detecting mixed top/non-top content. Fixes #548. --- parser/lman_parser.py | 34 ++++++++++++++++++++++++++++++++++ parser/parser_utils.py | 6 +++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/parser/lman_parser.py b/parser/lman_parser.py index 5a6ecb02..8265f394 100644 --- a/parser/lman_parser.py +++ b/parser/lman_parser.py @@ -8,6 +8,7 @@ import os import subprocess from urllib.parse import urlparse +import bs4 from bs4 import BeautifulSoup from pprint import pprint, pformat from html_to_dita import htmlToDITA @@ -565,6 +566,39 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="") # insert rest of converted content dita_section.extend(converted_bits) + def is_empty_p_element(el): + if el is None: + return False + elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0: + return True + else: + return False + + def next_sibling_tag(el): + next_sib = el.next_sibling + while type(next_sib) is bs4.element.NavigableString: + next_sib = next_sib.next_sibling + + return next_sib + + # Check for repeated

 

elements + p_elements = page.find_all("p") + empty_p_elements = list(filter(is_empty_p_element, p_elements)) + + found = False + for el in empty_p_elements: + count = 0 + while is_empty_p_element(next_sibling_tag(el)): + count += 1 + if count >= 4: + found = True + break + if found: + logging.warning( + f"Found string of repeated

 

elements in div with ID {page.get('id')} in file {filename}" + ) + break + return dita_section def find_first_page_layer(self, top_to_div_mapping, html_soup): diff --git a/parser/parser_utils.py b/parser/parser_utils.py index 8f473226..cc445422 100644 --- a/parser/parser_utils.py +++ b/parser/parser_utils.py @@ -191,9 +191,9 @@ def generate_top_to_div_mapping( # exited in an earlier if statement), so we check if there are some elements without top values # and raise a warning if so if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1: - logging.warning( - f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}" - ) + # logging.warning( + # f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}" + # ) return [(0, html_soup)] return top_to_div_mapping From 89ef3cb9d0768e1c3350631b3ed6b0a67c8404e3 Mon Sep 17 00:00:00 2001 From: Robin Wilson Date: Thu, 23 Nov 2023 10:05:59 +0000 Subject: [PATCH 3/3] Fix checking for multiple empty

tags, as we were getting false positives. --- parser/lman_parser.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/parser/lman_parser.py b/parser/lman_parser.py index 8265f394..5dbd44ab 100644 --- a/parser/lman_parser.py +++ b/parser/lman_parser.py @@ -574,25 +574,24 @@ def is_empty_p_element(el): else: return False - def next_sibling_tag(el): - next_sib = el.next_sibling - while type(next_sib) is bs4.element.NavigableString: - next_sib = next_sib.next_sibling - - return next_sib - # Check for repeated

 

elements p_elements = page.find_all("p") empty_p_elements = list(filter(is_empty_p_element, p_elements)) - found = False for el in empty_p_elements: count = 0 - while is_empty_p_element(next_sibling_tag(el)): - count += 1 + for next_sibling in el.next_siblings: + if next_sibling is None: + break + elif type(next_sibling) is bs4.element.NavigableString: + continue + elif is_empty_p_element(next_sibling): + count += 1 + if count >= 4: found = True break + if found: logging.warning( f"Found string of repeated

 

elements in div with ID {page.get('id')} in file {filename}"