DeepBlueCLtd · robintw · Nov 22, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/parser/check_files.py b/parser/check_files.py
@@ -21,6 +21,14 @@ def search_for_strings(html_soup, output):
             search_for_strings(element, output)
 
 
+def random_substring(s, n=30):
+    length = len(s)
+
+    start_char = random.randint(0, length - 30)
+
+    return s[start_char : start_char + n]
+
+
 def select_random_text_from_file(path, n):
     html = Path(path).read_text()
     html_soup = BeautifulSoup(html, "html.parser")
@@ -30,6 +38,9 @@ def select_random_text_from_file(path, n):
 
     output = list(filter(lambda x: len(x) > 40, output))
 
+    # Get a random substring of each string
+    output = list(map(random_substring, output))
+
     if n == "all":
         return output
     else:

diff --git a/parser/lman_parser.py b/parser/lman_parser.py
@@ -8,6 +8,7 @@
 import os
 import subprocess
 from urllib.parse import urlparse
+import bs4
 from bs4 import BeautifulSoup
 from pprint import pprint, pformat
 from html_to_dita import htmlToDITA
@@ -565,6 +566,38 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="")
         # insert rest of converted content
         dita_section.extend(converted_bits)
 
+        def is_empty_p_element(el):
+            if el is None:
+                return False
+            elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0:
+                return True
+            else:
+                return False
+
+        # Check for repeated <p>&nbsp;</p> elements
+        p_elements = page.find_all("p")
+        empty_p_elements = list(filter(is_empty_p_element, p_elements))
+        found = False
+        for el in empty_p_elements:
+            count = 0
+            for next_sibling in el.next_siblings:
+                if next_sibling is None:
+                    break
+                elif type(next_sibling) is bs4.element.NavigableString:
+                    continue
+                elif is_empty_p_element(next_sibling):
+                    count += 1
+
+                if count >= 4:
+                    found = True
+                    break
+
+            if found:
+                logging.warning(
+                    f"Found string of repeated <p>&nbsp;</p> elements in div with ID {page.get('id')} in file {filename}"
+                )
+                break
+
         return dita_section
 
     def find_first_page_layer(self, top_to_div_mapping, html_soup):

diff --git a/parser/parser_utils.py b/parser/parser_utils.py
@@ -191,9 +191,9 @@ def generate_top_to_div_mapping(
     # exited in an earlier if statement), so we check if there are some elements without top values
     # and raise a warning if so
     if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1:
-        logging.warning(
-            f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
-        )
+        # logging.warning(
+        #     f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
+        # )
         return [(0, html_soup)]
 
     return top_to_div_mapping