From a09ae5dc372a8e06a9d5c8e1f32269d731ad16de Mon Sep 17 00:00:00 2001
From: Robin Wilson <robin@rtwilson.com>
Date: Wed, 22 Nov 2023 16:36:53 +0000
Subject: [PATCH 1/3] Change check_files.py script to get a random 30 char
 substring of the text and check that exists in the target file. Fixes #546

---
 parser/check_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
diff --git a/parser/check_files.py b/parser/check_files.py
index 8c9c5617..b50976e9 100644
--- a/parser/check_files.py
+++ b/parser/check_files.py
@@ -21,6 +21,14 @@ def search_for_strings(html_soup, output):
             search_for_strings(element, output)
 
 
+def random_substring(s, n=30):
+    length = len(s)
+
+    start_char = random.randint(0, length - 30)
+
+    return s[start_char : start_char + n]
+
+
 def select_random_text_from_file(path, n):
     html = Path(path).read_text()
     html_soup = BeautifulSoup(html, "html.parser")
@@ -30,6 +38,9 @@ def select_random_text_from_file(path, n):
 
     output = list(filter(lambda x: len(x) > 40, output))
 
+    # Get a random substring of each string
+    output = list(map(random_substring, output))
+
     if n == "all":
         return output
     else:

From ac2287e31ad281eb63d0fd64214852b1111357bd Mon Sep 17 00:00:00 2001
From: Robin Wilson <robin@rtwilson.com>
Date: Thu, 23 Nov 2023 09:42:23 +0000
Subject: [PATCH 2/3] Identify where multiple <p>&nbsp;</p> tags have been
 added and give warning. Also remove warning for old way of detecting mixed
 top/non-top content. Fixes #548.

---
 parser/lman_parser.py  | 34 ++++++++++++++++++++++++++++++++++
 parser/parser_utils.py |  6 +++---
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/parser/lman_parser.py b/parser/lman_parser.py
index 5a6ecb02..8265f394 100644
--- a/parser/lman_parser.py
+++ b/parser/lman_parser.py
@@ -8,6 +8,7 @@
 import os
 import subprocess
 from urllib.parse import urlparse
+import bs4
 from bs4 import BeautifulSoup
 from pprint import pprint, pformat
 from html_to_dita import htmlToDITA
@@ -565,6 +566,39 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="")
         # insert rest of converted content
         dita_section.extend(converted_bits)
 
+        def is_empty_p_element(el):
+            if el is None:
+                return False
+            elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0:
+                return True
+            else:
+                return False
+
+        def next_sibling_tag(el):
+            next_sib = el.next_sibling
+            while type(next_sib) is bs4.element.NavigableString:
+                next_sib = next_sib.next_sibling
+
+            return next_sib
+
+        # Check for repeated <p>&nbsp;</p> elements
+        p_elements = page.find_all("p")
+        empty_p_elements = list(filter(is_empty_p_element, p_elements))
+
+        found = False
+        for el in empty_p_elements:
+            count = 0
+            while is_empty_p_element(next_sibling_tag(el)):
+                count += 1
+                if count >= 4:
+                    found = True
+                    break
+            if found:
+                logging.warning(
+                    f"Found string of repeated <p>&nbsp;</p> elements in div with ID {page.get('id')} in file {filename}"
+                )
+                break
+
         return dita_section
 
     def find_first_page_layer(self, top_to_div_mapping, html_soup):
diff --git a/parser/parser_utils.py b/parser/parser_utils.py
index 8f473226..cc445422 100644
--- a/parser/parser_utils.py
+++ b/parser/parser_utils.py
@@ -191,9 +191,9 @@ def generate_top_to_div_mapping(
     # exited in an earlier if statement), so we check if there are some elements without top values
     # and raise a warning if so
     if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1:
-        logging.warning(
-            f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
-        )
+        # logging.warning(
+        #     f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
+        # )
         return [(0, html_soup)]
 
     return top_to_div_mapping

From 89ef3cb9d0768e1c3350631b3ed6b0a67c8404e3 Mon Sep 17 00:00:00 2001
From: Robin Wilson <robin@rtwilson.com>
Date: Thu, 23 Nov 2023 10:05:59 +0000
Subject: [PATCH 3/3] Fix checking for multiple empty <p> tags, as we were
 getting false positives.

---
 parser/lman_parser.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/parser/lman_parser.py b/parser/lman_parser.py
index 8265f394..5dbd44ab 100644
--- a/parser/lman_parser.py
+++ b/parser/lman_parser.py
@@ -574,25 +574,24 @@ def is_empty_p_element(el):
             else:
                 return False
 
-        def next_sibling_tag(el):
-            next_sib = el.next_sibling
-            while type(next_sib) is bs4.element.NavigableString:
-                next_sib = next_sib.next_sibling
-
-            return next_sib
-
         # Check for repeated <p>&nbsp;</p> elements
         p_elements = page.find_all("p")
         empty_p_elements = list(filter(is_empty_p_element, p_elements))
-
         found = False
         for el in empty_p_elements:
             count = 0
-            while is_empty_p_element(next_sibling_tag(el)):
-                count += 1
+            for next_sibling in el.next_siblings:
+                if next_sibling is None:
+                    break
+                elif type(next_sibling) is bs4.element.NavigableString:
+                    continue
+                elif is_empty_p_element(next_sibling):
+                    count += 1
+
                 if count >= 4:
                     found = True
                     break
+
             if found:
                 logging.warning(
                     f"Found string of repeated <p>&nbsp;</p> elements in div with ID {page.get('id')} in file {filename}"