From c11060ef583fa92a195e18b7c42d6699563f3ab9 Mon Sep 17 00:00:00 2001 From: Ottavio Campana Date: Wed, 18 Feb 2026 10:42:14 +0100 Subject: [PATCH 1/3] Adding progressive xml lint script --- .github/scripts/progressive_xml_lint.py | 211 ++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 .github/scripts/progressive_xml_lint.py diff --git a/.github/scripts/progressive_xml_lint.py b/.github/scripts/progressive_xml_lint.py new file mode 100644 index 000000000..7b4376666 --- /dev/null +++ b/.github/scripts/progressive_xml_lint.py @@ -0,0 +1,211 @@ +import subprocess +import sys +import os +import xml.sax +import textwrap + +# --- Configuration --- +CONFIG = { + "indent_char": " ", # Use " " for spaces, "\t" for tabs + "indent_size": 4, # How many spaces per level? + "max_line_length": 120, # Maximum characters per line + "extensions": ('.xml', '.xsd', '.wsdl'), + # Map of { Bad_Byte : Replacement_Bytes } + "replacements": { + b'\x85': b'...', # Horizontal ellipsis + b'\x91': b"'", # Left single quote + b'\x92': b"'", # Right single quote + b'\x93': b'"', # Left double quote + b'\x94': b'"', # Right double quote + b'\x95': b'*', # Bullet + b'\x96': b'-', # En dash + b'\x97': b'-', # Em dash + b'\x84': b'"', # Low double quote + b'\x99': b'\xe2\x84\xa2' # Trademark (TM) + } +} + +class DepthMapHandler(xml.sax.ContentHandler): + def __init__(self): + self.line_depths = {} + self.protected_lines = set() + self.current_depth = 0 + self.locator = None + self.in_programlisting = False + self.programlisting_start = -1 + + def setDocumentLocator(self, locator): + self.locator = locator + + def startElement(self, name, attrs): + line_num = self.locator.getLineNumber() + if line_num not in self.line_depths: + self.line_depths[line_num] = self.current_depth + self.current_depth += 1 + + if name == 'programlisting': + self.in_programlisting = True + self.programlisting_start = line_num + + def endElement(self, name): + self.current_depth -= 1 + line_num = self.locator.getLineNumber() + if line_num not in self.line_depths: + self.line_depths[line_num] = self.current_depth + + if name == 'programlisting' and self.in_programlisting: + self.in_programlisting = False + self.protected_lines.update(range(self.programlisting_start, line_num + 1)) + + def characters(self, content): + if content.strip(): + line_num = self.locator.getLineNumber() + if line_num not in self.line_depths: + self.line_depths[line_num] = self.current_depth + +def get_changed_lines(file_path, base_ref): + """ + Returns a set of 1-based line numbers changed in the file. + Uses errors='replace' to avoid crashing on the very bad bytes we aim to fix. + """ + cmd = ["git", "diff", "-U0", base_ref, "--", file_path] + try: + # Crucial: errors='replace' prevents crash when diffing files with bad encoding + output = subprocess.check_output(cmd).decode("utf-8", errors="replace") + except subprocess.CalledProcessError: + return set() + + changed_lines = set() + for line in output.splitlines(): + if line.startswith("@@"): + try: + plus_part = line.split("+")[1].split()[0] + if "," in plus_part: + start, count = map(int, plus_part.split(",")) + else: + start, count = int(plus_part), 1 + + if count > 0: + changed_lines.update(range(start, start + count)) + except (IndexError, ValueError): + continue + return changed_lines + +def smart_wrap(content, indent_str, max_len): + """ + Wraps text content (string) to fill the line up to max_len. + """ + clean_content = ' '.join(content.split()) + + full_line = indent_str + clean_content + if len(full_line) <= max_len: + return full_line + "\n" + + wrapped_lines = textwrap.wrap( + clean_content, + width=max_len, + initial_indent=indent_str, + subsequent_indent=indent_str, + break_long_words=False, + break_on_hyphens=False + ) + return "\n".join(wrapped_lines) + "\n" + +def process_file(file_path, base_ref): + # 1. Get changed lines (from the dirty file on disk) + changed_lines = get_changed_lines(file_path, base_ref) + if not changed_lines: + return + + # 2. Read file as BINARY + # We must operate in binary to handle the specific byte replacements safely. + with open(file_path, "rb") as f: + original_lines_bytes = f.readlines() + + # 3. Create a 'Clean-in-Memory' version + # We apply the byte replacements ONLY to the changed lines. + # This buffer will be used for parsing (to avoid crashes) and formatting. + cleaned_lines_bytes = list(original_lines_bytes) # Make a copy + + for i in range(len(cleaned_lines_bytes)): + line_num = i + 1 + if line_num in changed_lines: + current_line = cleaned_lines_bytes[i] + for bad_byte, replacement in CONFIG["replacements"].items(): + current_line = current_line.replace(bad_byte, replacement) + cleaned_lines_bytes[i] = current_line + + # 4. Parse the Cleaned Content for Depth + # We join the cleaned lines into a single byte stream for the parser. + # This prevents the parser from crashing on the user's bad bytes. + full_clean_content = b"".join(cleaned_lines_bytes) + + handler = DepthMapHandler() + parser = xml.sax.make_parser() + parser.setContentHandler(handler) + try: + # xml.sax can parse bytes directly! + xml.sax.parseString(full_clean_content, handler) + except xml.sax.SAXException: + print(f"Skipping {file_path}: Malformed XML.") + return + + # 5. Format and Reconstruct + final_output_lines = [] + + for i, line_bytes in enumerate(cleaned_lines_bytes): + line_num = i + 1 + + # IF Protected (programlisting): Keep the bytes exactly as they are + if line_num in handler.protected_lines: + final_output_lines.append(line_bytes) + continue + + # IF Changed: Decode -> Format -> Encode + if line_num in changed_lines and line_num in handler.line_depths: + try: + # Decode to string for text wrapping (should be safe now after cleaning) + line_text = line_bytes.decode('utf-8') + except UnicodeDecodeError: + # Fallback: If it still fails, keep original bytes + final_output_lines.append(line_bytes) + continue + + depth = handler.line_depths[line_num] + correct_indent = CONFIG["indent_char"] * (CONFIG["indent_size"] * depth) + + # Smart wrap returns a string, we must encode back to bytes + formatted_text = smart_wrap(line_text, correct_indent, CONFIG["max_line_length"]) + final_output_lines.append(formatted_text.encode('utf-8')) + + else: + # IF Unchanged: Use the ORIGINAL bytes from disk (Step 2) + # This ensures we don't accidentally touch bytes in the rest of the file. + final_output_lines.append(original_lines_bytes[i]) + + # 6. Write back as BINARY + with open(file_path, "wb") as f: + f.writelines(final_output_lines) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python progressive_xml_lint.py ") + sys.exit(1) + + base_ref = sys.argv[1] + + try: + diff_cmd = ["git", "diff", "--name-only", base_ref] + files = subprocess.check_output(diff_cmd).decode("utf-8").splitlines() + + target_files = [ + f for f in files + if f.lower().endswith(CONFIG["extensions"]) and os.path.exists(f) + ] + + for file_path in target_files: + process_file(file_path, base_ref) + + except Exception as e: + print(f"Error: {e}") + sys.exit(1) From f0a0841ad0daf817e341fe7dada174d52002af18 Mon Sep 17 00:00:00 2001 From: Ottavio Campana Date: Wed, 25 Feb 2026 12:08:02 +0100 Subject: [PATCH 2/3] Unifying three actions into one With this change, we make sure to run progressive_lint.py before the wsdl checkers. --- ...essive_xml_lint.py => progressive_lint.py} | 2 +- .github/workflows/gsoap-wsdl-check.yaml | 54 ------------- ...-check.yaml => source-lint-and-check.yaml} | 77 ++++++++++++++++++- 3 files changed, 76 insertions(+), 57 deletions(-) rename .github/scripts/{progressive_xml_lint.py => progressive_lint.py} (99%) delete mode 100644 .github/workflows/gsoap-wsdl-check.yaml rename .github/workflows/{dotnet-wsdl-check.yaml => source-lint-and-check.yaml} (70%) diff --git a/.github/scripts/progressive_xml_lint.py b/.github/scripts/progressive_lint.py similarity index 99% rename from .github/scripts/progressive_xml_lint.py rename to .github/scripts/progressive_lint.py index 7b4376666..5ee5253bf 100644 --- a/.github/scripts/progressive_xml_lint.py +++ b/.github/scripts/progressive_lint.py @@ -189,7 +189,7 @@ def process_file(file_path, base_ref): if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python progressive_xml_lint.py ") + print("Usage: python progressive_lint.py ") sys.exit(1) base_ref = sys.argv[1] diff --git a/.github/workflows/gsoap-wsdl-check.yaml b/.github/workflows/gsoap-wsdl-check.yaml deleted file mode 100644 index 7b6c1d457..000000000 --- a/.github/workflows/gsoap-wsdl-check.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: gsoap-wsdl-check -run-name: "WSDL files syntax check with gSoap" - -on: - push: - paths: - - '**.wsdl' - - '**.xsd' - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - gsoap-wsdl-check: - strategy: - fail-fast: true - runs-on: ubuntu-24.04 - steps: - - name: Check that remote files are reachable - id: vars - run: | - export MISSING_FILES=0 - curl --head http://docs.oasis-open.org/wsn/b-2.xsd || export MISSING_FILES=$((MISSING_FILES+1)) - echo "MISSING_FILES=$MISSING_FILES" >> $GITHUB_OUTPUT - echo "MISSING_FILES=$MISSING_FILES" - - name: Install missing software - if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} - run: | - sudo apt update && sudo apt install gcc g++ curl autoconf automake cmake bison flex libssl-dev zlib1g-dev make - - name: Checkout tools repo - uses: actions/checkout@v6 - if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} - with: - repository: onvif/gsoap-wsdl-checker - path: gsoap-wsdl-checker - ssh-key: ${{ secrets.SSH_PRIVATE_KEY }} - - name: Get branch names. - if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} - id: branch-names - uses: tj-actions/branch-names@v9.0.2 - - name: Install gsoap - if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} - run: | - cd gsoap-wsdl-checker - cmake CMakeLists.txt -DBRANCH=${{ steps.branch-names.outputs.current_branch }} - make gsoap - - name: Compile the WDSL files - if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} - run: | - cd gsoap-wsdl-checker - make - diff --git a/.github/workflows/dotnet-wsdl-check.yaml b/.github/workflows/source-lint-and-check.yaml similarity index 70% rename from .github/workflows/dotnet-wsdl-check.yaml rename to .github/workflows/source-lint-and-check.yaml index 065d07feb..d4e4a3179 100644 --- a/.github/workflows/dotnet-wsdl-check.yaml +++ b/.github/workflows/source-lint-and-check.yaml @@ -1,11 +1,12 @@ -name: dotnet-wsdl-check -run-name: "WSDL files syntax check with .Net 9.0" +name: source-lint-and-check +run-name: "Source lint and check" on: push: paths: - '**.wsdl' - '**.xsd' + - '**.xml' workflow_dispatch: concurrency: @@ -13,7 +14,79 @@ concurrency: cancel-in-progress: true jobs: + source-lint: + strategy: + fail-fast: true + runs-on: ubuntu-24.04 + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # This ensures it checks out the branch that triggered the workflow + ref: ${{ github.ref }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Run progressive lint script + # Assuming the script modifies files in place + run: python .github/scripts/progressive_lint.py + + - name: Commit and push changes + uses: EndBug/add-and-commit@v9 + with: + # EndBug automatically checks if files changed. If no files changed, it does nothing. + message: 'chore: apply progressive linting fixes' + committer_name: GitHub Actions + committer_email: actions@github.com + # By default, this action pushes back to the branch checked out by actions/checkout + + gsoap-wsdl-check: + needs: source-lint + strategy: + fail-fast: true + runs-on: ubuntu-24.04 + steps: + - name: Check that remote files are reachable + id: vars + run: | + export MISSING_FILES=0 + curl --head http://docs.oasis-open.org/wsn/b-2.xsd || export MISSING_FILES=$((MISSING_FILES+1)) + echo "MISSING_FILES=$MISSING_FILES" >> $GITHUB_OUTPUT + echo "MISSING_FILES=$MISSING_FILES" + - name: Install missing software + if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} + run: | + sudo apt update && sudo apt install gcc g++ curl autoconf automake cmake bison flex libssl-dev zlib1g-dev make + - name: Checkout tools repo + uses: actions/checkout@v6 + if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} + with: + repository: onvif/gsoap-wsdl-checker + path: gsoap-wsdl-checker + ssh-key: ${{ secrets.SSH_PRIVATE_KEY }} + - name: Get branch names. + if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} + id: branch-names + uses: tj-actions/branch-names@v9.0.2 + - name: Install gsoap + if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} + run: | + cd gsoap-wsdl-checker + cmake CMakeLists.txt -DBRANCH=${{ steps.branch-names.outputs.current_branch }} + make gsoap + - name: Compile the WDSL files + if: ${{ steps.vars.outputs.MISSING_FILES == 0 }} + run: | + cd gsoap-wsdl-checker + make + dotnet-wsdl-check: + needs: source-lint strategy: fail-fast: true runs-on: ubuntu-24.04 From 0d0835437ff3376601dfebbcaf6a3bda1f532a53 Mon Sep 17 00:00:00 2001 From: Ottavio Campana Date: Wed, 25 Feb 2026 15:44:17 +0100 Subject: [PATCH 3/3] Correcting the logic --- .github/scripts/progressive_lint.py | 40 ++++++++++------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/.github/scripts/progressive_lint.py b/.github/scripts/progressive_lint.py index 5ee5253bf..c31f061cc 100644 --- a/.github/scripts/progressive_lint.py +++ b/.github/scripts/progressive_lint.py @@ -66,11 +66,11 @@ def characters(self, content): def get_changed_lines(file_path, base_ref): """ Returns a set of 1-based line numbers changed in the file. - Uses errors='replace' to avoid crashing on the very bad bytes we aim to fix. + Uses the triple-dot syntax to compare against the merge base. """ - cmd = ["git", "diff", "-U0", base_ref, "--", file_path] + # The '...' tells Git to compare the working branch against the point where it diverged from base_ref + cmd = ["git", "diff", "-U0", f"{base_ref}...", "--", file_path] try: - # Crucial: errors='replace' prevents crash when diffing files with bad encoding output = subprocess.check_output(cmd).decode("utf-8", errors="replace") except subprocess.CalledProcessError: return set() @@ -92,9 +92,6 @@ def get_changed_lines(file_path, base_ref): return changed_lines def smart_wrap(content, indent_str, max_len): - """ - Wraps text content (string) to fill the line up to max_len. - """ clean_content = ' '.join(content.split()) full_line = indent_str + clean_content @@ -112,20 +109,17 @@ def smart_wrap(content, indent_str, max_len): return "\n".join(wrapped_lines) + "\n" def process_file(file_path, base_ref): - # 1. Get changed lines (from the dirty file on disk) + # 1. Get changed lines (from the dirty file on disk, relative to merge base) changed_lines = get_changed_lines(file_path, base_ref) if not changed_lines: return # 2. Read file as BINARY - # We must operate in binary to handle the specific byte replacements safely. with open(file_path, "rb") as f: original_lines_bytes = f.readlines() - # 3. Create a 'Clean-in-Memory' version - # We apply the byte replacements ONLY to the changed lines. - # This buffer will be used for parsing (to avoid crashes) and formatting. - cleaned_lines_bytes = list(original_lines_bytes) # Make a copy + # 3. Create a 'Clean-in-Memory' version for parser + cleaned_lines_bytes = list(original_lines_bytes) for i in range(len(cleaned_lines_bytes)): line_num = i + 1 @@ -135,16 +129,13 @@ def process_file(file_path, base_ref): current_line = current_line.replace(bad_byte, replacement) cleaned_lines_bytes[i] = current_line - # 4. Parse the Cleaned Content for Depth - # We join the cleaned lines into a single byte stream for the parser. - # This prevents the parser from crashing on the user's bad bytes. + # 4. Parse the Cleaned Content full_clean_content = b"".join(cleaned_lines_bytes) handler = DepthMapHandler() parser = xml.sax.make_parser() parser.setContentHandler(handler) try: - # xml.sax can parse bytes directly! xml.sax.parseString(full_clean_content, handler) except xml.sax.SAXException: print(f"Skipping {file_path}: Malformed XML.") @@ -156,31 +147,27 @@ def process_file(file_path, base_ref): for i, line_bytes in enumerate(cleaned_lines_bytes): line_num = i + 1 - # IF Protected (programlisting): Keep the bytes exactly as they are + # Protected Block: Keep original bytes exactly if line_num in handler.protected_lines: - final_output_lines.append(line_bytes) + final_output_lines.append(original_lines_bytes[i]) continue - # IF Changed: Decode -> Format -> Encode + # Changed Block: Format if line_num in changed_lines and line_num in handler.line_depths: try: - # Decode to string for text wrapping (should be safe now after cleaning) line_text = line_bytes.decode('utf-8') except UnicodeDecodeError: - # Fallback: If it still fails, keep original bytes - final_output_lines.append(line_bytes) + final_output_lines.append(original_lines_bytes[i]) continue depth = handler.line_depths[line_num] correct_indent = CONFIG["indent_char"] * (CONFIG["indent_size"] * depth) - # Smart wrap returns a string, we must encode back to bytes formatted_text = smart_wrap(line_text, correct_indent, CONFIG["max_line_length"]) final_output_lines.append(formatted_text.encode('utf-8')) else: - # IF Unchanged: Use the ORIGINAL bytes from disk (Step 2) - # This ensures we don't accidentally touch bytes in the rest of the file. + # Unchanged Block: Keep original bytes exactly final_output_lines.append(original_lines_bytes[i]) # 6. Write back as BINARY @@ -195,7 +182,8 @@ def process_file(file_path, base_ref): base_ref = sys.argv[1] try: - diff_cmd = ["git", "diff", "--name-only", base_ref] + # Use triple dots to compare against the merge base + diff_cmd = ["git", "diff", "--name-only", f"{base_ref}..."] files = subprocess.check_output(diff_cmd).decode("utf-8").splitlines() target_files = [