diff --git a/.github/workflows/staging-check.yml b/.github/workflows/staging-check.yml new file mode 100644 index 0000000..15b7049 --- /dev/null +++ b/.github/workflows/staging-check.yml @@ -0,0 +1,34 @@ +name: Validate output against staging + +on: + pull_request: + branches: + - prod + +jobs: + validate_against_stating: + permissions: + pull-requests: write + + runs-on: ubuntu-latest + + env: + PYTHONDEVMODE: 1 + + steps: + - uses: actions/checkout@v6 + + - name: Run deployment script + run: | + pip install lxml + mkdir tmp/ + python tools/convert.py -a -d tmp ispdb/* + + - name: Compare output against staging + run: | + pip install requests + python tools/compare_out_files.py -b https://autoconfig-stage.thunderbird.net/v1.1/ tmp/ + + - name: Calculate generated_files.json diff with prod + run: | + python tools/calculate_generated_files_diff.py -b https://autoconfig.thunderbird.net/v1.1 -t ${{ secrets.GITHUB_TOKEN }} -r ${{ github.repository }} -n ${{ github.event.pull_request.number }} tmp/ diff --git a/.github/workflows/validate-config.yml b/.github/workflows/validate-config.yml index 8ebb138..d41af32 100644 --- a/.github/workflows/validate-config.yml +++ b/.github/workflows/validate-config.yml @@ -6,15 +6,29 @@ jobs: validate_config: runs-on: ubuntu-latest + env: + PYTHONDEVMODE: 1 + steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v6 + + - name: BOM check + run: | + ! grep -rlI $'\xEF\xBB\xBF' ispdb - - name: BOM check - uses: arma-actions/bom-check@v1.1 - with: - path: ispdb + - name: Validate file extensions + run: | + set -o pipefail + shopt -s extglob nullglob + files=( ispdb/!(*.xml) ) + if (( ${#files[*]} )); then + for file in "${files[@]}"; do + printf '::error file=%s::File name "%s" does not end in .xml – Please rename!\n' "$file" "$file" + done + exit 1 + fi - - name: Validate file extensions - run: | - set -o pipefail - ls -1 ispdb/ | grep -v '\.xml$' | awk '{print "::error file=ispdb/"$0"::File name \"ispdb/"$0"\" does not end in .xml – Please rename!"}' && exit 1 || true + - name: Validate XML content + run: | + pip install lxml + python tools/validate.py ispdb/* diff --git a/README.md b/README.md index 77cf705..707883c 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,5 @@ source files for the Thunderbird ISP database. For documentation regarding either topic, please refer to [this repository's wiki](https://github.com/thunderbird/autoconfig/wiki). + +This ISPDB was created by Ben Bucksch. diff --git a/ispdb/naver.com.xml b/ispdb/naver.com.xml index bd11fcf..78a4fa9 100644 --- a/ispdb/naver.com.xml +++ b/ispdb/naver.com.xml @@ -25,6 +25,6 @@ %EMAILADDRESS% password-encrypted - + - + \ No newline at end of file diff --git a/tools/calculate_generated_files_diff.py b/tools/calculate_generated_files_diff.py new file mode 100644 index 0000000..204730f --- /dev/null +++ b/tools/calculate_generated_files_diff.py @@ -0,0 +1,84 @@ +import argparse +import difflib +import os.path +import requests + +GENERATED_FILES_NAME = "generated_files.json" + +GITHUB_COMMENT_TEMPLATE_WITH_DIFF = """This PR will cause the following changes to the production `generated_files.json` file: + +
+ +Expand to view diff + + +```diff +{deltas} +``` + +
+""" + +GITHUB_COMMENT_TEMPLATE_NO_DIFF = ( + "This PR will not cause any change to the production `generated_files.json` file." +) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-b", metavar="base_url", help="base URL serving ISPDB files") + parser.add_argument("-t", metavar="api_token", help="Github API token") + parser.add_argument("-r", metavar="repo", help="Github repository") + parser.add_argument("-n", metavar="number", help="The Github or issue number") + parser.add_argument( + "folder", help="the folder containing the local ISPDB files to compare" + ) + + args = parser.parse_args() + + # Strip out any trailing slash in the base URL so we don't accidentally end + # up doubling it. + base_url: str = args.b.strip("/") + + resp = requests.get(f"{base_url}/{GENERATED_FILES_NAME}") + + # At the time of writing, all of the domains in ISPDB are made up of ASCII + # characters, but that might not stay true forever. + resp.encoding = "utf-8" + + with open(os.path.join(args.folder, GENERATED_FILES_NAME), "r") as fp: + local_list = fp.readlines() + + # We call the local version "staging" as a shortcut, because by this + # time we expect to have already validated that the + deltas = list( + difflib.unified_diff( + resp.text.splitlines(keepends=True), + local_list, + fromfile="current", + tofile="new", + ) + ) + + comment = ( + GITHUB_COMMENT_TEMPLATE_WITH_DIFF.format(deltas="".join(deltas)) + if len(deltas) > 0 + else GITHUB_COMMENT_TEMPLATE_NO_DIFF + ) + + # Create the comment via the Github API. + # See + resp = requests.post( + f"https://api.github.com/repos/{args.r}/issues/{args.n}/comments", + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {args.t}", + }, + json={"body": comment}, + ) + + print(f"Posted comment {resp.json()["html_url"]}") + + +if __name__ == "__main__": + main() diff --git a/tools/compare_out_files.py b/tools/compare_out_files.py new file mode 100644 index 0000000..4118c5e --- /dev/null +++ b/tools/compare_out_files.py @@ -0,0 +1,134 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import argparse +import difflib +import json +import os.path +import sys +import requests +from typing import Dict, List + +GENERATED_FILES_NAME = "generated_files.json" + + +def get_and_compare(file_name: str, base_url: str, local_folder: str) -> str: + """Reads a local file and compare it with its remote copy before returning + its content. + + Returns: + The file's content as served by the remote server, decoded as UTF-8 + text. + + Raises: + RuntimeError if the local file's content doesn't match the remote copy. + """ + resp = requests.get(f"{base_url}/{file_name}") + + # The response might not include an content-type header, and there are some + # non-ASCII characters in our XML files (e.g. in display names), so we need + # to explicitly tell `resp` what its encoding is. + resp.encoding = "utf-8" + + with open(os.path.join(local_folder, file_name), "r") as fp: + local_list = fp.readlines() + + deltas = list( + difflib.unified_diff( + local_list, + resp.text.splitlines(keepends=True), + fromfile="local", + tofile="remote", + ) + ) + + if len(deltas) > 0: + print(f"Diff deltas:\n\n{"".join(deltas)}", file=sys.stderr) + raise RuntimeError("local file list does not match staging copy") + + return resp.text + + +def get_file_list(base_url: str, local_folder: str) -> List[str]: + """Gets the list of files to compare. + + Returns: + The list of file names as per the `generated_files.json` file. + + Raises: + RuntimeError if the local `generated_files.json` file does not match the + remote copy. + """ + file_list = get_and_compare(GENERATED_FILES_NAME, base_url, local_folder) + return json.loads(file_list) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-b", metavar="base_url", help="base URL serving ISPDB files") + parser.add_argument( + "folder", help="the folder containing the local ISPDB files to compare" + ) + + args = parser.parse_args() + + # Strip out any trailing slash in the base URL so we don't accidentally end + # up doubling it. + base_url: str = args.b.strip("/") + + print("Fetching and comparing file list") + + listed_files = get_file_list(base_url, args.folder) + + failed_files: Dict[str, Exception] = {} + for file in listed_files: + print(f"Fetching and comparing {file}") + + try: + get_and_compare(file, base_url, args.folder) + except Exception as e: + print(f"Comparison failed for file {file}: {e}", file=sys.stderr) + failed_files[file] = e + + if len(failed_files) > 0: + # Print the failed files, preceded by an empty line to separate them + # from the previous logs. + print("\nComparing the following file(s) has failed:", file=sys.stderr) + + for file, exc in failed_files.items(): + print(f"{file}: {exc}", file=sys.stderr) + + # Check if we can find files that exist in the local directory but isn't + # listed in `generated_files.json`. We could also do this check in the other + # direction (i.e. check if a file in `generated_files.json` is missing from + # the local directory), but if a file from the list is missing then trying + # to open it earlier will have raised an exception and will already cause + # the script to fail. + local_files = os.listdir(args.folder) + + # Make sure we don't try to find the JSON list file in itself. + local_files.remove(GENERATED_FILES_NAME) + + unknown_files = [] + for local_file in local_files: + if local_file not in listed_files: + unknown_files.append(local_file) + + if len(unknown_files) > 0: + print("\nUnknown file(s) in local directory:", file=sys.stderr) + + for file in unknown_files: + print(file, file=sys.stderr) + else: + print("No unknown files found") + + # Fail the script if either a comparison has failed or we found an unknown + # file. We could fail earlier, but it's more helpful for troubleshooting if + # we have the script point out as many issues in one run as possible. + if len(failed_files) > 0 or len(unknown_files) > 0: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/convert.py b/tools/convert.py index 1f3b5f9..1491608 100644 --- a/tools/convert.py +++ b/tools/convert.py @@ -27,15 +27,16 @@ # ***** END LICENSE BLOCK ***** import argparse -import codecs import os.path -import stat import sys +from typing import Dict, List +import json + import lxml.etree as ET def read_config(file, convertTime): - return (ET.parse(file), max(os.stat(file).st_mtime, convertTime)) + return (ET.parse(file), max(os.path.getmtime(file), convertTime)) def doc_to_bytestring(doc): @@ -47,59 +48,102 @@ def print_config(doc): def write_config(outData, time, filename=None): - if os.path.exists(filename) and os.stat(filename).st_mtime >= time: + if os.path.exists(filename) and os.path.getmtime(filename) >= time: return - print("Writing %s" % filename) - file = codecs.open(filename, "wb") - file.write(outData) - file.write(b'\n') - file.close() + print(f"Writing {filename}") + with open(filename, "wb") as file: + file.write(outData) + file.write(b"\n") -def write_domains(doc, time, output_dir="."): +def write_domains(doc, time, output_dir=".") -> List[str]: outData = doc_to_bytestring(doc) - for d in doc.findall("//domain"): - write_config(outData, time, output_dir + "/" + d.text) + + domains: List[str] = [] + + for d in doc.getroot().findall(".//domain"): + write_config(outData, time, os.path.join(output_dir, d.text)) + domains.append(d.text) + + return domains def main(): # parse command line options parser = argparse.ArgumentParser() - parser.add_argument("-d", metavar="dir", - help="output directory") - parser.add_argument("-a", action="store_true", - help="write configuration files for all domains") - parser.add_argument("file", nargs="*", - help="input file(s) to process, wildcards allowed") - args = parser.parse_args(sys.argv[1:]) + parser.add_argument("-d", metavar="dir", help="output directory") + parser.add_argument( + "-a", action="store_true", help="write configuration files for all domains" + ) + parser.add_argument( + "file", nargs="*", help="input file(s) to process, wildcards allowed" + ) + args = parser.parse_args() # process arguments - convertTime = os.stat(sys.argv[0]).st_mtime - is_dir = stat.S_ISDIR + convertTime = os.path.getmtime(sys.argv[0]) - for f in args.file: - if is_dir(os.stat(f).st_mode): - continue - - if f == "README": - continue + # Record the files that failed to be processed and the errors related to + # them. + failed_files: Dict[str, Exception] = {} - doc, time = read_config(f, convertTime) + # Record the names of the files written so we can list them in a file later. + out_file_names: List[str] = [] - if args.a: - if args.d: - write_domains(doc, time, args.d) + for f in args.file: + try: + if os.path.isdir(f): + continue + + if f == "README": + continue + + print(f"Processing {f}") + + doc, time = read_config(f, convertTime) + + if args.a: + if args.d: + domains = write_domains(doc, time, args.d) + out_file_names.extend(domains) + else: + print("When you want to write domain files you") + print("should also specify an output directory") + print("using -d dir") + parser.print_usage() + sys.exit(2) + elif args.d: + outData = doc_to_bytestring(doc) + filename = os.path.basename(f) + write_config(outData, time, os.path.join(args.d, filename)) + out_file_names.append(filename) else: - print("When you want to write domain files you") - print("should also specify an output directory") - print("using -d dir") - parser.print_usage() - exit(2) - elif args.d: - outData = doc_to_bytestring(doc) - write_config(outData, time, args.d + "/" + os.path.basename(f)) - else: - print_config(doc) + print_config(doc) + except Exception as e: + print(f"File {f} could not be processed: {e}") + failed_files[f] = e + + if len(failed_files) > 0: + # Print the failed files, preceded by an empty line to separate them + # from the previous logs. + print() + print("Processing the following file(s) has failed:") + + for file, exc in failed_files.items(): + print(f"{file}: {exc}") + + sys.exit(1) + + # Sort the list for idempotency. + out_file_names.sort() + + # Write the list of output files in a JSON file. This file will mostly be + # used in CI processes. We generate it with indentation so that other CI + # scripts can produce a more legible output. + out_file_list_path = os.path.join(args.d, "generated_files.json") + with open(out_file_list_path, "w") as fp: + print(f"Writing list of {len(out_file_names)} files") + fp.write(json.dumps(out_file_names, indent=2)) if __name__ == "__main__": diff --git a/tools/validate.py b/tools/validate.py new file mode 100644 index 0000000..5a04456 --- /dev/null +++ b/tools/validate.py @@ -0,0 +1,52 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# This script checks all input files to validate that their content is valid +# XML. It is meant to run in the CI for PRs against the autoconfig repository. + +import argparse +import os +import sys +from typing import List + +from lxml import etree + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "file", nargs="*", help="input file(s) to process, wildcards allowed" + ) + args = parser.parse_args() + + # Defining `files` here isn't strictly necessary, but the extra typing + # (which we can't really get otherwise) helps with maintenance. + files: List[str] = args.file + + # The exit code. Stays 0 unless we encounter a file that doesn't parse. + ret = 0 + + for f in files: + # Filter out directories an non-XML files + if os.path.isdir(f): + print(f"Ignoring directory {f}") + continue + + if not f.endswith(".xml"): + print(f"Ignoring non-XML file {f}") + continue + + # Try parsing the file. If this did not work, print the error and set + # the exit code to 1. + try: + etree.parse(f) + except Exception as e: + print(f"File {f} did not parse: {e}") + ret = 1 + + sys.exit(ret) + + +if __name__ == "__main__": + main()