From 4c5459c8edceed4497029b19b579b36a57025140 Mon Sep 17 00:00:00 2001 From: Winter Deng Date: Thu, 9 Oct 2025 15:55:34 +0800 Subject: [PATCH 1/6] classify flags to 'general', 'linear', 'nn' categories, and change some flags orders in main.py --- docs/cli/classifier.py | 137 +++++++++++++++++++++++++++++++++++++++++ docs/cli/genflags.py | 50 +++++++++++---- main.py | 123 +++++++++++++++++++----------------- 3 files changed, 242 insertions(+), 68 deletions(-) create mode 100644 docs/cli/classifier.py diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py new file mode 100644 index 00000000..31f2e8fe --- /dev/null +++ b/docs/cli/classifier.py @@ -0,0 +1,137 @@ +import os +import sys +import glob +import re +from pathlib import Path +from collections import defaultdict + +current_dir = os.path.dirname(os.path.abspath(__file__)) +lib_path = os.path.abspath(os.path.join(current_dir, "..", "..")) +sys.path.insert(0, lib_path) + +def classify_file_category(path): + + relative_path = Path(path).relative_to(lib_path) + return_path = relative_path.as_posix() + filename = Path(*relative_path.parts[1:]).as_posix() if len(relative_path.parts) > 1 else return_path + + if filename.startswith("linear"): + category = "linear" + elif filename.startswith("torch") or filename.startswith("nn"): + category = "nn" + else: + category = "general" + return category, return_path + + +def fetch_option_flags(flags): + # flags = genflags.parser.flags + flag_list = [] + + for flag in flags: + flag_list.append( + { + "name": flag["name"].replace("\\", ""), + "instruction": flag["name"].split("-")[-1], + "description": flag["description"] + } + ) + + return flag_list + + +def fetch_all_files(): + main_files = [ + os.path.join(lib_path, "linear_trainer.py"), + os.path.join(lib_path, "torch_trainer.py") + ] + lib_files = glob.glob(os.path.join(lib_path, "libmultilabel/**/*.py"), recursive=True) + file_set = set(map(os.path.abspath, main_files + lib_files)) + return file_set + + +def find_config_usages_in_file(file_path, allowed_keys): + pattern = re.compile(r'\bconfig\.([a-zA-Z_][a-zA-Z0-9_]*)') + detailed_results = {} + try: + with open(file_path, "r", encoding="utf-8") as f: + lines = f.readlines() + except (IOError, UnicodeDecodeError): + return [] + + category, path = classify_file_category(file_path) + + for i, line in enumerate(lines, start=1): + matches = pattern.findall(line) + for key in matches: + if key in allowed_keys: + if key not in detailed_results: + detailed_results[key] = {"file": path, "lines": []} + detailed_results[key]["lines"].append(str(i)) + + return detailed_results + + +def move_duplicates_together(data, keep): + all_keys = list(data.keys()) + duplicates = set() + + for i, key1 in enumerate(all_keys): + for key2 in all_keys[i+1:]: + duplicates |= data[key1] & data[key2] + + data[keep] |= duplicates + + for key in all_keys: + if key != keep: + data[key] -= duplicates + + return data + + +def classify(raw_flags): + + category_set = {"general": set(), "linear": set(), "nn": set()} + flags = fetch_option_flags(raw_flags) + allowed_keys = set(flag["instruction"] for flag in flags) + file_set = fetch_all_files() + usage_map = defaultdict(list) + collected = {} + + for file_path in file_set: + detailed_results = find_config_usages_in_file(file_path, allowed_keys) + if detailed_results: + usage_map[file_path] = set(detailed_results.keys()) + for k, v in detailed_results.items(): + if k not in collected: + collected[k] = [] + collected[k].append(v) + + for path, keys in usage_map.items(): + category, path = classify_file_category(path) + category_set[category] = category_set[category].union(keys) + + category_set = move_duplicates_together(category_set, "general") + + for flag in flags: + for k, v in category_set.items(): + for i in v: + if flag["instruction"] == i: + flag["category"] = k + if "category" not in flag: + flag["category"] = "general" + + result = {} + for flag in flags: + if flag["category"] not in result: + result[flag["category"]] = [] + result[flag["category"]].append({"name": flag["name"].replace("--", r"\-\-"), "description": flag["description"]}) + + result["details"] = [] + for k, v in collected.items(): + result["details"].append({"name": k, "file": v[0]["file"], "location": ", ".join(v[0]["lines"])}) + if len(v) > 1: + for i in v[1:]: + result["details"].append({"name": "", "file": i["file"], "location": ", ".join(i["lines"])}) + + return result diff --git a/docs/cli/genflags.py b/docs/cli/genflags.py index 006991e8..e6f409c2 100644 --- a/docs/cli/genflags.py +++ b/docs/cli/genflags.py @@ -2,8 +2,11 @@ import os sys.path.insert(1, os.path.join(sys.path[0], "..", "..")) + import main +from classifier import classify + class FakeParser(dict): def __init__(self): @@ -29,21 +32,42 @@ def add_argument( parser.add_argument("-c", "--config", help="Path to configuration file") main.add_all_arguments(parser) +classified = classify(parser.flags) -def width(key): - return max(map(lambda f: len(f[key]), parser.flags)) +def width_title(key, title): + return max(map(lambda f: len(f[key]), classified[title])) +def print_table(title, flags, intro): + print() + print(intro) + print() -wn = width("name") -wd = width("description") + wn = width_title("name", title) + wd = width_title("description", title) -print( - """.. - Do not modify this file. This file is generated by genflags.py.\n""" + print("=" * wn, "=" * wd) + print("Name".ljust(wn), "Description".ljust(wd)) + print("=" * wn, "=" * wd) + for flag in flags: + print(flag["name"].ljust(wn), flag["description"].ljust(wd)) + print("=" * wn, "=" * wd) + print() + +print_table( + "general", + classified["general"], + intro="**General options**:\n\ +Common configurations shared across both linear and neural network trainers." +) +print_table( + "linear", + classified["linear"], + intro="**Linear options**:\n\ +Configurations specific to linear trainer." ) -print("=" * wn, "=" * wd) -print("Name".ljust(wn), "Description".ljust(wd)) -print("=" * wn, "=" * wd) -for flag in parser.flags: - print(flag["name"].ljust(wn), flag["description"].ljust(wd)) -print("=" * wn, "=" * wd) +print_table( + "nn", + classified["nn"], + intro="**Neural network options**:\n\ +Configurations specific to torch (neural networks) trainer." +) \ No newline at end of file diff --git a/main.py b/main.py index 70907edf..7a523f1f 100644 --- a/main.py +++ b/main.py @@ -11,21 +11,50 @@ def add_all_arguments(parser): - # path / directory + parser.add_argument( - "--result_dir", default="./runs", help="The directory to save checkpoints and logs (default: %(default)s)" + "-h", + "--help", + action="help", + help="Quickstart: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel/cli/quickstart.html", ) + parser.add_argument("--seed", type=int, help="Random seed (default: %(default)s)") + + # choose model (linear / nn) + parser.add_argument("--linear", action="store_true", help="Train linear model") + + # others + parser.add_argument("--cpu", action="store_true", help="Disable CUDA") + parser.add_argument("--silent", action="store_true", help="Enable silent mode") + parser.add_argument( + "--data_workers", type=int, default=4, help="Use multi-cpu core for data pre-processing (default: %(default)s)" + ) + parser.add_argument( + "--embed_cache_dir", + type=str, + help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)", + ) + parser.add_argument( + "--eval", action="store_true", help="Only run evaluation on the test set (default: %(default)s)" + ) + parser.add_argument("--checkpoint_path", help="The checkpoint to warm-up with (default: %(default)s)") + # data - parser.add_argument("--data_name", default="unnamed_data", help="Dataset name (default: %(default)s)") + parser.add_argument( + "--data_name", + default="unnamed_data", + help="Dataset name for generating the output directory (default: %(default)s)", + ) parser.add_argument("--training_file", help="Path to training data (default: %(default)s)") parser.add_argument("--val_file", help="Path to validation data (default: %(default)s)") - parser.add_argument("--test_file", help="Path to test data (default: %(default)s") + parser.add_argument("--test_file", help="Path to test data (default: %(default)s)") + parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)") parser.add_argument( "--val_size", type=float, default=0.2, - help="Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s).", + help="Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s)", ) parser.add_argument( "--min_vocab_freq", @@ -67,8 +96,24 @@ def add_all_arguments(parser): help="Whether to add the special tokens for inputs of the transformer-based language model. (default: %(default)s)", ) + # model + parser.add_argument("--model_name", default="unnamed_model", help="Model to be used (default: %(default)s)") + parser.add_argument( + "--init_weight", default="kaiming_uniform", help="Weight initialization to be used (default: %(default)s)" + ) + parser.add_argument( + "--loss_function", default="binary_cross_entropy_with_logits", help="Loss function (default: %(default)s)" + ) + + # pretrained vocab / embeddings + parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)") + parser.add_argument( + "--embed_file", + type=str, + help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)", + ) + # train - parser.add_argument("--seed", type=int, help="Random seed (default: %(default)s)") parser.add_argument( "--epochs", type=int, default=10000, help="The number of epochs to train (default: %(default)s)" ) @@ -109,15 +154,6 @@ def add_all_arguments(parser): help="Whether the embeddings of each word is normalized to a unit vector (default: %(default)s)", ) - # model - parser.add_argument("--model_name", default="unnamed_model", help="Model to be used (default: %(default)s)") - parser.add_argument( - "--init_weight", default="kaiming_uniform", help="Weight initialization to be used (default: %(default)s)" - ) - parser.add_argument( - "--loss_function", default="binary_cross_entropy_with_logits", help="Loss function (default: %(default)s)" - ) - # eval parser.add_argument( "--eval_batch_size", type=int, default=256, help="Size of evaluating batches (default: %(default)s)" @@ -138,28 +174,6 @@ def add_all_arguments(parser): "--val_metric", default="P@1", help="The metric to select the best model for testing (default: %(default)s)" ) - # pretrained vocab / embeddings - parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)") - parser.add_argument( - "--embed_file", type=str, help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)" - ) - parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)") - - # log - parser.add_argument( - "--save_k_predictions", - type=int, - nargs="?", - const=100, - default=0, - help="Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)", - ) - parser.add_argument( - "--predict_out_path", - default="./predictions.txt", - help="Path to the output file holding label results (default: %(default)s)", - ) - # auto-test parser.add_argument( "--limit_train_batches", @@ -180,24 +194,27 @@ def add_all_arguments(parser): help="Percentage of test dataset to use for auto-testing (default: %(default)s)", ) - # others - parser.add_argument("--cpu", action="store_true", help="Disable CUDA") - parser.add_argument("--silent", action="store_true", help="Enable silent mode") + # log parser.add_argument( - "--data_workers", type=int, default=4, help="Use multi-cpu core for data pre-processing (default: %(default)s)" + "--save_k_predictions", + type=int, + nargs="?", + const=100, + default=0, + help="Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)", ) parser.add_argument( - "--embed_cache_dir", - type=str, - help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)", + "--predict_out_path", + default="./predictions.txt", + help="Path to the output file holding label results (default: %(default)s)", ) + + # path / directory parser.add_argument( - "--eval", action="store_true", help="Only run evaluation on the test set (default: %(default)s)" + "--result_dir", default="./runs", help="The directory to save checkpoints and logs (default: %(default)s)" ) - parser.add_argument("--checkpoint_path", help="The checkpoint to warm-up with (default: %(default)s)") # linear options - parser.add_argument("--linear", action="store_true", help="Train linear model") parser.add_argument( "--data_format", type=str, @@ -224,7 +241,10 @@ def add_all_arguments(parser): "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)" ) parser.add_argument( - "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)" + "--tree_ensemble_models", + type=int, + default=1, + help="Number of models in the tree ensemble (default: %(default)s)", ) parser.add_argument( "--beam_width", @@ -239,13 +259,6 @@ def add_all_arguments(parser): default=8, help="the maximal number of labels inside a cluster (default: %(default)s)", ) - parser.add_argument( - "-h", - "--help", - action="help", - help="If you are trying to specify network config such as dropout or activation or config of the learning rate scheduler, use a yaml file instead. " - "See example configs in example_config", - ) def get_config(): From 38fc479a9d641f41e6d8ceda42f3304f7cb247e5 Mon Sep 17 00:00:00 2001 From: Winter Deng Date: Thu, 9 Oct 2025 15:57:12 +0800 Subject: [PATCH 2/6] reformat changed code --- docs/cli/classifier.py | 28 ++++++++++++++-------------- docs/cli/genflags.py | 11 +++++++---- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py index 31f2e8fe..09d4dc81 100644 --- a/docs/cli/classifier.py +++ b/docs/cli/classifier.py @@ -9,6 +9,7 @@ lib_path = os.path.abspath(os.path.join(current_dir, "..", "..")) sys.path.insert(0, lib_path) + def classify_file_category(path): relative_path = Path(path).relative_to(lib_path) @@ -30,28 +31,25 @@ def fetch_option_flags(flags): for flag in flags: flag_list.append( - { - "name": flag["name"].replace("\\", ""), - "instruction": flag["name"].split("-")[-1], - "description": flag["description"] - } - ) + { + "name": flag["name"].replace("\\", ""), + "instruction": flag["name"].split("-")[-1], + "description": flag["description"], + } + ) return flag_list def fetch_all_files(): - main_files = [ - os.path.join(lib_path, "linear_trainer.py"), - os.path.join(lib_path, "torch_trainer.py") - ] + main_files = [os.path.join(lib_path, "linear_trainer.py"), os.path.join(lib_path, "torch_trainer.py")] lib_files = glob.glob(os.path.join(lib_path, "libmultilabel/**/*.py"), recursive=True) file_set = set(map(os.path.abspath, main_files + lib_files)) return file_set def find_config_usages_in_file(file_path, allowed_keys): - pattern = re.compile(r'\bconfig\.([a-zA-Z_][a-zA-Z0-9_]*)') + pattern = re.compile(r"\bconfig\.([a-zA-Z_][a-zA-Z0-9_]*)") detailed_results = {} try: with open(file_path, "r", encoding="utf-8") as f: @@ -77,7 +75,7 @@ def move_duplicates_together(data, keep): duplicates = set() for i, key1 in enumerate(all_keys): - for key2 in all_keys[i+1:]: + for key2 in all_keys[i + 1 :]: duplicates |= data[key1] & data[key2] data[keep] |= duplicates @@ -99,7 +97,7 @@ def classify(raw_flags): collected = {} for file_path in file_set: - detailed_results = find_config_usages_in_file(file_path, allowed_keys) + detailed_results = find_config_usages_in_file(file_path, allowed_keys) if detailed_results: usage_map[file_path] = set(detailed_results.keys()) for k, v in detailed_results.items(): @@ -125,7 +123,9 @@ def classify(raw_flags): for flag in flags: if flag["category"] not in result: result[flag["category"]] = [] - result[flag["category"]].append({"name": flag["name"].replace("--", r"\-\-"), "description": flag["description"]}) + result[flag["category"]].append( + {"name": flag["name"].replace("--", r"\-\-"), "description": flag["description"]} + ) result["details"] = [] for k, v in collected.items(): diff --git a/docs/cli/genflags.py b/docs/cli/genflags.py index e6f409c2..c2036a75 100644 --- a/docs/cli/genflags.py +++ b/docs/cli/genflags.py @@ -34,9 +34,11 @@ def add_argument( classified = classify(parser.flags) + def width_title(key, title): return max(map(lambda f: len(f[key]), classified[title])) + def print_table(title, flags, intro): print() print(intro) @@ -53,21 +55,22 @@ def print_table(title, flags, intro): print("=" * wn, "=" * wd) print() + print_table( "general", classified["general"], intro="**General options**:\n\ -Common configurations shared across both linear and neural network trainers." +Common configurations shared across both linear and neural network trainers.", ) print_table( "linear", classified["linear"], intro="**Linear options**:\n\ -Configurations specific to linear trainer." +Configurations specific to linear trainer.", ) print_table( "nn", classified["nn"], intro="**Neural network options**:\n\ -Configurations specific to torch (neural networks) trainer." -) \ No newline at end of file +Configurations specific to torch (neural networks) trainer.", +) From cd7554dc37b7a0f20b69dbb94de26302521a71e7 Mon Sep 17 00:00:00 2001 From: Winter Deng Date: Sat, 11 Oct 2025 00:18:15 +0800 Subject: [PATCH 3/6] - change settings in docs/conf.py to prevent writing sg_execution_times.rst in Sphinx 5 - optimize code in docs/cli/classifier.py - reformat the above scripts --- docs/cli/classifier.py | 26 ++++++++++++++++++++++---- docs/conf.py | 1 + 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py index 09d4dc81..5fd088f3 100644 --- a/docs/cli/classifier.py +++ b/docs/cli/classifier.py @@ -26,7 +26,6 @@ def classify_file_category(path): def fetch_option_flags(flags): - # flags = genflags.parser.flags flag_list = [] for flag in flags: @@ -42,7 +41,11 @@ def fetch_option_flags(flags): def fetch_all_files(): - main_files = [os.path.join(lib_path, "linear_trainer.py"), os.path.join(lib_path, "torch_trainer.py")] + main_files = [ + os.path.join(lib_path, "main.py"), + os.path.join(lib_path, "linear_trainer.py"), + os.path.join(lib_path, "torch_trainer.py"), + ] lib_files = glob.glob(os.path.join(lib_path, "libmultilabel/**/*.py"), recursive=True) file_set = set(map(os.path.abspath, main_files + lib_files)) return file_set @@ -57,7 +60,18 @@ def find_config_usages_in_file(file_path, allowed_keys): except (IOError, UnicodeDecodeError): return [] - category, path = classify_file_category(file_path) + _, path = classify_file_category(file_path) + + if file_path.endswith("main.py"): + for idx in range(len(lines)): + if lines[idx].startswith("def main("): + lines = lines[idx:] + main_start = idx + break + for i, line in enumerate(lines[1:]): + if line and line[0] not in (" ", "\t") and line.strip() != "": + lines = lines[:i] + break for i, line in enumerate(lines, start=1): matches = pattern.findall(line) @@ -65,7 +79,10 @@ def find_config_usages_in_file(file_path, allowed_keys): if key in allowed_keys: if key not in detailed_results: detailed_results[key] = {"file": path, "lines": []} - detailed_results[key]["lines"].append(str(i)) + if file_path.endswith("main.py"): + detailed_results[key]["lines"].append(str(i + main_start)) + else: + detailed_results[key]["lines"].append(str(i)) return detailed_results @@ -123,6 +140,7 @@ def classify(raw_flags): for flag in flags: if flag["category"] not in result: result[flag["category"]] = [] + result[flag["category"]].append( {"name": flag["name"].replace("--", r"\-\-"), "description": flag["description"]} ) diff --git a/docs/conf.py b/docs/conf.py index 2d39be73..d07438e1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,6 +49,7 @@ "examples_dirs": "./examples", # path to your example scripts "gallery_dirs": "auto_examples", # path to where to save gallery generated output "plot_gallery": False, + "write_computation_times": False, } # bibtex files From b50a14d7be37a36bae8c2737383be67cf8e479cb Mon Sep 17 00:00:00 2001 From: Winter Deng Date: Mon, 3 Nov 2025 11:08:33 +0800 Subject: [PATCH 4/6] - optimize code in docs/cli/classifier.py (functions: classify_file_category, find_config_usages_in_file, move_duplicates_together, classify) - reformat the above script --- docs/cli/classifier.py | 120 ++++++++++++----------------------------- 1 file changed, 35 insertions(+), 85 deletions(-) diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py index 5fd088f3..ad253cb4 100644 --- a/docs/cli/classifier.py +++ b/docs/cli/classifier.py @@ -5,24 +5,20 @@ from pathlib import Path from collections import defaultdict -current_dir = os.path.dirname(os.path.abspath(__file__)) -lib_path = os.path.abspath(os.path.join(current_dir, "..", "..")) -sys.path.insert(0, lib_path) +lib_path = Path.cwd().parent +sys.path.insert(0, str(lib_path)) def classify_file_category(path): relative_path = Path(path).relative_to(lib_path) - return_path = relative_path.as_posix() - filename = Path(*relative_path.parts[1:]).as_posix() if len(relative_path.parts) > 1 else return_path + filename = "/".join(relative_path.parts[1:]) or relative_path.as_posix() if filename.startswith("linear"): - category = "linear" - elif filename.startswith("torch") or filename.startswith("nn"): - category = "nn" - else: - category = "general" - return category, return_path + return "linear" + if filename.startswith(("torch", "nn")): + return "nn" + return "general" def fetch_option_flags(flags): @@ -51,105 +47,59 @@ def fetch_all_files(): return file_set -def find_config_usages_in_file(file_path, allowed_keys): +def find_config_usages_in_file(file_path, allowed_keys, category_set): pattern = re.compile(r"\bconfig\.([a-zA-Z_][a-zA-Z0-9_]*)") - detailed_results = {} - try: - with open(file_path, "r", encoding="utf-8") as f: - lines = f.readlines() - except (IOError, UnicodeDecodeError): - return [] - _, path = classify_file_category(file_path) + with open(file_path, "r", encoding="utf-8") as f: + lines = f.readlines() if file_path.endswith("main.py"): for idx in range(len(lines)): if lines[idx].startswith("def main("): lines = lines[idx:] - main_start = idx break - for i, line in enumerate(lines[1:]): - if line and line[0] not in (" ", "\t") and line.strip() != "": - lines = lines[:i] - break - - for i, line in enumerate(lines, start=1): - matches = pattern.findall(line) - for key in matches: - if key in allowed_keys: - if key not in detailed_results: - detailed_results[key] = {"file": path, "lines": []} - if file_path.endswith("main.py"): - detailed_results[key]["lines"].append(str(i + main_start)) - else: - detailed_results[key]["lines"].append(str(i)) - - return detailed_results + all_str = " ".join(lines) + matches = set(pattern.findall(all_str)) & allowed_keys + category = classify_file_category(file_path) + for key in matches: + category_set[category].add(key) -def move_duplicates_together(data, keep): - all_keys = list(data.keys()) - duplicates = set() - for i, key1 in enumerate(all_keys): - for key2 in all_keys[i + 1 :]: - duplicates |= data[key1] & data[key2] - - data[keep] |= duplicates - - for key in all_keys: - if key != keep: - data[key] -= duplicates +def move_duplicates_together(data): + duplicates = (data["general"] & data["linear"]) | (data["general"] & data["nn"]) | (data["linear"] & data["nn"]) + data["general"].update(duplicates) + data["linear"] -= duplicates + data["nn"] -= duplicates return data def classify(raw_flags): - category_set = {"general": set(), "linear": set(), "nn": set()} + flags = fetch_option_flags(raw_flags) allowed_keys = set(flag["instruction"] for flag in flags) file_set = fetch_all_files() - usage_map = defaultdict(list) - collected = {} for file_path in file_set: - detailed_results = find_config_usages_in_file(file_path, allowed_keys) - if detailed_results: - usage_map[file_path] = set(detailed_results.keys()) - for k, v in detailed_results.items(): - if k not in collected: - collected[k] = [] - collected[k].append(v) - - for path, keys in usage_map.items(): - category, path = classify_file_category(path) - category_set[category] = category_set[category].union(keys) + find_config_usages_in_file(file_path, allowed_keys, category_set) - category_set = move_duplicates_together(category_set, "general") + category_set = move_duplicates_together(category_set) - for flag in flags: - for k, v in category_set.items(): - for i in v: - if flag["instruction"] == i: - flag["category"] = k - if "category" not in flag: - flag["category"] = "general" - - result = {} - for flag in flags: - if flag["category"] not in result: - result[flag["category"]] = [] + result = defaultdict(list) + for flag in raw_flags: + instr = flag["name"].replace("\\", "").split("-")[-1] + flag_name = flag["name"].replace("--", r"\-\-") - result[flag["category"]].append( - {"name": flag["name"].replace("--", r"\-\-"), "description": flag["description"]} - ) + matched = False + for category, keys in category_set.items(): + if instr in keys: + result[category].append({"name": flag_name, "description": flag["description"]}) + matched = True + break - result["details"] = [] - for k, v in collected.items(): - result["details"].append({"name": k, "file": v[0]["file"], "location": ", ".join(v[0]["lines"])}) - if len(v) > 1: - for i in v[1:]: - result["details"].append({"name": "", "file": i["file"], "location": ", ".join(i["lines"])}) + if not matched: + result["general"].append({"name": flag_name, "description": flag["description"]}) return result From 41d80569871c744771573925149bc8a6aa0657f8 Mon Sep 17 00:00:00 2001 From: Jie-Jyun Liu Date: Mon, 10 Nov 2025 14:05:59 +0800 Subject: [PATCH 5/6] Apply suggestions from code review --- docs/cli/classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py index ad253cb4..82490640 100644 --- a/docs/cli/classifier.py +++ b/docs/cli/classifier.py @@ -85,7 +85,7 @@ def classify(raw_flags): for file_path in file_set: find_config_usages_in_file(file_path, allowed_keys, category_set) - category_set = move_duplicates_together(category_set) + move_duplicates_together(category_set) result = defaultdict(list) for flag in raw_flags: From 3d725f5c6a3243d18a30730422e9284477f6f2a5 Mon Sep 17 00:00:00 2001 From: Winter Deng Date: Mon, 10 Nov 2025 14:11:38 +0800 Subject: [PATCH 6/6] optimize function --- docs/cli/classifier.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py index 82490640..0ff52ad8 100644 --- a/docs/cli/classifier.py +++ b/docs/cli/classifier.py @@ -10,7 +10,6 @@ def classify_file_category(path): - relative_path = Path(path).relative_to(lib_path) filename = "/".join(relative_path.parts[1:]) or relative_path.as_posix() @@ -72,8 +71,6 @@ def move_duplicates_together(data): data["linear"] -= duplicates data["nn"] -= duplicates - return data - def classify(raw_flags): category_set = {"general": set(), "linear": set(), "nn": set()}