diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py new file mode 100644 index 00000000..0ff52ad8 --- /dev/null +++ b/docs/cli/classifier.py @@ -0,0 +1,102 @@ +import os +import sys +import glob +import re +from pathlib import Path +from collections import defaultdict + +lib_path = Path.cwd().parent +sys.path.insert(0, str(lib_path)) + + +def classify_file_category(path): + relative_path = Path(path).relative_to(lib_path) + filename = "/".join(relative_path.parts[1:]) or relative_path.as_posix() + + if filename.startswith("linear"): + return "linear" + if filename.startswith(("torch", "nn")): + return "nn" + return "general" + + +def fetch_option_flags(flags): + flag_list = [] + + for flag in flags: + flag_list.append( + { + "name": flag["name"].replace("\\", ""), + "instruction": flag["name"].split("-")[-1], + "description": flag["description"], + } + ) + + return flag_list + + +def fetch_all_files(): + main_files = [ + os.path.join(lib_path, "main.py"), + os.path.join(lib_path, "linear_trainer.py"), + os.path.join(lib_path, "torch_trainer.py"), + ] + lib_files = glob.glob(os.path.join(lib_path, "libmultilabel/**/*.py"), recursive=True) + file_set = set(map(os.path.abspath, main_files + lib_files)) + return file_set + + +def find_config_usages_in_file(file_path, allowed_keys, category_set): + pattern = re.compile(r"\bconfig\.([a-zA-Z_][a-zA-Z0-9_]*)") + + with open(file_path, "r", encoding="utf-8") as f: + lines = f.readlines() + + if file_path.endswith("main.py"): + for idx in range(len(lines)): + if lines[idx].startswith("def main("): + lines = lines[idx:] + break + all_str = " ".join(lines) + matches = set(pattern.findall(all_str)) & allowed_keys + + category = classify_file_category(file_path) + for key in matches: + category_set[category].add(key) + + +def move_duplicates_together(data): + duplicates = (data["general"] & data["linear"]) | (data["general"] & data["nn"]) | (data["linear"] & data["nn"]) + data["general"].update(duplicates) + data["linear"] -= duplicates + data["nn"] -= duplicates + + +def classify(raw_flags): + category_set = {"general": set(), "linear": set(), "nn": set()} + + flags = fetch_option_flags(raw_flags) + allowed_keys = set(flag["instruction"] for flag in flags) + file_set = fetch_all_files() + + for file_path in file_set: + find_config_usages_in_file(file_path, allowed_keys, category_set) + + move_duplicates_together(category_set) + + result = defaultdict(list) + for flag in raw_flags: + instr = flag["name"].replace("\\", "").split("-")[-1] + flag_name = flag["name"].replace("--", r"\-\-") + + matched = False + for category, keys in category_set.items(): + if instr in keys: + result[category].append({"name": flag_name, "description": flag["description"]}) + matched = True + break + + if not matched: + result["general"].append({"name": flag_name, "description": flag["description"]}) + + return result diff --git a/docs/cli/genflags.py b/docs/cli/genflags.py index 006991e8..c2036a75 100644 --- a/docs/cli/genflags.py +++ b/docs/cli/genflags.py @@ -2,8 +2,11 @@ import os sys.path.insert(1, os.path.join(sys.path[0], "..", "..")) + import main +from classifier import classify + class FakeParser(dict): def __init__(self): @@ -29,21 +32,45 @@ def add_argument( parser.add_argument("-c", "--config", help="Path to configuration file") main.add_all_arguments(parser) +classified = classify(parser.flags) + + +def width_title(key, title): + return max(map(lambda f: len(f[key]), classified[title])) -def width(key): - return max(map(lambda f: len(f[key]), parser.flags)) +def print_table(title, flags, intro): + print() + print(intro) + print() -wn = width("name") -wd = width("description") + wn = width_title("name", title) + wd = width_title("description", title) -print( - """.. - Do not modify this file. This file is generated by genflags.py.\n""" + print("=" * wn, "=" * wd) + print("Name".ljust(wn), "Description".ljust(wd)) + print("=" * wn, "=" * wd) + for flag in flags: + print(flag["name"].ljust(wn), flag["description"].ljust(wd)) + print("=" * wn, "=" * wd) + print() + + +print_table( + "general", + classified["general"], + intro="**General options**:\n\ +Common configurations shared across both linear and neural network trainers.", +) +print_table( + "linear", + classified["linear"], + intro="**Linear options**:\n\ +Configurations specific to linear trainer.", +) +print_table( + "nn", + classified["nn"], + intro="**Neural network options**:\n\ +Configurations specific to torch (neural networks) trainer.", ) -print("=" * wn, "=" * wd) -print("Name".ljust(wn), "Description".ljust(wd)) -print("=" * wn, "=" * wd) -for flag in parser.flags: - print(flag["name"].ljust(wn), flag["description"].ljust(wd)) -print("=" * wn, "=" * wd) diff --git a/docs/conf.py b/docs/conf.py index 2d39be73..d07438e1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,6 +49,7 @@ "examples_dirs": "./examples", # path to your example scripts "gallery_dirs": "auto_examples", # path to where to save gallery generated output "plot_gallery": False, + "write_computation_times": False, } # bibtex files diff --git a/main.py b/main.py index 70907edf..7a523f1f 100644 --- a/main.py +++ b/main.py @@ -11,21 +11,50 @@ def add_all_arguments(parser): - # path / directory + parser.add_argument( - "--result_dir", default="./runs", help="The directory to save checkpoints and logs (default: %(default)s)" + "-h", + "--help", + action="help", + help="Quickstart: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel/cli/quickstart.html", ) + parser.add_argument("--seed", type=int, help="Random seed (default: %(default)s)") + + # choose model (linear / nn) + parser.add_argument("--linear", action="store_true", help="Train linear model") + + # others + parser.add_argument("--cpu", action="store_true", help="Disable CUDA") + parser.add_argument("--silent", action="store_true", help="Enable silent mode") + parser.add_argument( + "--data_workers", type=int, default=4, help="Use multi-cpu core for data pre-processing (default: %(default)s)" + ) + parser.add_argument( + "--embed_cache_dir", + type=str, + help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)", + ) + parser.add_argument( + "--eval", action="store_true", help="Only run evaluation on the test set (default: %(default)s)" + ) + parser.add_argument("--checkpoint_path", help="The checkpoint to warm-up with (default: %(default)s)") + # data - parser.add_argument("--data_name", default="unnamed_data", help="Dataset name (default: %(default)s)") + parser.add_argument( + "--data_name", + default="unnamed_data", + help="Dataset name for generating the output directory (default: %(default)s)", + ) parser.add_argument("--training_file", help="Path to training data (default: %(default)s)") parser.add_argument("--val_file", help="Path to validation data (default: %(default)s)") - parser.add_argument("--test_file", help="Path to test data (default: %(default)s") + parser.add_argument("--test_file", help="Path to test data (default: %(default)s)") + parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)") parser.add_argument( "--val_size", type=float, default=0.2, - help="Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s).", + help="Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s)", ) parser.add_argument( "--min_vocab_freq", @@ -67,8 +96,24 @@ def add_all_arguments(parser): help="Whether to add the special tokens for inputs of the transformer-based language model. (default: %(default)s)", ) + # model + parser.add_argument("--model_name", default="unnamed_model", help="Model to be used (default: %(default)s)") + parser.add_argument( + "--init_weight", default="kaiming_uniform", help="Weight initialization to be used (default: %(default)s)" + ) + parser.add_argument( + "--loss_function", default="binary_cross_entropy_with_logits", help="Loss function (default: %(default)s)" + ) + + # pretrained vocab / embeddings + parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)") + parser.add_argument( + "--embed_file", + type=str, + help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)", + ) + # train - parser.add_argument("--seed", type=int, help="Random seed (default: %(default)s)") parser.add_argument( "--epochs", type=int, default=10000, help="The number of epochs to train (default: %(default)s)" ) @@ -109,15 +154,6 @@ def add_all_arguments(parser): help="Whether the embeddings of each word is normalized to a unit vector (default: %(default)s)", ) - # model - parser.add_argument("--model_name", default="unnamed_model", help="Model to be used (default: %(default)s)") - parser.add_argument( - "--init_weight", default="kaiming_uniform", help="Weight initialization to be used (default: %(default)s)" - ) - parser.add_argument( - "--loss_function", default="binary_cross_entropy_with_logits", help="Loss function (default: %(default)s)" - ) - # eval parser.add_argument( "--eval_batch_size", type=int, default=256, help="Size of evaluating batches (default: %(default)s)" @@ -138,28 +174,6 @@ def add_all_arguments(parser): "--val_metric", default="P@1", help="The metric to select the best model for testing (default: %(default)s)" ) - # pretrained vocab / embeddings - parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)") - parser.add_argument( - "--embed_file", type=str, help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)" - ) - parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)") - - # log - parser.add_argument( - "--save_k_predictions", - type=int, - nargs="?", - const=100, - default=0, - help="Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)", - ) - parser.add_argument( - "--predict_out_path", - default="./predictions.txt", - help="Path to the output file holding label results (default: %(default)s)", - ) - # auto-test parser.add_argument( "--limit_train_batches", @@ -180,24 +194,27 @@ def add_all_arguments(parser): help="Percentage of test dataset to use for auto-testing (default: %(default)s)", ) - # others - parser.add_argument("--cpu", action="store_true", help="Disable CUDA") - parser.add_argument("--silent", action="store_true", help="Enable silent mode") + # log parser.add_argument( - "--data_workers", type=int, default=4, help="Use multi-cpu core for data pre-processing (default: %(default)s)" + "--save_k_predictions", + type=int, + nargs="?", + const=100, + default=0, + help="Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)", ) parser.add_argument( - "--embed_cache_dir", - type=str, - help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)", + "--predict_out_path", + default="./predictions.txt", + help="Path to the output file holding label results (default: %(default)s)", ) + + # path / directory parser.add_argument( - "--eval", action="store_true", help="Only run evaluation on the test set (default: %(default)s)" + "--result_dir", default="./runs", help="The directory to save checkpoints and logs (default: %(default)s)" ) - parser.add_argument("--checkpoint_path", help="The checkpoint to warm-up with (default: %(default)s)") # linear options - parser.add_argument("--linear", action="store_true", help="Train linear model") parser.add_argument( "--data_format", type=str, @@ -224,7 +241,10 @@ def add_all_arguments(parser): "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)" ) parser.add_argument( - "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)" + "--tree_ensemble_models", + type=int, + default=1, + help="Number of models in the tree ensemble (default: %(default)s)", ) parser.add_argument( "--beam_width", @@ -239,13 +259,6 @@ def add_all_arguments(parser): default=8, help="the maximal number of labels inside a cluster (default: %(default)s)", ) - parser.add_argument( - "-h", - "--help", - action="help", - help="If you are trying to specify network config such as dropout or activation or config of the learning rate scheduler, use a yaml file instead. " - "See example configs in example_config", - ) def get_config():