diff --git a/experiments/code/ace/lite_llm_generator.py b/experiments/code/ace/lite_llm_generator.py index b758163..2f705d1 100644 --- a/experiments/code/ace/lite_llm_generator.py +++ b/experiments/code/ace/lite_llm_generator.py @@ -24,6 +24,14 @@ RateLimitError, UnprocessableEntityError, ) +from httpx import ( + ConnectTimeout, + ReadTimeout, + TimeoutException, + ConnectError, + ReadError, + WriteError, +) from rich.panel import Panel from appworld import AppWorld @@ -34,6 +42,7 @@ cache = Memory(os.path.join(path_store.cache, "llm_calls"), verbose=0) RETRY_ERROR = ( + # OpenAI exceptions APIConnectionError, APIError, APIResponseValidationError, @@ -48,6 +57,13 @@ PermissionDeniedError, RateLimitError, UnprocessableEntityError, + # httpx exceptions for connection/timeout issues (e.g., SambaNova client) + ConnectTimeout, + ReadTimeout, + TimeoutException, + ConnectError, + ReadError, + WriteError, ) CHAT_COMPLETION = { # These are lambda so set environment variables take effect at runtime "openai": lambda: OpenAI(api_key="9b419298-ffce-4d50-a42c-0b4a0b911a89", base_url="https://api.sambanova.ai/v1").chat.completions.create, @@ -145,7 +161,10 @@ def non_cached_chat_completion( if provider.strip().lower() == "sambanova": from sambanova import SambaNova - client = SambaNova() + import httpx + # Set longer timeout: 60s for connection, 600s for read/write + timeout = httpx.Timeout(60.0, connect=60.0, read=600.0, write=600.0) + client = SambaNova(timeout=timeout) elif provider.strip().lower() == "together": from together import Together client = Together() @@ -310,7 +329,7 @@ def generate( return {"content": "", "tool_calls": [], "cost": 0} success = False - for _ in range(self.max_retries): + for attempt in range(self.max_retries): try: arguments = { "model": self.model, @@ -330,7 +349,9 @@ def generate( print(traceback.format_exc()) exit() - print(f"Encountered LM Error: {exception.message[:200].strip()}...") + error_msg = str(exception)[:200] if str(exception) else type(exception).__name__ + print(f"Encountered LM Error: {error_msg.strip()}...") + print(f"Retrying... (Attempt {attempt + 1}/{self.max_retries})") print(f"Will try again in {self.retry_after_n_seconds} seconds.") time.sleep(self.retry_after_n_seconds) pass diff --git a/experiments/curriculum/analyze_dataset.py b/experiments/curriculum/analyze_dataset.py new file mode 100755 index 0000000..4f16bb5 --- /dev/null +++ b/experiments/curriculum/analyze_dataset.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 +""" +Analyze task difficulty distribution and instruction similarity from a dataset file. + +Usage (run from repository root): + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric cosine:0.8 + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric oracle + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric embedding:0.85 --embedding-model text-embedding-3-large + +Available similarity metrics: + - jaccard:THRESHOLD - Word-based set similarity (e.g., jaccard:0.7) + - cosine:THRESHOLD - TF-IDF cosine similarity (e.g., cosine:0.8) + - levenshtein:THRESHOLD - Character-level edit distance (e.g., levenshtein:0.75) + - oracle - Ground truth clustering using task family IDs (e.g., "76f2c72" from "76f2c72_2") + Note: threshold not needed for oracle since it uses exact family ID matching + - embedding:THRESHOLD - Semantic similarity using OpenAI embeddings API (e.g., embedding:0.85) + Requires: OPENAI_API_KEY environment variable +""" + +import argparse +import json +from collections import Counter +from pathlib import Path +from typing import Dict, List, Tuple + +from similarity_metrics import calculate_similarity, compute_embeddings_batch, compute_idf_scores + + +def load_dataset(dataset_name: str) -> List[str]: + """Load task IDs from dataset file.""" + dataset_file = Path("data/datasets") / dataset_name + + if not dataset_file.exists(): + raise FileNotFoundError(f"Dataset file not found: {dataset_file}") + + with open(dataset_file, 'r') as f: + task_ids = [line.strip() for line in f if line.strip()] + return task_ids + + +def get_task_difficulty(task_id: str) -> Tuple[str, int, str]: + """Get difficulty level and instruction for a task from its metadata files.""" + metadata_path = Path("data/tasks") / task_id / "ground_truth" / "metadata.json" + specs_path = Path("data/tasks") / task_id / "specs.json" + + try: + # Get difficulty from metadata.json + with open(metadata_path, 'r') as f: + metadata = json.load(f) + difficulty = metadata.get('difficulty', -1) + + # Get instruction from specs.json + instruction = "" + try: + with open(specs_path, 'r') as f: + specs = json.load(f) + instruction = specs.get('instruction', '') + except (FileNotFoundError, json.JSONDecodeError): + pass + + return (task_id, difficulty, instruction) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Warning: Could not read metadata for task {task_id}: {e}") + return (task_id, -1, "") + + +def analyze_difficulty_distribution(task_ids: List[str]) -> List[Tuple[str, int, str]]: + """Analyze difficulty distribution and return task-difficulty-instruction tuples.""" + task_data = [] + + for task_id in task_ids: + task_id, difficulty, instruction = get_task_difficulty(task_id) + task_data.append((task_id, difficulty, instruction)) + + return task_data + + +def print_distribution(task_difficulty_pairs: List[Tuple[str, int, str]]): + """Print difficulty distribution statistics.""" + # Count by difficulty + difficulty_counts = {} + total_tasks = len(task_difficulty_pairs) + + for _, difficulty, _ in task_difficulty_pairs: + if difficulty >= 0: + difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1 + + print("\n" + "=" * 60) + print("DIFFICULTY DISTRIBUTION") + print("=" * 60) + + print(f"\n{'Difficulty':<12} {'Count':<10} {'Percentage':<12} {'Bar'}") + print("-" * 60) + + for difficulty in sorted(difficulty_counts.keys()): + count = difficulty_counts[difficulty] + percentage = (count / total_tasks) * 100 + bar = "ā–ˆ" * int(percentage / 2) + print(f"Level {difficulty:<6} {count:<10} {percentage:>6.2f}% {bar}") + + print("-" * 60) + print(f"{'Total':<12} {total_tasks:<10} {'100.00%':<12}") + print() + + +def print_task_list(task_difficulty_pairs: List[Tuple[str, int, str]]): + """Print tasks in original order with difficulty level appended.""" + print("\n" + "=" * 60) + print("TASKS IN ORIGINAL ORDER (with difficulty)") + print("=" * 60) + print() + + for task_id, difficulty, _ in task_difficulty_pairs: + diff_str = f"Level {difficulty}" if difficulty >= 0 else "Unknown" + print(f"{task_id:<20} -> {diff_str}") + + print() + + +def _analyze_oracle_similarity( + task_difficulty_pairs: List[Tuple[str, int, str]], + threshold: float = 0.7 +) -> Dict: + """Analyze similarity using oracle (task family ID) clustering. + + Note: Threshold is ignored for oracle similarity since it produces binary results + (1.0 for same family, 0.0 for different families). All tasks with the same family + ID will always be clustered together. + + Args: + task_difficulty_pairs: List of (task_id, difficulty, instruction) tuples + threshold: Ignored for oracle metric + + Returns: + Dictionary with clustering results based on task family IDs + """ + from similarity_metrics import extract_task_family_id + + # Group tasks by family ID + family_to_tasks = {} + all_tasks_count = 0 + + for task_id, difficulty, instruction in task_difficulty_pairs: + family_id = extract_task_family_id(task_id) + if family_id not in family_to_tasks: + family_to_tasks[family_id] = [] + family_to_tasks[family_id].append({ + "task_id": task_id, + "difficulty": difficulty, + "instruction": instruction + }) + all_tasks_count += 1 + + # Build clusters (one cluster per family) + detailed_clusters = [] + for idx, (family_id, tasks) in enumerate(family_to_tasks.items()): + # Get unique instructions in this family + unique_instructions = set(task["instruction"] for task in tasks if task["instruction"]) + + detailed_clusters.append({ + "cluster_id": idx, + "family_id": family_id, + "size": len(tasks), + "unique_instructions": len(unique_instructions), + "tasks": tasks + }) + + # Sort clusters by size (largest first) + detailed_clusters.sort(key=lambda x: x["size"], reverse=True) + + # Get cluster size distribution + cluster_sizes = Counter(len(cluster["tasks"]) for cluster in detailed_clusters) + + # Count unique instructions across all tasks + all_instructions = [instruction for _, _, instruction in task_difficulty_pairs if instruction] + unique_instructions_total = len(set(all_instructions)) + exact_duplicates = len(all_instructions) - unique_instructions_total + + return { + "total_instructions": all_tasks_count, + "exact_duplicates": exact_duplicates, + "unique_instructions": unique_instructions_total, + "similarity_metric": "oracle", + "clustering_threshold": "N/A (oracle uses exact family ID matching)", + "num_clusters": len(detailed_clusters), + "cluster_size_distribution": dict(cluster_sizes), + "clusters": detailed_clusters + } + + +def analyze_instruction_similarity( + task_difficulty_pairs: List[Tuple[str, int, str]], + similarity_metric: str = "jaccard", + threshold: float = 0.7, + embedding_model: str = "text-embedding-3-large" +) -> Dict: + """Analyze instruction similarity patterns in the dataset.""" + # For oracle metric, we cluster by task ID directly + if similarity_metric == "oracle": + return _analyze_oracle_similarity(task_difficulty_pairs, threshold) + + # Build mapping of instruction to task IDs + instruction_to_tasks = {} + for task_id, difficulty, instruction in task_difficulty_pairs: + if instruction: + if instruction not in instruction_to_tasks: + instruction_to_tasks[instruction] = [] + instruction_to_tasks[instruction].append({ + "task_id": task_id, + "difficulty": difficulty + }) + + instructions = list(instruction_to_tasks.keys()) + + if not instructions: + return { + "total_instructions": 0, + "exact_duplicates": 0, + "unique_instructions": 0, + "similarity_metric": similarity_metric, + "clustering_threshold": threshold, + "num_clusters": 0, + "clusters": [] + } + + # Count exact duplicates + unique_instructions = len(instructions) + total_instructions_count = sum(len(tasks) for tasks in instruction_to_tasks.values()) + exact_duplicates = total_instructions_count - unique_instructions # Number of duplicate copies + + # Prepare metric-specific data + idf_scores = None + embeddings_cache = None + api_key = None + + if similarity_metric == "cosine": + idf_scores = compute_idf_scores(instructions) + elif similarity_metric == "embedding": + # Pre-compute all embeddings for efficiency + print(f" Computing embeddings for {len(instructions)} unique instructions using model '{embedding_model}'...") + import os + api_key = os.environ.get("OPENAI_API_KEY") + embeddings_cache = compute_embeddings_batch(instructions, api_key, model=embedding_model) + print(f" Embeddings computed successfully") + + # Clustering based on similarity + # Group instructions that are similar to each other + clusters = [] + clustered = set() + + for i, inst1 in enumerate(instructions): + if inst1 in clustered: + continue + + cluster = [inst1] + clustered.add(inst1) + + for inst2 in instructions[i+1:]: + if inst2 in clustered: + continue + + # Check similarity with any instruction in current cluster + is_similar = False + for cluster_inst in cluster: + sim_score = calculate_similarity( + cluster_inst, inst2, similarity_metric, + idf_scores=idf_scores, + embeddings_cache=embeddings_cache, + api_key=api_key + ) + if sim_score >= threshold: + is_similar = True + break + + if is_similar: + cluster.append(inst2) + clustered.add(inst2) + + clusters.append(cluster) + + # Get cluster size distribution + cluster_sizes = Counter(len(cluster) for cluster in clusters) + + # Build detailed cluster information with task IDs + detailed_clusters = [] + for idx, cluster in enumerate(clusters): + cluster_tasks = [] + for instruction in cluster: + for task_info in instruction_to_tasks[instruction]: + cluster_tasks.append({ + "task_id": task_info["task_id"], + "difficulty": task_info["difficulty"], + "instruction": instruction + }) + + detailed_clusters.append({ + "cluster_id": idx, + "size": len(cluster_tasks), + "unique_instructions": len(cluster), + "tasks": cluster_tasks + }) + + # Sort clusters by size (largest first) + detailed_clusters.sort(key=lambda x: x["size"], reverse=True) + + return { + "total_instructions": total_instructions_count, + "exact_duplicates": exact_duplicates, + "unique_instructions": unique_instructions, + "similarity_metric": similarity_metric, + "clustering_threshold": threshold, + "num_clusters": len(clusters), + "cluster_size_distribution": dict(cluster_sizes), + "clusters": detailed_clusters + } + + +def print_similarity_analysis(similarity_data: Dict): + """Print instruction similarity analysis.""" + print("\n" + "=" * 60) + print("INSTRUCTION CLUSTERING ANALYSIS") + print("=" * 60) + print() + + print(f"Total instructions: {similarity_data['total_instructions']}") + print(f"Unique instructions: {similarity_data['unique_instructions']}") + print(f"Exact duplicates (copies): {similarity_data['exact_duplicates']}") + print() + + print(f"Clustering (metric: {similarity_data['similarity_metric']}, threshold ≄{similarity_data['clustering_threshold']}):") + print(f" Number of clusters: {similarity_data['num_clusters']}") + print(f" Cluster size distribution: {similarity_data['cluster_size_distribution']}") + print() + + # Show top 10 largest clusters + print("Top 10 largest clusters:") + for cluster in similarity_data['clusters'][:10]: + print(f" Cluster {cluster['cluster_id']}: {cluster['size']} tasks ({cluster['unique_instructions']} unique instructions)") + # Show first instruction as example + if cluster['tasks']: + example = cluster['tasks'][0]['instruction'] + print(f" Example: {example[:80]}{'...' if len(example) > 80 else ''}") + print() + + +def save_analysis( + task_difficulty_pairs: List[Tuple[str, int, str]], + dataset_name: str, + similarity_data: Dict +): + """Save formatted analysis to JSON file in datasets folder.""" + output_file = Path("data/datasets") / f"{dataset_name}_analysis.json" + + # Count by difficulty + difficulty_counts = {} + total_tasks = len(task_difficulty_pairs) + + for _, difficulty, _ in task_difficulty_pairs: + if difficulty >= 0: + difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1 + + # Build distribution + distribution = {} + for difficulty in sorted(difficulty_counts.keys()): + count = difficulty_counts[difficulty] + percentage = (count / total_tasks) * 100 + distribution[f"level_{difficulty}"] = { + "count": count, + "percentage": round(percentage, 2) + } + + # Filter out unknown difficulty tasks and include instruction text + tasks_with_difficulty = [ + {"task_id": task_id, "difficulty": difficulty, "instruction": instruction} + for task_id, difficulty, instruction in task_difficulty_pairs + if difficulty >= 0 + ] + + # Create output data + output_data = { + "dataset": dataset_name, + "total_tasks": total_tasks, + "distribution": distribution, + "instruction_similarity": similarity_data, + "tasks": tasks_with_difficulty + } + + with open(output_file, 'w') as f: + json.dump(output_data, f, indent=2) + + print(f"\nAnalysis saved to: {output_file}") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze task difficulty distribution from a dataset file", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic analysis with default jaccard metric and threshold + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt + + # Use cosine similarity with custom threshold + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric cosine:0.8 + + # Use oracle clustering (task family IDs) + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric oracle + + # Use embedding similarity with custom model + python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric embedding:0.85 --embedding-model text-embedding-3-large + +Available similarity metrics: + - jaccard:THRESHOLD (e.g., jaccard:0.7) + - cosine:THRESHOLD (e.g., cosine:0.8) + - levenshtein:THRESHOLD (e.g., levenshtein:0.75) + - oracle (no threshold needed) + - embedding:THRESHOLD (e.g., embedding:0.85, requires OPENAI_API_KEY) + """ + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset filename (e.g., train.txt, test.txt)" + ) + parser.add_argument( + "--similarity-metric", + type=str, + default="jaccard:0.7", + help="Similarity metric with optional threshold in format 'metric:threshold' (e.g., 'cosine:0.8', 'oracle'). Default: jaccard:0.7" + ) + parser.add_argument( + "--embedding-model", + type=str, + default="text-embedding-3-large", + help="Embedding model to use when similarity-metric uses 'embedding' (default: text-embedding-3-large). Other OpenAI options: text-embedding-3-small, text-embedding-ada-002" + ) + + args = parser.parse_args() + + # Parse similarity metric and threshold + if ':' in args.similarity_metric: + parts = args.similarity_metric.split(':', 1) + similarity_metric = parts[0] + try: + threshold = float(parts[1]) + except ValueError: + parser.error(f"Invalid threshold in --similarity-metric argument: {parts[1]}") + else: + # For oracle or if user specifies metric without threshold + similarity_metric = args.similarity_metric + if similarity_metric == "oracle": + threshold = 0.0 # Will be ignored for oracle + else: + # Default threshold for metrics without explicit threshold + threshold = 0.7 + print(f"Note: No threshold specified, using default: {threshold}") + + # Validate similarity metric + valid_metrics = ["jaccard", "cosine", "levenshtein", "oracle", "embedding"] + if similarity_metric not in valid_metrics: + parser.error(f"Invalid similarity metric: {similarity_metric}. Must be one of {valid_metrics}") + + # Load dataset + print(f"Loading dataset: {args.dataset}") + task_ids = load_dataset(args.dataset) + print(f"Found {len(task_ids)} tasks") + + # Analyze difficulty distribution + print("Analyzing difficulty levels...") + task_difficulty_pairs = analyze_difficulty_distribution(task_ids) + + # Print results + print_distribution(task_difficulty_pairs) + print_task_list(task_difficulty_pairs) + + # Analyze and print instruction similarity + print(f"Analyzing instruction similarity (metric: {similarity_metric}, threshold: {threshold})...") + similarity_data = analyze_instruction_similarity( + task_difficulty_pairs, + similarity_metric=similarity_metric, + threshold=threshold, + embedding_model=args.embedding_model + ) + print_similarity_analysis(similarity_data) + + # Save analysis + dataset_name = Path(args.dataset).stem + save_analysis(task_difficulty_pairs, dataset_name, similarity_data) + + +if __name__ == "__main__": + main() diff --git a/experiments/curriculum/check_same_data.py b/experiments/curriculum/check_same_data.py new file mode 100644 index 0000000..dd8b877 --- /dev/null +++ b/experiments/curriculum/check_same_data.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +Script to check if two files contain the same data in different orders. +Compares files line by line, ignoring the order of lines. +""" + +import argparse +from pathlib import Path +from typing import Set, Tuple + + +def read_file_lines(file_path: Path) -> Tuple[Set[str], int]: + """ + Read all lines from a file and return them as a set along with the total count. + + Args: + file_path: Path to the file to read + + Returns: + Tuple of (set of lines, total line count) + """ + with open(file_path, 'r', encoding='utf-8') as f: + lines = [line.rstrip('\n\r') for line in f] + + return set(lines), len(lines) + + +def compare_files(file1_path: Path, file2_path: Path, verbose: bool = True) -> bool: + """ + Compare two files to check if they contain the same data in different orders. + + Args: + file1_path: Path to the first file + file2_path: Path to the second file + verbose: If True, print detailed comparison information + + Returns: + True if files contain the same data, False otherwise + """ + # Read both files + print(f"Reading {file1_path}...") + lines1, count1 = read_file_lines(file1_path) + + print(f"Reading {file2_path}...") + lines2, count2 = read_file_lines(file2_path) + + # Print basic stats + print(f"\n{file1_path.name}: {count1} total lines, {len(lines1)} unique lines") + print(f"{file2_path.name}: {count2} total lines, {len(lines2)} unique lines") + + # Check if sets are equal + are_same = lines1 == lines2 + + if are_same: + print(f"\nāœ“ Files contain the SAME data (possibly in different order)") + if count1 != len(lines1): + print(f" Note: File 1 has {count1 - len(lines1)} duplicate lines") + if count2 != len(lines2): + print(f" Note: File 2 has {count2 - len(lines2)} duplicate lines") + else: + print(f"\nāœ— Files contain DIFFERENT data") + + if verbose: + # Find differences + only_in_file1 = lines1 - lines2 + only_in_file2 = lines2 - lines1 + + if only_in_file1: + print(f"\nLines only in {file1_path.name} ({len(only_in_file1)} lines):") + for line in sorted(only_in_file1)[:10]: # Show first 10 + print(f" {line[:100]}...") # Truncate long lines + if len(only_in_file1) > 10: + print(f" ... and {len(only_in_file1) - 10} more") + + if only_in_file2: + print(f"\nLines only in {file2_path.name} ({len(only_in_file2)} lines):") + for line in sorted(only_in_file2)[:10]: # Show first 10 + print(f" {line[:100]}...") # Truncate long lines + if len(only_in_file2) > 10: + print(f" ... and {len(only_in_file2) - 10} more") + + return are_same + + +def main(): + parser = argparse.ArgumentParser( + description="Check if two files contain the same data in different orders" + ) + parser.add_argument( + "file1", + type=str, + help="Path to the first file (relative to data/datasets/)" + ) + parser.add_argument( + "file2", + type=str, + help="Path to the second file (relative to data/datasets/)" + ) + + args = parser.parse_args() + + # Prepend data/datasets to file paths + base_path = Path("data/datasets") + file1_path = base_path / args.file1 + file2_path = base_path / args.file2 + + # Check if files exist + if not file1_path.exists(): + print(f"Error: File not found: {file1_path}") + return 1 + + if not file2_path.exists(): + print(f"Error: File not found: {file2_path}") + return 1 + + # Compare files (verbose is always True by default) + try: + are_same = compare_files(file1_path, file2_path) + return 0 if are_same else 1 + except Exception as e: + print(f"Error comparing files: {e}") + return 1 + + +if __name__ == "__main__": + exit(main()) diff --git a/experiments/curriculum/data_selector.py b/experiments/curriculum/data_selector.py new file mode 100755 index 0000000..0c9d840 --- /dev/null +++ b/experiments/curriculum/data_selector.py @@ -0,0 +1,771 @@ +#!/usr/bin/env python3 +""" +Select and reorder tasks from a dataset file based on difficulty and size. + +Usage (run from repository root): + # Standard selection + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --size 50 --difficulty balanced --order easy-to-hard + + # Cluster-based selection (ignores --size, respects --difficulty): + # No filter (all difficulties) + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --order original + + # Easy-only (strict - errors if not enough easy samples) + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster oracle --examples-per-cluster 3 --difficulty easy-only + + # Easy-preferred (prefers easy, uses closest as fallback) + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --difficulty easy-preferred +""" + +import argparse +import json +import random +from pathlib import Path +from typing import List, Tuple, Dict + +from similarity_metrics import calculate_similarity, compute_embeddings_batch, compute_idf_scores + + +def load_dataset(dataset_name: str) -> List[str]: + """Load task IDs from dataset file.""" + dataset_file = Path("data/datasets") / dataset_name + + if not dataset_file.exists(): + raise FileNotFoundError(f"Dataset file not found: {dataset_file}") + + with open(dataset_file, 'r') as f: + task_ids = [line.strip() for line in f if line.strip()] + return task_ids + + +def get_task_difficulty(task_id: str) -> int: + """Get difficulty level for a task from its metadata.json file.""" + metadata_path = Path("data/tasks") / task_id / "ground_truth" / "metadata.json" + + try: + with open(metadata_path, 'r') as f: + metadata = json.load(f) + return metadata.get('difficulty', -1) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Warning: Could not read metadata for task {task_id}: {e}") + return -1 + + +def get_task_instruction(task_id: str) -> str: + """Get instruction text for a task from its specs.json file.""" + specs_path = Path("data/tasks") / task_id / "specs.json" + + try: + with open(specs_path, 'r') as f: + specs = json.load(f) + return specs.get('instruction', '') + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Warning: Could not read specs for task {task_id}: {e}") + return "" + + +def get_tasks_with_difficulty(task_ids: List[str]) -> List[Tuple[str, int]]: + """Get list of (task_id, difficulty) tuples.""" + task_difficulty_pairs = [] + for task_id in task_ids: + difficulty = get_task_difficulty(task_id) + task_difficulty_pairs.append((task_id, difficulty)) + return task_difficulty_pairs + + +def filter_by_difficulty( + task_pairs: List[Tuple[str, int]], + difficulty_mode: str +) -> List[Tuple[str, int]]: + """Filter tasks based on difficulty mode.""" + if difficulty_mode is None: + # No filter specified + return [p for p in task_pairs if p[1] >= 0] + elif difficulty_mode in ["easy-only", "easy-preferred"]: + return [p for p in task_pairs if p[1] == 1] + elif difficulty_mode in ["medium-only", "medium-preferred"]: + return [p for p in task_pairs if p[1] == 2] + elif difficulty_mode in ["hard-only", "hard-preferred"]: + return [p for p in task_pairs if p[1] == 3] + elif difficulty_mode in ["balanced", "custom"]: + # Keep all difficulties for balanced/custom selection + return [p for p in task_pairs if p[1] >= 0] + else: + raise ValueError(f"Unknown difficulty mode: {difficulty_mode}") + + +def select_tasks( + task_pairs: List[Tuple[str, int]], + size: int, + difficulty_mode: str, + ratio: str = None +) -> List[Tuple[str, int]]: + """Select tasks based on size and difficulty distribution.""" + if difficulty_mode in ["balanced", "custom"]: + # Group by difficulty + by_difficulty = {1: [], 2: [], 3: []} + for task_id, diff in task_pairs: + if diff in by_difficulty: + by_difficulty[diff].append((task_id, diff)) + + # Determine ratio based on mode + if difficulty_mode == "balanced": + # Strictly equal distribution: 1:1:1 + ratio_easy = ratio_medium = ratio_hard = 1 + else: # difficulty_mode == "custom" + # Custom ratio from --ratio argument + if ratio: + try: + parts = [int(x) for x in ratio.split(':')] + if len(parts) != 3: + raise ValueError("Ratio must have 3 parts (easy:medium:hard)") + ratio_easy, ratio_medium, ratio_hard = parts + except ValueError as e: + print(f"Warning: Invalid ratio '{ratio}', using equal distribution. Error: {e}") + ratio_easy = ratio_medium = ratio_hard = 1 + else: + # Default for custom mode if no ratio specified + ratio_easy = ratio_medium = ratio_hard = 1 + + # Calculate how many tasks per difficulty level based on ratio + total_ratio = ratio_easy + ratio_medium + ratio_hard + count_easy = int(size * ratio_easy / total_ratio) + count_medium = int(size * ratio_medium / total_ratio) + count_hard = size - count_easy - count_medium # Ensure we hit exact size + + selected = [] + + # Select tasks according to ratio + selected.extend(by_difficulty[1][:min(count_easy, len(by_difficulty[1]))]) + selected.extend(by_difficulty[2][:min(count_medium, len(by_difficulty[2]))]) + selected.extend(by_difficulty[3][:min(count_hard, len(by_difficulty[3]))]) + + return selected[:size] # Ensure we don't exceed size + else: + # For non-balanced modes, just take first N tasks + return task_pairs[:size] + + +def cluster_tasks_by_similarity( + task_ids: List[str], + similarity_metric: str, + threshold: float, + embedding_model: str = "text-embedding-3-large" +) -> List[List[Dict]]: + """Cluster tasks by instruction similarity, preserving original order within clusters. + + Returns: + List of clusters, where each cluster is a list of dicts with 'task_id', 'difficulty', + 'instruction', and 'original_index' keys. + """ + print(f"Reading task instructions and metadata...") + + # Build task data with original indices + task_data = [] + for idx, task_id in enumerate(task_ids): + difficulty = get_task_difficulty(task_id) + instruction = get_task_instruction(task_id) + if instruction and difficulty >= 0: # Only include tasks with valid instruction and difficulty + task_data.append({ + 'task_id': task_id, + 'difficulty': difficulty, + 'instruction': instruction, + 'original_index': idx + }) + + if not task_data: + print("Warning: No valid tasks found for clustering") + return [] + + print(f"Found {len(task_data)} valid tasks for clustering") + + # Handle oracle clustering (by task family ID) + if similarity_metric == "oracle": + from similarity_metrics import extract_task_family_id + + family_to_tasks = {} + for task in task_data: + family_id = extract_task_family_id(task['task_id']) + if family_id not in family_to_tasks: + family_to_tasks[family_id] = [] + family_to_tasks[family_id].append(task) + + # Sort each cluster by original index to preserve order + clusters = [] + for family_id in sorted(family_to_tasks.keys()): + cluster = sorted(family_to_tasks[family_id], key=lambda x: x['original_index']) + clusters.append(cluster) + + return clusters + + # For other metrics, cluster by instruction similarity + # Build instruction to tasks mapping + instruction_to_tasks = {} + for task in task_data: + instruction = task['instruction'] + if instruction not in instruction_to_tasks: + instruction_to_tasks[instruction] = [] + instruction_to_tasks[instruction].append(task) + + instructions = list(instruction_to_tasks.keys()) + + # Prepare metric-specific data + idf_scores = None + embeddings_cache = None + api_key = None + + if similarity_metric == "cosine": + print("Computing IDF scores...") + idf_scores = compute_idf_scores(instructions) + elif similarity_metric == "embedding": + print(f"Computing embeddings for {len(instructions)} unique instructions using model '{embedding_model}'...") + import os + api_key = os.environ.get("OPENAI_API_KEY") + embeddings_cache = compute_embeddings_batch(instructions, api_key, model=embedding_model) + print("Embeddings computed successfully") + + # Clustering based on similarity + clusters = [] + clustered = set() + + for i, inst1 in enumerate(instructions): + if inst1 in clustered: + continue + + cluster_instructions = [inst1] + clustered.add(inst1) + + for inst2 in instructions[i+1:]: + if inst2 in clustered: + continue + + # Check similarity with any instruction in current cluster + is_similar = False + for cluster_inst in cluster_instructions: + sim_score = calculate_similarity( + cluster_inst, inst2, similarity_metric, + idf_scores=idf_scores, + embeddings_cache=embeddings_cache, + api_key=api_key + ) + if sim_score >= threshold: + is_similar = True + break + + if is_similar: + cluster_instructions.append(inst2) + clustered.add(inst2) + + # Collect all tasks for this cluster and sort by original index + cluster_tasks = [] + for instruction in cluster_instructions: + cluster_tasks.extend(instruction_to_tasks[instruction]) + + cluster_tasks.sort(key=lambda x: x['original_index']) + clusters.append(cluster_tasks) + + # Sort clusters by the minimum original index in each cluster + clusters.sort(key=lambda cluster: min(task['original_index'] for task in cluster)) + + return clusters + + +def select_from_clusters( + clusters: List[List[Dict]], + examples_per_cluster: int, + difficulty_filter: str = None, + ratio: str = None +) -> Tuple[List[Dict], Dict]: + """Select examples from each cluster with difficulty filtering, preserving original relative order. + + Args: + clusters: List of clusters (each cluster is a list of task dicts) + examples_per_cluster: Number of examples to select from each cluster + difficulty_filter: Difficulty mode (easy-only, easy-preferred, medium-only, medium-preferred, + hard-only, hard-preferred, balanced, custom, or None for no filter) + ratio: Custom ratio for balanced/custom mode (e.g., "1:2:1") + + Returns: + Tuple of (selected task dicts sorted by original_index, statistics dict) + + Raises: + ValueError: If using -only mode and not enough samples of that difficulty are available + """ + # Parse difficulty filter mode + if difficulty_filter is None: + # No difficulty filter - select first N from each cluster + selected = [] + for cluster in clusters: + selected.extend(cluster[:examples_per_cluster]) + selected.sort(key=lambda x: x['original_index']) + stats = { + 'exact_matches': len(selected), + 'approximate_matches': {}, + 'total_selected': len(selected) + } + return selected, stats + + # Determine target difficulties and strictness based on filter mode + strict_mode = False + target_difficulties = [] + + if difficulty_filter == "easy-only": + target_difficulties = [1] + strict_mode = True + elif difficulty_filter == "easy-preferred": + target_difficulties = [1] + strict_mode = False + elif difficulty_filter == "medium-only": + target_difficulties = [2] + strict_mode = True + elif difficulty_filter == "medium-preferred": + target_difficulties = [2] + strict_mode = False + elif difficulty_filter == "hard-only": + target_difficulties = [3] + strict_mode = True + elif difficulty_filter == "hard-preferred": + target_difficulties = [3] + strict_mode = False + elif difficulty_filter in ["balanced", "custom"]: + # Parse ratio + if difficulty_filter == "balanced": + ratio_parts = [1, 1, 1] + else: + if ratio: + try: + ratio_parts = [int(x) for x in ratio.split(':')] + if len(ratio_parts) != 3: + print("Warning: Invalid ratio format, using equal distribution") + ratio_parts = [1, 1, 1] + except ValueError: + print("Warning: Invalid ratio values, using equal distribution") + ratio_parts = [1, 1, 1] + else: + ratio_parts = [1, 1, 1] + + # Build target list based on ratio + total_ratio = sum(ratio_parts) + for cluster in clusters: + count_easy = int(examples_per_cluster * ratio_parts[0] / total_ratio) + count_medium = int(examples_per_cluster * ratio_parts[1] / total_ratio) + count_hard = examples_per_cluster - count_easy - count_medium + # We'll handle this per cluster, so just mark that we need balanced selection + target_difficulties = None # Special handling for balanced mode + strict_mode = True # Balanced mode is strict by default + + # Statistics tracking + stats = { + 'exact_matches': 0, + 'approximate_matches': {}, # {(requested, actual): count} + 'total_selected': 0 + } + + selected = [] + + for cluster in clusters: + if difficulty_filter in ["balanced", "custom"]: + # For balanced/custom, try to get proportional representation + ratio_parts = [1, 1, 1] if difficulty_filter == "balanced" else [int(x) for x in (ratio or "1:1:1").split(':')] + total_ratio = sum(ratio_parts) + count_easy = int(examples_per_cluster * ratio_parts[0] / total_ratio) + count_medium = int(examples_per_cluster * ratio_parts[1] / total_ratio) + count_hard = examples_per_cluster - count_easy - count_medium + + # Group cluster tasks by difficulty + by_diff = {1: [], 2: [], 3: []} + for task in cluster: + if task['difficulty'] in by_diff: + by_diff[task['difficulty']].append(task) + + # Try to select according to ratio + cluster_selected = [] + needs = {1: count_easy, 2: count_medium, 3: count_hard} + + for diff_level in [1, 2, 3]: + available = by_diff[diff_level] + needed = needs[diff_level] + taken = min(needed, len(available)) + cluster_selected.extend(available[:taken]) + stats['exact_matches'] += taken + needs[diff_level] -= taken + + # If we still need more, try to fill with closest difficulty + if len(cluster_selected) < examples_per_cluster: + for target_diff in [1, 2, 3]: + if needs[target_diff] > 0: + # Try adjacent difficulties + for alt_diff in [target_diff - 1, target_diff + 1, target_diff - 2, target_diff + 2]: + if alt_diff in [1, 2, 3] and needs[target_diff] > 0: + # Get tasks not already selected + available = [t for t in by_diff[alt_diff] if t not in cluster_selected] + taken = min(needs[target_diff], len(available)) + if taken > 0: + cluster_selected.extend(available[:taken]) + key = (target_diff, alt_diff) + stats['approximate_matches'][key] = stats['approximate_matches'].get(key, 0) + taken + needs[target_diff] -= taken + + selected.extend(cluster_selected) + stats['total_selected'] += len(cluster_selected) + else: + # For single difficulty modes (-only/-preferred) + cluster_selected = [] + for task in cluster: + if len(cluster_selected) >= examples_per_cluster: + break + if task['difficulty'] in target_difficulties: + cluster_selected.append(task) + stats['exact_matches'] += 1 + + # Check if we got enough samples for strict mode + if strict_mode and len(cluster_selected) < examples_per_cluster: + # In strict mode, raise error if not enough samples + diff_level_name = {1: "easy", 2: "medium", 3: "hard"}[target_difficulties[0]] + raise ValueError( + f"Not enough {diff_level_name} samples in cluster. " + f"Needed {examples_per_cluster}, found {len(cluster_selected)}. " + f"Use '{diff_level_name}-preferred' mode to allow closest matches as fallback." + ) + + # If we need more and in preferred mode, use closest difficulty + if not strict_mode and len(cluster_selected) < examples_per_cluster: + remaining = [t for t in cluster if t not in cluster_selected] + + for task in remaining: + if len(cluster_selected) >= examples_per_cluster: + break + + # Find closest target difficulty + if target_difficulties: + closest_target = min(target_difficulties, key=lambda x: abs(x - task['difficulty'])) + cluster_selected.append(task) + key = (closest_target, task['difficulty']) + stats['approximate_matches'][key] = stats['approximate_matches'].get(key, 0) + 1 + + selected.extend(cluster_selected) + stats['total_selected'] += len(cluster_selected) + + # Sort all selected tasks by original index to preserve overall order + selected.sort(key=lambda x: x['original_index']) + + return selected, stats + + +def order_tasks( + task_pairs: List[Tuple[str, int]], + order_mode: str, + random_seed: int = None +) -> List[Tuple[str, int]]: + """Order tasks based on the specified mode.""" + if order_mode == "original": + return task_pairs + elif order_mode == "easy-to-hard": + return sorted(task_pairs, key=lambda x: (x[1], x[0])) + elif order_mode == "hard-to-easy": + return sorted(task_pairs, key=lambda x: (-x[1], x[0])) + elif order_mode == "random": + if random_seed is not None: + random.seed(random_seed) + shuffled = task_pairs.copy() + random.shuffle(shuffled) + return shuffled + else: + raise ValueError(f"Unknown order mode: {order_mode}") + + +def save_dataset(task_ids: List[str], output_name: str): + """Save selected task IDs to output file.""" + output_file = Path("data/datasets") / output_name + + with open(output_file, 'w') as f: + for task_id in task_ids: + f.write(f"{task_id}\n") + + print(f"Saved {len(task_ids)} tasks to: {output_file}") + + +def print_difficulty_matching_stats(stats: Dict): + """Print statistics about difficulty matching in cluster selection.""" + print("\n" + "=" * 60) + print("DIFFICULTY MATCHING STATISTICS") + print("=" * 60) + + print(f"Total tasks selected: {stats['total_selected']}") + print(f"Exact matches: {stats['exact_matches']} ({stats['exact_matches']/stats['total_selected']*100:.1f}%)") + + if stats['approximate_matches']: + total_approx = sum(stats['approximate_matches'].values()) + print(f"Approximate matches: {total_approx} ({total_approx/stats['total_selected']*100:.1f}%)") + print("\nApproximate match breakdown:") + print(f" {'Requested':<12} {'Actual':<12} {'Count':<8}") + print(" " + "-" * 32) + + # Sort by requested difficulty, then actual + for (requested, actual), count in sorted(stats['approximate_matches'].items()): + req_label = f"Level {requested}" + act_label = f"Level {actual}" + print(f" {req_label:<12} {act_label:<12} {count:<8}") + else: + print("Approximate matches: 0 (all exact matches!)") + print() + + +def print_summary(task_pairs: List[Tuple[str, int]]): + """Print summary of selected tasks.""" + total = len(task_pairs) + if total == 0: + print("No tasks selected!") + return + + # Count by difficulty + counts = {1: 0, 2: 0, 3: 0} + for _, diff in task_pairs: + if diff in counts: + counts[diff] += 1 + + print("\nSelection Summary:") + print(f"Total tasks: {total}") + print(f" Easy (Level 1): {counts[1]:3d} ({counts[1]/total*100:5.1f}%)") + print(f" Medium (Level 2): {counts[2]:3d} ({counts[2]/total*100:5.1f}%)") + print(f" Hard (Level 3): {counts[3]:3d} ({counts[3]/total*100:5.1f}%)") + + +def main(): + parser = argparse.ArgumentParser( + description="Select and reorder tasks from a dataset file", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Standard selection with difficulty and size + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --size 50 --difficulty balanced --order easy-to-hard + + # Cluster-based selection (ignores --size, respects --difficulty) + # No difficulty filter - select from all difficulties + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --order original + + # Easy-only mode - strict, errors if not enough easy samples + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster oracle --examples-per-cluster 3 --difficulty easy-only --order original + + # Easy-preferred mode - prefers easy, falls back to closest if unavailable + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --difficulty easy-preferred --order original + + # Balanced mode - tries to maintain 1:1:1 ratio across difficulties + python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster oracle --examples-per-cluster 3 --difficulty balanced --order easy-to-hard + +Available similarity metrics for --cluster: + - jaccard: Word-based set similarity + - cosine: TF-IDF cosine similarity + - levenshtein: Character-level edit distance + - oracle: Ground truth clustering using task family IDs (threshold ignored) + - embedding: Semantic similarity using OpenAI embeddings (requires OPENAI_API_KEY) + +Difficulty modes: + - None (default): No filtering, select from all difficulties + - easy-only/medium-only/hard-only: Strict mode, requires exact matches (throws error if unavailable) + - easy-preferred/medium-preferred/hard-preferred: Prefers difficulty, allows closest match as fallback + - balanced: Tries to maintain 1:1:1 ratio across all three difficulty levels + - custom: Custom ratio specified with --ratio (e.g., --ratio 1:2:1 for easy:medium:hard) + """ + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Input dataset filename (e.g., train.txt)" + ) + parser.add_argument( + "--output", + type=str, + required=True, + help="Output dataset filename (e.g., train_subset.txt)" + ) + parser.add_argument( + "--size", + type=int, + required=False, + help="Number of tasks to select (ignored when using --cluster)" + ) + parser.add_argument( + "--difficulty", + type=str, + choices=[ + "easy-only", "easy-preferred", + "medium-only", "medium-preferred", + "hard-only", "hard-preferred", + "balanced", "custom" + ], + default=None, + help="Difficulty filter. -only modes require exact matches (error if unavailable). -preferred modes prefer the level but allow closest match. balanced/custom for proportional selection. If not specified, no difficulty filter is applied." + ) + parser.add_argument( + "--order", + type=str, + choices=["original", "easy-to-hard", "hard-to-easy", "random"], + default="original", + help="Task ordering (default: original)" + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for reproducibility (only used with --order random)" + ) + parser.add_argument( + "--ratio", + type=str, + default=None, + help="Difficulty ratio for custom mode (e.g., '1:2:1' for easy:medium:hard)" + ) + parser.add_argument( + "--cluster", + type=str, + default=None, + help="Enable cluster-based selection with format 'metric:threshold' (e.g., 'cosine:0.8', 'oracle'). When used, --size is ignored but --difficulty is respected." + ) + parser.add_argument( + "--examples-per-cluster", + type=int, + default=None, + help="Number of examples to select from each cluster (required when using --cluster)" + ) + parser.add_argument( + "--embedding-model", + type=str, + default="text-embedding-3-large", + help="Embedding model to use when --cluster uses 'embedding' metric (default: text-embedding-3-large)" + ) + + args = parser.parse_args() + + # Validate arguments + if args.cluster: + if not args.examples_per_cluster: + parser.error("--examples-per-cluster is required when using --cluster") + if args.size: + print("Warning: --size is ignored when using --cluster mode") + else: + if not args.size: + parser.error("--size is required when not using --cluster mode") + if args.examples_per_cluster: + print("Warning: --examples-per-cluster is ignored when not using --cluster mode") + + # Load dataset + print(f"Loading dataset: {args.dataset}") + task_ids = load_dataset(args.dataset) + print(f"Found {len(task_ids)} tasks") + + # Branch based on clustering mode + if args.cluster: + # Parse cluster argument + if ':' in args.cluster: + parts = args.cluster.split(':', 1) + similarity_metric = parts[0] + try: + threshold = float(parts[1]) + except ValueError: + parser.error(f"Invalid threshold in --cluster argument: {parts[1]}") + else: + # For oracle, threshold is not needed + similarity_metric = args.cluster + threshold = 0.0 # Will be ignored for oracle + + # Validate similarity metric + valid_metrics = ["jaccard", "cosine", "levenshtein", "oracle", "embedding"] + if similarity_metric not in valid_metrics: + parser.error(f"Invalid similarity metric: {similarity_metric}. Must be one of {valid_metrics}") + + print(f"\n=== CLUSTER-BASED SELECTION ===") + print(f"Similarity metric: {similarity_metric}") + if similarity_metric != "oracle": + print(f"Threshold: {threshold}") + print(f"Examples per cluster: {args.examples_per_cluster}") + if args.difficulty: + print(f"Difficulty filter: {args.difficulty}") + if args.difficulty == "custom" and args.ratio: + print(f"Custom ratio: {args.ratio}") + elif args.difficulty == "balanced": + print(f"Using balanced distribution (1:1:1)") + else: + print(f"Difficulty filter: None (all difficulties)") + + # Cluster tasks + clusters = cluster_tasks_by_similarity( + task_ids, + similarity_metric, + threshold, + embedding_model=args.embedding_model + ) + + print(f"\nClustering complete: {len(clusters)} clusters found") + + # Print cluster statistics + cluster_sizes = [len(cluster) for cluster in clusters] + print(f"Cluster size range: {min(cluster_sizes)} to {max(cluster_sizes)}") + print(f"Average cluster size: {sum(cluster_sizes) / len(cluster_sizes):.1f}") + + # Select from clusters with difficulty filtering + filter_msg = f"with difficulty filter '{args.difficulty}'" if args.difficulty else "without difficulty filter" + print(f"\nSelecting {args.examples_per_cluster} example(s) from each cluster {filter_msg}...") + try: + selected_tasks, difficulty_stats = select_from_clusters( + clusters, + args.examples_per_cluster, + difficulty_filter=args.difficulty, + ratio=args.ratio + ) + print(f"Selected {len(selected_tasks)} tasks total") + + # Print difficulty matching statistics only if filter was applied + if args.difficulty: + print_difficulty_matching_stats(difficulty_stats) + except ValueError as e: + print(f"\nError: {e}") + return + + # Convert to task_pairs format for ordering and summary + selected_pairs = [(task['task_id'], task['difficulty']) for task in selected_tasks] + + else: + # Standard selection mode + print("\n=== STANDARD SELECTION ===") + + # Get difficulty information + print("Reading task metadata...") + task_pairs = get_tasks_with_difficulty(task_ids) + + # Filter by difficulty + print(f"Filtering by difficulty: {args.difficulty}") + filtered_pairs = filter_by_difficulty(task_pairs, args.difficulty) + print(f"After filtering: {len(filtered_pairs)} tasks") + + # Select tasks + print(f"Selecting {args.size} tasks...") + if args.difficulty == "custom": + if args.ratio: + print(f"Using custom ratio: {args.ratio}") + else: + print("Warning: --ratio not specified for custom mode") + elif args.difficulty == "balanced": + print("Using balanced distribution (1:1:1)") + selected_pairs = select_tasks(filtered_pairs, args.size, args.difficulty, args.ratio) + + if len(selected_pairs) < args.size: + print(f"Warning: Only {len(selected_pairs)} tasks available, requested {args.size}") + + # Order tasks (applies to both modes) + print(f"\nOrdering tasks: {args.order}") + ordered_pairs = order_tasks(selected_pairs, args.order, args.seed) + + # Extract task IDs + selected_task_ids = [task_id for task_id, _ in ordered_pairs] + + # Print summary + print_summary(ordered_pairs) + + # Save output + save_dataset(selected_task_ids, args.output) + + +if __name__ == "__main__": + main() diff --git a/experiments/curriculum/similarity_metrics.py b/experiments/curriculum/similarity_metrics.py new file mode 100644 index 0000000..fab1584 --- /dev/null +++ b/experiments/curriculum/similarity_metrics.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +""" +Similarity metrics for comparing text instructions. + +This module provides various text similarity metrics that can be used +for clustering and comparing task instructions. + +Available metrics: +- Jaccard similarity: Word-based set similarity +- TF-IDF Cosine similarity: Weighted term similarity with IDF scoring +- Levenshtein similarity: Character-level edit distance similarity +- Oracle similarity: Ground truth similarity based on task family ID +- Embedding similarity: Semantic similarity using sentence embeddings (requires API key) +""" + +import math +import os +import re +from collections import Counter, defaultdict +from typing import Dict, List, Optional + + +def extract_task_family_id(task_id: str) -> str: + """Extract the family ID from a task ID. + + Args: + task_id: Task ID in format like "76f2c72_2" or "76f2c72_1" + + Returns: + Family ID (e.g., "76f2c72") + """ + # Split by underscore and take the first part + parts = task_id.split('_') + return parts[0] if parts else task_id + + +def tokenize(text: str) -> List[str]: + """Tokenize text into words.""" + # Convert to lowercase and split by non-alphanumeric characters + return re.findall(r'\b\w+\b', text.lower()) + + +def calculate_jaccard_similarity(s1: str, s2: str) -> float: + """Calculate Jaccard similarity between two strings based on word tokens. + + Args: + s1: First string + s2: Second string + + Returns: + Similarity score between 0.0 and 1.0 + """ + if not s1 or not s2: + return 0.0 + + words1 = set(tokenize(s1)) + words2 = set(tokenize(s2)) + + intersection = len(words1 & words2) + union = len(words1 | words2) + + return intersection / union if union > 0 else 0.0 + + +def calculate_cosine_similarity(s1: str, s2: str, idf_scores: Dict[str, float]) -> float: + """Calculate TF-IDF cosine similarity between two strings. + + Args: + s1: First string + s2: Second string + idf_scores: Dictionary mapping terms to their IDF scores + + Returns: + Similarity score between 0.0 and 1.0 + """ + if not s1 or not s2: + return 0.0 + + tokens1 = tokenize(s1) + tokens2 = tokenize(s2) + + # Calculate TF-IDF vectors + def get_tfidf_vector(tokens: List[str]) -> Dict[str, float]: + tf = Counter(tokens) + total_terms = len(tokens) + tfidf = {} + for term, count in tf.items(): + tf_score = count / total_terms if total_terms > 0 else 0 + idf_score = idf_scores.get(term, 0) + tfidf[term] = tf_score * idf_score + return tfidf + + vec1 = get_tfidf_vector(tokens1) + vec2 = get_tfidf_vector(tokens2) + + # Calculate cosine similarity + all_terms = set(vec1.keys()) | set(vec2.keys()) + + dot_product = sum(vec1.get(term, 0) * vec2.get(term, 0) for term in all_terms) + magnitude1 = math.sqrt(sum(v * v for v in vec1.values())) + magnitude2 = math.sqrt(sum(v * v for v in vec2.values())) + + if magnitude1 == 0 or magnitude2 == 0: + return 0.0 + + return dot_product / (magnitude1 * magnitude2) + + +def calculate_levenshtein_similarity(s1: str, s2: str) -> float: + """Calculate normalized Levenshtein (edit distance) similarity. + + Args: + s1: First string + s2: Second string + + Returns: + Similarity score between 0.0 and 1.0 + """ + if not s1 or not s2: + return 0.0 + + # Levenshtein distance using dynamic programming + len1, len2 = len(s1), len(s2) + if len1 == 0: + return 0.0 if len2 > 0 else 1.0 + if len2 == 0: + return 0.0 + + # Create distance matrix + distances = [[0] * (len2 + 1) for _ in range(len1 + 1)] + + for i in range(len1 + 1): + distances[i][0] = i + for j in range(len2 + 1): + distances[0][j] = j + + for i in range(1, len1 + 1): + for j in range(1, len2 + 1): + cost = 0 if s1[i - 1] == s2[j - 1] else 1 + distances[i][j] = min( + distances[i - 1][j] + 1, # deletion + distances[i][j - 1] + 1, # insertion + distances[i - 1][j - 1] + cost # substitution + ) + + max_len = max(len1, len2) + return 1.0 - (distances[len1][len2] / max_len) + + +def calculate_oracle_similarity(task_id1: str, task_id2: str) -> float: + """Calculate oracle similarity based on task family ID. + + This is a "ground truth" similarity metric that returns 1.0 if two tasks + belong to the same family (same prefix before underscore), 0.0 otherwise. + + Args: + task_id1: First task ID (e.g., "76f2c72_2") + task_id2: Second task ID (e.g., "76f2c72_3") + + Returns: + 1.0 if same family, 0.0 otherwise + """ + if not task_id1 or not task_id2: + return 0.0 + + family1 = extract_task_family_id(task_id1) + family2 = extract_task_family_id(task_id2) + + return 1.0 if family1 == family2 else 0.0 + + +def get_embedding( + text: str, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model: str = "text-embedding-3-large" +) -> List[float]: + """Get embedding vector for a text using OpenAI API. + + Args: + text: Text to embed + api_key: OpenAI API key (if None, will try to use OPENAI_API_KEY env var) + base_url: API base URL (default: None, uses OpenAI's default) + model: Embedding model to use (default: text-embedding-3-large) + Other options: "text-embedding-3-large", "text-embedding-ada-002" + + Returns: + Embedding vector as list of floats + + Raises: + ImportError: If openai package is not installed + ValueError: If API key is not provided + """ + try: + from openai import OpenAI + except ImportError: + raise ImportError("openai package required for embedding similarity. Install with: pip install openai") + + if api_key is None: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError( + "API key required for embedding similarity. " + "Set OPENAI_API_KEY environment variable or pass api_key parameter." + ) + + # Create client with or without custom base_url + if base_url: + client = OpenAI(api_key=api_key, base_url=base_url) + else: + client = OpenAI(api_key=api_key) + + # Use OpenAI's embedding model + try: + response = client.embeddings.create( + model=model, + input=text + ) + return response.data[0].embedding + except Exception as e: + # Provide helpful error message with available models + raise RuntimeError( + f"Failed to get embedding with model '{model}'. " + f"Error: {str(e)}. " + f"Available OpenAI models: text-embedding-3-small, text-embedding-3-large, text-embedding-ada-002" + ) from e + + +def calculate_embedding_similarity( + s1: str, + s2: str, + embeddings_cache: Optional[Dict[str, List[float]]] = None, + api_key: Optional[str] = None +) -> float: + """Calculate cosine similarity between embeddings of two texts. + + Args: + s1: First string + s2: Second string + embeddings_cache: Optional cache of pre-computed embeddings {text: embedding} + api_key: OpenAI API key + + Returns: + Cosine similarity score between 0.0 and 1.0 + """ + if not s1 or not s2: + return 0.0 + + # Get embeddings from cache or compute them + if embeddings_cache and s1 in embeddings_cache: + emb1 = embeddings_cache[s1] + else: + emb1 = get_embedding(s1, api_key) + if embeddings_cache is not None: + embeddings_cache[s1] = emb1 + + if embeddings_cache and s2 in embeddings_cache: + emb2 = embeddings_cache[s2] + else: + emb2 = get_embedding(s2, api_key) + if embeddings_cache is not None: + embeddings_cache[s2] = emb2 + + # Calculate cosine similarity + dot_product = sum(a * b for a, b in zip(emb1, emb2)) + magnitude1 = math.sqrt(sum(a * a for a in emb1)) + magnitude2 = math.sqrt(sum(b * b for b in emb2)) + + if magnitude1 == 0 or magnitude2 == 0: + return 0.0 + + # Cosine similarity ranges from -1 to 1, normalize to 0 to 1 + similarity = dot_product / (magnitude1 * magnitude2) + return (similarity + 1) / 2 + + +def compute_embeddings_batch( + texts: List[str], + api_key: Optional[str] = None, + model: str = "text-embedding-3-large" +) -> Dict[str, List[float]]: + """Pre-compute embeddings for a list of texts. + + Args: + texts: List of texts to embed + api_key: OpenAI API key + model: Embedding model to use + + Returns: + Dictionary mapping each text to its embedding vector + """ + embeddings_cache = {} + for i, text in enumerate(texts): + if text and text not in embeddings_cache: + print(f" Progress: {i+1}/{len(texts)}", end='\r') + embeddings_cache[text] = get_embedding(text, api_key, model=model) + print(f" Progress: {len(texts)}/{len(texts)} - Complete!") + return embeddings_cache + + +def compute_idf_scores(texts: List[str]) -> Dict[str, float]: + """Compute IDF (Inverse Document Frequency) scores for all terms in a text corpus. + + Args: + texts: List of text documents + + Returns: + Dictionary mapping terms to their IDF scores + """ + num_docs = len(texts) + if num_docs == 0: + return {} + + # Count document frequency for each term + doc_freq = defaultdict(int) + for text in texts: + unique_terms = set(tokenize(text)) + for term in unique_terms: + doc_freq[term] += 1 + + # Calculate IDF scores + idf_scores = {} + for term, freq in doc_freq.items(): + idf_scores[term] = math.log(num_docs / freq) if freq > 0 else 0 + + return idf_scores + + +def calculate_similarity( + s1: str, + s2: str, + metric: str = "jaccard", + idf_scores: Dict[str, float] = None, + task_id1: Optional[str] = None, + task_id2: Optional[str] = None, + embeddings_cache: Optional[Dict[str, List[float]]] = None, + api_key: Optional[str] = None +) -> float: + """Calculate similarity between two strings using the specified metric. + + Args: + s1: First string (instruction text or task ID for oracle) + s2: Second string (instruction text or task ID for oracle) + metric: Similarity metric to use ('jaccard', 'cosine', 'levenshtein', 'oracle', 'embedding') + idf_scores: IDF scores (required for cosine similarity) + task_id1: First task ID (required for oracle similarity) + task_id2: Second task ID (required for oracle similarity) + embeddings_cache: Cache of embeddings (for embedding similarity) + api_key: API key for embedding service (for embedding similarity) + + Returns: + Similarity score between 0.0 and 1.0 + + Raises: + ValueError: If an unknown metric is specified or if required parameters are missing + """ + if metric == "jaccard": + return calculate_jaccard_similarity(s1, s2) + elif metric == "cosine": + if idf_scores is None: + raise ValueError("IDF scores required for cosine similarity") + return calculate_cosine_similarity(s1, s2, idf_scores) + elif metric == "levenshtein": + return calculate_levenshtein_similarity(s1, s2) + elif metric == "oracle": + if task_id1 is None or task_id2 is None: + raise ValueError("Task IDs required for oracle similarity") + return calculate_oracle_similarity(task_id1, task_id2) + elif metric == "embedding": + return calculate_embedding_similarity(s1, s2, embeddings_cache, api_key) + else: + raise ValueError(f"Unknown similarity metric: {metric}")