diff --git a/experiments/code/ace/lite_llm_generator.py b/experiments/code/ace/lite_llm_generator.py
index b758163..2f705d1 100644
--- a/experiments/code/ace/lite_llm_generator.py
+++ b/experiments/code/ace/lite_llm_generator.py
@@ -24,6 +24,14 @@
     RateLimitError,
     UnprocessableEntityError,
 )
+from httpx import (
+    ConnectTimeout,
+    ReadTimeout,
+    TimeoutException,
+    ConnectError,
+    ReadError,
+    WriteError,
+)
 from rich.panel import Panel
 
 from appworld import AppWorld
@@ -34,6 +42,7 @@
 cache = Memory(os.path.join(path_store.cache, "llm_calls"), verbose=0)
 
 RETRY_ERROR = (
+    # OpenAI exceptions
     APIConnectionError,
     APIError,
     APIResponseValidationError,
@@ -48,6 +57,13 @@
     PermissionDeniedError,
     RateLimitError,
     UnprocessableEntityError,
+    # httpx exceptions for connection/timeout issues (e.g., SambaNova client)
+    ConnectTimeout,
+    ReadTimeout,
+    TimeoutException,
+    ConnectError,
+    ReadError,
+    WriteError,
 )
 CHAT_COMPLETION = {  # These are lambda so set environment variables take effect at runtime
     "openai": lambda: OpenAI(api_key="9b419298-ffce-4d50-a42c-0b4a0b911a89", base_url="https://api.sambanova.ai/v1").chat.completions.create,
@@ -145,7 +161,10 @@ def non_cached_chat_completion(
 
     if provider.strip().lower() == "sambanova":
         from sambanova import SambaNova
-        client = SambaNova()
+        import httpx
+        # Set longer timeout: 60s for connection, 600s for read/write
+        timeout = httpx.Timeout(60.0, connect=60.0, read=600.0, write=600.0)
+        client = SambaNova(timeout=timeout)
     elif provider.strip().lower() == "together":
         from together import Together
         client = Together()
@@ -310,7 +329,7 @@ def generate(
             return {"content": "", "tool_calls": [], "cost": 0}
 
         success = False
-        for _ in range(self.max_retries):
+        for attempt in range(self.max_retries):
             try:
                 arguments = {
                     "model": self.model,
@@ -330,7 +349,9 @@ def generate(
 
                     print(traceback.format_exc())
                     exit()
-                print(f"Encountered LM Error: {exception.message[:200].strip()}...")
+                error_msg = str(exception)[:200] if str(exception) else type(exception).__name__
+                print(f"Encountered LM Error: {error_msg.strip()}...")
+                print(f"Retrying... (Attempt {attempt + 1}/{self.max_retries})")
                 print(f"Will try again in {self.retry_after_n_seconds} seconds.")
                 time.sleep(self.retry_after_n_seconds)
                 pass
diff --git a/experiments/curriculum/analyze_dataset.py b/experiments/curriculum/analyze_dataset.py
new file mode 100755
index 0000000..4f16bb5
--- /dev/null
+++ b/experiments/curriculum/analyze_dataset.py
@@ -0,0 +1,494 @@
+#!/usr/bin/env python3
+"""
+Analyze task difficulty distribution and instruction similarity from a dataset file.
+
+Usage (run from repository root):
+    python3 experiments/curriculum/analyze_dataset.py --dataset train.txt
+    python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric cosine:0.8
+    python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric oracle
+    python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric embedding:0.85 --embedding-model text-embedding-3-large
+
+Available similarity metrics:
+    - jaccard:THRESHOLD - Word-based set similarity (e.g., jaccard:0.7)
+    - cosine:THRESHOLD - TF-IDF cosine similarity (e.g., cosine:0.8)
+    - levenshtein:THRESHOLD - Character-level edit distance (e.g., levenshtein:0.75)
+    - oracle - Ground truth clustering using task family IDs (e.g., "76f2c72" from "76f2c72_2")
+               Note: threshold not needed for oracle since it uses exact family ID matching
+    - embedding:THRESHOLD - Semantic similarity using OpenAI embeddings API (e.g., embedding:0.85)
+                            Requires: OPENAI_API_KEY environment variable
+"""
+
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+from similarity_metrics import calculate_similarity, compute_embeddings_batch, compute_idf_scores
+
+
+def load_dataset(dataset_name: str) -> List[str]:
+    """Load task IDs from dataset file."""
+    dataset_file = Path("data/datasets") / dataset_name
+
+    if not dataset_file.exists():
+        raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
+
+    with open(dataset_file, 'r') as f:
+        task_ids = [line.strip() for line in f if line.strip()]
+    return task_ids
+
+
+def get_task_difficulty(task_id: str) -> Tuple[str, int, str]:
+    """Get difficulty level and instruction for a task from its metadata files."""
+    metadata_path = Path("data/tasks") / task_id / "ground_truth" / "metadata.json"
+    specs_path = Path("data/tasks") / task_id / "specs.json"
+
+    try:
+        # Get difficulty from metadata.json
+        with open(metadata_path, 'r') as f:
+            metadata = json.load(f)
+            difficulty = metadata.get('difficulty', -1)
+
+        # Get instruction from specs.json
+        instruction = ""
+        try:
+            with open(specs_path, 'r') as f:
+                specs = json.load(f)
+                instruction = specs.get('instruction', '')
+        except (FileNotFoundError, json.JSONDecodeError):
+            pass
+
+        return (task_id, difficulty, instruction)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        print(f"Warning: Could not read metadata for task {task_id}: {e}")
+        return (task_id, -1, "")
+
+
+def analyze_difficulty_distribution(task_ids: List[str]) -> List[Tuple[str, int, str]]:
+    """Analyze difficulty distribution and return task-difficulty-instruction tuples."""
+    task_data = []
+
+    for task_id in task_ids:
+        task_id, difficulty, instruction = get_task_difficulty(task_id)
+        task_data.append((task_id, difficulty, instruction))
+
+    return task_data
+
+
+def print_distribution(task_difficulty_pairs: List[Tuple[str, int, str]]):
+    """Print difficulty distribution statistics."""
+    # Count by difficulty
+    difficulty_counts = {}
+    total_tasks = len(task_difficulty_pairs)
+
+    for _, difficulty, _ in task_difficulty_pairs:
+        if difficulty >= 0:
+            difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1
+
+    print("\n" + "=" * 60)
+    print("DIFFICULTY DISTRIBUTION")
+    print("=" * 60)
+
+    print(f"\n{'Difficulty':<12} {'Count':<10} {'Percentage':<12} {'Bar'}")
+    print("-" * 60)
+
+    for difficulty in sorted(difficulty_counts.keys()):
+        count = difficulty_counts[difficulty]
+        percentage = (count / total_tasks) * 100
+        bar = "█" * int(percentage / 2)
+        print(f"Level {difficulty:<6} {count:<10} {percentage:>6.2f}%     {bar}")
+
+    print("-" * 60)
+    print(f"{'Total':<12} {total_tasks:<10} {'100.00%':<12}")
+    print()
+
+
+def print_task_list(task_difficulty_pairs: List[Tuple[str, int, str]]):
+    """Print tasks in original order with difficulty level appended."""
+    print("\n" + "=" * 60)
+    print("TASKS IN ORIGINAL ORDER (with difficulty)")
+    print("=" * 60)
+    print()
+
+    for task_id, difficulty, _ in task_difficulty_pairs:
+        diff_str = f"Level {difficulty}" if difficulty >= 0 else "Unknown"
+        print(f"{task_id:<20} -> {diff_str}")
+
+    print()
+
+
+def _analyze_oracle_similarity(
+    task_difficulty_pairs: List[Tuple[str, int, str]],
+    threshold: float = 0.7
+) -> Dict:
+    """Analyze similarity using oracle (task family ID) clustering.
+
+    Note: Threshold is ignored for oracle similarity since it produces binary results
+    (1.0 for same family, 0.0 for different families). All tasks with the same family
+    ID will always be clustered together.
+
+    Args:
+        task_difficulty_pairs: List of (task_id, difficulty, instruction) tuples
+        threshold: Ignored for oracle metric
+
+    Returns:
+        Dictionary with clustering results based on task family IDs
+    """
+    from similarity_metrics import extract_task_family_id
+
+    # Group tasks by family ID
+    family_to_tasks = {}
+    all_tasks_count = 0
+
+    for task_id, difficulty, instruction in task_difficulty_pairs:
+        family_id = extract_task_family_id(task_id)
+        if family_id not in family_to_tasks:
+            family_to_tasks[family_id] = []
+        family_to_tasks[family_id].append({
+            "task_id": task_id,
+            "difficulty": difficulty,
+            "instruction": instruction
+        })
+        all_tasks_count += 1
+
+    # Build clusters (one cluster per family)
+    detailed_clusters = []
+    for idx, (family_id, tasks) in enumerate(family_to_tasks.items()):
+        # Get unique instructions in this family
+        unique_instructions = set(task["instruction"] for task in tasks if task["instruction"])
+
+        detailed_clusters.append({
+            "cluster_id": idx,
+            "family_id": family_id,
+            "size": len(tasks),
+            "unique_instructions": len(unique_instructions),
+            "tasks": tasks
+        })
+
+    # Sort clusters by size (largest first)
+    detailed_clusters.sort(key=lambda x: x["size"], reverse=True)
+
+    # Get cluster size distribution
+    cluster_sizes = Counter(len(cluster["tasks"]) for cluster in detailed_clusters)
+
+    # Count unique instructions across all tasks
+    all_instructions = [instruction for _, _, instruction in task_difficulty_pairs if instruction]
+    unique_instructions_total = len(set(all_instructions))
+    exact_duplicates = len(all_instructions) - unique_instructions_total
+
+    return {
+        "total_instructions": all_tasks_count,
+        "exact_duplicates": exact_duplicates,
+        "unique_instructions": unique_instructions_total,
+        "similarity_metric": "oracle",
+        "clustering_threshold": "N/A (oracle uses exact family ID matching)",
+        "num_clusters": len(detailed_clusters),
+        "cluster_size_distribution": dict(cluster_sizes),
+        "clusters": detailed_clusters
+    }
+
+
+def analyze_instruction_similarity(
+    task_difficulty_pairs: List[Tuple[str, int, str]],
+    similarity_metric: str = "jaccard",
+    threshold: float = 0.7,
+    embedding_model: str = "text-embedding-3-large"
+) -> Dict:
+    """Analyze instruction similarity patterns in the dataset."""
+    # For oracle metric, we cluster by task ID directly
+    if similarity_metric == "oracle":
+        return _analyze_oracle_similarity(task_difficulty_pairs, threshold)
+
+    # Build mapping of instruction to task IDs
+    instruction_to_tasks = {}
+    for task_id, difficulty, instruction in task_difficulty_pairs:
+        if instruction:
+            if instruction not in instruction_to_tasks:
+                instruction_to_tasks[instruction] = []
+            instruction_to_tasks[instruction].append({
+                "task_id": task_id,
+                "difficulty": difficulty
+            })
+
+    instructions = list(instruction_to_tasks.keys())
+
+    if not instructions:
+        return {
+            "total_instructions": 0,
+            "exact_duplicates": 0,
+            "unique_instructions": 0,
+            "similarity_metric": similarity_metric,
+            "clustering_threshold": threshold,
+            "num_clusters": 0,
+            "clusters": []
+        }
+
+    # Count exact duplicates
+    unique_instructions = len(instructions)
+    total_instructions_count = sum(len(tasks) for tasks in instruction_to_tasks.values())
+    exact_duplicates = total_instructions_count - unique_instructions  # Number of duplicate copies
+
+    # Prepare metric-specific data
+    idf_scores = None
+    embeddings_cache = None
+    api_key = None
+
+    if similarity_metric == "cosine":
+        idf_scores = compute_idf_scores(instructions)
+    elif similarity_metric == "embedding":
+        # Pre-compute all embeddings for efficiency
+        print(f"  Computing embeddings for {len(instructions)} unique instructions using model '{embedding_model}'...")
+        import os
+        api_key = os.environ.get("OPENAI_API_KEY")
+        embeddings_cache = compute_embeddings_batch(instructions, api_key, model=embedding_model)
+        print(f"  Embeddings computed successfully")
+
+    # Clustering based on similarity
+    # Group instructions that are similar to each other
+    clusters = []
+    clustered = set()
+
+    for i, inst1 in enumerate(instructions):
+        if inst1 in clustered:
+            continue
+
+        cluster = [inst1]
+        clustered.add(inst1)
+
+        for inst2 in instructions[i+1:]:
+            if inst2 in clustered:
+                continue
+
+            # Check similarity with any instruction in current cluster
+            is_similar = False
+            for cluster_inst in cluster:
+                sim_score = calculate_similarity(
+                    cluster_inst, inst2, similarity_metric,
+                    idf_scores=idf_scores,
+                    embeddings_cache=embeddings_cache,
+                    api_key=api_key
+                )
+                if sim_score >= threshold:
+                    is_similar = True
+                    break
+
+            if is_similar:
+                cluster.append(inst2)
+                clustered.add(inst2)
+
+        clusters.append(cluster)
+
+    # Get cluster size distribution
+    cluster_sizes = Counter(len(cluster) for cluster in clusters)
+
+    # Build detailed cluster information with task IDs
+    detailed_clusters = []
+    for idx, cluster in enumerate(clusters):
+        cluster_tasks = []
+        for instruction in cluster:
+            for task_info in instruction_to_tasks[instruction]:
+                cluster_tasks.append({
+                    "task_id": task_info["task_id"],
+                    "difficulty": task_info["difficulty"],
+                    "instruction": instruction
+                })
+
+        detailed_clusters.append({
+            "cluster_id": idx,
+            "size": len(cluster_tasks),
+            "unique_instructions": len(cluster),
+            "tasks": cluster_tasks
+        })
+
+    # Sort clusters by size (largest first)
+    detailed_clusters.sort(key=lambda x: x["size"], reverse=True)
+
+    return {
+        "total_instructions": total_instructions_count,
+        "exact_duplicates": exact_duplicates,
+        "unique_instructions": unique_instructions,
+        "similarity_metric": similarity_metric,
+        "clustering_threshold": threshold,
+        "num_clusters": len(clusters),
+        "cluster_size_distribution": dict(cluster_sizes),
+        "clusters": detailed_clusters
+    }
+
+
+def print_similarity_analysis(similarity_data: Dict):
+    """Print instruction similarity analysis."""
+    print("\n" + "=" * 60)
+    print("INSTRUCTION CLUSTERING ANALYSIS")
+    print("=" * 60)
+    print()
+
+    print(f"Total instructions: {similarity_data['total_instructions']}")
+    print(f"Unique instructions: {similarity_data['unique_instructions']}")
+    print(f"Exact duplicates (copies): {similarity_data['exact_duplicates']}")
+    print()
+
+    print(f"Clustering (metric: {similarity_data['similarity_metric']}, threshold ≥{similarity_data['clustering_threshold']}):")
+    print(f"  Number of clusters: {similarity_data['num_clusters']}")
+    print(f"  Cluster size distribution: {similarity_data['cluster_size_distribution']}")
+    print()
+
+    # Show top 10 largest clusters
+    print("Top 10 largest clusters:")
+    for cluster in similarity_data['clusters'][:10]:
+        print(f"  Cluster {cluster['cluster_id']}: {cluster['size']} tasks ({cluster['unique_instructions']} unique instructions)")
+        # Show first instruction as example
+        if cluster['tasks']:
+            example = cluster['tasks'][0]['instruction']
+            print(f"    Example: {example[:80]}{'...' if len(example) > 80 else ''}")
+    print()
+
+
+def save_analysis(
+    task_difficulty_pairs: List[Tuple[str, int, str]],
+    dataset_name: str,
+    similarity_data: Dict
+):
+    """Save formatted analysis to JSON file in datasets folder."""
+    output_file = Path("data/datasets") / f"{dataset_name}_analysis.json"
+
+    # Count by difficulty
+    difficulty_counts = {}
+    total_tasks = len(task_difficulty_pairs)
+
+    for _, difficulty, _ in task_difficulty_pairs:
+        if difficulty >= 0:
+            difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1
+
+    # Build distribution
+    distribution = {}
+    for difficulty in sorted(difficulty_counts.keys()):
+        count = difficulty_counts[difficulty]
+        percentage = (count / total_tasks) * 100
+        distribution[f"level_{difficulty}"] = {
+            "count": count,
+            "percentage": round(percentage, 2)
+        }
+
+    # Filter out unknown difficulty tasks and include instruction text
+    tasks_with_difficulty = [
+        {"task_id": task_id, "difficulty": difficulty, "instruction": instruction}
+        for task_id, difficulty, instruction in task_difficulty_pairs
+        if difficulty >= 0
+    ]
+
+    # Create output data
+    output_data = {
+        "dataset": dataset_name,
+        "total_tasks": total_tasks,
+        "distribution": distribution,
+        "instruction_similarity": similarity_data,
+        "tasks": tasks_with_difficulty
+    }
+
+    with open(output_file, 'w') as f:
+        json.dump(output_data, f, indent=2)
+
+    print(f"\nAnalysis saved to: {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze task difficulty distribution from a dataset file",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic analysis with default jaccard metric and threshold
+  python3 experiments/curriculum/analyze_dataset.py --dataset train.txt
+
+  # Use cosine similarity with custom threshold
+  python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric cosine:0.8
+
+  # Use oracle clustering (task family IDs)
+  python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric oracle
+
+  # Use embedding similarity with custom model
+  python3 experiments/curriculum/analyze_dataset.py --dataset train.txt --similarity-metric embedding:0.85 --embedding-model text-embedding-3-large
+
+Available similarity metrics:
+  - jaccard:THRESHOLD (e.g., jaccard:0.7)
+  - cosine:THRESHOLD (e.g., cosine:0.8)
+  - levenshtein:THRESHOLD (e.g., levenshtein:0.75)
+  - oracle (no threshold needed)
+  - embedding:THRESHOLD (e.g., embedding:0.85, requires OPENAI_API_KEY)
+        """
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Dataset filename (e.g., train.txt, test.txt)"
+    )
+    parser.add_argument(
+        "--similarity-metric",
+        type=str,
+        default="jaccard:0.7",
+        help="Similarity metric with optional threshold in format 'metric:threshold' (e.g., 'cosine:0.8', 'oracle'). Default: jaccard:0.7"
+    )
+    parser.add_argument(
+        "--embedding-model",
+        type=str,
+        default="text-embedding-3-large",
+        help="Embedding model to use when similarity-metric uses 'embedding' (default: text-embedding-3-large). Other OpenAI options: text-embedding-3-small, text-embedding-ada-002"
+    )
+
+    args = parser.parse_args()
+
+    # Parse similarity metric and threshold
+    if ':' in args.similarity_metric:
+        parts = args.similarity_metric.split(':', 1)
+        similarity_metric = parts[0]
+        try:
+            threshold = float(parts[1])
+        except ValueError:
+            parser.error(f"Invalid threshold in --similarity-metric argument: {parts[1]}")
+    else:
+        # For oracle or if user specifies metric without threshold
+        similarity_metric = args.similarity_metric
+        if similarity_metric == "oracle":
+            threshold = 0.0  # Will be ignored for oracle
+        else:
+            # Default threshold for metrics without explicit threshold
+            threshold = 0.7
+            print(f"Note: No threshold specified, using default: {threshold}")
+
+    # Validate similarity metric
+    valid_metrics = ["jaccard", "cosine", "levenshtein", "oracle", "embedding"]
+    if similarity_metric not in valid_metrics:
+        parser.error(f"Invalid similarity metric: {similarity_metric}. Must be one of {valid_metrics}")
+
+    # Load dataset
+    print(f"Loading dataset: {args.dataset}")
+    task_ids = load_dataset(args.dataset)
+    print(f"Found {len(task_ids)} tasks")
+
+    # Analyze difficulty distribution
+    print("Analyzing difficulty levels...")
+    task_difficulty_pairs = analyze_difficulty_distribution(task_ids)
+
+    # Print results
+    print_distribution(task_difficulty_pairs)
+    print_task_list(task_difficulty_pairs)
+
+    # Analyze and print instruction similarity
+    print(f"Analyzing instruction similarity (metric: {similarity_metric}, threshold: {threshold})...")
+    similarity_data = analyze_instruction_similarity(
+        task_difficulty_pairs,
+        similarity_metric=similarity_metric,
+        threshold=threshold,
+        embedding_model=args.embedding_model
+    )
+    print_similarity_analysis(similarity_data)
+
+    # Save analysis
+    dataset_name = Path(args.dataset).stem
+    save_analysis(task_difficulty_pairs, dataset_name, similarity_data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/curriculum/check_same_data.py b/experiments/curriculum/check_same_data.py
new file mode 100644
index 0000000..dd8b877
--- /dev/null
+++ b/experiments/curriculum/check_same_data.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+Script to check if two files contain the same data in different orders.
+Compares files line by line, ignoring the order of lines.
+"""
+
+import argparse
+from pathlib import Path
+from typing import Set, Tuple
+
+
+def read_file_lines(file_path: Path) -> Tuple[Set[str], int]:
+    """
+    Read all lines from a file and return them as a set along with the total count.
+
+    Args:
+        file_path: Path to the file to read
+
+    Returns:
+        Tuple of (set of lines, total line count)
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        lines = [line.rstrip('\n\r') for line in f]
+
+    return set(lines), len(lines)
+
+
+def compare_files(file1_path: Path, file2_path: Path, verbose: bool = True) -> bool:
+    """
+    Compare two files to check if they contain the same data in different orders.
+
+    Args:
+        file1_path: Path to the first file
+        file2_path: Path to the second file
+        verbose: If True, print detailed comparison information
+
+    Returns:
+        True if files contain the same data, False otherwise
+    """
+    # Read both files
+    print(f"Reading {file1_path}...")
+    lines1, count1 = read_file_lines(file1_path)
+
+    print(f"Reading {file2_path}...")
+    lines2, count2 = read_file_lines(file2_path)
+
+    # Print basic stats
+    print(f"\n{file1_path.name}: {count1} total lines, {len(lines1)} unique lines")
+    print(f"{file2_path.name}: {count2} total lines, {len(lines2)} unique lines")
+
+    # Check if sets are equal
+    are_same = lines1 == lines2
+
+    if are_same:
+        print(f"\n✓ Files contain the SAME data (possibly in different order)")
+        if count1 != len(lines1):
+            print(f"  Note: File 1 has {count1 - len(lines1)} duplicate lines")
+        if count2 != len(lines2):
+            print(f"  Note: File 2 has {count2 - len(lines2)} duplicate lines")
+    else:
+        print(f"\n✗ Files contain DIFFERENT data")
+
+        if verbose:
+            # Find differences
+            only_in_file1 = lines1 - lines2
+            only_in_file2 = lines2 - lines1
+
+            if only_in_file1:
+                print(f"\nLines only in {file1_path.name} ({len(only_in_file1)} lines):")
+                for line in sorted(only_in_file1)[:10]:  # Show first 10
+                    print(f"  {line[:100]}...")  # Truncate long lines
+                if len(only_in_file1) > 10:
+                    print(f"  ... and {len(only_in_file1) - 10} more")
+
+            if only_in_file2:
+                print(f"\nLines only in {file2_path.name} ({len(only_in_file2)} lines):")
+                for line in sorted(only_in_file2)[:10]:  # Show first 10
+                    print(f"  {line[:100]}...")  # Truncate long lines
+                if len(only_in_file2) > 10:
+                    print(f"  ... and {len(only_in_file2) - 10} more")
+
+    return are_same
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check if two files contain the same data in different orders"
+    )
+    parser.add_argument(
+        "file1",
+        type=str,
+        help="Path to the first file (relative to data/datasets/)"
+    )
+    parser.add_argument(
+        "file2",
+        type=str,
+        help="Path to the second file (relative to data/datasets/)"
+    )
+
+    args = parser.parse_args()
+
+    # Prepend data/datasets to file paths
+    base_path = Path("data/datasets")
+    file1_path = base_path / args.file1
+    file2_path = base_path / args.file2
+
+    # Check if files exist
+    if not file1_path.exists():
+        print(f"Error: File not found: {file1_path}")
+        return 1
+
+    if not file2_path.exists():
+        print(f"Error: File not found: {file2_path}")
+        return 1
+
+    # Compare files (verbose is always True by default)
+    try:
+        are_same = compare_files(file1_path, file2_path)
+        return 0 if are_same else 1
+    except Exception as e:
+        print(f"Error comparing files: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/experiments/curriculum/data_selector.py b/experiments/curriculum/data_selector.py
new file mode 100755
index 0000000..0c9d840
--- /dev/null
+++ b/experiments/curriculum/data_selector.py
@@ -0,0 +1,771 @@
+#!/usr/bin/env python3
+"""
+Select and reorder tasks from a dataset file based on difficulty and size.
+
+Usage (run from repository root):
+    # Standard selection
+    python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --size 50 --difficulty balanced --order easy-to-hard
+
+    # Cluster-based selection (ignores --size, respects --difficulty):
+    # No filter (all difficulties)
+    python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --order original
+
+    # Easy-only (strict - errors if not enough easy samples)
+    python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster oracle --examples-per-cluster 3 --difficulty easy-only
+
+    # Easy-preferred (prefers easy, uses closest as fallback)
+    python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --difficulty easy-preferred
+"""
+
+import argparse
+import json
+import random
+from pathlib import Path
+from typing import List, Tuple, Dict
+
+from similarity_metrics import calculate_similarity, compute_embeddings_batch, compute_idf_scores
+
+
+def load_dataset(dataset_name: str) -> List[str]:
+    """Load task IDs from dataset file."""
+    dataset_file = Path("data/datasets") / dataset_name
+
+    if not dataset_file.exists():
+        raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
+
+    with open(dataset_file, 'r') as f:
+        task_ids = [line.strip() for line in f if line.strip()]
+    return task_ids
+
+
+def get_task_difficulty(task_id: str) -> int:
+    """Get difficulty level for a task from its metadata.json file."""
+    metadata_path = Path("data/tasks") / task_id / "ground_truth" / "metadata.json"
+
+    try:
+        with open(metadata_path, 'r') as f:
+            metadata = json.load(f)
+            return metadata.get('difficulty', -1)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        print(f"Warning: Could not read metadata for task {task_id}: {e}")
+        return -1
+
+
+def get_task_instruction(task_id: str) -> str:
+    """Get instruction text for a task from its specs.json file."""
+    specs_path = Path("data/tasks") / task_id / "specs.json"
+
+    try:
+        with open(specs_path, 'r') as f:
+            specs = json.load(f)
+            return specs.get('instruction', '')
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        print(f"Warning: Could not read specs for task {task_id}: {e}")
+        return ""
+
+
+def get_tasks_with_difficulty(task_ids: List[str]) -> List[Tuple[str, int]]:
+    """Get list of (task_id, difficulty) tuples."""
+    task_difficulty_pairs = []
+    for task_id in task_ids:
+        difficulty = get_task_difficulty(task_id)
+        task_difficulty_pairs.append((task_id, difficulty))
+    return task_difficulty_pairs
+
+
+def filter_by_difficulty(
+    task_pairs: List[Tuple[str, int]],
+    difficulty_mode: str
+) -> List[Tuple[str, int]]:
+    """Filter tasks based on difficulty mode."""
+    if difficulty_mode is None:
+        # No filter specified
+        return [p for p in task_pairs if p[1] >= 0]
+    elif difficulty_mode in ["easy-only", "easy-preferred"]:
+        return [p for p in task_pairs if p[1] == 1]
+    elif difficulty_mode in ["medium-only", "medium-preferred"]:
+        return [p for p in task_pairs if p[1] == 2]
+    elif difficulty_mode in ["hard-only", "hard-preferred"]:
+        return [p for p in task_pairs if p[1] == 3]
+    elif difficulty_mode in ["balanced", "custom"]:
+        # Keep all difficulties for balanced/custom selection
+        return [p for p in task_pairs if p[1] >= 0]
+    else:
+        raise ValueError(f"Unknown difficulty mode: {difficulty_mode}")
+
+
+def select_tasks(
+    task_pairs: List[Tuple[str, int]],
+    size: int,
+    difficulty_mode: str,
+    ratio: str = None
+) -> List[Tuple[str, int]]:
+    """Select tasks based on size and difficulty distribution."""
+    if difficulty_mode in ["balanced", "custom"]:
+        # Group by difficulty
+        by_difficulty = {1: [], 2: [], 3: []}
+        for task_id, diff in task_pairs:
+            if diff in by_difficulty:
+                by_difficulty[diff].append((task_id, diff))
+
+        # Determine ratio based on mode
+        if difficulty_mode == "balanced":
+            # Strictly equal distribution: 1:1:1
+            ratio_easy = ratio_medium = ratio_hard = 1
+        else:  # difficulty_mode == "custom"
+            # Custom ratio from --ratio argument
+            if ratio:
+                try:
+                    parts = [int(x) for x in ratio.split(':')]
+                    if len(parts) != 3:
+                        raise ValueError("Ratio must have 3 parts (easy:medium:hard)")
+                    ratio_easy, ratio_medium, ratio_hard = parts
+                except ValueError as e:
+                    print(f"Warning: Invalid ratio '{ratio}', using equal distribution. Error: {e}")
+                    ratio_easy = ratio_medium = ratio_hard = 1
+            else:
+                # Default for custom mode if no ratio specified
+                ratio_easy = ratio_medium = ratio_hard = 1
+
+        # Calculate how many tasks per difficulty level based on ratio
+        total_ratio = ratio_easy + ratio_medium + ratio_hard
+        count_easy = int(size * ratio_easy / total_ratio)
+        count_medium = int(size * ratio_medium / total_ratio)
+        count_hard = size - count_easy - count_medium  # Ensure we hit exact size
+
+        selected = []
+
+        # Select tasks according to ratio
+        selected.extend(by_difficulty[1][:min(count_easy, len(by_difficulty[1]))])
+        selected.extend(by_difficulty[2][:min(count_medium, len(by_difficulty[2]))])
+        selected.extend(by_difficulty[3][:min(count_hard, len(by_difficulty[3]))])
+
+        return selected[:size]  # Ensure we don't exceed size
+    else:
+        # For non-balanced modes, just take first N tasks
+        return task_pairs[:size]
+
+
+def cluster_tasks_by_similarity(
+    task_ids: List[str],
+    similarity_metric: str,
+    threshold: float,
+    embedding_model: str = "text-embedding-3-large"
+) -> List[List[Dict]]:
+    """Cluster tasks by instruction similarity, preserving original order within clusters.
+
+    Returns:
+        List of clusters, where each cluster is a list of dicts with 'task_id', 'difficulty',
+        'instruction', and 'original_index' keys.
+    """
+    print(f"Reading task instructions and metadata...")
+
+    # Build task data with original indices
+    task_data = []
+    for idx, task_id in enumerate(task_ids):
+        difficulty = get_task_difficulty(task_id)
+        instruction = get_task_instruction(task_id)
+        if instruction and difficulty >= 0:  # Only include tasks with valid instruction and difficulty
+            task_data.append({
+                'task_id': task_id,
+                'difficulty': difficulty,
+                'instruction': instruction,
+                'original_index': idx
+            })
+
+    if not task_data:
+        print("Warning: No valid tasks found for clustering")
+        return []
+
+    print(f"Found {len(task_data)} valid tasks for clustering")
+
+    # Handle oracle clustering (by task family ID)
+    if similarity_metric == "oracle":
+        from similarity_metrics import extract_task_family_id
+
+        family_to_tasks = {}
+        for task in task_data:
+            family_id = extract_task_family_id(task['task_id'])
+            if family_id not in family_to_tasks:
+                family_to_tasks[family_id] = []
+            family_to_tasks[family_id].append(task)
+
+        # Sort each cluster by original index to preserve order
+        clusters = []
+        for family_id in sorted(family_to_tasks.keys()):
+            cluster = sorted(family_to_tasks[family_id], key=lambda x: x['original_index'])
+            clusters.append(cluster)
+
+        return clusters
+
+    # For other metrics, cluster by instruction similarity
+    # Build instruction to tasks mapping
+    instruction_to_tasks = {}
+    for task in task_data:
+        instruction = task['instruction']
+        if instruction not in instruction_to_tasks:
+            instruction_to_tasks[instruction] = []
+        instruction_to_tasks[instruction].append(task)
+
+    instructions = list(instruction_to_tasks.keys())
+
+    # Prepare metric-specific data
+    idf_scores = None
+    embeddings_cache = None
+    api_key = None
+
+    if similarity_metric == "cosine":
+        print("Computing IDF scores...")
+        idf_scores = compute_idf_scores(instructions)
+    elif similarity_metric == "embedding":
+        print(f"Computing embeddings for {len(instructions)} unique instructions using model '{embedding_model}'...")
+        import os
+        api_key = os.environ.get("OPENAI_API_KEY")
+        embeddings_cache = compute_embeddings_batch(instructions, api_key, model=embedding_model)
+        print("Embeddings computed successfully")
+
+    # Clustering based on similarity
+    clusters = []
+    clustered = set()
+
+    for i, inst1 in enumerate(instructions):
+        if inst1 in clustered:
+            continue
+
+        cluster_instructions = [inst1]
+        clustered.add(inst1)
+
+        for inst2 in instructions[i+1:]:
+            if inst2 in clustered:
+                continue
+
+            # Check similarity with any instruction in current cluster
+            is_similar = False
+            for cluster_inst in cluster_instructions:
+                sim_score = calculate_similarity(
+                    cluster_inst, inst2, similarity_metric,
+                    idf_scores=idf_scores,
+                    embeddings_cache=embeddings_cache,
+                    api_key=api_key
+                )
+                if sim_score >= threshold:
+                    is_similar = True
+                    break
+
+            if is_similar:
+                cluster_instructions.append(inst2)
+                clustered.add(inst2)
+
+        # Collect all tasks for this cluster and sort by original index
+        cluster_tasks = []
+        for instruction in cluster_instructions:
+            cluster_tasks.extend(instruction_to_tasks[instruction])
+
+        cluster_tasks.sort(key=lambda x: x['original_index'])
+        clusters.append(cluster_tasks)
+
+    # Sort clusters by the minimum original index in each cluster
+    clusters.sort(key=lambda cluster: min(task['original_index'] for task in cluster))
+
+    return clusters
+
+
+def select_from_clusters(
+    clusters: List[List[Dict]],
+    examples_per_cluster: int,
+    difficulty_filter: str = None,
+    ratio: str = None
+) -> Tuple[List[Dict], Dict]:
+    """Select examples from each cluster with difficulty filtering, preserving original relative order.
+
+    Args:
+        clusters: List of clusters (each cluster is a list of task dicts)
+        examples_per_cluster: Number of examples to select from each cluster
+        difficulty_filter: Difficulty mode (easy-only, easy-preferred, medium-only, medium-preferred,
+                          hard-only, hard-preferred, balanced, custom, or None for no filter)
+        ratio: Custom ratio for balanced/custom mode (e.g., "1:2:1")
+
+    Returns:
+        Tuple of (selected task dicts sorted by original_index, statistics dict)
+
+    Raises:
+        ValueError: If using -only mode and not enough samples of that difficulty are available
+    """
+    # Parse difficulty filter mode
+    if difficulty_filter is None:
+        # No difficulty filter - select first N from each cluster
+        selected = []
+        for cluster in clusters:
+            selected.extend(cluster[:examples_per_cluster])
+        selected.sort(key=lambda x: x['original_index'])
+        stats = {
+            'exact_matches': len(selected),
+            'approximate_matches': {},
+            'total_selected': len(selected)
+        }
+        return selected, stats
+
+    # Determine target difficulties and strictness based on filter mode
+    strict_mode = False
+    target_difficulties = []
+
+    if difficulty_filter == "easy-only":
+        target_difficulties = [1]
+        strict_mode = True
+    elif difficulty_filter == "easy-preferred":
+        target_difficulties = [1]
+        strict_mode = False
+    elif difficulty_filter == "medium-only":
+        target_difficulties = [2]
+        strict_mode = True
+    elif difficulty_filter == "medium-preferred":
+        target_difficulties = [2]
+        strict_mode = False
+    elif difficulty_filter == "hard-only":
+        target_difficulties = [3]
+        strict_mode = True
+    elif difficulty_filter == "hard-preferred":
+        target_difficulties = [3]
+        strict_mode = False
+    elif difficulty_filter in ["balanced", "custom"]:
+        # Parse ratio
+        if difficulty_filter == "balanced":
+            ratio_parts = [1, 1, 1]
+        else:
+            if ratio:
+                try:
+                    ratio_parts = [int(x) for x in ratio.split(':')]
+                    if len(ratio_parts) != 3:
+                        print("Warning: Invalid ratio format, using equal distribution")
+                        ratio_parts = [1, 1, 1]
+                except ValueError:
+                    print("Warning: Invalid ratio values, using equal distribution")
+                    ratio_parts = [1, 1, 1]
+            else:
+                ratio_parts = [1, 1, 1]
+
+        # Build target list based on ratio
+        total_ratio = sum(ratio_parts)
+        for cluster in clusters:
+            count_easy = int(examples_per_cluster * ratio_parts[0] / total_ratio)
+            count_medium = int(examples_per_cluster * ratio_parts[1] / total_ratio)
+            count_hard = examples_per_cluster - count_easy - count_medium
+            # We'll handle this per cluster, so just mark that we need balanced selection
+        target_difficulties = None  # Special handling for balanced mode
+        strict_mode = True  # Balanced mode is strict by default
+
+    # Statistics tracking
+    stats = {
+        'exact_matches': 0,
+        'approximate_matches': {},  # {(requested, actual): count}
+        'total_selected': 0
+    }
+
+    selected = []
+
+    for cluster in clusters:
+        if difficulty_filter in ["balanced", "custom"]:
+            # For balanced/custom, try to get proportional representation
+            ratio_parts = [1, 1, 1] if difficulty_filter == "balanced" else [int(x) for x in (ratio or "1:1:1").split(':')]
+            total_ratio = sum(ratio_parts)
+            count_easy = int(examples_per_cluster * ratio_parts[0] / total_ratio)
+            count_medium = int(examples_per_cluster * ratio_parts[1] / total_ratio)
+            count_hard = examples_per_cluster - count_easy - count_medium
+
+            # Group cluster tasks by difficulty
+            by_diff = {1: [], 2: [], 3: []}
+            for task in cluster:
+                if task['difficulty'] in by_diff:
+                    by_diff[task['difficulty']].append(task)
+
+            # Try to select according to ratio
+            cluster_selected = []
+            needs = {1: count_easy, 2: count_medium, 3: count_hard}
+
+            for diff_level in [1, 2, 3]:
+                available = by_diff[diff_level]
+                needed = needs[diff_level]
+                taken = min(needed, len(available))
+                cluster_selected.extend(available[:taken])
+                stats['exact_matches'] += taken
+                needs[diff_level] -= taken
+
+            # If we still need more, try to fill with closest difficulty
+            if len(cluster_selected) < examples_per_cluster:
+                for target_diff in [1, 2, 3]:
+                    if needs[target_diff] > 0:
+                        # Try adjacent difficulties
+                        for alt_diff in [target_diff - 1, target_diff + 1, target_diff - 2, target_diff + 2]:
+                            if alt_diff in [1, 2, 3] and needs[target_diff] > 0:
+                                # Get tasks not already selected
+                                available = [t for t in by_diff[alt_diff] if t not in cluster_selected]
+                                taken = min(needs[target_diff], len(available))
+                                if taken > 0:
+                                    cluster_selected.extend(available[:taken])
+                                    key = (target_diff, alt_diff)
+                                    stats['approximate_matches'][key] = stats['approximate_matches'].get(key, 0) + taken
+                                    needs[target_diff] -= taken
+
+            selected.extend(cluster_selected)
+            stats['total_selected'] += len(cluster_selected)
+        else:
+            # For single difficulty modes (-only/-preferred)
+            cluster_selected = []
+            for task in cluster:
+                if len(cluster_selected) >= examples_per_cluster:
+                    break
+                if task['difficulty'] in target_difficulties:
+                    cluster_selected.append(task)
+                    stats['exact_matches'] += 1
+
+            # Check if we got enough samples for strict mode
+            if strict_mode and len(cluster_selected) < examples_per_cluster:
+                # In strict mode, raise error if not enough samples
+                diff_level_name = {1: "easy", 2: "medium", 3: "hard"}[target_difficulties[0]]
+                raise ValueError(
+                    f"Not enough {diff_level_name} samples in cluster. "
+                    f"Needed {examples_per_cluster}, found {len(cluster_selected)}. "
+                    f"Use '{diff_level_name}-preferred' mode to allow closest matches as fallback."
+                )
+
+            # If we need more and in preferred mode, use closest difficulty
+            if not strict_mode and len(cluster_selected) < examples_per_cluster:
+                remaining = [t for t in cluster if t not in cluster_selected]
+
+                for task in remaining:
+                    if len(cluster_selected) >= examples_per_cluster:
+                        break
+
+                    # Find closest target difficulty
+                    if target_difficulties:
+                        closest_target = min(target_difficulties, key=lambda x: abs(x - task['difficulty']))
+                        cluster_selected.append(task)
+                        key = (closest_target, task['difficulty'])
+                        stats['approximate_matches'][key] = stats['approximate_matches'].get(key, 0) + 1
+
+            selected.extend(cluster_selected)
+            stats['total_selected'] += len(cluster_selected)
+
+    # Sort all selected tasks by original index to preserve overall order
+    selected.sort(key=lambda x: x['original_index'])
+
+    return selected, stats
+
+
+def order_tasks(
+    task_pairs: List[Tuple[str, int]],
+    order_mode: str,
+    random_seed: int = None
+) -> List[Tuple[str, int]]:
+    """Order tasks based on the specified mode."""
+    if order_mode == "original":
+        return task_pairs
+    elif order_mode == "easy-to-hard":
+        return sorted(task_pairs, key=lambda x: (x[1], x[0]))
+    elif order_mode == "hard-to-easy":
+        return sorted(task_pairs, key=lambda x: (-x[1], x[0]))
+    elif order_mode == "random":
+        if random_seed is not None:
+            random.seed(random_seed)
+        shuffled = task_pairs.copy()
+        random.shuffle(shuffled)
+        return shuffled
+    else:
+        raise ValueError(f"Unknown order mode: {order_mode}")
+
+
+def save_dataset(task_ids: List[str], output_name: str):
+    """Save selected task IDs to output file."""
+    output_file = Path("data/datasets") / output_name
+
+    with open(output_file, 'w') as f:
+        for task_id in task_ids:
+            f.write(f"{task_id}\n")
+
+    print(f"Saved {len(task_ids)} tasks to: {output_file}")
+
+
+def print_difficulty_matching_stats(stats: Dict):
+    """Print statistics about difficulty matching in cluster selection."""
+    print("\n" + "=" * 60)
+    print("DIFFICULTY MATCHING STATISTICS")
+    print("=" * 60)
+
+    print(f"Total tasks selected: {stats['total_selected']}")
+    print(f"Exact matches: {stats['exact_matches']} ({stats['exact_matches']/stats['total_selected']*100:.1f}%)")
+
+    if stats['approximate_matches']:
+        total_approx = sum(stats['approximate_matches'].values())
+        print(f"Approximate matches: {total_approx} ({total_approx/stats['total_selected']*100:.1f}%)")
+        print("\nApproximate match breakdown:")
+        print(f"  {'Requested':<12} {'Actual':<12} {'Count':<8}")
+        print("  " + "-" * 32)
+
+        # Sort by requested difficulty, then actual
+        for (requested, actual), count in sorted(stats['approximate_matches'].items()):
+            req_label = f"Level {requested}"
+            act_label = f"Level {actual}"
+            print(f"  {req_label:<12} {act_label:<12} {count:<8}")
+    else:
+        print("Approximate matches: 0 (all exact matches!)")
+    print()
+
+
+def print_summary(task_pairs: List[Tuple[str, int]]):
+    """Print summary of selected tasks."""
+    total = len(task_pairs)
+    if total == 0:
+        print("No tasks selected!")
+        return
+
+    # Count by difficulty
+    counts = {1: 0, 2: 0, 3: 0}
+    for _, diff in task_pairs:
+        if diff in counts:
+            counts[diff] += 1
+
+    print("\nSelection Summary:")
+    print(f"Total tasks: {total}")
+    print(f"  Easy (Level 1):   {counts[1]:3d} ({counts[1]/total*100:5.1f}%)")
+    print(f"  Medium (Level 2): {counts[2]:3d} ({counts[2]/total*100:5.1f}%)")
+    print(f"  Hard (Level 3):   {counts[3]:3d} ({counts[3]/total*100:5.1f}%)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Select and reorder tasks from a dataset file",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Standard selection with difficulty and size
+  python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --size 50 --difficulty balanced --order easy-to-hard
+
+  # Cluster-based selection (ignores --size, respects --difficulty)
+  # No difficulty filter - select from all difficulties
+  python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --order original
+
+  # Easy-only mode - strict, errors if not enough easy samples
+  python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster oracle --examples-per-cluster 3 --difficulty easy-only --order original
+
+  # Easy-preferred mode - prefers easy, falls back to closest if unavailable
+  python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster cosine:0.8 --examples-per-cluster 2 --difficulty easy-preferred --order original
+
+  # Balanced mode - tries to maintain 1:1:1 ratio across difficulties
+  python3 experiments/curriculum/data_selector.py --dataset train.txt --output train_subset.txt --cluster oracle --examples-per-cluster 3 --difficulty balanced --order easy-to-hard
+
+Available similarity metrics for --cluster:
+  - jaccard: Word-based set similarity
+  - cosine: TF-IDF cosine similarity
+  - levenshtein: Character-level edit distance
+  - oracle: Ground truth clustering using task family IDs (threshold ignored)
+  - embedding: Semantic similarity using OpenAI embeddings (requires OPENAI_API_KEY)
+
+Difficulty modes:
+  - None (default): No filtering, select from all difficulties
+  - easy-only/medium-only/hard-only: Strict mode, requires exact matches (throws error if unavailable)
+  - easy-preferred/medium-preferred/hard-preferred: Prefers difficulty, allows closest match as fallback
+  - balanced: Tries to maintain 1:1:1 ratio across all three difficulty levels
+  - custom: Custom ratio specified with --ratio (e.g., --ratio 1:2:1 for easy:medium:hard)
+        """
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Input dataset filename (e.g., train.txt)"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output dataset filename (e.g., train_subset.txt)"
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        required=False,
+        help="Number of tasks to select (ignored when using --cluster)"
+    )
+    parser.add_argument(
+        "--difficulty",
+        type=str,
+        choices=[
+            "easy-only", "easy-preferred",
+            "medium-only", "medium-preferred",
+            "hard-only", "hard-preferred",
+            "balanced", "custom"
+        ],
+        default=None,
+        help="Difficulty filter. -only modes require exact matches (error if unavailable). -preferred modes prefer the level but allow closest match. balanced/custom for proportional selection. If not specified, no difficulty filter is applied."
+    )
+    parser.add_argument(
+        "--order",
+        type=str,
+        choices=["original", "easy-to-hard", "hard-to-easy", "random"],
+        default="original",
+        help="Task ordering (default: original)"
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Random seed for reproducibility (only used with --order random)"
+    )
+    parser.add_argument(
+        "--ratio",
+        type=str,
+        default=None,
+        help="Difficulty ratio for custom mode (e.g., '1:2:1' for easy:medium:hard)"
+    )
+    parser.add_argument(
+        "--cluster",
+        type=str,
+        default=None,
+        help="Enable cluster-based selection with format 'metric:threshold' (e.g., 'cosine:0.8', 'oracle'). When used, --size is ignored but --difficulty is respected."
+    )
+    parser.add_argument(
+        "--examples-per-cluster",
+        type=int,
+        default=None,
+        help="Number of examples to select from each cluster (required when using --cluster)"
+    )
+    parser.add_argument(
+        "--embedding-model",
+        type=str,
+        default="text-embedding-3-large",
+        help="Embedding model to use when --cluster uses 'embedding' metric (default: text-embedding-3-large)"
+    )
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    if args.cluster:
+        if not args.examples_per_cluster:
+            parser.error("--examples-per-cluster is required when using --cluster")
+        if args.size:
+            print("Warning: --size is ignored when using --cluster mode")
+    else:
+        if not args.size:
+            parser.error("--size is required when not using --cluster mode")
+        if args.examples_per_cluster:
+            print("Warning: --examples-per-cluster is ignored when not using --cluster mode")
+
+    # Load dataset
+    print(f"Loading dataset: {args.dataset}")
+    task_ids = load_dataset(args.dataset)
+    print(f"Found {len(task_ids)} tasks")
+
+    # Branch based on clustering mode
+    if args.cluster:
+        # Parse cluster argument
+        if ':' in args.cluster:
+            parts = args.cluster.split(':', 1)
+            similarity_metric = parts[0]
+            try:
+                threshold = float(parts[1])
+            except ValueError:
+                parser.error(f"Invalid threshold in --cluster argument: {parts[1]}")
+        else:
+            # For oracle, threshold is not needed
+            similarity_metric = args.cluster
+            threshold = 0.0  # Will be ignored for oracle
+
+        # Validate similarity metric
+        valid_metrics = ["jaccard", "cosine", "levenshtein", "oracle", "embedding"]
+        if similarity_metric not in valid_metrics:
+            parser.error(f"Invalid similarity metric: {similarity_metric}. Must be one of {valid_metrics}")
+
+        print(f"\n=== CLUSTER-BASED SELECTION ===")
+        print(f"Similarity metric: {similarity_metric}")
+        if similarity_metric != "oracle":
+            print(f"Threshold: {threshold}")
+        print(f"Examples per cluster: {args.examples_per_cluster}")
+        if args.difficulty:
+            print(f"Difficulty filter: {args.difficulty}")
+            if args.difficulty == "custom" and args.ratio:
+                print(f"Custom ratio: {args.ratio}")
+            elif args.difficulty == "balanced":
+                print(f"Using balanced distribution (1:1:1)")
+        else:
+            print(f"Difficulty filter: None (all difficulties)")
+
+        # Cluster tasks
+        clusters = cluster_tasks_by_similarity(
+            task_ids,
+            similarity_metric,
+            threshold,
+            embedding_model=args.embedding_model
+        )
+
+        print(f"\nClustering complete: {len(clusters)} clusters found")
+
+        # Print cluster statistics
+        cluster_sizes = [len(cluster) for cluster in clusters]
+        print(f"Cluster size range: {min(cluster_sizes)} to {max(cluster_sizes)}")
+        print(f"Average cluster size: {sum(cluster_sizes) / len(cluster_sizes):.1f}")
+
+        # Select from clusters with difficulty filtering
+        filter_msg = f"with difficulty filter '{args.difficulty}'" if args.difficulty else "without difficulty filter"
+        print(f"\nSelecting {args.examples_per_cluster} example(s) from each cluster {filter_msg}...")
+        try:
+            selected_tasks, difficulty_stats = select_from_clusters(
+                clusters,
+                args.examples_per_cluster,
+                difficulty_filter=args.difficulty,
+                ratio=args.ratio
+            )
+            print(f"Selected {len(selected_tasks)} tasks total")
+
+            # Print difficulty matching statistics only if filter was applied
+            if args.difficulty:
+                print_difficulty_matching_stats(difficulty_stats)
+        except ValueError as e:
+            print(f"\nError: {e}")
+            return
+
+        # Convert to task_pairs format for ordering and summary
+        selected_pairs = [(task['task_id'], task['difficulty']) for task in selected_tasks]
+
+    else:
+        # Standard selection mode
+        print("\n=== STANDARD SELECTION ===")
+
+        # Get difficulty information
+        print("Reading task metadata...")
+        task_pairs = get_tasks_with_difficulty(task_ids)
+
+        # Filter by difficulty
+        print(f"Filtering by difficulty: {args.difficulty}")
+        filtered_pairs = filter_by_difficulty(task_pairs, args.difficulty)
+        print(f"After filtering: {len(filtered_pairs)} tasks")
+
+        # Select tasks
+        print(f"Selecting {args.size} tasks...")
+        if args.difficulty == "custom":
+            if args.ratio:
+                print(f"Using custom ratio: {args.ratio}")
+            else:
+                print("Warning: --ratio not specified for custom mode")
+        elif args.difficulty == "balanced":
+            print("Using balanced distribution (1:1:1)")
+        selected_pairs = select_tasks(filtered_pairs, args.size, args.difficulty, args.ratio)
+
+        if len(selected_pairs) < args.size:
+            print(f"Warning: Only {len(selected_pairs)} tasks available, requested {args.size}")
+
+    # Order tasks (applies to both modes)
+    print(f"\nOrdering tasks: {args.order}")
+    ordered_pairs = order_tasks(selected_pairs, args.order, args.seed)
+
+    # Extract task IDs
+    selected_task_ids = [task_id for task_id, _ in ordered_pairs]
+
+    # Print summary
+    print_summary(ordered_pairs)
+
+    # Save output
+    save_dataset(selected_task_ids, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/curriculum/similarity_metrics.py b/experiments/curriculum/similarity_metrics.py
new file mode 100644
index 0000000..fab1584
--- /dev/null
+++ b/experiments/curriculum/similarity_metrics.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+"""
+Similarity metrics for comparing text instructions.
+
+This module provides various text similarity metrics that can be used
+for clustering and comparing task instructions.
+
+Available metrics:
+- Jaccard similarity: Word-based set similarity
+- TF-IDF Cosine similarity: Weighted term similarity with IDF scoring
+- Levenshtein similarity: Character-level edit distance similarity
+- Oracle similarity: Ground truth similarity based on task family ID
+- Embedding similarity: Semantic similarity using sentence embeddings (requires API key)
+"""
+
+import math
+import os
+import re
+from collections import Counter, defaultdict
+from typing import Dict, List, Optional
+
+
+def extract_task_family_id(task_id: str) -> str:
+    """Extract the family ID from a task ID.
+
+    Args:
+        task_id: Task ID in format like "76f2c72_2" or "76f2c72_1"
+
+    Returns:
+        Family ID (e.g., "76f2c72")
+    """
+    # Split by underscore and take the first part
+    parts = task_id.split('_')
+    return parts[0] if parts else task_id
+
+
+def tokenize(text: str) -> List[str]:
+    """Tokenize text into words."""
+    # Convert to lowercase and split by non-alphanumeric characters
+    return re.findall(r'\b\w+\b', text.lower())
+
+
+def calculate_jaccard_similarity(s1: str, s2: str) -> float:
+    """Calculate Jaccard similarity between two strings based on word tokens.
+
+    Args:
+        s1: First string
+        s2: Second string
+
+    Returns:
+        Similarity score between 0.0 and 1.0
+    """
+    if not s1 or not s2:
+        return 0.0
+
+    words1 = set(tokenize(s1))
+    words2 = set(tokenize(s2))
+
+    intersection = len(words1 & words2)
+    union = len(words1 | words2)
+
+    return intersection / union if union > 0 else 0.0
+
+
+def calculate_cosine_similarity(s1: str, s2: str, idf_scores: Dict[str, float]) -> float:
+    """Calculate TF-IDF cosine similarity between two strings.
+
+    Args:
+        s1: First string
+        s2: Second string
+        idf_scores: Dictionary mapping terms to their IDF scores
+
+    Returns:
+        Similarity score between 0.0 and 1.0
+    """
+    if not s1 or not s2:
+        return 0.0
+
+    tokens1 = tokenize(s1)
+    tokens2 = tokenize(s2)
+
+    # Calculate TF-IDF vectors
+    def get_tfidf_vector(tokens: List[str]) -> Dict[str, float]:
+        tf = Counter(tokens)
+        total_terms = len(tokens)
+        tfidf = {}
+        for term, count in tf.items():
+            tf_score = count / total_terms if total_terms > 0 else 0
+            idf_score = idf_scores.get(term, 0)
+            tfidf[term] = tf_score * idf_score
+        return tfidf
+
+    vec1 = get_tfidf_vector(tokens1)
+    vec2 = get_tfidf_vector(tokens2)
+
+    # Calculate cosine similarity
+    all_terms = set(vec1.keys()) | set(vec2.keys())
+
+    dot_product = sum(vec1.get(term, 0) * vec2.get(term, 0) for term in all_terms)
+    magnitude1 = math.sqrt(sum(v * v for v in vec1.values()))
+    magnitude2 = math.sqrt(sum(v * v for v in vec2.values()))
+
+    if magnitude1 == 0 or magnitude2 == 0:
+        return 0.0
+
+    return dot_product / (magnitude1 * magnitude2)
+
+
+def calculate_levenshtein_similarity(s1: str, s2: str) -> float:
+    """Calculate normalized Levenshtein (edit distance) similarity.
+
+    Args:
+        s1: First string
+        s2: Second string
+
+    Returns:
+        Similarity score between 0.0 and 1.0
+    """
+    if not s1 or not s2:
+        return 0.0
+
+    # Levenshtein distance using dynamic programming
+    len1, len2 = len(s1), len(s2)
+    if len1 == 0:
+        return 0.0 if len2 > 0 else 1.0
+    if len2 == 0:
+        return 0.0
+
+    # Create distance matrix
+    distances = [[0] * (len2 + 1) for _ in range(len1 + 1)]
+
+    for i in range(len1 + 1):
+        distances[i][0] = i
+    for j in range(len2 + 1):
+        distances[0][j] = j
+
+    for i in range(1, len1 + 1):
+        for j in range(1, len2 + 1):
+            cost = 0 if s1[i - 1] == s2[j - 1] else 1
+            distances[i][j] = min(
+                distances[i - 1][j] + 1,      # deletion
+                distances[i][j - 1] + 1,      # insertion
+                distances[i - 1][j - 1] + cost  # substitution
+            )
+
+    max_len = max(len1, len2)
+    return 1.0 - (distances[len1][len2] / max_len)
+
+
+def calculate_oracle_similarity(task_id1: str, task_id2: str) -> float:
+    """Calculate oracle similarity based on task family ID.
+
+    This is a "ground truth" similarity metric that returns 1.0 if two tasks
+    belong to the same family (same prefix before underscore), 0.0 otherwise.
+
+    Args:
+        task_id1: First task ID (e.g., "76f2c72_2")
+        task_id2: Second task ID (e.g., "76f2c72_3")
+
+    Returns:
+        1.0 if same family, 0.0 otherwise
+    """
+    if not task_id1 or not task_id2:
+        return 0.0
+
+    family1 = extract_task_family_id(task_id1)
+    family2 = extract_task_family_id(task_id2)
+
+    return 1.0 if family1 == family2 else 0.0
+
+
+def get_embedding(
+    text: str,
+    api_key: Optional[str] = None,
+    base_url: Optional[str] = None,
+    model: str = "text-embedding-3-large"
+) -> List[float]:
+    """Get embedding vector for a text using OpenAI API.
+
+    Args:
+        text: Text to embed
+        api_key: OpenAI API key (if None, will try to use OPENAI_API_KEY env var)
+        base_url: API base URL (default: None, uses OpenAI's default)
+        model: Embedding model to use (default: text-embedding-3-large)
+               Other options: "text-embedding-3-large", "text-embedding-ada-002"
+
+    Returns:
+        Embedding vector as list of floats
+
+    Raises:
+        ImportError: If openai package is not installed
+        ValueError: If API key is not provided
+    """
+    try:
+        from openai import OpenAI
+    except ImportError:
+        raise ImportError("openai package required for embedding similarity. Install with: pip install openai")
+
+    if api_key is None:
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "API key required for embedding similarity. "
+                "Set OPENAI_API_KEY environment variable or pass api_key parameter."
+            )
+
+    # Create client with or without custom base_url
+    if base_url:
+        client = OpenAI(api_key=api_key, base_url=base_url)
+    else:
+        client = OpenAI(api_key=api_key)
+
+    # Use OpenAI's embedding model
+    try:
+        response = client.embeddings.create(
+            model=model,
+            input=text
+        )
+        return response.data[0].embedding
+    except Exception as e:
+        # Provide helpful error message with available models
+        raise RuntimeError(
+            f"Failed to get embedding with model '{model}'. "
+            f"Error: {str(e)}. "
+            f"Available OpenAI models: text-embedding-3-small, text-embedding-3-large, text-embedding-ada-002"
+        ) from e
+
+
+def calculate_embedding_similarity(
+    s1: str,
+    s2: str,
+    embeddings_cache: Optional[Dict[str, List[float]]] = None,
+    api_key: Optional[str] = None
+) -> float:
+    """Calculate cosine similarity between embeddings of two texts.
+
+    Args:
+        s1: First string
+        s2: Second string
+        embeddings_cache: Optional cache of pre-computed embeddings {text: embedding}
+        api_key: OpenAI API key
+
+    Returns:
+        Cosine similarity score between 0.0 and 1.0
+    """
+    if not s1 or not s2:
+        return 0.0
+
+    # Get embeddings from cache or compute them
+    if embeddings_cache and s1 in embeddings_cache:
+        emb1 = embeddings_cache[s1]
+    else:
+        emb1 = get_embedding(s1, api_key)
+        if embeddings_cache is not None:
+            embeddings_cache[s1] = emb1
+
+    if embeddings_cache and s2 in embeddings_cache:
+        emb2 = embeddings_cache[s2]
+    else:
+        emb2 = get_embedding(s2, api_key)
+        if embeddings_cache is not None:
+            embeddings_cache[s2] = emb2
+
+    # Calculate cosine similarity
+    dot_product = sum(a * b for a, b in zip(emb1, emb2))
+    magnitude1 = math.sqrt(sum(a * a for a in emb1))
+    magnitude2 = math.sqrt(sum(b * b for b in emb2))
+
+    if magnitude1 == 0 or magnitude2 == 0:
+        return 0.0
+
+    # Cosine similarity ranges from -1 to 1, normalize to 0 to 1
+    similarity = dot_product / (magnitude1 * magnitude2)
+    return (similarity + 1) / 2
+
+
+def compute_embeddings_batch(
+    texts: List[str],
+    api_key: Optional[str] = None,
+    model: str = "text-embedding-3-large"
+) -> Dict[str, List[float]]:
+    """Pre-compute embeddings for a list of texts.
+
+    Args:
+        texts: List of texts to embed
+        api_key: OpenAI API key
+        model: Embedding model to use
+
+    Returns:
+        Dictionary mapping each text to its embedding vector
+    """
+    embeddings_cache = {}
+    for i, text in enumerate(texts):
+        if text and text not in embeddings_cache:
+            print(f"    Progress: {i+1}/{len(texts)}", end='\r')
+            embeddings_cache[text] = get_embedding(text, api_key, model=model)
+    print(f"    Progress: {len(texts)}/{len(texts)} - Complete!")
+    return embeddings_cache
+
+
+def compute_idf_scores(texts: List[str]) -> Dict[str, float]:
+    """Compute IDF (Inverse Document Frequency) scores for all terms in a text corpus.
+
+    Args:
+        texts: List of text documents
+
+    Returns:
+        Dictionary mapping terms to their IDF scores
+    """
+    num_docs = len(texts)
+    if num_docs == 0:
+        return {}
+
+    # Count document frequency for each term
+    doc_freq = defaultdict(int)
+    for text in texts:
+        unique_terms = set(tokenize(text))
+        for term in unique_terms:
+            doc_freq[term] += 1
+
+    # Calculate IDF scores
+    idf_scores = {}
+    for term, freq in doc_freq.items():
+        idf_scores[term] = math.log(num_docs / freq) if freq > 0 else 0
+
+    return idf_scores
+
+
+def calculate_similarity(
+    s1: str,
+    s2: str,
+    metric: str = "jaccard",
+    idf_scores: Dict[str, float] = None,
+    task_id1: Optional[str] = None,
+    task_id2: Optional[str] = None,
+    embeddings_cache: Optional[Dict[str, List[float]]] = None,
+    api_key: Optional[str] = None
+) -> float:
+    """Calculate similarity between two strings using the specified metric.
+
+    Args:
+        s1: First string (instruction text or task ID for oracle)
+        s2: Second string (instruction text or task ID for oracle)
+        metric: Similarity metric to use ('jaccard', 'cosine', 'levenshtein', 'oracle', 'embedding')
+        idf_scores: IDF scores (required for cosine similarity)
+        task_id1: First task ID (required for oracle similarity)
+        task_id2: Second task ID (required for oracle similarity)
+        embeddings_cache: Cache of embeddings (for embedding similarity)
+        api_key: API key for embedding service (for embedding similarity)
+
+    Returns:
+        Similarity score between 0.0 and 1.0
+
+    Raises:
+        ValueError: If an unknown metric is specified or if required parameters are missing
+    """
+    if metric == "jaccard":
+        return calculate_jaccard_similarity(s1, s2)
+    elif metric == "cosine":
+        if idf_scores is None:
+            raise ValueError("IDF scores required for cosine similarity")
+        return calculate_cosine_similarity(s1, s2, idf_scores)
+    elif metric == "levenshtein":
+        return calculate_levenshtein_similarity(s1, s2)
+    elif metric == "oracle":
+        if task_id1 is None or task_id2 is None:
+            raise ValueError("Task IDs required for oracle similarity")
+        return calculate_oracle_similarity(task_id1, task_id2)
+    elif metric == "embedding":
+        return calculate_embedding_similarity(s1, s2, embeddings_cache, api_key)
+    else:
+        raise ValueError(f"Unknown similarity metric: {metric}")