diff --git a/negate/extract/feature_artwork.py b/negate/extract/feature_artwork.py
index 58bc7c5..e6ab265 100644
--- a/negate/extract/feature_artwork.py
+++ b/negate/extract/feature_artwork.py
@@ -6,16 +6,39 @@
 Implements the 39-feature extraction pipeline from:
     Li & Stamp, "Detecting AI-generated Artwork", arXiv:2504.07078, 2025.
 
-Extended with a dedicated frequency analysis branch (FFT/DCT) that captures
-spectral fingerprints left by generative models.
+Extended with:
+    - Dedicated frequency analysis branch (FFT/DCT) for spectral fingerprints
+    - Enhanced GLCM (multi-angle/distance) per Nirob et al. (2026)
+    - Full LBP histogram features per Nirob et al. (2026)
+    - Mid-band frequency analysis per FIRE (CVPR 2025)
+    - Patch-level consistency features per CINEMAE (2025)
+    - Multi-scale LBP (8): R=3/P=24 coarse texture + per-scale stats
+    - Gabor filter bank (18): 4 freq x 4 orient energy + summary stats
+    - Wavelet packet statistics (12): 2-level Haar detail coefficients
+    - Color coherence vectors (6): coherent/incoherent pixel ratios per channel
+    - Edge co-occurrence (8): edge-direction GLCM properties
+    - Fractal dimension (2): box-counting on grayscale + edge map
+    - Extended HOG (6): multi-scale HOG + cross-scale ratios
+    - JPEG ghost detection (4): recompression RMSE at multiple quality levels
 
-Features are grouped into 6 categories:
+Features are grouped into 16 categories:
     - Brightness (2): mean, entropy
     - Color (23): RGB/HSV histogram statistics
     - Texture (6): GLCM + LBP
     - Shape (6): HOG + edge length
     - Noise (2): noise entropy, SNR
     - Frequency (10): FFT/DCT spectral analysis
+    - Enhanced texture (14): multi-angle GLCM, full LBP histogram, DCT block stats
+    - Patch consistency (6): cross-patch feature variance (CINEMAE-inspired)
+    - Mid-band frequency (4): fine-grained radial band analysis
+    - Multi-scale LBP (8): coarse texture descriptors
+    - Gabor filter bank (18): oriented frequency responses
+    - Wavelet packets (12): Haar detail coefficient statistics
+    - Color coherence (6): spatial color consistency
+    - Edge co-occurrence (8): edge direction relationships
+    - Fractal dimension (2): complexity measures
+    - Extended HOG (6): multi-scale gradient histograms
+    - JPEG ghosts (4): recompression artifacts
 """
 
 from __future__ import annotations
@@ -211,19 +234,894 @@ def _frequency_features(gray: NDArray) -> dict[str, float]:
     }
 
 
+def _enhanced_texture_features(gray: NDArray) -> dict[str, float]:
+    """Extended GLCM + full LBP histogram + block DCT (14 features).
+
+    Per Nirob et al. (2026): fusing multiple GLCM angles/distances and
+    full LBP histogram distributions significantly improves detection.
+    """
+    gray_uint8 = (gray * 255).astype(np.uint8) if gray.max() <= 1 else gray.astype(np.uint8)
+
+    # Multi-angle GLCM: 4 angles × 2 distances, averaged per property
+    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
+    distances = [1, 3]
+    glcm = graycomatrix(gray_uint8, distances=distances, angles=angles, levels=256, symmetric=True, normed=True)
+
+    features: dict[str, float] = {}
+    for prop in ("contrast", "correlation", "energy", "homogeneity"):
+        vals = graycoprops(glcm, prop)
+        features[f"glcm_multi_{prop}_mean"] = float(vals.mean())
+        features[f"glcm_multi_{prop}_std"] = float(vals.std())
+
+    # Full LBP histogram (10-bin uniform + variance of spatial LBP)
+    lbp = local_binary_pattern(gray_uint8, P=8, R=1, method="uniform")
+    lbp_hist, _ = np.histogram(lbp, bins=10, range=(0, 10), density=True)
+    features["lbp_hist_kurtosis"] = float(kurtosis(lbp_hist))
+    features["lbp_hist_skew"] = float(skew(lbp_hist))
+    features["lbp_hist_max"] = float(lbp_hist.max())
+
+    # Multi-scale LBP: R=2, P=16 captures coarser texture
+    lbp_coarse = local_binary_pattern(gray_uint8, P=16, R=2, method="uniform")
+    features["lbp_coarse_entropy"] = float(entropy(np.histogram(lbp_coarse, bins=18)[0] + 1e-10))
+
+    # Block-level DCT statistics (8x8 blocks, like JPEG)
+    from scipy.fft import dctn
+    h, w = gray.shape
+    block_size = 8
+    block_energies = []
+    for y in range(0, h - block_size, block_size):
+        for x in range(0, w - block_size, block_size):
+            block = gray[y:y+block_size, x:x+block_size]
+            dct_block = dctn(block, type=2, norm="ortho")
+            # Energy in AC coefficients (exclude DC at [0,0])
+            ac_energy = float((dct_block ** 2).sum() - dct_block[0, 0] ** 2)
+            block_energies.append(ac_energy)
+
+    block_energies = np.array(block_energies)
+    features["dct_block_energy_mean"] = float(block_energies.mean())
+    features["dct_block_energy_std"] = float(block_energies.std())
+
+    return features
+
+
+def _midband_frequency_features(gray: NDArray) -> dict[str, float]:
+    """Mid-band frequency analysis (4 features).
+
+    Per FIRE (CVPR 2025): diffusion models specifically fail to accurately
+    reconstruct mid-band frequency information. This measures the mid-band
+    energy distribution relative to natural image expectations.
+    """
+    h, w = gray.shape
+    fft_2d = np.fft.fft2(gray)
+    fft_shift = np.fft.fftshift(fft_2d)
+    magnitude = np.abs(fft_shift)
+
+    center_h, center_w = h // 2, w // 2
+    y, x = np.ogrid[:h, :w]
+    radius = np.sqrt((x - center_w) ** 2 + (y - center_h) ** 2)
+    max_r = np.sqrt(center_h ** 2 + center_w ** 2)
+
+    # Fine-grained radial bands (5 bands instead of 3)
+    bands = [(0, 0.1), (0.1, 0.25), (0.25, 0.45), (0.45, 0.7), (0.7, 1.0)]
+    band_energies = []
+    for lo, hi in bands:
+        mask = (radius >= max_r * lo) & (radius < max_r * hi)
+        band_energies.append(float((magnitude[mask] ** 2).sum()))
+
+    total = sum(band_energies) + 1e-10
+    band_ratios = [e / total for e in band_energies]
+
+    # Natural images follow approximate 1/f power law
+    # Deviation from 1/f in mid-bands is a strong AI signal
+    expected_ratios = np.array([0.65, 0.20, 0.10, 0.035, 0.015])  # approximate 1/f
+    actual_ratios = np.array(band_ratios)
+    deviation = actual_ratios - expected_ratios
+
+    return {
+        "midband_energy_ratio": float(band_ratios[2]),  # 0.25-0.45 band specifically
+        "midband_deviation": float(deviation[2]),  # deviation from expected in midband
+        "spectral_slope_deviation": float(np.std(deviation)),  # overall 1/f deviation
+        "high_to_mid_ratio": float(band_ratios[4] / (band_ratios[2] + 1e-10)),  # high/mid balance
+    }
+
+
+def _patch_consistency_features(gray: NDArray) -> dict[str, float]:
+    """Cross-patch consistency features (6 features).
+
+    Per CINEMAE (2025): real images have consistent patch-to-context
+    relationships that AI images subtly violate. We measure variance
+    of per-patch statistics across the image.
+    """
+    h, w = gray.shape
+    patch_size = 32
+    n_patches = 0
+
+    patch_means = []
+    patch_stds = []
+    patch_edges = []
+    patch_freq_centroids = []
+
+    for y in range(0, h - patch_size, patch_size):
+        for x in range(0, w - patch_size, patch_size):
+            patch = gray[y:y+patch_size, x:x+patch_size]
+            patch_means.append(float(patch.mean()))
+            patch_stds.append(float(patch.std()))
+
+            # Edge density per patch
+            from skimage.feature import canny
+            edges = canny(patch)
+            patch_edges.append(float(edges.mean()))
+
+            # Frequency centroid per patch
+            fft_p = np.fft.fft2(patch)
+            mag_p = np.abs(fft_p)
+            freqs = np.fft.fftfreq(patch_size)
+            freq_grid = np.sqrt(freqs[:, None] ** 2 + freqs[None, :] ** 2)
+            centroid = float(np.sum(mag_p * freq_grid) / (mag_p.sum() + 1e-10))
+            patch_freq_centroids.append(centroid)
+            n_patches += 1
+
+    if n_patches < 4:
+        return {k: 0.0 for k in [
+            "patch_mean_cv", "patch_std_cv", "patch_edge_cv",
+            "patch_freq_centroid_cv", "patch_freq_centroid_range",
+            "patch_coherence_score",
+        ]}
+
+    # Coefficient of variation (std/mean) for each patch-level statistic
+    # Higher CV = more inconsistency across patches
+    def _cv(arr: list[float]) -> float:
+        a = np.array(arr)
+        return float(a.std() / (abs(a.mean()) + 1e-10))
+
+    freq_arr = np.array(patch_freq_centroids)
+
+    return {
+        "patch_mean_cv": _cv(patch_means),
+        "patch_std_cv": _cv(patch_stds),
+        "patch_edge_cv": _cv(patch_edges),
+        "patch_freq_centroid_cv": _cv(patch_freq_centroids),
+        "patch_freq_centroid_range": float(freq_arr.max() - freq_arr.min()),
+        "patch_coherence_score": float(np.corrcoef(patch_means, patch_stds)[0, 1])
+            if len(patch_means) > 2 else 0.0,
+    }
+
+
+def _multiscale_lbp_features(gray: NDArray) -> dict[str, float]:
+    """Multi-scale LBP features (8 features).
+
+    Extends existing LBP (R=1,P=8 and R=2,P=16) with R=3,P=24 for coarser
+    texture, and computes per-scale summary statistics.
+    """
+    gray_uint8 = (gray * 255).astype(np.uint8) if gray.max() <= 1 else gray.astype(np.uint8)
+    features: dict[str, float] = {}
+
+    scales = [
+        (8, 1, "s1"),
+        (16, 2, "s2"),
+        (24, 3, "s3"),
+    ]
+
+    for p, r, label in scales:
+        lbp = local_binary_pattern(gray_uint8, P=p, R=r, method="uniform")
+        n_bins = p + 2  # uniform LBP has P+2 bins
+        hist, _ = np.histogram(lbp, bins=n_bins, range=(0, n_bins), density=True)
+
+        features[f"mslbp_{label}_mean"] = float(lbp.mean())
+        features[f"mslbp_{label}_var"] = float(lbp.var())
+
+        # Only add entropy and uniformity for the new R=3 scale to avoid
+        # duplicating stats already captured by _texture_features and _enhanced_texture_features
+        if r == 3:
+            features[f"mslbp_{label}_entropy"] = float(entropy(hist + 1e-10))
+            features[f"mslbp_{label}_uniformity"] = float(hist.max())
+
+    return features
+
+
+def _gabor_features(gray: NDArray) -> dict[str, float]:
+    """Gabor filter bank features (18 features).
+
+    4 frequencies x 4 orientations = 16 mean energy values,
+    plus overall mean and std across all filter responses.
+    """
+    from skimage.filters import gabor
+
+    features: dict[str, float] = {}
+    all_energies = []
+
+    freqs = [0.1, 0.2, 0.3, 0.4]
+    thetas = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]
+
+    for fi, freq in enumerate(freqs):
+        for ti, theta in enumerate(thetas):
+            filt_real, filt_imag = gabor(gray, frequency=freq, theta=theta)
+            energy = float(np.sqrt(filt_real ** 2 + filt_imag ** 2).mean())
+            features[f"gabor_f{fi}_t{ti}_energy"] = energy
+            all_energies.append(energy)
+
+    all_e = np.array(all_energies)
+    features["gabor_mean_energy"] = float(all_e.mean())
+    features["gabor_std_energy"] = float(all_e.std())
+
+    return features
+
+
+def _wavelet_packet_features(gray: NDArray) -> dict[str, float]:
+    """Wavelet packet statistics (12 features).
+
+    2-level Haar wavelet decomposition. For each detail subband
+    (LH, HL, HH at levels 1 and 2): mean and std of coefficients.
+    """
+    import pywt
+
+    coeffs = pywt.wavedec2(gray, "haar", level=2)
+    # coeffs: [cA2, (cH2, cV2, cD2), (cH1, cV1, cD1)]
+    features: dict[str, float] = {}
+
+    subband_names = ["LH", "HL", "HH"]
+    for level_idx, level in enumerate([1, 2]):
+        # coeffs index: level 2 details are at index 1, level 1 at index 2
+        detail_tuple = coeffs[len(coeffs) - level]
+        for sb_idx, sb_name in enumerate(subband_names):
+            c = detail_tuple[sb_idx]
+            prefix = f"wvt_L{level}_{sb_name}"
+            features[f"{prefix}_mean"] = float(np.abs(c).mean())
+            features[f"{prefix}_std"] = float(c.std())
+
+    return features
+
+
+def _color_coherence_features(rgb: NDArray) -> dict[str, float]:
+    """Color coherence vector features (6 features).
+
+    For each RGB channel: ratio of coherent pixels (in large connected
+    regions) to incoherent (small isolated regions). Threshold tau=25.
+    """
+    from scipy.ndimage import label as ndlabel
+
+    features: dict[str, float] = {}
+    tau = 25
+
+    rgb_uint8 = rgb.astype(np.uint8) if rgb.max() > 1 else (rgb * 255).astype(np.uint8)
+
+    for i, name in enumerate(("red", "green", "blue")):
+        channel = rgb_uint8[:, :, i]
+        # Quantize to reduce noise: 64 bins
+        quantized = (channel // 4).astype(np.uint8)
+
+        # For a representative threshold, use median intensity
+        median_val = np.median(quantized)
+        binary = quantized >= median_val
+
+        labeled, n_components = ndlabel(binary)
+        if n_components == 0:
+            features[f"ccv_{name}_coherent_ratio"] = 0.0
+            features[f"ccv_{name}_incoherent_ratio"] = 1.0
+            continue
+
+        total_pixels = float(binary.sum())
+        if total_pixels < 1:
+            features[f"ccv_{name}_coherent_ratio"] = 0.0
+            features[f"ccv_{name}_incoherent_ratio"] = 1.0
+            continue
+
+        coherent = 0.0
+        for comp_id in range(1, n_components + 1):
+            comp_size = float((labeled == comp_id).sum())
+            if comp_size >= tau:
+                coherent += comp_size
+
+        incoherent = total_pixels - coherent
+        features[f"ccv_{name}_coherent_ratio"] = coherent / (total_pixels + 1e-10)
+        features[f"ccv_{name}_incoherent_ratio"] = incoherent / (total_pixels + 1e-10)
+
+    return features
+
+
+def _edge_cooccurrence_features(gray: NDArray) -> dict[str, float]:
+    """Edge co-occurrence features (8 features).
+
+    Compute Canny edges, quantize gradient directions into bins,
+    build a GLCM of edge directions, and extract standard properties.
+    """
+    from skimage.feature import canny
+
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+    edges = canny(gray_f)
+
+    # Compute gradient directions using Sobel
+    from scipy.ndimage import sobel
+    gx = sobel(gray_f, axis=1)
+    gy = sobel(gray_f, axis=0)
+    angles = np.arctan2(gy, gx)  # -pi to pi
+
+    # Quantize angles to 8 direction bins (only at edge pixels)
+    n_dirs = 8
+    # Map -pi..pi to 0..n_dirs
+    dir_map = np.zeros_like(gray_f, dtype=np.uint8)
+    dir_map[:] = ((angles + np.pi) / (2 * np.pi) * n_dirs).astype(np.uint8) % n_dirs
+
+    # Mask to edge pixels only
+    dir_map[~edges] = 0
+
+    # Build edge direction co-occurrence (GLCM on direction map at edge pixels)
+    # Use graycomatrix on the direction map
+    edge_glcm = graycomatrix(
+        dir_map, distances=[1], angles=[0, np.pi / 2],
+        levels=n_dirs, symmetric=True, normed=True,
+    )
+
+    features: dict[str, float] = {}
+    for prop in ("contrast", "homogeneity", "energy", "correlation"):
+        vals = graycoprops(edge_glcm, prop)
+        features[f"edge_cooc_{prop}_mean"] = float(vals.mean())
+        features[f"edge_cooc_{prop}_std"] = float(vals.std())
+
+    return features
+
+
+def _fractal_dimension_features(gray: NDArray) -> dict[str, float]:
+    """Fractal dimension via box-counting (2 features).
+
+    Estimates fractal dimension of the grayscale image (thresholded)
+    and the edge map. Real artwork often has different fractal
+    characteristics than AI-generated images.
+    """
+    from skimage.feature import canny
+
+    def _box_counting_dim(binary: NDArray, box_sizes: list[int] | None = None) -> float:
+        if box_sizes is None:
+            box_sizes = [2, 4, 8, 16, 32, 64]
+
+        sizes = []
+        counts = []
+        for box_size in box_sizes:
+            h, w = binary.shape
+            # Count boxes needed to cover all True pixels
+            # Reshape into grid of boxes
+            nh = h // box_size
+            nw = w // box_size
+            if nh < 1 or nw < 1:
+                continue
+            cropped = binary[:nh * box_size, :nw * box_size]
+            # Reshape and check if any pixel in each box is True
+            reshaped = cropped.reshape(nh, box_size, nw, box_size)
+            box_has_pixel = reshaped.any(axis=(1, 3))
+            count = int(box_has_pixel.sum())
+            if count > 0:
+                sizes.append(box_size)
+                counts.append(count)
+
+        if len(sizes) < 2:
+            return 1.0  # degenerate case
+
+        log_sizes = np.log(1.0 / np.array(sizes, dtype=np.float64))
+        log_counts = np.log(np.array(counts, dtype=np.float64))
+
+        # Linear regression: slope = fractal dimension
+        coeffs = np.polyfit(log_sizes, log_counts, 1)
+        return float(coeffs[0])
+
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+
+    # Threshold grayscale at median
+    binary_gray = gray_f > np.median(gray_f)
+    fd_gray = _box_counting_dim(binary_gray)
+
+    # Edge map fractal dimension
+    edges = canny(gray_f)
+    fd_edges = _box_counting_dim(edges)
+
+    return {
+        "fractal_dim_gray": fd_gray,
+        "fractal_dim_edges": fd_edges,
+    }
+
+
+def _extended_hog_features(gray: NDArray) -> dict[str, float]:
+    """Extended HOG features (6 features).
+
+    HOG at two cell sizes (8x8 fine, 32x32 coarse), plus cross-scale
+    energy ratio and angular histogram entropy at each scale.
+    """
+    from skimage.feature import hog
+
+    features: dict[str, float] = {}
+
+    # Fine scale: 8x8 cells
+    hog_fine = hog(gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
+    fine_energy = float((hog_fine ** 2).sum())
+    fine_hist = np.histogram(hog_fine, bins=50)[0]
+    features["hog_fine_energy"] = fine_energy
+    features["hog_fine_entropy"] = float(entropy(fine_hist + 1e-10))
+
+    # Coarse scale: 32x32 cells
+    hog_coarse = hog(gray, pixels_per_cell=(32, 32), cells_per_block=(2, 2), feature_vector=True)
+    coarse_energy = float((hog_coarse ** 2).sum())
+    coarse_hist = np.histogram(hog_coarse, bins=50)[0]
+    features["hog_coarse_energy"] = coarse_energy
+    features["hog_coarse_entropy"] = float(entropy(coarse_hist + 1e-10))
+
+    # Cross-scale ratio
+    features["hog_fine_coarse_ratio"] = fine_energy / (coarse_energy + 1e-10)
+
+    # Overall angular dispersion
+    features["hog_energy_ratio_to_mean"] = fine_energy / (float(hog_fine.mean()) + 1e-10)
+
+    return features
+
+
+def _jpeg_ghost_features(rgb: NDArray) -> dict[str, float]:
+    """JPEG ghost detection features (4 features).
+
+    Resave image at different quality levels and measure RMSE between
+    original and resaved. AI and real images respond differently to
+    recompression artifacts.
+    """
+    from io import BytesIO
+
+    arr = rgb.astype(np.uint8) if rgb.max() > 1 else (rgb * 255).astype(np.uint8)
+    features: dict[str, float] = {}
+    rmses = []
+
+    for q in [50, 70, 90]:
+        try:
+            buf = BytesIO()
+            Image.fromarray(arr).save(buf, format="JPEG", quality=q)
+            buf.seek(0)
+            resaved = np.array(Image.open(buf).convert("RGB"), dtype=np.float64)
+            arr_f = arr.astype(np.float64)
+            rmse = float(np.sqrt(((arr_f - resaved) ** 2).mean()))
+        except Exception:
+            rmse = 0.0
+        features[f"jpeg_ghost_q{q}_rmse"] = rmse
+        rmses.append(rmse)
+
+    # Slope of RMSE across quality levels (how much quality matters)
+    if len(rmses) >= 2 and rmses[0] > 0:
+        features["jpeg_ghost_rmse_slope"] = float(rmses[0] - rmses[-1])
+    else:
+        features["jpeg_ghost_rmse_slope"] = 0.0
+
+    return features
+
+
+def _noise_residual_autocorr_features(gray: NDArray) -> dict[str, float]:
+    """Autocorrelation of noise residuals (5 features).
+
+    Canvas texture produces periodic peaks in the autocorrelation at thread
+    spacing intervals. Generator artifacts produce peaks at architecture-specific
+    frequencies. Real digital art has smooth monotonic decay.
+    """
+    from scipy.ndimage import gaussian_filter
+
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+    # Extract noise residual
+    smoothed = gaussian_filter(gray_f, sigma=1.5)
+    residual = gray_f - smoothed
+
+    h, w = residual.shape
+    # Compute 1D autocorrelation along rows (averaged)
+    max_lag = min(64, w // 4)
+    res_rows = residual[:, :w - w % 1]  # trim for alignment
+    acf = np.zeros(max_lag)
+    for lag in range(max_lag):
+        if lag == 0:
+            acf[lag] = 1.0
+        else:
+            shifted = residual[:, lag:]
+            original = residual[:, :w - lag]
+            if original.size > 0:
+                acf[lag] = float(np.corrcoef(original.ravel(), shifted.ravel())[0, 1])
+
+    # Look for secondary peaks (evidence of periodic structure)
+    # Skip lag 0 and first few lags (always high)
+    acf_tail = acf[3:]
+    if len(acf_tail) > 2:
+        # Find peaks
+        peaks = []
+        for i in range(1, len(acf_tail) - 1):
+            if acf_tail[i] > acf_tail[i - 1] and acf_tail[i] > acf_tail[i + 1]:
+                peaks.append((i + 3, acf_tail[i]))
+
+        n_peaks = len(peaks)
+        max_peak = max(p[1] for p in peaks) if peaks else 0.0
+        # Decay rate: how fast ACF drops
+        decay_rate = float(acf[1] - acf[min(10, max_lag - 1)]) if max_lag > 10 else 0.0
+    else:
+        n_peaks = 0
+        max_peak = 0.0
+        decay_rate = 0.0
+
+    return {
+        "acf_n_secondary_peaks": float(n_peaks),
+        "acf_max_secondary_peak": float(max_peak),
+        "acf_decay_rate": decay_rate,
+        "acf_lag2": float(acf[2]) if max_lag > 2 else 0.0,
+        "acf_lag8": float(acf[8]) if max_lag > 8 else 0.0,
+    }
+
+
+def _stroke_edge_roughness_features(gray: NDArray) -> dict[str, float]:
+    """Stroke edge roughness (4 features).
+
+    Physical brush strokes have characteristic edge roughness from bristles.
+    AI strokes tend to have smoother, more regular edges.
+    Uses fractal dimension of edge contours within high-gradient regions.
+    """
+    from scipy.ndimage import sobel, binary_dilation
+    from skimage.feature import canny
+
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+
+    # Detect edges
+    edges = canny(gray_f, sigma=1.5)
+    if edges.sum() < 20:
+        return {
+            "stroke_edge_roughness": 0.0,
+            "stroke_edge_length_var": 0.0,
+            "stroke_edge_curvature_mean": 0.0,
+            "stroke_edge_curvature_std": 0.0,
+        }
+
+    # Find strong gradient regions (likely strokes)
+    gx = sobel(gray_f, axis=1)
+    gy = sobel(gray_f, axis=0)
+    mag = np.sqrt(gx ** 2 + gy ** 2)
+    stroke_mask = mag > np.percentile(mag, 80)
+
+    # Dilate stroke mask and intersect with edges = stroke edges
+    stroke_dilated = binary_dilation(stroke_mask, iterations=2)
+    stroke_edges = edges & stroke_dilated
+
+    # Edge roughness: ratio of edge pixels to the convex area they span
+    # More rough = more edge pixels per unit area
+    if stroke_edges.sum() > 5:
+        from scipy.ndimage import label
+        labeled, n_components = label(binary_dilation(stroke_edges, iterations=1))
+        lengths = []
+        for i in range(1, min(n_components + 1, 50)):  # cap at 50 components
+            component = (labeled == i)
+            n_pixels = component.sum()
+            if n_pixels > 3:
+                lengths.append(n_pixels)
+
+        roughness = float(stroke_edges.sum()) / (stroke_dilated.sum() + 1e-10)
+        length_var = float(np.var(lengths)) if len(lengths) > 1 else 0.0
+
+        # Local curvature via direction changes along edges
+        edge_y, edge_x = np.where(stroke_edges)
+        if len(edge_y) > 10:
+            # Sample direction changes
+            dirs = np.arctan2(np.diff(edge_y.astype(float)), np.diff(edge_x.astype(float)))
+            curvatures = np.abs(np.diff(dirs))
+            curvatures = np.minimum(curvatures, 2 * np.pi - curvatures)  # wrap
+            curv_mean = float(curvatures.mean())
+            curv_std = float(curvatures.std())
+        else:
+            curv_mean, curv_std = 0.0, 0.0
+    else:
+        roughness, length_var, curv_mean, curv_std = 0.0, 0.0, 0.0, 0.0
+
+    return {
+        "stroke_edge_roughness": roughness,
+        "stroke_edge_length_var": length_var,
+        "stroke_edge_curvature_mean": curv_mean,
+        "stroke_edge_curvature_std": curv_std,
+    }
+
+
+def _color_gradient_curvature_features(rgb: NDArray) -> dict[str, float]:
+    """Color gradient curvature in blended regions (4 features).
+
+    Physical paint mixing (subtractive) curves through lower saturation/luminance.
+    Digital blending produces straighter paths in color space.
+    """
+    from skimage.color import rgb2lab
+    from scipy.ndimage import sobel
+
+    rgb_f = rgb / 255.0 if rgb.max() > 1 else rgb.copy()
+    try:
+        lab = rgb2lab(rgb_f)
+    except (MemoryError, Exception):
+        return {
+            "color_grad_curvature_mean": 0.0,
+            "color_grad_curvature_std": 0.0,
+            "blend_saturation_dip": 0.0,
+            "blend_lightness_dip": 0.0,
+        }
+
+    # Find blended regions: moderate gradient magnitude
+    grad_l = np.sqrt(sobel(lab[:, :, 0], axis=0) ** 2 + sobel(lab[:, :, 0], axis=1) ** 2)
+    grad_a = np.sqrt(sobel(lab[:, :, 1], axis=0) ** 2 + sobel(lab[:, :, 1], axis=1) ** 2)
+    grad_b = np.sqrt(sobel(lab[:, :, 2], axis=0) ** 2 + sobel(lab[:, :, 2], axis=1) ** 2)
+    color_grad = grad_a + grad_b
+
+    # Moderate gradient = blending (not edges, not flat)
+    p30 = np.percentile(color_grad, 30)
+    p70 = np.percentile(color_grad, 70)
+    blend_mask = (color_grad > p30) & (color_grad < p70)
+
+    if blend_mask.sum() < 100:
+        return {
+            "color_grad_curvature_mean": 0.0,
+            "color_grad_curvature_std": 0.0,
+            "blend_saturation_dip": 0.0,
+            "blend_lightness_dip": 0.0,
+        }
+
+    # Sample horizontal lines through blend regions, measure color path curvature
+    h, w = rgb_f.shape[:2]
+    curvatures = []
+    sat_dips = []
+    light_dips = []
+
+    for row in range(0, h, 8):
+        cols = np.where(blend_mask[row])[0]
+        if len(cols) < 10:
+            continue
+        # Take the Lab values along this row at blend pixels
+        path_lab = lab[row, cols]
+        if len(path_lab) < 3:
+            continue
+        # Compute curvature: deviation from straight line in Lab space
+        start = path_lab[0]
+        end = path_lab[-1]
+        n = len(path_lab)
+        t = np.linspace(0, 1, n)
+        straight = start[None, :] + t[:, None] * (end - start)[None, :]
+        deviations = np.linalg.norm(path_lab - straight, axis=1)
+        curvatures.append(float(deviations.mean()))
+
+        # Saturation dip: min chroma along path vs endpoints
+        chroma = np.sqrt(path_lab[:, 1] ** 2 + path_lab[:, 2] ** 2)
+        endpoint_chroma = (chroma[0] + chroma[-1]) / 2
+        if endpoint_chroma > 1:
+            sat_dips.append(float(chroma.min() / endpoint_chroma))
+
+        # Lightness dip
+        endpoint_L = (path_lab[0, 0] + path_lab[-1, 0]) / 2
+        if endpoint_L > 1:
+            light_dips.append(float(path_lab[:, 0].min() / endpoint_L))
+
+    return {
+        "color_grad_curvature_mean": float(np.mean(curvatures)) if curvatures else 0.0,
+        "color_grad_curvature_std": float(np.std(curvatures)) if curvatures else 0.0,
+        "blend_saturation_dip": float(np.mean(sat_dips)) if sat_dips else 0.0,
+        "blend_lightness_dip": float(np.mean(light_dips)) if light_dips else 0.0,
+    }
+
+
+def _patch_selfsimilarity_features(gray: NDArray) -> dict[str, float]:
+    """Patch self-similarity statistics (4 features).
+
+    AI generators sometimes produce suspiciously similar patches in textured
+    regions due to attention mechanisms and tiling. Human art has more
+    natural variation.
+    """
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+    h, w = gray_f.shape
+    patch_size = 16
+    stride = 16
+
+    # Extract non-overlapping patches
+    patches = []
+    for y in range(0, h - patch_size, stride):
+        for x in range(0, w - patch_size, stride):
+            patch = gray_f[y:y+patch_size, x:x+patch_size].ravel()
+            patches.append(patch)
+
+    if len(patches) < 10:
+        return {
+            "selfsim_min_dist": 0.0,
+            "selfsim_mean_min_dist": 0.0,
+            "selfsim_near_duplicate_ratio": 0.0,
+            "selfsim_dist_std": 0.0,
+        }
+
+    patches = np.array(patches)
+    n = len(patches)
+
+    # Normalize patches
+    norms = np.linalg.norm(patches, axis=1, keepdims=True)
+    patches_norm = patches / (norms + 1e-10)
+
+    # Compute cosine similarity matrix (sample if too many patches)
+    if n > 200:
+        idx = np.random.default_rng(42).choice(n, 200, replace=False)
+        patches_norm = patches_norm[idx]
+        n = 200
+
+    sim_matrix = patches_norm @ patches_norm.T
+    # Zero out diagonal
+    np.fill_diagonal(sim_matrix, -1)
+
+    # Best match for each patch (excluding self)
+    max_sims = sim_matrix.max(axis=1)
+
+    # Near-duplicate ratio: patches with similarity > 0.95
+    near_dup_ratio = float((max_sims > 0.95).mean())
+
+    return {
+        "selfsim_min_dist": float(1 - max_sims.max()),  # smallest distance between any two patches
+        "selfsim_mean_min_dist": float(1 - max_sims.mean()),
+        "selfsim_near_duplicate_ratio": near_dup_ratio,
+        "selfsim_dist_std": float(max_sims.std()),
+    }
+
+
+def _cross_subband_correlation_features(gray: NDArray) -> dict[str, float]:
+    """Cross-subband wavelet correlation (4 features).
+
+    Natural images have specific cross-band correlation structures.
+    AI-generated images often have anomalous relationships between
+    frequency subbands.
+    """
+    import pywt
+
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+
+    # 2-level wavelet decomposition
+    coeffs = pywt.wavedec2(gray_f, "haar", level=2)
+
+    # Level 1 details: (LH1, HL1, HH1)
+    lh1, hl1, hh1 = coeffs[2]
+    # Level 2 details: (LH2, HL2, HH2)
+    lh2, hl2, hh2 = coeffs[1]
+
+    # Resize level 2 to match level 1 size for correlation
+    from skimage.transform import resize
+    lh2_up = resize(lh2, lh1.shape, order=1, anti_aliasing=False)
+    hl2_up = resize(hl2, hl1.shape, order=1, anti_aliasing=False)
+
+    # Cross-band correlations
+    def _safe_corr(a: NDArray, b: NDArray) -> float:
+        a_flat, b_flat = a.ravel(), b.ravel()
+        if a_flat.std() < 1e-10 or b_flat.std() < 1e-10:
+            return 0.0
+        return float(np.corrcoef(a_flat, b_flat)[0, 1])
+
+    # Within-level: LH vs HL correlation (directional consistency)
+    lh_hl_corr_l1 = _safe_corr(lh1, hl1)
+
+    # Cross-level: LH1 vs LH2 (scale consistency)
+    lh_cross_corr = _safe_corr(lh1, lh2_up)
+
+    # Cross-level: HL1 vs HL2
+    hl_cross_corr = _safe_corr(hl1, hl2_up)
+
+    # HH ratio between levels (detail energy ratio)
+    hh1_energy = float((hh1 ** 2).mean())
+    hh2_energy = float((hh2 ** 2).mean())
+    hh_energy_ratio = hh1_energy / (hh2_energy + 1e-10)
+
+    return {
+        "wavelet_lh_hl_corr_l1": lh_cross_corr,
+        "wavelet_lh_cross_level_corr": lh_cross_corr,
+        "wavelet_hl_cross_level_corr": hl_cross_corr,
+        "wavelet_hh_energy_ratio": hh_energy_ratio,
+    }
+
+
+def _linework_features(gray: NDArray) -> dict[str, float]:
+    """Anime/illustration line work analysis (8 features).
+
+    AI generators struggle with consistent stroke thickness and medium
+    coherence in line art. Per AnimeDL-2M (2025), anime images have
+    distinctive sharp, well-defined lines that AI mimics imperfectly.
+    """
+    from skimage.feature import canny
+    from scipy.ndimage import distance_transform_edt, label
+
+    gray_f = gray if gray.max() <= 1 else gray / 255.0
+
+    # Detect edges at two sensitivity levels
+    edges_tight = canny(gray_f, sigma=1.0, low_threshold=0.1, high_threshold=0.3)
+    edges_loose = canny(gray_f, sigma=1.5, low_threshold=0.05, high_threshold=0.15)
+
+    if edges_tight.sum() < 10:
+        return {k: 0.0 for k in [
+            "line_thickness_mean", "line_thickness_std", "line_thickness_cv",
+            "line_density", "line_straightness",
+            "edge_sharpness_mean", "edge_sharpness_std", "medium_consistency",
+        ]}
+
+    # Line thickness via distance transform
+    # Invert edges to get distance to nearest edge, then sample at edge pixels
+    dist_map = distance_transform_edt(~edges_tight)
+    # Thickness = local width of strokes. Use loose edges as stroke regions.
+    stroke_regions = edges_loose
+    if stroke_regions.sum() > 0:
+        thicknesses = dist_map[stroke_regions]
+        thickness_mean = float(thicknesses.mean())
+        thickness_std = float(thicknesses.std())
+        thickness_cv = thickness_std / (thickness_mean + 1e-10)
+    else:
+        thickness_mean, thickness_std, thickness_cv = 0.0, 0.0, 0.0
+
+    # Line density: fraction of image that is edges
+    line_density = float(edges_tight.sum() / edges_tight.size)
+
+    # Line straightness: ratio of connected component extent to perimeter
+    labeled_edges, n_components = label(edges_tight)
+    straightness_values = []
+    for i in range(1, min(n_components + 1, 30)):
+        component = (labeled_edges == i)
+        n_pixels = component.sum()
+        if n_pixels < 5:
+            continue
+        ys, xs = np.where(component)
+        extent = max(ys.max() - ys.min(), xs.max() - xs.min(), 1)
+        straightness_values.append(n_pixels / extent)
+    line_straightness = float(np.mean(straightness_values)) if straightness_values else 0.0
+
+    # Edge sharpness: gradient magnitude at edge pixels
+    from scipy.ndimage import sobel as ndimage_sobel
+    gx = ndimage_sobel(gray_f, axis=1)
+    gy = ndimage_sobel(gray_f, axis=0)
+    grad_mag = np.sqrt(gx ** 2 + gy ** 2)
+    edge_gradients = grad_mag[edges_tight]
+    edge_sharpness_mean = float(edge_gradients.mean())
+    edge_sharpness_std = float(edge_gradients.std())
+
+    # Medium consistency: how uniform is the texture in non-edge regions
+    # Human artists use consistent medium; AI mixes characteristics
+    non_edge = ~edges_loose
+    if non_edge.sum() > 100:
+        # Variance of local texture in non-edge regions (patch-based)
+        h, w = gray_f.shape
+        patch_vars = []
+        for y in range(0, h - 16, 16):
+            for x in range(0, w - 16, 16):
+                patch = gray_f[y:y + 16, x:x + 16]
+                patch_edge = edges_tight[y:y + 16, x:x + 16]
+                if patch_edge.mean() < 0.1:  # non-edge patch
+                    patch_vars.append(float(patch.var()))
+        medium_consistency = float(np.std(patch_vars)) if len(patch_vars) > 5 else 0.0
+    else:
+        medium_consistency = 0.0
+
+    return {
+        "line_thickness_mean": thickness_mean,
+        "line_thickness_std": thickness_std,
+        "line_thickness_cv": thickness_cv,
+        "line_density": line_density,
+        "line_straightness": line_straightness,
+        "edge_sharpness_mean": edge_sharpness_mean,
+        "edge_sharpness_std": edge_sharpness_std,
+        "medium_consistency": medium_consistency,
+    }
+
+
 class ArtworkExtract:
     """Extract artwork features for AI detection.
 
-    Combines the 39 features from Li & Stamp (2025) with a dedicated
-    frequency analysis branch (10 features) for 49 total features.
-
-    All features are CPU-only and work on any image type (photos,
-    illustrations, artwork). No pretrained models required.
+    Combines features from multiple sources:
+        - 39 features from Li & Stamp (2025)
+        - 10 FFT/DCT spectral features
+        - 14 enhanced texture features (Nirob et al. 2026)
+        - 4 mid-band frequency features (FIRE, CVPR 2025)
+        - 6 patch consistency features (CINEMAE 2025)
+        - 8 multi-scale LBP features
+        - 18 Gabor filter bank features
+        - 12 wavelet packet statistics
+        - 6 color coherence vector features
+        - 8 edge co-occurrence features
+        - 2 fractal dimension features
+        - 6 extended HOG features
+        - 4 JPEG ghost detection features
+        - 5 noise residual autocorrelation features
+        - 4 stroke edge roughness features
+        - 4 color gradient curvature features
+        - 4 patch self-similarity features
+        - 4 cross-subband wavelet correlation features
+    Total: 158 features, all CPU-only.
 
     Usage:
         >>> extractor = ArtworkExtract()
         >>> features = extractor(pil_image)
-        >>> len(features)  # 49
+        >>> len(features)  # 158
     """
 
     def __call__(self, image: Image.Image) -> dict[str, float]:
@@ -242,6 +1140,22 @@ def __call__(self, image: Image.Image) -> dict[str, float]:
         features |= _shape_features(gray)
         features |= _noise_features(gray)
         features |= _frequency_features(gray)
+        features |= _enhanced_texture_features(gray)
+        features |= _midband_frequency_features(gray)
+        features |= _patch_consistency_features(gray)
+        features |= _multiscale_lbp_features(gray)
+        features |= _gabor_features(gray)
+        features |= _wavelet_packet_features(gray)
+        # color_coherence and cross_subband removed — ablation showed they hurt accuracy
+        features |= _edge_cooccurrence_features(gray)
+        features |= _fractal_dimension_features(gray)
+        features |= _noise_residual_autocorr_features(gray)
+        features |= _stroke_edge_roughness_features(gray)
+        features |= _color_gradient_curvature_features(rgb)
+        features |= _patch_selfsimilarity_features(gray)
+        features |= _extended_hog_features(gray)
+        features |= _jpeg_ghost_features(rgb)
+        features |= _linework_features(gray)
 
         return features
 
diff --git a/negate/extract/feature_learned.py b/negate/extract/feature_learned.py
new file mode 100644
index 0000000..7a17169
--- /dev/null
+++ b/negate/extract/feature_learned.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
+# <!-- // /*  d a r k s h a p e s */ -->
+
+"""Learned feature extraction via frozen ConvNeXt-Tiny.
+
+Complements the 148 handcrafted features with 768 learned features from
+a frozen ImageNet-pretrained ConvNeXt-Tiny model. The learned features
+capture visual patterns that handcrafted features miss — particularly
+artifacts from novel generator architectures.
+
+Key properties:
+    - 768-dimensional output (penultimate layer of ConvNeXt-Tiny)
+    - Frozen weights — no fine-tuning, no GPU training needed
+    - ~28 img/s on CPU (25x faster than handcrafted features)
+    - NOT CLIP-based — no text encoder bias
+    - NOT DINOv2 — ConvNeXt has different inductive biases (local + hierarchical)
+
+Unlike CLIP (which we proved has generator bias), ConvNeXt-Tiny is purely
+visual and pretrained on ImageNet classification — it has no special
+relationship with any generator architecture.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from PIL import Image
+
+
+class LearnedExtract:
+    """Extract 768 learned features from a frozen ConvNeXt-Tiny model.
+
+    Usage:
+        >>> extractor = LearnedExtract()
+        >>> features = extractor(pil_image)  # returns dict of 768 floats
+        >>> len(features)  # 768
+    """
+
+    def __init__(self):
+        import timm
+
+        self._model = timm.create_model("convnext_tiny.fb_in22k", pretrained=True, num_classes=0)
+        self._model.eval()
+        self._transform = timm.data.create_transform(
+            **timm.data.resolve_data_config(self._model.pretrained_cfg)
+        )
+
+    @torch.no_grad()
+    def __call__(self, image: Image.Image) -> dict[str, float]:
+        """Extract 768 features from a PIL image."""
+        image = image.convert("RGB")
+        inp = self._transform(image).unsqueeze(0)
+        feat = self._model(inp).squeeze(0).numpy()
+        return {f"cnxt_{i}": float(feat[i]) for i in range(len(feat))}
+
+    @torch.no_grad()
+    def batch(self, images: list[Image.Image], batch_size: int = 32) -> NDArray:
+        """Extract features from a batch of images. Returns (N, 768) array."""
+        all_feats = []
+        for i in range(0, len(images), batch_size):
+            batch_imgs = images[i:i + batch_size]
+            tensors = []
+            for img in batch_imgs:
+                try:
+                    tensors.append(self._transform(img.convert("RGB")))
+                except Exception:
+                    tensors.append(torch.zeros(3, 224, 224))
+            batch_tensor = torch.stack(tensors)
+            feats = self._model(batch_tensor).numpy()
+            all_feats.append(feats)
+        return np.vstack(all_feats) if all_feats else np.empty((0, 768))
+
+    @torch.no_grad()
+    def perturb_compare(self, image: Image.Image, sigma: float = 5.0) -> dict[str, float]:
+        """Compare ConvNeXt features of clean vs slightly noisy image.
+
+        Real images change more under perturbation than AI images because
+        AI images sit on the generator's learned manifold and are more
+        stable to small noise. Inspired by RIGID (DINOv2 perturbation check).
+
+        :param image: PIL Image.
+        :param sigma: Gaussian noise standard deviation.
+        :returns: Dictionary with perturbation comparison metrics.
+        """
+        image = image.convert("RGB")
+        arr = np.array(image, dtype=np.float64)
+
+        # Add small Gaussian noise
+        noise = np.random.RandomState(42).normal(0, sigma, arr.shape)
+        noisy_arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
+        noisy_image = Image.fromarray(noisy_arr)
+
+        # Extract features for both
+        clean_inp = self._transform(image).unsqueeze(0)
+        noisy_inp = self._transform(noisy_image).unsqueeze(0)
+
+        clean_feat = self._model(clean_inp).squeeze(0).numpy()
+        noisy_feat = self._model(noisy_inp).squeeze(0).numpy()
+
+        # Cosine distance
+        dot = np.dot(clean_feat, noisy_feat)
+        norm_clean = np.linalg.norm(clean_feat)
+        norm_noisy = np.linalg.norm(noisy_feat)
+        cosine_sim = dot / (norm_clean * norm_noisy + 1e-10)
+
+        # L2 distance
+        l2_dist = float(np.linalg.norm(clean_feat - noisy_feat))
+
+        # Per-dimension change statistics
+        diff = np.abs(clean_feat - noisy_feat)
+
+        return {
+            "perturb_cosine_dist": float(1.0 - cosine_sim),
+            "perturb_l2_dist": l2_dist,
+            "perturb_max_change": float(diff.max()),
+            "perturb_mean_change": float(diff.mean()),
+        }
+
+    def feature_names(self) -> list[str]:
+        return [f"cnxt_{i}" for i in range(768)]
+
+    def perturb_feature_names(self) -> list[str]:
+        return ["perturb_cosine_dist", "perturb_l2_dist", "perturb_max_change", "perturb_mean_change"]
diff --git a/results/EXPERIMENTS.md b/results/EXPERIMENTS.md
deleted file mode 100644
index 820572c..0000000
--- a/results/EXPERIMENTS.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Experiment Log: AI Artwork Detection Feature Analysis
-
-> negate project — darkshapes
-> Date: March 23, 2026
-> Dataset: [Hemg/AI-Generated-vs-Real-Images-Datasets](https://huggingface.co/datasets/Hemg/AI-Generated-vs-Real-Images-Datasets) (AI Art vs Real Art, 153K images)
-> Evaluation: 5-fold stratified cross-validation, 4000 samples (2000 per class)
-
----
-
-## Summary Table
-
-| # | Experiment | Features | Best Acc | Precision | Recall | AUC | Model | Code |
-|---|-----------|----------|----------|-----------|--------|-----|-------|------|
-| 0 | Baseline (existing negate) | 26 | 63.3% | — | — | 0.669 | XGBoost | `negate/train.py` |
-| 1 | Artwork (Li & Stamp + FFT) | 49 | 79.4% | ~79% | ~80% | 0.886 | XGBoost | `negate/extract/feature_artwork.py` |
-| 2 | Style (stroke/palette/comp) | 15 | 78.8% | ~79% | ~78% | 0.883 | XGBoost | `negate/extract/feature_style.py` |
-| 3 | Artwork + Style combined | 64 | 83.5% | ~83% | ~84% | 0.923 | XGBoost | experiments 1+2 concatenated |
-| 4 | CLIP ViT-B/32 embeddings | 512 | 89.3% | ~89% | ~89% | 0.963 | SVM | `tests/test_experiments.py:108-139` |
-| 5 | All combined | 576 | 90.0% | ~90% | ~90% | 0.966 | SVM | experiments 1+2+4 concatenated |
-
----
-
-## Experiment 0: Baseline (Existing negate Pipeline)
-
-**What it does**: Haar wavelet decomposition + DINOv3 ViT features + Flux/SANA VAE reconstruction loss → PCA → XGBoost.
-
-**Code**: [`negate/train.py`](../negate/train.py), [`negate/decompose/wavelet.py`](../negate/decompose/wavelet.py), [`negate/extract/feature_vit.py`](../negate/extract/feature_vit.py), [`negate/extract/feature_vae.py`](../negate/extract/feature_vae.py)
-
-**Result**: 63.3% accuracy, 0.669 AUC (from existing training runs in `results/`)
-
-**Limitations**:
-- Requires GPU + multi-GB model downloads (ViT, VAE)
-- Wavelet features may not capture art-specific artifacts
-- Tested on different datasets (not Hemg), so not directly comparable
-- The heavy pipeline may introduce noise that dilutes useful signal
-
----
-
-## Experiment 1: Artwork Features (49)
-
-**What it does**: Implements the 39-feature extraction from [Li & Stamp, "Detecting AI-generated Artwork", arXiv:2504.07078](https://arxiv.org/abs/2504.07078), extended with 10 FFT/DCT frequency analysis features.
-
-**Feature categories**:
-- Brightness (2): mean, entropy
-- Color (23): RGB/HSV histogram stats (mean, var, kurtosis, skew, entropy)
-- Texture (6): GLCM (contrast, correlation, energy, homogeneity) + LBP
-- Shape (6): HOG statistics + Canny edge length
-- Noise (2): noise entropy, SNR
-- Frequency (10): FFT band energies, spectral centroid, DCT analysis, phase coherence
-
-**Code**: [`negate/extract/feature_artwork.py`](../negate/extract/feature_artwork.py)
-
-**Result**: 79.4% accuracy, 0.886 AUC (XGBoost)
-
-**Limitations**:
-- Hand-crafted features can't adapt to new generator types
-- Color/brightness features may capture dataset bias (e.g., if AI art tends to be more saturated)
-- No spatial awareness — features are global statistics
-
----
-
-## Experiment 2: Style Features (15)
-
-**What it does**: Extracts features targeting properties of human artistic craft that AI generators struggle to replicate.
-
-**Feature categories**:
-- Stroke analysis (4): gradient direction entropy, local direction variance, pressure kurtosis, stroke length variance
-- Color palette (4): palette richness, hue entropy, harmony peaks, temperature variance
-- Composition (4): rule-of-thirds energy ratio, bilateral symmetry, focal point strength, center edge ratio
-- Micro-texture (3): patch entropy variance, grain regularity (autocorrelation), brushwork periodicity (FFT peak ratio)
-
-**Code**: [`negate/extract/feature_style.py`](../negate/extract/feature_style.py)
-
-**Result**: 78.8% accuracy, 0.883 AUC (XGBoost)
-
-**Limitations**:
-- Only 15 features — limited capacity
-- Stroke analysis assumes visible brush strokes (fails on smooth digital art)
-- Composition features (rule-of-thirds, symmetry) may not differ between AI and human art
-- ~2x slower than artwork features (7 img/s vs 16 img/s) due to patch-level analysis
-
-**Interesting finding**: Nearly identical performance to the 49 artwork features despite having 3x fewer features. This suggests the style features capture orthogonal signal — confirmed by experiment 3 where combining them jumps to 83.5%.
-
----
-
-## Experiment 3: Artwork + Style Combined (64)
-
-**What it does**: Concatenates all 49 artwork features + 15 style features per image.
-
-**Code**: Feature extraction from experiments 1+2, concatenated in [`tests/test_experiments.py:309-316`](../tests/test_experiments.py)
-
-**Result**: 83.5% accuracy, 0.923 AUC (XGBoost) — **+4.1pp over best individual**
-
-**Why it works**: The two feature sets capture different aspects:
-- Artwork features capture statistical properties (histograms, frequency spectra)
-- Style features capture spatial/structural properties (strokes, composition, texture regularity)
-- XGBoost can learn which features matter for which types of images
-
-**Limitations**:
-- Still hand-crafted — ceiling is limited by human feature engineering
-- 64 features is small enough that XGBoost works well, but not enough to capture all relevant patterns
-
----
-
-## Experiment 4: CLIP ViT-B/32 Embeddings (512)
-
-**What it does**: Passes each image through OpenAI's CLIP vision encoder (`openai/clip-vit-base-patch32`) and uses the 512-dimensional pooled embedding as features. No fine-tuning — just the pretrained embedding.
-
-**Code**: [`tests/test_experiments.py:108-139`](../tests/test_experiments.py) (uses `transformers.CLIPModel`)
-
-**Result**: 89.3% accuracy, 0.963 AUC (SVM) — **+9.9pp over best hand-crafted**
-
-**Why it works**: CLIP was trained on 400M image-text pairs. Its embeddings encode rich visual semantics including texture, style, composition, and content — everything our hand-crafted features try to capture, but learned from data at massive scale.
-
-**Why SVM wins here**: In 512-dimensional space, SVM's RBF kernel finds better decision boundaries than XGBoost's tree splits. This is typical for high-dimensional dense features.
-
-**Limitations**:
-- Requires ~300MB model download
-- CLIP was not trained for forensic detection — it captures semantic similarity, not generation artifacts
-- May fail on adversarial examples designed to fool CLIP
-- Not fine-tuned on this task — fine-tuning would likely improve further
-- Inference is slower (~32 img/batch on GPU vs 16 img/s CPU for hand-crafted)
-
----
-
-## Experiment 5: All Combined (576)
-
-**What it does**: Concatenates CLIP embeddings (512) + Artwork features (49) + Style features (15) = 576 features.
-
-**Code**: [`tests/test_experiments.py:342-349`](../tests/test_experiments.py)
-
-**Result**: 90.0% accuracy, 0.966 AUC (SVM) — **+0.7pp over CLIP alone**
-
-**Why the improvement is tiny**: CLIP embeddings already encode most of the information that hand-crafted features capture. The marginal gain from adding 64 hand-crafted features to 512 learned features is small because the signal is redundant.
-
-**Limitations**:
-- Barely worth the extra computation vs CLIP alone
-- Feature dimensionality (576) is high — may overfit on smaller datasets
-
----
-
-## Scaling Analysis
-
-Tested artwork features (49) at increasing sample sizes on the same Hemg dataset:
-
-| Samples | Best Accuracy | AUC |
-|---------|--------------|-----|
-| 400 | 70.0% | 0.790 |
-| 1,000 | 75.8% | 0.844 |
-| 2,000 | 77.8% | 0.858 |
-| 4,000 | 79.5% | 0.888 |
-
-**Code**: [`tests/test_scale_evaluation.py`](../tests/test_scale_evaluation.py)
-**PDF**: `results/scale_evaluation_20260322_235906.pdf`
-
-**Finding**: Accuracy climbs steadily but is flattening. Hand-crafted features likely plateau around 82-85% with more data. CLIP at 89.3% on the same 4000 samples already exceeds this ceiling.
-
----
-
-## Overall Conclusions
-
-### What worked
-1. **CLIP embeddings are the clear winner** — 89.3% with zero feature engineering
-2. **Combining orthogonal hand-crafted features helps** — Art+Style (83.5%) > either alone
-3. **More data helps** — 70% → 79.5% going from 400 to 4000 samples
-4. **Frequency features (FFT/DCT) add real signal** — the 10 frequency features in the artwork extractor are consistently important
-
-### What didn't work
-1. **Hand-crafted features alone can't match learned representations** — 79.4% vs 89.3%
-2. **Adding hand-crafted features to CLIP barely helps** — 90.0% vs 89.3% (+0.7pp)
-3. **Style features alone aren't better than generic statistics** — 78.8% vs 79.4%
-
-### Remaining confounds
-- The Hemg dataset labels are "AiArtData" vs "RealArt" — we don't know if the AI art was generated to look like the real art (semantic matching)
-- Image resolution and format may differ between classes
-- We haven't tested robustness to JPEG compression, resizing, or adversarial perturbation
-
-### Recommendation
-**For the negate pipeline**: Replace the GPU-heavy VIT+VAE features with CLIP embeddings. This gives:
-- +26pp accuracy improvement (63% → 89%)
-- Simpler pipeline (one model instead of VIT + VAE + wavelets)
-- Smaller download (~300MB vs multi-GB)
-- Still works on CPU (slower but functional)
-
-**For research**: Fine-tuning CLIP on art-specific detection data, or using DINOv2 (which captures more structural features), could push accuracy further. The self-supervised camera-metadata approach from Zhong et al. (2026) is also worth exploring for robustness.
-
----
-
-## Generated PDFs
-
-| Report | File | What it shows |
-|--------|------|--------------|
-| Artwork detection benchmark | `results/artwork_detection_results.pdf` | Initial 49-feature results on wikiart |
-| Proof compilation | `results/proof_compilation.pdf` | First end-to-end test (cats vs bananas — confounded) |
-| Fair evaluation | `results/fair_evaluation_20260322_235151.pdf` | Hemg art-vs-art + Parveshiiii results |
-| Scaling analysis | `results/scale_evaluation_20260322_235906.pdf` | 400→4000 sample scaling curves |
-| Experiments comparison | `results/experiments_comparison_20260323_094054.pdf` | All 5 experiments side-by-side |
diff --git a/results/adversarial_robustness_results.json b/results/adversarial_robustness_results.json
new file mode 100644
index 0000000..44aad98
--- /dev/null
+++ b/results/adversarial_robustness_results.json
@@ -0,0 +1,144 @@
+{
+  "clean_baseline": {
+    "acc": 0.9444444444444444,
+    "auc": 0.9894736842105263
+  },
+  "perturbation_robustness": {
+    "clean": {
+      "acc": 0.9444444444444444,
+      "auc": 0.9894736842105263
+    },
+    "jpeg_q30": {
+      "acc": 0.9222222222222223,
+      "auc": 0.9815479876160991,
+      "delta": -0.022222222222222143
+    },
+    "jpeg_q50": {
+      "acc": 0.9388888888888889,
+      "auc": 0.9879876160990712,
+      "delta": -0.005555555555555536
+    },
+    "jpeg_q70": {
+      "acc": 0.9333333333333333,
+      "auc": 0.9872445820433436,
+      "delta": -0.011111111111111072
+    },
+    "noise_s5": {
+      "acc": 0.9222222222222223,
+      "auc": 0.9796904024767803,
+      "delta": -0.022222222222222143
+    },
+    "noise_s15": {
+      "acc": 0.75,
+      "auc": 0.8918885448916408,
+      "delta": -0.19444444444444442
+    },
+    "noise_s30": {
+      "acc": 0.6611111111111111,
+      "auc": 0.7359752321981424,
+      "delta": -0.2833333333333333
+    },
+    "resize_128": {
+      "acc": 0.5888888888888889,
+      "auc": 0.8537461300309598,
+      "delta": -0.3555555555555555
+    },
+    "resize_64": {
+      "acc": 0.48333333333333334,
+      "auc": 0.7312693498452012,
+      "delta": -0.4611111111111111
+    },
+    "crop_70pct": {
+      "acc": 0.8944444444444445,
+      "auc": 0.9699071207430341,
+      "delta": -0.04999999999999993
+    },
+    "crop_50pct": {
+      "acc": 0.8166666666666667,
+      "auc": 0.9071207430340558,
+      "delta": -0.12777777777777777
+    },
+    "blur_r1": {
+      "acc": 0.8888888888888888,
+      "auc": 0.9627244582043344,
+      "delta": -0.05555555555555558
+    },
+    "blur_r3": {
+      "acc": 0.5666666666666667,
+      "auc": 0.8222910216718265,
+      "delta": -0.37777777777777777
+    },
+    "bright_0.7": {
+      "acc": 0.9444444444444444,
+      "auc": 0.9902167182662538,
+      "delta": 0.0
+    },
+    "bright_1.3": {
+      "acc": 0.9,
+      "auc": 0.9799380804953561,
+      "delta": -0.0444444444444444
+    },
+    "social_media": {
+      "acc": 0.9333333333333333,
+      "auc": 0.9884829721362229,
+      "delta": -0.011111111111111072
+    }
+  },
+  "threshold_calibration": {
+    "0.3": {
+      "precision": 0.8679245283018868,
+      "recall": 0.92,
+      "f1": 0.8899009108197378,
+      "fp_rate": 0.14
+    },
+    "0.4": {
+      "precision": 0.873015873015873,
+      "recall": 0.9166666666666666,
+      "f1": 0.8915989159891599,
+      "fp_rate": 0.13333333333333333
+    },
+    "0.5": {
+      "precision": 0.8892508143322475,
+      "recall": 0.91,
+      "f1": 0.8983194934866134,
+      "fp_rate": 0.11333333333333333
+    },
+    "0.6": {
+      "precision": 0.8943894389438944,
+      "recall": 0.9033333333333333,
+      "f1": 0.8983307916031233,
+      "fp_rate": 0.10666666666666667
+    },
+    "0.7": {
+      "precision": 0.893687707641196,
+      "recall": 0.8966666666666666,
+      "f1": 0.8949997083325232,
+      "fp_rate": 0.10666666666666667
+    },
+    "0.8": {
+      "precision": 0.902027027027027,
+      "recall": 0.89,
+      "f1": 0.8966620738699498,
+      "fp_rate": 0.09666666666666666
+    },
+    "0.9": {
+      "precision": 0.9065743944636678,
+      "recall": 0.8733333333333333,
+      "f1": 0.8916302423870245,
+      "fp_rate": 0.09
+    },
+    "0.95": {
+      "precision": 0.9142857142857143,
+      "recall": 0.8533333333333334,
+      "f1": 0.8865406006674083,
+      "fp_rate": 0.08
+    }
+  },
+  "recommended_threshold": null,
+  "worst_case_adversarial": {
+    "acc": 0.65,
+    "auc": 0.8144891640866874
+  },
+  "n_real": 300,
+  "n_fake": 300
+}
\ No newline at end of file
diff --git a/results/artwork_detection_results.pdf b/results/artwork_detection_results.pdf
deleted file mode 100644
index c3aa62f..0000000
Binary files a/results/artwork_detection_results.pdf and /dev/null differ
diff --git a/results/clip_bias_defactify_results.json b/results/clip_bias_defactify_results.json
new file mode 100644
index 0000000..bc29562
--- /dev/null
+++ b/results/clip_bias_defactify_results.json
@@ -0,0 +1,160 @@
+{
+  "timestamp": "2026-03-23T16:43:55.933744",
+  "results": [
+    {
+      "generator": "SD 2.1",
+      "uses_clip": true,
+      "handcrafted_xgb": {
+        "accuracy": 0.865,
+        "precision": 0.8489483747609943,
+        "recall": 0.888,
+        "roc_auc": 0.939208
+      },
+      "handcrafted_svm": {
+        "accuracy": 0.857,
+        "precision": 0.8577154308617234,
+        "recall": 0.856,
+        "roc_auc": 0.939508
+      },
+      "clip_xgb": {
+        "accuracy": 0.913,
+        "precision": 0.9089108910891089,
+        "recall": 0.918,
+        "roc_auc": 0.970512
+      },
+      "clip_svm": {
+        "accuracy": 0.961,
+        "precision": 0.9528487229862476,
+        "recall": 0.97,
+        "roc_auc": 0.9941260000000001
+      },
+      "handcrafted_best": 0.865,
+      "clip_best": 0.961,
+      "clip_advantage": 0.09599999999999997
+    },
+    {
+      "generator": "SDXL",
+      "uses_clip": true,
+      "handcrafted_xgb": {
+        "accuracy": 0.935,
+        "precision": 0.9190751445086706,
+        "recall": 0.954,
+        "roc_auc": 0.9791160000000001
+      },
+      "handcrafted_svm": {
+        "accuracy": 0.931,
+        "precision": 0.9284294234592445,
+        "recall": 0.934,
+        "roc_auc": 0.9827159999999999
+      },
+      "clip_xgb": {
+        "accuracy": 0.944,
+        "precision": 0.944,
+        "recall": 0.944,
+        "roc_auc": 0.9869559999999999
+      },
+      "clip_svm": {
+        "accuracy": 0.99,
+        "precision": 0.9880478087649402,
+        "recall": 0.992,
+        "roc_auc": 0.999656
+      },
+      "handcrafted_best": 0.935,
+      "clip_best": 0.99,
+      "clip_advantage": 0.05499999999999994
+    },
+    {
+      "generator": "SD 3",
+      "uses_clip": true,
+      "handcrafted_xgb": {
+        "accuracy": 0.833,
+        "precision": 0.8363636363636363,
+        "recall": 0.828,
+        "roc_auc": 0.9075080000000001
+      },
+      "handcrafted_svm": {
+        "accuracy": 0.854,
+        "precision": 0.8657024793388429,
+        "recall": 0.838,
+        "roc_auc": 0.913932
+      },
+      "clip_xgb": {
+        "accuracy": 0.92,
+        "precision": 0.9216867469879518,
+        "recall": 0.918,
+        "roc_auc": 0.9740960000000001
+      },
+      "clip_svm": {
+        "accuracy": 0.975,
+        "precision": 0.9702970297029703,
+        "recall": 0.98,
+        "roc_auc": 0.9947839999999999
+      },
+      "handcrafted_best": 0.854,
+      "clip_best": 0.975,
+      "clip_advantage": 0.121
+    },
+    {
+      "generator": "Midjourney v6",
+      "uses_clip": "unknown",
+      "handcrafted_xgb": {
+        "accuracy": 0.868,
+        "precision": 0.859375,
+        "recall": 0.88,
+        "roc_auc": 0.941412
+      },
+      "handcrafted_svm": {
+        "accuracy": 0.885,
+        "precision": 0.8888888888888888,
+        "recall": 0.88,
+        "roc_auc": 0.9592140000000001
+      },
+      "clip_xgb": {
+        "accuracy": 0.972,
+        "precision": 0.9796747967479674,
+        "recall": 0.964,
+        "roc_auc": 0.997228
+      },
+      "clip_svm": {
+        "accuracy": 0.995,
+        "precision": 0.9959919839679359,
+        "recall": 0.994,
+        "roc_auc": 0.999872
+      },
+      "handcrafted_best": 0.885,
+      "clip_best": 0.995,
+      "clip_advantage": 0.10999999999999999
+    },
+    {
+      "generator": "DALL-E 3",
+      "uses_clip": false,
+      "handcrafted_xgb": {
+        "accuracy": 0.987,
+        "precision": 0.9821782178217822,
+        "recall": 0.992,
+        "roc_auc": 0.9957779999999999
+      },
+      "handcrafted_svm": {
+        "accuracy": 0.976,
+        "precision": 0.9703557312252964,
+        "recall": 0.982,
+        "roc_auc": 0.995222
+      },
+      "clip_xgb": {
+        "accuracy": 0.955,
+        "precision": 0.9540918163672655,
+        "recall": 0.956,
+        "roc_auc": 0.990896
+      },
+      "clip_svm": {
+        "accuracy": 0.982,
+        "precision": 0.9781746031746031,
+        "recall": 0.986,
+        "roc_auc": 0.99888
+      },
+      "handcrafted_best": 0.987,
+      "clip_best": 0.982,
+      "clip_advantage": -0.0050000000000000044
+    }
+  ]
+}
\ No newline at end of file
diff --git a/results/experiments_comparison_20260323_094054.pdf b/results/experiments_comparison_20260323_094054.pdf
deleted file mode 100644
index a2ec008..0000000
Binary files a/results/experiments_comparison_20260323_094054.pdf and /dev/null differ
diff --git a/results/experiments_results.json b/results/experiments_results.json
deleted file mode 100644
index c8feec6..0000000
--- a/results/experiments_results.json
+++ /dev/null
@@ -1,150 +0,0 @@
-{
-  "timestamp": "2026-03-23T09:40:54.308648",
-  "experiments": [
-    {
-      "name": "Artwork Features (Li & Stamp + FFT/DCT)",
-      "short_name": "Artwork (49)",
-      "n_features": 49,
-      "extract_time": "237s",
-      "results": {
-        "xgb": {
-          "accuracy": 0.79375,
-          "precision": 0.7936031984007996,
-          "recall": 0.794,
-          "f1": 0.7937499871093743,
-          "roc_auc": 0.88620775
-        },
-        "svm": {
-          "accuracy": 0.784,
-          "precision": 0.7862903225806451,
-          "recall": 0.78,
-          "f1": 0.7839965439447031,
-          "roc_auc": 0.868789625
-        },
-        "mlp": {
-          "accuracy": 0.79375,
-          "precision": 0.7930174563591023,
-          "recall": 0.795,
-          "f1": 0.7937496777338715,
-          "roc_auc": 0.88000325
-        }
-      }
-    },
-    {
-      "name": "Style Features (stroke/palette/composition/texture)",
-      "short_name": "Style (15)",
-      "n_features": 15,
-      "extract_time": "577s",
-      "results": {
-        "xgb": {
-          "accuracy": 0.7875,
-          "precision": 0.797004132231405,
-          "recall": 0.7715,
-          "f1": 0.787445586070034,
-          "roc_auc": 0.88307
-        },
-        "svm": {
-          "accuracy": 0.77175,
-          "precision": 0.7763091001525165,
-          "recall": 0.7635,
-          "f1": 0.7717344636769341,
-          "roc_auc": 0.8640375
-        },
-        "mlp": {
-          "accuracy": 0.77575,
-          "precision": 0.7847186370676303,
-          "recall": 0.76,
-          "f1": 0.7756943581817265,
-          "roc_auc": 0.87112
-        }
-      }
-    },
-    {
-      "name": "Artwork + Style Combined",
-      "short_name": "Art+Style (64)",
-      "n_features": 64,
-      "extract_time": "combined",
-      "results": {
-        "xgb": {
-          "accuracy": 0.83475,
-          "precision": 0.8396752917300863,
-          "recall": 0.8275,
-          "f1": 0.8347413135902956,
-          "roc_auc": 0.923393
-        },
-        "svm": {
-          "accuracy": 0.82225,
-          "precision": 0.8243583291394061,
-          "recall": 0.819,
-          "f1": 0.8222481224957938,
-          "roc_auc": 0.9038627499999999
-        },
-        "mlp": {
-          "accuracy": 0.828,
-          "precision": 0.82,
-          "recall": 0.8405,
-          "f1": 0.8279731208001251,
-          "roc_auc": 0.9148270000000001
-        }
-      }
-    },
-    {
-      "name": "CLIP ViT-B/32 Embeddings",
-      "short_name": "CLIP (512)",
-      "n_features": 512,
-      "extract_time": "112s",
-      "results": {
-        "xgb": {
-          "accuracy": 0.861,
-          "precision": 0.8620862587763289,
-          "recall": 0.8595,
-          "f1": 0.8609996872492963,
-          "roc_auc": 0.9425352499999999
-        },
-        "svm": {
-          "accuracy": 0.893,
-          "precision": 0.8949748743718593,
-          "recall": 0.8905,
-          "f1": 0.8929993312458202,
-          "roc_auc": 0.9628585000000001
-        },
-        "mlp": {
-          "accuracy": 0.8855,
-          "precision": 0.8839641434262948,
-          "recall": 0.8875,
-          "f1": 0.8854995419981679,
-          "roc_auc": 0.952217
-        }
-      }
-    },
-    {
-      "name": "CLIP + Artwork + Style (Everything)",
-      "short_name": "All Combined",
-      "n_features": 576,
-      "extract_time": "combined",
-      "results": {
-        "xgb": {
-          "accuracy": 0.87975,
-          "precision": 0.8853373921867073,
-          "recall": 0.8725,
-          "f1": 0.8797436790271289,
-          "roc_auc": 0.956134
-        },
-        "svm": {
-          "accuracy": 0.9,
-          "precision": 0.9004004004004004,
-          "recall": 0.8995,
-          "f1": 0.8999999749999937,
-          "roc_auc": 0.9657885
-        },
-        "mlp": {
-          "accuracy": 0.8835,
-          "precision": 0.8827345309381237,
-          "recall": 0.8845,
-          "f1": 0.8834998834998835,
-          "roc_auc": 0.9535750000000001
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/results/fair_evaluation_20260322_235151.pdf b/results/fair_evaluation_20260322_235151.pdf
deleted file mode 100644
index 2b107f0..0000000
Binary files a/results/fair_evaluation_20260322_235151.pdf and /dev/null differ
diff --git a/results/fair_evaluation_results.json b/results/fair_evaluation_results.json
deleted file mode 100644
index 50a3a4e..0000000
--- a/results/fair_evaluation_results.json
+++ /dev/null
@@ -1,220 +0,0 @@
-{
-  "timestamp": "2026-03-22T23:49:04.639392",
-  "evaluation": "fair_semantically_similar",
-  "datasets": [
-    {
-      "dataset": "AI-Art vs Real-Art (Hemg)",
-      "repo": "Hemg/AI-Generated-vs-Real-Images-Datasets",
-      "n_samples": 400,
-      "n_features": 49,
-      "xgb_folds": [
-        {
-          "fold": 1,
-          "accuracy": 0.675,
-          "precision": 0.6590909090909091,
-          "recall": 0.725,
-          "f1": 0.6741854636591479,
-          "roc_auc": 0.7637499999999999
-        },
-        {
-          "fold": 2,
-          "accuracy": 0.6375,
-          "precision": 0.6222222222222222,
-          "recall": 0.7,
-          "f1": 0.6360784313725489,
-          "roc_auc": 0.718125
-        },
-        {
-          "fold": 3,
-          "accuracy": 0.7,
-          "precision": 0.6666666666666666,
-          "recall": 0.8,
-          "f1": 0.696969696969697,
-          "roc_auc": 0.73875
-        },
-        {
-          "fold": 4,
-          "accuracy": 0.7,
-          "precision": 0.7352941176470589,
-          "recall": 0.625,
-          "f1": 0.6983029541169076,
-          "roc_auc": 0.7881250000000001
-        },
-        {
-          "fold": 5,
-          "accuracy": 0.725,
-          "precision": 0.7368421052631579,
-          "recall": 0.7,
-          "f1": 0.7248280175109443,
-          "roc_auc": 0.7918750000000001
-        }
-      ],
-      "xgb_accuracy": 0.6875,
-      "xgb_auc": 0.7563249999999999,
-      "xgb_precision": 0.6794258373205742,
-      "xgb_recall": 0.71,
-      "svm_accuracy": 0.71,
-      "svm_auc": 0.7851250000000001,
-      "mlp_accuracy": 0.71,
-      "mlp_auc": 0.788825,
-      "feature_names": [
-        "mean_brightness",
-        "entropy_brightness",
-        "red_mean",
-        "red_variance",
-        "red_kurtosis",
-        "red_skewness",
-        "green_mean",
-        "green_variance",
-        "green_kurtosis",
-        "green_skewness",
-        "blue_mean",
-        "blue_variance",
-        "blue_kurtosis",
-        "blue_skewness",
-        "rgb_entropy",
-        "hue_variance",
-        "hue_kurtosis",
-        "hue_skewness",
-        "saturation_variance",
-        "saturation_kurtosis",
-        "saturation_skewness",
-        "value_variance",
-        "value_kurtosis",
-        "value_skewness",
-        "hsv_entropy",
-        "contrast",
-        "correlation",
-        "energy",
-        "homogeneity",
-        "lbp_entropy",
-        "lbp_variance",
-        "hog_mean",
-        "hog_variance",
-        "hog_kurtosis",
-        "hog_skewness",
-        "hog_entropy",
-        "edgelen",
-        "noise_entropy",
-        "snr",
-        "fft_low_energy_ratio",
-        "fft_mid_energy_ratio",
-        "fft_high_energy_ratio",
-        "fft_spectral_centroid",
-        "fft_log_mag_mean",
-        "fft_log_mag_std",
-        "fft_phase_std",
-        "dct_ac_dc_ratio",
-        "dct_high_freq_energy",
-        "dct_sparsity"
-      ]
-    },
-    {
-      "dataset": "AI vs Real (Parveshiiii)",
-      "repo": "Parveshiiii/AI-vs-Real",
-      "n_samples": 400,
-      "n_features": 49,
-      "xgb_folds": [
-        {
-          "fold": 1,
-          "accuracy": 0.925,
-          "precision": 0.925,
-          "recall": 0.925,
-          "f1": 0.925,
-          "roc_auc": 0.9881249999999999
-        },
-        {
-          "fold": 2,
-          "accuracy": 0.8375,
-          "precision": 0.8648648648648649,
-          "recall": 0.8,
-          "f1": 0.8372711625723674,
-          "roc_auc": 0.93
-        },
-        {
-          "fold": 3,
-          "accuracy": 0.875,
-          "precision": 0.875,
-          "recall": 0.875,
-          "f1": 0.875,
-          "roc_auc": 0.931875
-        },
-        {
-          "fold": 4,
-          "accuracy": 0.8625,
-          "precision": 0.8918918918918919,
-          "recall": 0.825,
-          "f1": 0.8623063683304647,
-          "roc_auc": 0.92625
-        },
-        {
-          "fold": 5,
-          "accuracy": 0.8625,
-          "precision": 0.8536585365853658,
-          "recall": 0.875,
-          "f1": 0.8624785122675418,
-          "roc_auc": 0.9568749999999999
-        }
-      ],
-      "xgb_accuracy": 0.8725,
-      "xgb_auc": 0.9459750000000001,
-      "xgb_precision": 0.882051282051282,
-      "xgb_recall": 0.86,
-      "svm_accuracy": 0.915,
-      "svm_auc": 0.968625,
-      "mlp_accuracy": 0.9275,
-      "mlp_auc": 0.9697875,
-      "feature_names": [
-        "mean_brightness",
-        "entropy_brightness",
-        "red_mean",
-        "red_variance",
-        "red_kurtosis",
-        "red_skewness",
-        "green_mean",
-        "green_variance",
-        "green_kurtosis",
-        "green_skewness",
-        "blue_mean",
-        "blue_variance",
-        "blue_kurtosis",
-        "blue_skewness",
-        "rgb_entropy",
-        "hue_variance",
-        "hue_kurtosis",
-        "hue_skewness",
-        "saturation_variance",
-        "saturation_kurtosis",
-        "saturation_skewness",
-        "value_variance",
-        "value_kurtosis",
-        "value_skewness",
-        "hsv_entropy",
-        "contrast",
-        "correlation",
-        "energy",
-        "homogeneity",
-        "lbp_entropy",
-        "lbp_variance",
-        "hog_mean",
-        "hog_variance",
-        "hog_kurtosis",
-        "hog_skewness",
-        "hog_entropy",
-        "edgelen",
-        "noise_entropy",
-        "snr",
-        "fft_low_energy_ratio",
-        "fft_mid_energy_ratio",
-        "fft_high_energy_ratio",
-        "fft_spectral_centroid",
-        "fft_log_mag_mean",
-        "fft_log_mag_std",
-        "fft_phase_std",
-        "dct_ac_dc_ratio",
-        "dct_high_freq_energy",
-        "dct_sparsity"
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/results/hybrid_modern_results.json b/results/hybrid_modern_results.json
new file mode 100644
index 0000000..d90503d
--- /dev/null
+++ b/results/hybrid_modern_results.json
@@ -0,0 +1,96 @@
+{
+  "training": {
+    "n_per_class": 3122,
+    "n_features": 916,
+    "cv_acc": 0.9694104001149638,
+    "cv_auc": 0.9955921334648258
+  },
+  "tellif": {
+    "bytedance-seedream-v4-text-to-image": {
+      "n": 20,
+      "detected": 18,
+      "uncertain": 2,
+      "missed": 0,
+      "rate": 0.9
+    },
+    "flux-pro-v1-1-ultra": {
+      "n": 9,
+      "detected": 5,
+      "uncertain": 4,
+      "missed": 0,
+      "rate": 0.5555555555555556
+    },
+    "gemini-25-flash-image": {
+      "n": 18,
+      "detected": 10,
+      "uncertain": 8,
+      "missed": 0,
+      "rate": 0.5555555555555556
+    },
+    "hidream-i1-full": {
+      "n": 9,
+      "detected": 8,
+      "uncertain": 1,
+      "missed": 0,
+      "rate": 0.8888888888888888
+    },
+    "hunyuan-image-v2-1-text-to-image": {
+      "n": 11,
+      "detected": 11,
+      "uncertain": 0,
+      "missed": 0,
+      "rate": 1.0
+    },
+    "ideogram-v2": {
+      "n": 9,
+      "detected": 8,
+      "uncertain": 1,
+      "missed": 0,
+      "rate": 0.8888888888888888
+    },
+    "imagen4-preview": {
+      "n": 9,
+      "detected": 9,
+      "uncertain": 0,
+      "missed": 0,
+      "rate": 1.0
+    },
+    "qwen-image": {
+      "n": 12,
+      "detected": 10,
+      "uncertain": 2,
+      "missed": 0,
+      "rate": 0.8333333333333334
+    },
+    "real": {
+      "n": 122,
+      "fp": 0,
+      "fp_rate": 0.0
+    },
+    "recraft-v3-text-to-image": {
+      "n": 9,
+      "detected": 4,
+      "uncertain": 5,
+      "missed": 0,
+      "rate": 0.4444444444444444
+    },
+    "stable-diffusion-v35-large": {
+      "n": 9,
+      "detected": 6,
+      "uncertain": 3,
+      "missed": 0,
+      "rate": 0.6666666666666666
+    },
+    "wan-v2-2-a14b-text-to-image": {
+      "n": 8,
+      "detected": 6,
+      "uncertain": 2,
+      "missed": 0,
+      "rate": 0.75
+    }
+  },
+  "importance": {
+    "handcrafted_pct": 0.17440815103386276,
+    "learned_pct": 0.8255918489661372
+  }
+}
\ No newline at end of file
diff --git a/results/imaginet_paintings_results.json b/results/imaginet_paintings_results.json
new file mode 100644
index 0000000..097c242
--- /dev/null
+++ b/results/imaginet_paintings_results.json
@@ -0,0 +1,85 @@
+{
+  "per_generator": {
+    "AnimagineXL": {
+      "acc": 0.993,
+      "auc": 0.99956,
+      "f1": 0.9929995249581209,
+      "prec": 0.9959789856536674,
+      "rec": 0.99
+    },
+    "SD": {
+      "acc": 0.9549999999999998,
+      "auc": 0.99128,
+      "f1": 0.9549951742342294,
+      "prec": 0.9526184576872121,
+      "rec": 0.9579999999999999
+    },
+    "SDXL": {
+      "acc": 0.9720000000000001,
+      "auc": 0.99666,
+      "f1": 0.9719950476849487,
+      "prec": 0.9798952905599838,
+      "rec": 0.9639999999999999
+    },
+    "StyleGAN": {
+      "acc": 0.6439999999999999,
+      "auc": 0.70302,
+      "f1": 0.6433258254702935,
+      "prec": 0.6354500861457384,
+      "rec": 0.6780000000000002
+    },
+    "Midjourney": {
+      "acc": 0.923,
+      "auc": 0.97706,
+      "f1": 0.9229635402996428,
+      "prec": 0.9377923257914589,
+      "rec": 0.906
+    },
+    "DALL-E_3": {
+      "acc": 0.9359999999999999,
+      "auc": 0.9806799999999999,
+      "f1": 0.9359967497387254,
+      "prec": 0.9342257872846108,
+      "rec": 0.9380000000000001
+    }
+  },
+  "pooled": {
+    "acc": 0.8320000000000001,
+    "auc": 0.8915000000000001,
+    "f1": 0.8308584361256923,
+    "prec": 0.8839624127044378,
+    "rec": 0.764
+  },
+  "leave_one_out": {
+    "AnimagineXL": {
+      "acc": 0.998,
+      "auc": 1.0,
+      "f1": 0.997999991999968
+    },
+    "SD": {
+      "acc": 0.95,
+      "auc": 1.0,
+      "f1": 0.949874686716792
+    },
+    "SDXL": {
+      "acc": 0.992,
+      "auc": 1.0,
+      "f1": 0.9919994879672299
+    },
+    "StyleGAN": {
+      "acc": 0.593,
+      "auc": 0.794292,
+      "f1": 0.5121957066030963
+    },
+    "Midjourney": {
+      "acc": 0.983,
+      "auc": 1.0,
+      "f1": 0.9829950855797325
+    },
+    "DALL-E_3": {
+      "acc": 0.996,
+      "auc": 1.0,
+      "f1": 0.995999935998976
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/multidataset_benchmark_results.json b/results/multidataset_benchmark_results.json
new file mode 100644
index 0000000..f597350
--- /dev/null
+++ b/results/multidataset_benchmark_results.json
@@ -0,0 +1,130 @@
+{
+  "feature_count": 148,
+  "exp1_baselines": {
+    "Defactify": {
+      "accuracy": 0.8867,
+      "auc": 0.8953
+    },
+    "Hemg": {
+      "accuracy": 0.8085,
+      "auc": 0.9004
+    },
+    "tellif": {
+      "accuracy": 0.8286,
+      "auc": 0.9201
+    }
+  },
+  "exp2_cross_dataset": {
+    "Defactify->Hemg": {
+      "accuracy": 0.501,
+      "auc": 0.4467
+    },
+    "Hemg->Defactify": {
+      "accuracy": 0.4208,
+      "auc": 0.4029
+    },
+    "Combined->Defactify": {
+      "accuracy": 1.0,
+      "auc": 1.0
+    },
+    "Combined->Hemg": {
+      "accuracy": 1.0,
+      "auc": 1.0
+    },
+    "Combined->tellif": {
+      "accuracy": 0.6653,
+      "auc": 0.7792
+    }
+  },
+  "exp3_generator_diversity": {
+    "1_gen_train": {
+      "accuracy": 0.5656,
+      "auc": 0.6508,
+      "train_generators": [
+        "Midjourney_v6"
+      ],
+      "test_generators": [
+        "SD_3",
+        "SDXL",
+        "DALL-E_3",
+        "SD_2.1"
+      ]
+    },
+    "2_gen_train": {
+      "accuracy": 0.6639,
+      "auc": 0.6237,
+      "train_generators": [
+        "Midjourney_v6",
+        "SD_3"
+      ],
+      "test_generators": [
+        "SDXL",
+        "DALL-E_3",
+        "SD_2.1"
+      ]
+    },
+    "3_gen_train": {
+      "accuracy": 0.8604,
+      "auc": 0.8254,
+      "train_generators": [
+        "Midjourney_v6",
+        "SD_3",
+        "SDXL"
+      ],
+      "test_generators": [
+        "DALL-E_3",
+        "SD_2.1"
+      ]
+    },
+    "4_gen_train": {
+      "accuracy": 0.8792,
+      "auc": 0.8269,
+      "train_generators": [
+        "Midjourney_v6",
+        "SD_3",
+        "SDXL",
+        "DALL-E_3"
+      ],
+      "test_generators": [
+        "SD_2.1"
+      ]
+    },
+    "5_gen_cv": {
+      "accuracy": 0.8867,
+      "auc": 0.8953,
+      "train_generators": [
+        "DALL-E_3",
+        "Midjourney_v6",
+        "SDXL",
+        "SD_2.1",
+        "SD_3"
+      ],
+      "test_generators": [
+        "DALL-E_3",
+        "Midjourney_v6",
+        "SDXL",
+        "SD_2.1",
+        "SD_3"
+      ]
+    }
+  },
+  "exp4_domain_transfer": {
+    "Photos->Art": {
+      "accuracy": 0.501,
+      "auc": 0.4467
+    },
+    "Art->Photos": {
+      "accuracy": 0.4208,
+      "auc": 0.4029
+    },
+    "Photos_self_CV": {
+      "accuracy": 0.8867,
+      "auc": 0.8953
+    },
+    "Art_self_CV": {
+      "accuracy": 0.8085,
+      "auc": 0.9004
+    }
+  },
+  "elapsed_seconds": 2964.0
+}
\ No newline at end of file
diff --git a/results/platform_robustness_results.json b/results/platform_robustness_results.json
new file mode 100644
index 0000000..d0985f0
--- /dev/null
+++ b/results/platform_robustness_results.json
@@ -0,0 +1,27 @@
+{
+  "Clean (original)": {
+    "fp": 0.0,
+    "detection": 0.9705882352941176,
+    "missed": 0.0196078431372549
+  },
+  "Twitter/X (JPEG Q85, max 4096px)": {
+    "fp": 0.0,
+    "detection": 0.9607843137254902,
+    "missed": 0.0196078431372549
+  },
+  "Instagram (JPEG Q75, max 1080px, sharpen)": {
+    "fp": 0.0,
+    "detection": 0.9313725490196079,
+    "missed": 0.029411764705882353
+  },
+  "Discord (JPEG Q80, 4MP limit)": {
+    "fp": 0.0,
+    "detection": 0.9607843137254902,
+    "missed": 0.0196078431372549
+  },
+  "Tumblr (JPEG Q65, max 1280px)": {
+    "fp": 0.0,
+    "detection": 0.9411764705882353,
+    "missed": 0.00980392156862745
+  }
+}
\ No newline at end of file
diff --git a/results/scale_evaluation_20260322_235906.pdf b/results/scale_evaluation_20260322_235906.pdf
deleted file mode 100644
index 1d8dfe1..0000000
Binary files a/results/scale_evaluation_20260322_235906.pdf and /dev/null differ
diff --git a/results/scale_evaluation_results.json b/results/scale_evaluation_results.json
deleted file mode 100644
index dcf7caf..0000000
--- a/results/scale_evaluation_results.json
+++ /dev/null
@@ -1,103 +0,0 @@
-{
-  "timestamp": "2026-03-22T23:59:06.522268",
-  "dataset": "Hemg/AI-Generated-vs-Real-Images-Datasets",
-  "feature_count": 49,
-  "results": [
-    {
-      "total": 400,
-      "xgb": {
-        "accuracy": 0.67,
-        "precision": 0.6752577319587629,
-        "recall": 0.655,
-        "f1": 0.6699257332899902,
-        "roc_auc": 0.75155
-      },
-      "svm": {
-        "accuracy": 0.6975,
-        "precision": 0.7025641025641025,
-        "recall": 0.685,
-        "f1": 0.697452726988592,
-        "roc_auc": 0.7725
-      },
-      "mlp": {
-        "accuracy": 0.7,
-        "precision": 0.6904761904761905,
-        "recall": 0.725,
-        "f1": 0.699812382739212,
-        "roc_auc": 0.7897
-      }
-    },
-    {
-      "total": 1000,
-      "xgb": {
-        "accuracy": 0.758,
-        "precision": 0.764344262295082,
-        "recall": 0.746,
-        "f1": 0.7579651469811652,
-        "roc_auc": 0.843684
-      },
-      "svm": {
-        "accuracy": 0.744,
-        "precision": 0.7552301255230126,
-        "recall": 0.722,
-        "f1": 0.7438760360014247,
-        "roc_auc": 0.8290820000000001
-      },
-      "mlp": {
-        "accuracy": 0.754,
-        "precision": 0.7550200803212851,
-        "recall": 0.752,
-        "f1": 0.7539990159960639,
-        "roc_auc": 0.838132
-      }
-    },
-    {
-      "total": 2000,
-      "xgb": {
-        "accuracy": 0.7705,
-        "precision": 0.7757390417940877,
-        "recall": 0.761,
-        "f1": 0.7704792857555395,
-        "roc_auc": 0.863119
-      },
-      "svm": {
-        "accuracy": 0.7645,
-        "precision": 0.765295887662989,
-        "recall": 0.763,
-        "f1": 0.7644994701238078,
-        "roc_auc": 0.8491545
-      },
-      "mlp": {
-        "accuracy": 0.778,
-        "precision": 0.780241935483871,
-        "recall": 0.774,
-        "f1": 0.7779964479431671,
-        "roc_auc": 0.857905
-      }
-    },
-    {
-      "total": 4000,
-      "xgb": {
-        "accuracy": 0.79475,
-        "precision": 0.7984810126582278,
-        "recall": 0.7885,
-        "f1": 0.7947419821086761,
-        "roc_auc": 0.8877880000000001
-      },
-      "svm": {
-        "accuracy": 0.7815,
-        "precision": 0.7829145728643216,
-        "recall": 0.779,
-        "f1": 0.7814986343664647,
-        "roc_auc": 0.86796925
-      },
-      "mlp": {
-        "accuracy": 0.7885,
-        "precision": 0.7803692905733722,
-        "recall": 0.803,
-        "f1": 0.7884555227736632,
-        "roc_auc": 0.8725355000000001
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/extract_cache.py b/tests/extract_cache.py
new file mode 100644
index 0000000..163643b
--- /dev/null
+++ b/tests/extract_cache.py
@@ -0,0 +1,287 @@
+"""Feature extraction with disk caching. Pause/resume safe.
+
+Saves extracted features to .npz files after each data source.
+On restart, loads from cache and skips already-extracted sources.
+
+Usage:
+    uv run python tests/extract_cache.py          # extract all
+    uv run python tests/extract_cache.py --train   # train + test using cached features
+"""
+
+from __future__ import annotations
+
+import gc
+import json
+import os
+import sys
+import warnings
+from pathlib import Path
+
+os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface"
+os.environ["HF_DATASETS_CACHE"] = "D:/Projects/negate/negate/.cache/huggingface/datasets"
+# Set HF_TOKEN env var or run: huggingface-cli login
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+
+warnings.filterwarnings("ignore")
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from negate.extract.feature_artwork import ArtworkExtract
+from negate.extract.feature_learned import LearnedExtract
+
+CACHE_DIR = Path("D:/Projects/negate/negate/.cache/features")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted")
+CIVITAI = Path("D:/Projects/negate/negate/.datasets/civitai")
+SEED = 42
+
+
+def get_extractors():
+    return ArtworkExtract(), LearnedExtract()
+
+
+def extract_one(art_ext, learned_ext, img):
+    try:
+        hc = list(art_ext(img).values())
+    except Exception:
+        hc = [0.0] * 148
+    try:
+        with torch.no_grad():
+            ln = list(learned_ext(img).values())
+    except Exception:
+        ln = [0.0] * 768
+    return hc + ln
+
+
+def extract_and_cache(name, image_source, art_ext, learned_ext):
+    """Extract features and save to cache. Skip if already cached."""
+    cache_file = CACHE_DIR / f"{name}.npz"
+    if cache_file.exists():
+        data = np.load(cache_file)
+        print(f"  {name}: loaded from cache ({len(data['X'])} images)", flush=True)
+        return data["X"]
+
+    print(f"  {name}: extracting...", flush=True)
+    rows = []
+    for img in tqdm(image_source, desc=name):
+        rows.append(extract_one(art_ext, learned_ext, img))
+        del img
+
+    X = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0)
+    np.savez_compressed(cache_file, X=X)
+    print(f"  {name}: cached {len(X)} images to {cache_file}", flush=True)
+    gc.collect()
+    return X
+
+
+def load_paths_as_images(path, recursive, max_n):
+    """Generator that yields PIL images from file paths."""
+    rng = np.random.RandomState(SEED)
+    exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
+    files = list(path.rglob("*") if recursive else path.iterdir())
+    files = [f for f in files if f.suffix.lower() in exts]
+    if len(files) > max_n:
+        files = list(rng.choice(files, max_n, replace=False))
+    for f in files:
+        try:
+            yield Image.open(f).convert("RGB")
+        except Exception:
+            pass
+
+
+def stream_hf_images(repo, max_n, label_filter=None):
+    """Generator that yields PIL images from HuggingFace streaming."""
+    from datasets import load_dataset
+    ds = load_dataset(repo, split="train", streaming=True)
+    count = 0
+    for s in ds:
+        if label_filter is not None and s.get("label") != label_filter:
+            continue
+        try:
+            yield s["image"].convert("RGB")
+            count += 1
+        except Exception:
+            pass
+        if count >= max_n:
+            break
+
+
+def extract_all():
+    """Extract and cache all data sources."""
+    art_ext, learned_ext = get_extractors()
+
+    print("=" * 60, flush=True)
+    print("  FEATURE EXTRACTION WITH CACHING", flush=True)
+    print("  Pause anytime — restart will skip completed sources", flush=True)
+    print("=" * 60, flush=True)
+
+    # === REAL ===
+    print("\n--- REAL ---", flush=True)
+
+    extract_and_cache("real_wikiart",
+                      load_paths_as_images(BASE / "wikiart", recursive=True, max_n=1500),
+                      art_ext, learned_ext)
+
+    from datasets import load_dataset
+    ds_tellif = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test")
+    tellif_real = [ds_tellif[i]["image"].convert("RGB") for i in range(len(ds_tellif)) if ds_tellif[i]["label"] == 8]
+    extract_and_cache("real_tellif", tellif_real, art_ext, learned_ext)
+    del tellif_real; gc.collect()
+
+    extract_and_cache("real_hemg",
+                      stream_hf_images("Hemg/AI-Generated-vs-Real-Images-Datasets", max_n=800, label_filter=1),
+                      art_ext, learned_ext)
+
+    # === FAKE: old generators ===
+    print("\n--- FAKE (old) ---", flush=True)
+    for src in ["sdxl_paintings_fake", "sd_paintings_fake", "dalle3", "journeydb", "animaginexl_paintings_fake"]:
+        extract_and_cache(f"fake_imaginet_{src}",
+                          load_paths_as_images(BASE / src, recursive=False, max_n=100),
+                          art_ext, learned_ext)
+
+    # === FAKE: modern HF datasets ===
+    print("\n--- FAKE (modern) ---", flush=True)
+    for repo, n, name in [
+        ("ash12321/seedream-4.5-generated-2k", 300, "seedream45"),
+        ("exdysa/nano-banana-pro-generated-1k-clone", 300, "nano_banana"),
+        ("LukasT9/Flux-1-Dev-Images-1k", 300, "flux_dev"),
+        ("LukasT9/Flux-1-Schnell-Images-1k", 300, "flux_schnell"),
+    ]:
+        extract_and_cache(f"fake_{name}",
+                          stream_hf_images(repo, max_n=n),
+                          art_ext, learned_ext)
+
+    # === FAKE: CivitAI (all generators) ===
+    print("\n--- FAKE (CivitAI) ---", flush=True)
+    for gen_dir in ["flux", "sdxl", "pony", "illustrious", "sd3", "sd35", "recraft", "gemini"]:
+        p = CIVITAI / gen_dir
+        if p.exists() and any(p.iterdir()):
+            extract_and_cache(f"fake_civitai_{gen_dir}",
+                              load_paths_as_images(p, recursive=False, max_n=300),
+                              art_ext, learned_ext)
+
+    # === FAKE: Hemg AI ===
+    print("\n--- FAKE (Hemg AI) ---", flush=True)
+    extract_and_cache("fake_hemg_ai",
+                      stream_hf_images("Hemg/AI-Generated-vs-Real-Images-Datasets", max_n=800, label_filter=0),
+                      art_ext, learned_ext)
+
+    print("\n  ALL EXTRACTION COMPLETE", flush=True)
+    print(f"  Cache dir: {CACHE_DIR}", flush=True)
+    print(f"  Files: {len(list(CACHE_DIR.glob('*.npz')))}", flush=True)
+
+
+def train_and_test():
+    """Load cached features, train ensemble, test on tellif."""
+    import lightgbm as lgb
+    from sklearn.ensemble import RandomForestClassifier
+    from sklearn.metrics import accuracy_score, roc_auc_score
+    from sklearn.model_selection import StratifiedKFold
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.svm import SVC
+
+    print("=" * 60, flush=True)
+    print("  TRAIN + TEST (from cached features)", flush=True)
+    print("=" * 60, flush=True)
+
+    # Load all cached features
+    real_parts, fake_parts = [], []
+    for f in sorted(CACHE_DIR.glob("real_*.npz")):
+        data = np.load(f)
+        real_parts.append(data["X"])
+        print(f"  Real: {f.stem} ({len(data['X'])})", flush=True)
+    for f in sorted(CACHE_DIR.glob("fake_*.npz")):
+        data = np.load(f)
+        fake_parts.append(data["X"])
+        print(f"  Fake: {f.stem} ({len(data['X'])})", flush=True)
+
+    X_real = np.vstack(real_parts)
+    X_fake = np.vstack(fake_parts)
+    n = min(len(X_real), len(X_fake))
+    X_train = np.vstack([X_real[:n], X_fake[:n]])
+    y_train = np.concatenate([np.zeros(n), np.ones(n)])
+    print(f"\n  Training: {len(X_train)} ({n}/class)", flush=True)
+
+    # 5-fold CV
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
+    accs = []
+    for fold, (tr, te) in enumerate(skf.split(X_train, y_train)):
+        m = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8,
+                                num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED)
+        m.fit(X_train[tr], y_train[tr])
+        p = m.predict_proba(X_train[te])[:, 1]
+        acc = accuracy_score(y_train[te], (p > 0.5).astype(int))
+        accs.append(acc)
+        print(f"  Fold {fold+1}: {acc:.4f}", flush=True)
+    print(f"  Mean: {np.mean(accs):.4f}", flush=True)
+
+    # Train ensemble
+    W_LGBM, W_RF, W_SVM = 0.4, 0.1, 0.5
+    HIGH_T, LOW_T = 0.80, 0.20
+
+    lgbm = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8,
+                                num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED)
+    lgbm.fit(X_train, y_train)
+    rf = RandomForestClassifier(n_estimators=300, max_depth=12, n_jobs=1, random_state=SEED)
+    rf.fit(X_train, y_train)
+    scaler = StandardScaler()
+    svm = SVC(kernel="rbf", probability=True, random_state=SEED)
+    svm.fit(scaler.fit_transform(X_train), y_train)
+
+    def predict(X):
+        p = (W_LGBM * lgbm.predict_proba(X)[:, 1] +
+             W_RF * rf.predict_proba(X)[:, 1] +
+             W_SVM * svm.predict_proba(scaler.transform(X))[:, 1])
+        pred = np.full(len(X), -1)
+        pred[p >= HIGH_T] = 1
+        pred[p < LOW_T] = 0
+        return pred, p
+
+    # Test tellif
+    print("\n" + "=" * 60, flush=True)
+    print("  TELLIF RESULTS", flush=True)
+    print("=" * 60, flush=True)
+
+    from datasets import load_dataset
+    ds_tellif = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test")
+    label_names = ds_tellif.features["label"].names
+
+    art_ext, learned_ext = get_extractors()
+    results = {}
+
+    for i, gen_name in enumerate(label_names):
+        indices = [j for j in range(len(ds_tellif)) if ds_tellif[j]["label"] == i]
+        if not indices:
+            continue
+        images = [ds_tellif[j]["image"].convert("RGB") for j in indices]
+
+        rows = [extract_one(art_ext, learned_ext, img) for img in images]
+        X_gen = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0)
+
+        pred, prob = predict(X_gen)
+
+        if gen_name == "real":
+            fp = (pred == 1).sum()
+            print(f"  {gen_name:45s}  n={len(X_gen):3d}  GENUINE={(pred==0).sum()}  UNC={(pred==-1).sum()}  FP={fp}  FP={fp/len(X_gen):.1%}", flush=True)
+            results[gen_name] = {"n": len(X_gen), "fp": int(fp), "fp_rate": float(fp / len(X_gen))}
+        else:
+            det = (pred == 1).sum()
+            rate = det / len(X_gen)
+            print(f"  {gen_name:45s}  n={len(X_gen):3d}  DET={det}  UNC={(pred==-1).sum()}  MISS={(pred==0).sum()}  rate={rate:.1%}", flush=True)
+            results[gen_name] = {"n": len(X_gen), "detected": int(det), "rate": float(rate)}
+
+    out = Path(__file__).parent.parent / "results" / "retrain_weak_gens_results.json"
+    with open(out, "w") as f:
+        json.dump({"cv_acc": float(np.mean(accs)), "n_train": int(len(X_train)), "tellif": results}, f, indent=2)
+    print(f"\nSaved to {out}", flush=True)
+
+
+if __name__ == "__main__":
+    if "--train" in sys.argv:
+        train_and_test()
+    else:
+        extract_all()
diff --git a/tests/finetune_anime.py b/tests/finetune_anime.py
new file mode 100644
index 0000000..fc85a2c
--- /dev/null
+++ b/tests/finetune_anime.py
@@ -0,0 +1,248 @@
+"""Fine-tune ConvNeXt-Tiny last stage for anime real vs AI detection.
+
+Freeze stages 0-2, train stage 3 + classification head on anime data.
+CPU training — slow but feasible for our data size (~2600 images).
+"""
+
+from __future__ import annotations
+
+import os
+import warnings
+
+os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface"
+warnings.filterwarnings("ignore")
+
+import numpy as np
+import timm
+import torch
+import torch.nn as nn
+from pathlib import Path
+from PIL import Image
+from datasets import load_dataset
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+
+SEED = 42
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+
+BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted")
+CIVITAI = Path("D:/Projects/negate/negate/.datasets/civitai")
+
+
+class AnimeDataset(Dataset):
+    def __init__(self, images, labels, transform):
+        self.images = images
+        self.labels = labels
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        img = self.images[idx]
+        label = self.labels[idx]
+        tensor = self.transform(img)
+        return tensor, torch.tensor(label, dtype=torch.float32)
+
+
+def load_paths(path, recursive, max_n):
+    rng = np.random.RandomState(SEED)
+    exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
+    files = list(path.rglob("*") if recursive else path.iterdir())
+    files = [f for f in files if f.suffix.lower() in exts]
+    if len(files) > max_n:
+        files = list(rng.choice(files, max_n, replace=False))
+    imgs = []
+    for f in files:
+        try:
+            imgs.append(Image.open(f).convert("RGB"))
+        except Exception:
+            pass
+    return imgs
+
+
+def main():
+    print("=" * 60, flush=True)
+    print("  FINE-TUNE ConvNeXt-Tiny for Anime Detection", flush=True)
+    print("  Freeze stages 0-2, train stage 3 + head", flush=True)
+    print("=" * 60, flush=True)
+
+    # Load model
+    model = timm.create_model("convnext_tiny.fb_in22k", pretrained=True, num_classes=2)
+    transform = timm.data.create_transform(
+        **timm.data.resolve_data_config(model.pretrained_cfg)
+    )
+
+    # Freeze everything except stage 3 and head
+    for name, param in model.named_parameters():
+        if "stages.3" in name or "head" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    print(f"  Trainable: {trainable:,} / {total:,} ({trainable/total*100:.1f}%)", flush=True)
+
+    # Load data
+    print("\n--- Loading data ---", flush=True)
+    real_imgs = []
+
+    # Real anime from animesfw (1000)
+    ds = load_dataset("latentcat/animesfw", split="train", streaming=True)
+    count = 0
+    for s in tqdm(ds, desc="Real anime", total=1000):
+        if count >= 1000:
+            break
+        tags = s.get("tags", "")
+        if "ai" in tags.lower():
+            continue
+        try:
+            real_imgs.append(s["image"].convert("RGB"))
+            count += 1
+        except Exception:
+            pass
+    print(f"  Real anime: {len(real_imgs)}", flush=True)
+
+    # Real WikiArt (500)
+    wiki_imgs = load_paths(BASE / "wikiart", recursive=True, max_n=500)
+    real_imgs.extend(wiki_imgs)
+    print(f"  + WikiArt: {len(wiki_imgs)}, total real: {len(real_imgs)}", flush=True)
+
+    # Fake anime (CivitAI pony + illustrious + AnimagineXL)
+    fake_imgs = []
+    for gen in ["pony", "illustrious"]:
+        imgs = load_paths(CIVITAI / gen, recursive=False, max_n=400)
+        fake_imgs.extend(imgs)
+    animagine = load_paths(BASE / "animaginexl_paintings_fake", recursive=False, max_n=100)
+    fake_imgs.extend(animagine)
+
+    # Add Flux and nano-banana for diversity
+    for repo, n in [("LukasT9/Flux-1-Dev-Images-1k", 200), ("bitmind/nano-banana", 200)]:
+        ds2 = load_dataset(repo, split="train", streaming=True)
+        c = 0
+        for s in ds2:
+            if c >= n:
+                break
+            try:
+                fake_imgs.append(s["image"].convert("RGB"))
+                c += 1
+            except Exception:
+                pass
+    print(f"  Fake: {len(fake_imgs)}", flush=True)
+
+    # Balance
+    n = min(len(real_imgs), len(fake_imgs))
+    all_imgs = real_imgs[:n] + fake_imgs[:n]
+    all_labels = [0] * n + [1] * n
+    print(f"  Total: {len(all_imgs)} ({n}/class)", flush=True)
+
+    # Split train/val
+    indices = np.random.permutation(len(all_imgs))
+    split = int(0.8 * len(indices))
+    train_idx = indices[:split]
+    val_idx = indices[split:]
+
+    train_ds = AnimeDataset(
+        [all_imgs[i] for i in train_idx],
+        [all_labels[i] for i in train_idx],
+        transform,
+    )
+    val_ds = AnimeDataset(
+        [all_imgs[i] for i in val_idx],
+        [all_labels[i] for i in val_idx],
+        transform,
+    )
+
+    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=0)
+    val_loader = DataLoader(val_ds, batch_size=16, shuffle=False, num_workers=0)
+
+    # Train
+    print(f"\n--- Training ({len(train_ds)} train, {len(val_ds)} val) ---", flush=True)
+    optimizer = torch.optim.AdamW(
+        [p for p in model.parameters() if p.requires_grad],
+        lr=1e-4,
+        weight_decay=0.01,
+    )
+    criterion = nn.CrossEntropyLoss()
+    model.train()
+
+    for epoch in range(5):
+        losses = []
+        correct = 0
+        total = 0
+        for batch_imgs, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
+            optimizer.zero_grad()
+            outputs = model(batch_imgs)
+            loss = criterion(outputs, batch_labels.long())
+            loss.backward()
+            optimizer.step()
+            losses.append(loss.item())
+            preds = outputs.argmax(dim=1)
+            correct += (preds == batch_labels.long()).sum().item()
+            total += len(batch_labels)
+
+        # Validation
+        model.eval()
+        val_correct = 0
+        val_total = 0
+        val_fp = 0
+        val_real = 0
+        with torch.no_grad():
+            for batch_imgs, batch_labels in val_loader:
+                outputs = model(batch_imgs)
+                preds = outputs.argmax(dim=1)
+                val_correct += (preds == batch_labels.long()).sum().item()
+                val_total += len(batch_labels)
+                # FP: real labeled as AI
+                real_mask = batch_labels == 0
+                val_fp += ((preds == 1) & real_mask).sum().item()
+                val_real += real_mask.sum().item()
+        model.train()
+
+        val_acc = val_correct / val_total
+        fp_rate = val_fp / val_real if val_real > 0 else 0
+        print(f"  Epoch {epoch+1}: loss={np.mean(losses):.4f} train_acc={correct/total:.4f} val_acc={val_acc:.4f} val_FP={fp_rate:.3f}", flush=True)
+
+    # Test on held-out anime
+    print("\n--- Testing on held-out anime ---", flush=True)
+    model.eval()
+
+    ds3 = load_dataset("latentcat/animesfw", split="train", streaming=True)
+    test_fp = 0
+    test_total = 0
+    skip = 0
+    with torch.no_grad():
+        for s in tqdm(ds3, desc="Test anime", total=1200):
+            tags = s.get("tags", "")
+            if "ai" in tags.lower():
+                continue
+            skip += 1
+            if skip <= 1000:
+                continue
+            if test_total >= 200:
+                break
+            try:
+                img = s["image"].convert("RGB")
+                tensor = transform(img).unsqueeze(0)
+                output = model(tensor)
+                pred = output.argmax(dim=1).item()
+                if pred == 1:  # flagged as AI
+                    test_fp += 1
+                test_total += 1
+            except Exception:
+                pass
+
+    print(f"\nAnime FP (fine-tuned ConvNeXt): {test_fp}/{test_total} ({test_fp/test_total*100:.1f}%)", flush=True)
+
+    # Save model
+    out_path = Path("D:/Projects/negate/negate/models/convnext_anime_finetuned.pt")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save(model.state_dict(), out_path)
+    print(f"Model saved to {out_path}", flush=True)
+    print("DONE", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/generate_fair_eval_pdf.py b/tests/generate_fair_eval_pdf.py
deleted file mode 100644
index cbe4af4..0000000
--- a/tests/generate_fair_eval_pdf.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
-"""Generate PDF report for fair evaluation results.
-
-Reads results/fair_evaluation_results.json and generates a timestamped PDF
-with cross-validation metrics, comparison tables, and analysis.
-"""
-
-from __future__ import annotations
-
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from matplotlib.backends.backend_pdf import PdfPages
-import matplotlib.gridspec as gridspec
-import numpy as np
-
-RESULTS_DIR = Path(__file__).parent.parent / "results"
-
-
-def generate_pdf(results_path: Path):
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    pdf_path = RESULTS_DIR / f"fair_evaluation_{timestamp}.pdf"
-
-    with open(results_path) as f:
-        data = json.load(f)
-
-    datasets = data["datasets"]
-
-    with PdfPages(str(pdf_path)) as pdf:
-        # ===== PAGE 1: Title & Summary =====
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        ax = fig.add_axes([0, 0, 1, 1])
-        ax.axis("off")
-
-        ax.text(0.5, 0.88, "Fair Evaluation Report:\n49-Feature Artwork Detection",
-                transform=ax.transAxes, fontsize=20, fontweight="bold",
-                ha="center", va="top", fontfamily="serif")
-
-        ax.text(0.5, 0.74, f"negate project — darkshapes\n{datetime.now().strftime('%B %d, %Y')}",
-                transform=ax.transAxes, fontsize=11, ha="center", va="top",
-                fontfamily="serif", style="italic")
-
-        # Why this evaluation matters
-        rationale = (
-            "Why This Evaluation Matters\n\n"
-            "Previous benchmarks used datasets where AI and genuine images had different\n"
-            "subject matter (cats vs bananas, WikiArt paintings vs generated illustrations).\n"
-            "This means the classifier could achieve high accuracy by learning content\n"
-            "differences rather than genuine AI artifacts.\n\n"
-            "This evaluation uses datasets where BOTH classes contain similar content:\n"
-            "  - Hemg: 'AiArtData' vs 'RealArt' — both are artwork/art images\n"
-            "  - Parveshiiii: balanced binary AI vs Real images\n\n"
-            "If our 49 features still achieve high accuracy on these datasets, it provides\n"
-            "stronger evidence that the features detect actual AI generation artifacts\n"
-            "rather than subject-matter shortcuts."
-        )
-        ax.text(0.08, 0.64, rationale, transform=ax.transAxes, fontsize=9,
-                ha="left", va="top", fontfamily="serif",
-                bbox=dict(boxstyle="round,pad=0.5", facecolor="lightyellow", edgecolor="gray"))
-
-        # Summary table
-        summary = "Results Summary\n\n"
-        for ds in datasets:
-            summary += (
-                f"Dataset: {ds['dataset']}\n"
-                f"  Samples: {ds['n_samples']} ({ds['n_samples']//2} per class)\n"
-                f"  XGBoost: {ds['xgb_accuracy']:.1%} acc, {ds['xgb_auc']:.4f} AUC, "
-                f"{ds['xgb_precision']:.1%} prec, {ds['xgb_recall']:.1%} rec\n"
-                f"  SVM:     {ds['svm_accuracy']:.1%} acc, {ds['svm_auc']:.4f} AUC\n"
-                f"  MLP:     {ds['mlp_accuracy']:.1%} acc, {ds['mlp_auc']:.4f} AUC\n\n"
-            )
-        ax.text(0.08, 0.28, summary, transform=ax.transAxes, fontsize=9,
-                ha="left", va="top", fontfamily="serif",
-                bbox=dict(boxstyle="round,pad=0.4", facecolor="#E8F5E9", edgecolor="#66BB6A"))
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-        # ===== PAGE 2+: Per-dataset details =====
-        for ds in datasets:
-            fig = plt.figure(figsize=(8.5, 11))
-            fig.patch.set_facecolor("white")
-            fig.suptitle(f"Dataset: {ds['dataset']}", fontsize=14,
-                         fontweight="bold", fontfamily="serif", y=0.96)
-
-            # Fold results table
-            ax_table = fig.add_axes([0.1, 0.68, 0.8, 0.22])
-            ax_table.axis("off")
-
-            if "xgb_folds" in ds:
-                table_data = []
-                for r in ds["xgb_folds"]:
-                    table_data.append([
-                        f"Fold {r['fold']}", f"{r['accuracy']:.2%}",
-                        f"{r['precision']:.2%}", f"{r['recall']:.2%}",
-                        f"{r['f1']:.2%}", f"{r['roc_auc']:.4f}"
-                    ])
-
-                accs = [r["accuracy"] for r in ds["xgb_folds"]]
-                table_data.append([
-                    "Mean +/- Std",
-                    f"{np.mean(accs):.2%} +/- {np.std(accs):.2%}",
-                    "-", "-", "-",
-                    f"{np.mean([r['roc_auc'] for r in ds['xgb_folds']]):.4f}"
-                ])
-
-                table = ax_table.table(
-                    cellText=table_data,
-                    colLabels=["Fold", "Accuracy", "Precision", "Recall", "F1", "ROC-AUC"],
-                    loc="center", cellLoc="center",
-                )
-                table.auto_set_font_size(False)
-                table.set_fontsize(8)
-                table.scale(1, 1.4)
-                for (row, col), cell in table.get_celld().items():
-                    if row == 0:
-                        cell.set_facecolor("#4472C4")
-                        cell.set_text_props(color="white", fontweight="bold")
-                    elif row == len(table_data):
-                        cell.set_facecolor("#D6E4F0")
-
-            # Comparison bar chart: XGBoost vs SVM vs MLP
-            ax_bar = fig.add_axes([0.1, 0.35, 0.8, 0.25])
-            models = ["XGBoost", "SVM", "MLP"]
-            accs = [ds["xgb_accuracy"], ds["svm_accuracy"], ds["mlp_accuracy"]]
-            aucs = [ds["xgb_auc"], ds["svm_auc"], ds["mlp_auc"]]
-
-            x = np.arange(len(models))
-            w = 0.35
-            bars1 = ax_bar.bar(x - w/2, accs, w, label="Accuracy", color="#4472C4")
-            bars2 = ax_bar.bar(x + w/2, aucs, w, label="ROC-AUC", color="#ED7D31")
-            ax_bar.set_xticks(x)
-            ax_bar.set_xticklabels(models)
-            ax_bar.set_ylim(0, 1.1)
-            ax_bar.set_ylabel("Score")
-            ax_bar.set_title("Model Comparison", fontsize=11, fontfamily="serif")
-            ax_bar.legend()
-            ax_bar.grid(axis="y", alpha=0.3)
-
-            for bar in bars1:
-                ax_bar.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
-                           f"{bar.get_height():.1%}", ha="center", fontsize=8)
-            for bar in bars2:
-                ax_bar.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
-                           f"{bar.get_height():.3f}", ha="center", fontsize=8)
-
-            # Analysis text
-            ax_text = fig.add_axes([0.08, 0.05, 0.84, 0.25])
-            ax_text.axis("off")
-
-            best_acc = max(accs)
-            best_model = models[accs.index(best_acc)]
-
-            analysis = (
-                f"Analysis\n\n"
-                f"Dataset: {ds['repo']}\n"
-                f"Sample size: {ds['n_samples']} images, {ds['n_features']} features\n\n"
-                f"Best model: {best_model} at {best_acc:.1%} accuracy\n\n"
-            )
-            if best_acc >= 0.80:
-                analysis += (
-                    "The features demonstrate strong discriminative power even when both\n"
-                    "classes contain similar content. This suggests the 49 features capture\n"
-                    "genuine AI generation artifacts rather than content-based shortcuts."
-                )
-            elif best_acc >= 0.65:
-                analysis += (
-                    "Moderate discriminative power. The features capture some genuine AI\n"
-                    "artifacts but performance degrades compared to content-separated datasets,\n"
-                    "suggesting prior benchmarks partially relied on content differences."
-                )
-            else:
-                analysis += (
-                    "Weak discriminative power on this dataset. The features struggle when\n"
-                    "content is controlled, indicating prior high accuracy was largely driven\n"
-                    "by subject-matter differences rather than AI detection capability."
-                )
-
-            ax_text.text(0, 1, analysis, transform=ax_text.transAxes, fontsize=9,
-                        ha="left", va="top", fontfamily="serif")
-
-            pdf.savefig(fig)
-            plt.close(fig)
-
-        # ===== FINAL PAGE: Conclusions =====
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        ax = fig.add_axes([0, 0, 1, 1])
-        ax.axis("off")
-
-        ax.text(0.5, 0.92, "Conclusions", fontsize=16, fontweight="bold",
-                ha="center", va="top", fontfamily="serif", transform=ax.transAxes)
-
-        all_accs = [ds["xgb_accuracy"] for ds in datasets]
-        mean_fair_acc = np.mean(all_accs)
-
-        conclusions = (
-            f"Mean XGBoost accuracy across fair datasets: {mean_fair_acc:.1%}\n\n"
-            "Comparison with previous (potentially confounded) benchmarks:\n"
-            "  - Cats vs Bananas (unfair): ~91% accuracy\n"
-            "  - WikiArt vs Generated (partially fair): ~92% accuracy\n"
-            f"  - Fair evaluation (this report): {mean_fair_acc:.1%} accuracy\n\n"
-        )
-
-        if mean_fair_acc >= 0.80:
-            conclusions += (
-                "CONCLUSION: The 49-feature pipeline holds up under fair evaluation.\n"
-                "The accuracy drop from unfair to fair benchmarks is modest, indicating\n"
-                "that the features genuinely detect AI artifacts, not just content.\n\n"
-                "The frequency-domain features (FFT/DCT) and texture features (GLCM/LBP)\n"
-                "appear to be capturing real structural differences between AI-generated\n"
-                "and human-created artwork."
-            )
-        elif mean_fair_acc >= 0.65:
-            conclusions += (
-                "CONCLUSION: Mixed results. The features have some genuine detection\n"
-                "capability but a significant portion of previous accuracy was from\n"
-                "content shortcuts. The pipeline needs improvement — likely deeper\n"
-                "learned features (self-supervised or fine-tuned ViT) rather than\n"
-                "hand-crafted statistics."
-            )
-        else:
-            conclusions += (
-                "CONCLUSION: The 49-feature pipeline does NOT generalize to fair\n"
-                "evaluation. Previous high accuracy was primarily from content confounds.\n"
-                "A fundamentally different approach is needed — likely self-supervised\n"
-                "learning of camera/generation-intrinsic features as described in\n"
-                "Zhong et al. (2026)."
-            )
-
-        conclusions += (
-            "\n\nMethodological Note\n\n"
-            "This report uses 5-fold stratified cross-validation with 200 images per\n"
-            "class. While larger samples would give tighter confidence intervals, this\n"
-            "is sufficient to distinguish between >80% and chance-level performance.\n\n"
-            "Features: 49 total (39 from Li & Stamp 2025 + 10 FFT/DCT frequency features)\n"
-            "Classifiers: XGBoost, SVM (RBF kernel), MLP (100 hidden units)\n"
-            "All processing: CPU-only, no pretrained neural networks"
-        )
-
-        ax.text(0.08, 0.85, conclusions, transform=ax.transAxes, fontsize=9.5,
-                ha="left", va="top", fontfamily="serif")
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-    print(f"PDF saved to: {pdf_path}")
-    return pdf_path
-
-
-if __name__ == "__main__":
-    results_path = RESULTS_DIR / "fair_evaluation_results.json"
-    if not results_path.exists():
-        print(f"Run test_fair_evaluation.py first to generate {results_path}")
-        sys.exit(1)
-    generate_pdf(results_path)
diff --git a/tests/generate_results_pdf.py b/tests/generate_results_pdf.py
deleted file mode 100644
index 1ed4fdd..0000000
--- a/tests/generate_results_pdf.py
+++ /dev/null
@@ -1,475 +0,0 @@
-# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
-# <!-- // /*  d a r k s h a p e s */ -->
-
-"""Generate results PDF with multi-signal ensemble, calibrated thresholds,
-abstention, and full precision/recall/F1 reporting.
-
-Usage: uv run python tests/generate_results_pdf.py
-Output: results/artwork_detection_results.pdf
-"""
-
-from __future__ import annotations
-
-import sys
-from datetime import datetime
-from pathlib import Path
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-from matplotlib.backends.backend_pdf import PdfPages
-from matplotlib.patches import Patch
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-from datasets import load_dataset, Image as HFImage
-from sklearn.calibration import CalibratedClassifierCV
-from sklearn.metrics import (
-    accuracy_score, precision_score, recall_score, f1_score,
-    roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve,
-)
-from sklearn.model_selection import StratifiedKFold, cross_val_predict
-from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
-from tqdm import tqdm
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from negate.extract.feature_artwork import ArtworkExtract
-
-HUMAN_ART_REPO = "huggan/wikiart"
-SYNTHETIC_REPO = "exdysa/nano-banana-pro-generated-1k-clone"
-SAMPLE_SIZE = 100
-N_FOLDS = 5
-SEED = 42
-OUTPUT_DIR = Path(__file__).parent.parent / "results"
-
-
-def load_and_extract():
-    print(f"Loading {SAMPLE_SIZE} human art + {SAMPLE_SIZE} AI images...")
-    human_ds = load_dataset(HUMAN_ART_REPO, split=f"train[:{SAMPLE_SIZE}]")
-    human_ds = human_ds.cast_column("image", HFImage(decode=True, mode="RGB"))
-    ai_ds = load_dataset(SYNTHETIC_REPO, split=f"train[:{SAMPLE_SIZE}]")
-    ai_ds = ai_ds.cast_column("image", HFImage(decode=True, mode="RGB"))
-
-    extractor = ArtworkExtract()
-    features, labels = [], []
-    imgs_human, imgs_ai = [], []
-
-    for row in tqdm(human_ds, desc="Human art"):
-        try:
-            features.append(extractor(row["image"]))
-            labels.append(0)
-            if len(imgs_human) < 4:
-                imgs_human.append(row["image"])
-        except Exception:
-            pass
-
-    for row in tqdm(ai_ds, desc="AI art"):
-        try:
-            features.append(extractor(row["image"]))
-            labels.append(1)
-            if len(imgs_ai) < 4:
-                imgs_ai.append(row["image"])
-        except Exception:
-            pass
-
-    df = pd.DataFrame(features).fillna(0)
-    X = np.where(np.isfinite(df.to_numpy(dtype=np.float64)), df.to_numpy(dtype=np.float64), 0)
-    y = np.array(labels)
-    return X, y, list(df.columns), imgs_human, imgs_ai
-
-
-def run_ensemble_cv(X, y):
-    """Run calibrated ensemble with abstention."""
-    scaler = StandardScaler()
-    X_s = scaler.fit_transform(X)
-    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
-
-    # Individual models (calibrated with Platt scaling)
-    models = {
-        "SVM": CalibratedClassifierCV(SVC(C=10, gamma="scale", kernel="rbf", random_state=SEED), cv=3, method="sigmoid"),
-        "MLP": CalibratedClassifierCV(MLPClassifier(hidden_layer_sizes=(100,), activation="relu", max_iter=1000, random_state=SEED), cv=3, method="sigmoid"),
-    }
-
-    # Collect per-model CV predictions
-    model_probs = {}
-    model_preds = {}
-    for name, model in models.items():
-        probs = cross_val_predict(model, X_s, y, cv=skf, method="predict_proba")[:, 1]
-        model_probs[name] = probs
-        model_preds[name] = (probs > 0.5).astype(int)
-
-    # XGBoost (already outputs calibrated probabilities)
-    xgb_probs = np.zeros(len(y))
-    for train_idx, test_idx in skf.split(X_s, y):
-        params = {"objective": "binary:logistic", "max_depth": 4, "learning_rate": 0.1,
-                  "subsample": 0.8, "colsample_bytree": 0.8, "seed": SEED, "eval_metric": "logloss"}
-        dtrain = xgb.DMatrix(X_s[train_idx], label=y[train_idx])
-        dtest = xgb.DMatrix(X_s[test_idx])
-        model = xgb.train(params, dtrain, num_boost_round=200,
-                          evals=[(xgb.DMatrix(X_s[test_idx], label=y[test_idx]), "test")],
-                          early_stopping_rounds=10, verbose_eval=False)
-        xgb_probs[test_idx] = model.predict(dtest)
-
-    model_probs["XGBoost"] = xgb_probs
-    model_preds["XGBoost"] = (xgb_probs > 0.5).astype(int)
-
-    # Ensemble: average calibrated probabilities
-    ensemble_probs = np.mean([model_probs[n] for n in model_probs], axis=0)
-
-    # Abstention: if ensemble confidence < threshold, mark as uncertain
-    ABSTAIN_THRESH = 0.3  # abstain if prob between 0.3 and 0.7
-    ensemble_preds = np.full(len(y), -1)  # -1 = uncertain
-    ensemble_preds[ensemble_probs > (1 - ABSTAIN_THRESH)] = 1  # AI
-    ensemble_preds[ensemble_probs < ABSTAIN_THRESH] = 0  # Human
-
-    # Per-model metrics
-    results = {}
-    for name in model_probs:
-        pred = model_preds[name]
-        results[name] = {
-            "accuracy": accuracy_score(y, pred),
-            "precision": precision_score(y, pred, zero_division=0),
-            "recall": recall_score(y, pred, zero_division=0),
-            "f1": f1_score(y, pred, average="macro"),
-            "roc_auc": roc_auc_score(y, model_probs[name]),
-            "probs": model_probs[name],
-        }
-
-    # Ensemble metrics (excluding abstained samples)
-    confident_mask = ensemble_preds >= 0
-    n_abstained = int((~confident_mask).sum())
-    if confident_mask.sum() > 0:
-        results["Ensemble"] = {
-            "accuracy": accuracy_score(y[confident_mask], ensemble_preds[confident_mask]),
-            "precision": precision_score(y[confident_mask], ensemble_preds[confident_mask], zero_division=0),
-            "recall": recall_score(y[confident_mask], ensemble_preds[confident_mask], zero_division=0),
-            "f1": f1_score(y[confident_mask], ensemble_preds[confident_mask], average="macro"),
-            "roc_auc": roc_auc_score(y, ensemble_probs),
-            "probs": ensemble_probs,
-            "n_abstained": n_abstained,
-            "n_classified": int(confident_mask.sum()),
-        }
-
-    # Feature importance (full XGBoost model)
-    feature_names = [f"f{i}" for i in range(X.shape[1])]
-    dtrain_full = xgb.DMatrix(X_s, label=y, feature_names=feature_names)
-    full_model = xgb.train({"objective": "binary:logistic", "max_depth": 4, "seed": SEED},
-                           dtrain_full, num_boost_round=100, verbose_eval=False)
-
-    return results, ensemble_probs, ensemble_preds, full_model
-
-
-def generate_pdf(X, y, feature_names, results, ensemble_probs, ensemble_preds,
-                 model, imgs_human, imgs_ai):
-    OUTPUT_DIR.mkdir(exist_ok=True)
-    pdf_path = OUTPUT_DIR / "artwork_detection_results.pdf"
-
-    with PdfPages(str(pdf_path)) as pdf:
-        # ===== PAGE 1: Title + Results Table =====
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        ax = fig.add_axes([0, 0, 1, 1])
-        ax.axis("off")
-
-        ax.text(0.5, 0.92, "AI-Generated Artwork Detection", fontsize=22, fontweight="bold",
-                ha="center", fontfamily="serif", transform=ax.transAxes)
-        ax.text(0.5, 0.87, "Multi-Signal Ensemble with Calibrated Thresholds",
-                fontsize=12, ha="center", fontfamily="serif", style="italic", transform=ax.transAxes)
-        ax.text(0.5, 0.83, f"negate project | {datetime.now().strftime('%B %d, %Y')}",
-                fontsize=10, ha="center", fontfamily="serif", transform=ax.transAxes)
-
-        # Results table
-        ax_table = fig.add_axes([0.08, 0.52, 0.84, 0.26])
-        ax_table.axis("off")
-
-        table_data = []
-        for name, r in results.items():
-            row = [name, f"{r['accuracy']:.1%}", f"{r['precision']:.1%}",
-                   f"{r['recall']:.1%}", f"{r['f1']:.1%}", f"{r['roc_auc']:.4f}"]
-            if name == "Ensemble":
-                row.append(f"{r['n_abstained']}")
-            else:
-                row.append("-")
-            table_data.append(row)
-        table_data.append(["Existing negate", "63.3%", "--", "--", "--", "0.669", "-"])
-
-        table = ax_table.table(
-            cellText=table_data,
-            colLabels=["Model", "Accuracy", "Precision", "Recall", "F1", "AUC", "Abstained"],
-            loc="center", cellLoc="center",
-        )
-        table.auto_set_font_size(False)
-        table.set_fontsize(8.5)
-        table.scale(1, 1.6)
-        for (row, col), cell in table.get_celld().items():
-            if row == 0:
-                cell.set_facecolor("#2E4057")
-                cell.set_text_props(color="white", fontweight="bold")
-            elif row == len(table_data):
-                cell.set_facecolor("#FFE0E0")
-
-        # Summary text
-        ens = results.get("Ensemble", {})
-        summary = (
-            "Approach\n\n"
-            f"  Features: {X.shape[1]} (39 artwork + 10 frequency analysis)\n"
-            f"  Dataset:  {np.sum(y==0)} human artworks (WikiArt) + {np.sum(y==1)} AI images\n"
-            f"  CV:       {N_FOLDS}-fold stratified cross-validation\n\n"
-            "  Three calibrated classifiers (SVM, MLP, XGBoost) vote via averaged\n"
-            "  probabilities. Images where ensemble confidence is between 30-70%\n"
-            f"  are marked 'uncertain' ({ens.get('n_abstained', 0)} images abstained).\n\n"
-            "  Precision = of images flagged AI, how many actually are\n"
-            "  Recall    = of actual AI images, how many were caught"
-        )
-        ax.text(0.08, 0.48, summary, fontsize=9, ha="left", va="top", fontfamily="serif",
-                transform=ax.transAxes)
-
-        # Key findings
-        findings = (
-            "Key Findings\n\n"
-            f"  1. Ensemble achieves {ens.get('precision', 0):.1%} precision, "
-            f"{ens.get('recall', 0):.1%} recall on classified images\n"
-            f"  2. {ens.get('n_abstained', 0)} uncertain images abstained from "
-            f"(reduces false positives)\n"
-            f"  3. +{(ens.get('accuracy', 0) - 0.633)*100:.1f}pp improvement over "
-            "existing negate pipeline (63.3%)\n"
-            f"  4. Frequency features (FFT/DCT) add spectral artifact detection\n"
-            "  5. All processing is CPU-only, ~12 images/sec"
-        )
-        ax.text(0.08, 0.24, findings, fontsize=9, ha="left", va="top", fontfamily="serif",
-                transform=ax.transAxes,
-                bbox=dict(boxstyle="round,pad=0.4", facecolor="#E8F5E9", edgecolor="#66BB6A"))
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-        # ===== PAGE 2: ROC + PR curves + Confusion Matrix =====
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        fig.suptitle("Detection Performance Analysis", fontsize=14,
-                     fontweight="bold", fontfamily="serif", y=0.96)
-
-        # ROC curves
-        ax_roc = fig.add_axes([0.08, 0.62, 0.4, 0.28])
-        colors = {"SVM": "#4472C4", "MLP": "#ED7D31", "XGBoost": "#70AD47", "Ensemble": "#C00000"}
-        for name, r in results.items():
-            fpr, tpr, _ = roc_curve(y, r["probs"])
-            ax_roc.plot(fpr, tpr, color=colors.get(name, "gray"), linewidth=2,
-                       label=f"{name} ({r['roc_auc']:.3f})")
-        ax_roc.plot([0, 1], [0, 1], "k--", alpha=0.3)
-        ax_roc.set_xlabel("False Positive Rate", fontsize=9)
-        ax_roc.set_ylabel("True Positive Rate", fontsize=9)
-        ax_roc.set_title("ROC Curves", fontsize=10, fontfamily="serif")
-        ax_roc.legend(fontsize=7, loc="lower right")
-        ax_roc.grid(True, alpha=0.2)
-
-        # Precision-Recall curves
-        ax_pr = fig.add_axes([0.55, 0.62, 0.4, 0.28])
-        for name, r in results.items():
-            prec_curve, rec_curve, _ = precision_recall_curve(y, r["probs"])
-            ax_pr.plot(rec_curve, prec_curve, color=colors.get(name, "gray"), linewidth=2,
-                      label=name)
-        ax_pr.set_xlabel("Recall", fontsize=9)
-        ax_pr.set_ylabel("Precision", fontsize=9)
-        ax_pr.set_title("Precision-Recall Curves", fontsize=10, fontfamily="serif")
-        ax_pr.legend(fontsize=7)
-        ax_pr.grid(True, alpha=0.2)
-
-        # Ensemble confusion matrix
-        ax_cm = fig.add_axes([0.08, 0.28, 0.35, 0.26])
-        confident = ensemble_preds >= 0
-        if confident.sum() > 0:
-            cm = confusion_matrix(y[confident], ensemble_preds[confident])
-            im = ax_cm.imshow(cm, cmap="Blues")
-            ax_cm.set_xticks([0, 1])
-            ax_cm.set_yticks([0, 1])
-            ax_cm.set_xticklabels(["Human", "AI"], fontsize=9)
-            ax_cm.set_yticklabels(["Human", "AI"], fontsize=9)
-            ax_cm.set_xlabel("Predicted", fontsize=9)
-            ax_cm.set_ylabel("Actual", fontsize=9)
-            ax_cm.set_title("Ensemble (confident only)", fontsize=10, fontfamily="serif")
-            for i in range(2):
-                for j in range(2):
-                    ax_cm.text(j, i, str(cm[i, j]), ha="center", va="center", fontsize=16,
-                              fontweight="bold", color="white" if cm[i, j] > cm.max()/2 else "black")
-
-        # Probability distribution
-        ax_hist = fig.add_axes([0.55, 0.28, 0.4, 0.26])
-        human_probs = ensemble_probs[y == 0]
-        ai_probs = ensemble_probs[y == 1]
-        ax_hist.hist(human_probs, bins=20, alpha=0.6, color="#4472C4", label="Human art", density=True)
-        ax_hist.hist(ai_probs, bins=20, alpha=0.6, color="#ED7D31", label="AI art", density=True)
-        ax_hist.axvline(x=0.3, color="red", linestyle="--", alpha=0.5, label="Abstain zone")
-        ax_hist.axvline(x=0.7, color="red", linestyle="--", alpha=0.5)
-        ax_hist.axvspan(0.3, 0.7, alpha=0.1, color="red")
-        ax_hist.set_xlabel("Ensemble Probability (AI)", fontsize=9)
-        ax_hist.set_ylabel("Density", fontsize=9)
-        ax_hist.set_title("Probability Distribution", fontsize=10, fontfamily="serif")
-        ax_hist.legend(fontsize=7)
-
-        # Per-model agreement analysis
-        ax_agree = fig.add_axes([0.08, 0.04, 0.84, 0.18])
-        ax_agree.axis("off")
-        n_all_agree = sum(1 for i in range(len(y))
-                         if len(set(results[n]["probs"][i] > 0.5 for n in ["SVM", "MLP", "XGBoost"])) == 1)
-        n_disagree = len(y) - n_all_agree
-        agree_text = (
-            "Model Agreement Analysis\n\n"
-            f"  All 3 models agree:    {n_all_agree}/{len(y)} ({n_all_agree/len(y):.0%})\n"
-            f"  At least 1 disagrees:  {n_disagree}/{len(y)} ({n_disagree/len(y):.0%})\n\n"
-            "  When models disagree, the ensemble uses averaged probability with\n"
-            "  abstention zone (0.3-0.7). This reduces false positives at the cost\n"
-            "  of some unclassified images -- a deliberate tradeoff for precision."
-        )
-        ax_agree.text(0, 1, agree_text, fontsize=9, ha="left", va="top", fontfamily="serif",
-                     transform=ax_agree.transAxes)
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-        # ===== PAGE 3: Feature Analysis + Examples =====
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        fig.suptitle("Feature Analysis & Examples", fontsize=14,
-                     fontweight="bold", fontfamily="serif", y=0.96)
-
-        # Example images
-        n = min(4, len(imgs_human), len(imgs_ai))
-        gs = gridspec.GridSpec(2, n, top=0.9, bottom=0.65, left=0.05, right=0.95, hspace=0.2, wspace=0.1)
-        for i in range(n):
-            ax = fig.add_subplot(gs[0, i])
-            ax.imshow(imgs_human[i])
-            ax.set_title(f"Human #{i+1}", fontsize=8)
-            ax.axis("off")
-        for i in range(n):
-            ax = fig.add_subplot(gs[1, i])
-            ax.imshow(imgs_ai[i])
-            ax.set_title(f"AI #{i+1}", fontsize=8)
-            ax.axis("off")
-
-        # Feature importance
-        ax_imp = fig.add_axes([0.12, 0.08, 0.76, 0.5])
-        importance = model.get_score(importance_type="gain")
-        sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:20]
-        if sorted_imp:
-            # Map back to real feature names
-            disp_names = []
-            for fname, _ in sorted_imp:
-                idx = int(fname[1:])  # f0 -> 0
-                disp_names.append(feature_names[idx] if idx < len(feature_names) else fname)
-            disp_names = disp_names[::-1]
-            gains = [x[1] for x in sorted_imp][::-1]
-
-            color_map = {"fft": "#C00000", "dct": "#C00000",
-                         "hog": "#ED7D31", "edge": "#ED7D31",
-                         "lbp": "#70AD47", "contrast": "#70AD47", "correlation": "#70AD47",
-                         "energy": "#70AD47", "homogeneity": "#70AD47"}
-            bar_colors = []
-            for n in disp_names:
-                c = "#4472C4"  # default
-                for prefix, color in color_map.items():
-                    if prefix in n:
-                        c = color
-                        break
-                bar_colors.append(c)
-
-            ax_imp.barh(range(len(disp_names)), gains, color=bar_colors)
-            ax_imp.set_yticks(range(len(disp_names)))
-            ax_imp.set_yticklabels(disp_names, fontsize=7)
-            ax_imp.set_xlabel("XGBoost Gain", fontsize=9)
-            ax_imp.set_title("Top 20 Features by Importance", fontsize=10, fontfamily="serif")
-
-            legend_elements = [
-                Patch(facecolor="#C00000", label="Frequency (FFT/DCT)"),
-                Patch(facecolor="#ED7D31", label="Shape (HOG/edges)"),
-                Patch(facecolor="#70AD47", label="Texture (GLCM/LBP)"),
-                Patch(facecolor="#4472C4", label="Color/Brightness/Noise"),
-            ]
-            ax_imp.legend(handles=legend_elements, fontsize=7, loc="lower right")
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-        # ===== PAGE 4: Methodology & Architecture =====
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        ax = fig.add_axes([0, 0, 1, 1])
-        ax.axis("off")
-
-        ax.text(0.5, 0.95, "Architecture & Methodology", fontsize=14,
-                fontweight="bold", ha="center", fontfamily="serif", transform=ax.transAxes)
-
-        method_text = (
-            "Multi-Signal Ensemble Architecture\n\n"
-            "The detection system combines three orthogonal classifiers, each seeing the\n"
-            "same feature space but learning different decision boundaries:\n\n"
-            "  1. SVM (RBF kernel) - Finds nonlinear decision boundaries in feature space.\n"
-            "     Calibrated with Platt scaling (sigmoid) for reliable probabilities.\n\n"
-            "  2. MLP (100 hidden units) - Learns feature interactions through backpropagation.\n"
-            "     Calibrated with Platt scaling for probability alignment.\n\n"
-            "  3. XGBoost (gradient boosted trees) - Captures feature thresholds and\n"
-            "     interactions. Naturally outputs calibrated log-odds.\n\n"
-            "Ensemble Voting: Averaged calibrated probabilities from all three models.\n"
-            "This is more robust than majority voting because it accounts for confidence.\n\n"
-            "Calibrated Confidence & Abstention\n\n"
-            "Instead of a hard 0.5 threshold, the ensemble uses a deliberate 'uncertain'\n"
-            "zone between 0.3 and 0.7 probability. Images in this zone are marked as\n"
-            "'uncertain' rather than forced into a class. This dramatically improves\n"
-            "precision on the images that ARE classified.\n\n"
-            "Feature Extraction Pipeline (49 features, CPU-only)\n\n"
-            "  Brightness (2)  - Global luminance statistics\n"
-            "  Color (23)      - RGB/HSV histogram moments (mean, var, kurtosis, skew, entropy)\n"
-            "  Texture (6)     - GLCM co-occurrence + LBP local patterns\n"
-            "  Shape (6)       - HOG gradient histograms + Canny edge density\n"
-            "  Noise (2)       - Estimated noise entropy + signal-to-noise ratio\n"
-            "  Frequency (10)  - FFT radial band energies, spectral centroid, phase coherence,\n"
-            "                    DCT AC/DC ratio, high-freq energy, coefficient sparsity\n\n"
-            "The frequency branch is the key addition beyond Li & Stamp (2025). AI generators\n"
-            "leave characteristic spectral signatures from upsampling layers, attention patterns,\n"
-            "and latent space decoding. These are invisible in pixel space but clearly visible\n"
-            "in the frequency domain.\n\n"
-            "Limitations\n\n"
-            "  - Tested on mismatched subjects (WikiArt paintings vs AI banana images)\n"
-            "  - Not yet tested on hard negatives (polished digital art, img2img, LoRA art)\n"
-            "  - Single generator family in AI training data (Stable Diffusion variants)\n"
-            "  - 200 sample dataset is small for robust conclusions\n\n"
-            "References\n\n"
-            "  [1] Li & Stamp, 'Detecting AI-generated Artwork', arXiv:2504.07078, 2025\n"
-            "  [2] negate project, github.com/darkshapes/negate"
-        )
-        ax.text(0.06, 0.9, method_text, fontsize=8.5, ha="left", va="top", fontfamily="serif",
-                transform=ax.transAxes)
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-    print(f"PDF saved: {pdf_path}")
-    return pdf_path
-
-
-def main():
-    print("=" * 55)
-    print("  ARTWORK DETECTION - ENSEMBLE RESULTS")
-    print("=" * 55)
-
-    X, y, names, imgs_h, imgs_a = load_and_extract()
-    print(f"Dataset: {np.sum(y==0)} human + {np.sum(y==1)} AI, {X.shape[1]} features")
-
-    results, ens_probs, ens_preds, model = run_ensemble_cv(X, y)
-
-    print(f"\n{'Model':<15} {'Acc':>8} {'Prec':>8} {'Rec':>8} {'F1':>8} {'AUC':>8}")
-    print("-" * 55)
-    for name, r in results.items():
-        extra = f"  ({r.get('n_abstained', '-')} abstained)" if 'n_abstained' in r else ""
-        print(f"{name:<15} {r['accuracy']:>7.1%} {r['precision']:>7.1%} {r['recall']:>7.1%} "
-              f"{r['f1']:>7.1%} {r['roc_auc']:>7.4f}{extra}")
-
-    generate_pdf(X, y, names, results, ens_probs, ens_preds, model, imgs_h, imgs_a)
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/test_adversarial_robustness.py b/tests/test_adversarial_robustness.py
new file mode 100644
index 0000000..abe4e7d
--- /dev/null
+++ b/tests/test_adversarial_robustness.py
@@ -0,0 +1,348 @@
+"""Adversarial robustness and false positive analysis.
+
+Tests whether the 148-feature detector holds up under:
+1. Post-processing attacks (JPEG, noise, resize, blur, crop)
+2. False positive stress test (digital art, ambiguous content)
+3. Threshold calibration (precision-recall tradeoff)
+4. Social media simulation (combined degradation)
+
+Uses ImagiNet paintings data (already on disk).
+"""
+
+from __future__ import annotations
+
+import json
+import warnings
+from io import BytesIO
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+from PIL import Image, ImageFilter, ImageEnhance
+from sklearn.metrics import (
+    accuracy_score, f1_score, precision_recall_curve, precision_score,
+    recall_score, roc_auc_score,
+)
+from sklearn.model_selection import StratifiedKFold
+from tqdm import tqdm
+
+warnings.filterwarnings("ignore")
+
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from negate.extract.feature_artwork import ArtworkExtract
+
+ext = ArtworkExtract()
+BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted")
+SEED = 42
+N_IMAGES = 300  # per class for speed
+rng = np.random.RandomState(SEED)
+
+
+def load_image_paths(path: Path, recursive: bool, max_n: int) -> list[Path]:
+    exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
+    if recursive:
+        files = [f for f in path.rglob("*") if f.suffix.lower() in exts]
+    else:
+        files = [f for f in path.iterdir() if f.suffix.lower() in exts]
+    if len(files) > max_n:
+        files = list(rng.choice(files, max_n, replace=False))
+    return files
+
+
+def extract_features_from_images(images: list[Image.Image], desc: str = "") -> np.ndarray:
+    rows = []
+    for img in tqdm(images, desc=desc, leave=False):
+        try:
+            feat = ext(img)
+            rows.append(list(feat.values()))
+        except Exception:
+            rows.append([0.0] * 148)
+    X = np.array(rows, dtype=np.float64)
+    return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+
+
+def extract_features_from_paths(files: list[Path], desc: str = "") -> np.ndarray:
+    images = []
+    for f in files:
+        try:
+            images.append(Image.open(f).convert("RGB"))
+        except Exception:
+            pass
+    return extract_features_from_images(images, desc)
+
+
+# ---- Perturbation functions ----
+
+def jpeg_compress(img: Image.Image, quality: int) -> Image.Image:
+    buf = BytesIO()
+    img.save(buf, format="JPEG", quality=quality)
+    buf.seek(0)
+    return Image.open(buf).convert("RGB")
+
+
+def add_gaussian_noise(img: Image.Image, sigma: float) -> Image.Image:
+    arr = np.array(img, dtype=np.float64)
+    noise = rng.normal(0, sigma, arr.shape)
+    arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
+    return Image.fromarray(arr)
+
+
+def resize_down_up(img: Image.Image, small_size: int) -> Image.Image:
+    orig_size = img.size
+    img_small = img.resize((small_size, small_size), Image.BICUBIC)
+    return img_small.resize(orig_size, Image.BICUBIC)
+
+
+def center_crop(img: Image.Image, ratio: float = 0.7) -> Image.Image:
+    w, h = img.size
+    new_w, new_h = int(w * ratio), int(h * ratio)
+    left = (w - new_w) // 2
+    top = (h - new_h) // 2
+    return img.crop((left, top, left + new_w, top + new_h))
+
+
+def gaussian_blur(img: Image.Image, radius: float) -> Image.Image:
+    return img.filter(ImageFilter.GaussianBlur(radius=radius))
+
+
+def adjust_brightness(img: Image.Image, factor: float) -> Image.Image:
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def social_media_sim(img: Image.Image) -> Image.Image:
+    """Simulate social media pipeline: resize down, JPEG, slight blur."""
+    img = img.resize((1024, 1024), Image.BICUBIC)
+    img = jpeg_compress(img, 75)
+    img = gaussian_blur(img, 0.5)
+    return img
+
+
+PERTURBATIONS = {
+    "jpeg_q30": lambda img: jpeg_compress(img, 30),
+    "jpeg_q50": lambda img: jpeg_compress(img, 50),
+    "jpeg_q70": lambda img: jpeg_compress(img, 70),
+    "noise_s5": lambda img: add_gaussian_noise(img, 5),
+    "noise_s15": lambda img: add_gaussian_noise(img, 15),
+    "noise_s30": lambda img: add_gaussian_noise(img, 30),
+    "resize_128": lambda img: resize_down_up(img, 128),
+    "resize_64": lambda img: resize_down_up(img, 64),
+    "crop_70pct": lambda img: center_crop(img, 0.7),
+    "crop_50pct": lambda img: center_crop(img, 0.5),
+    "blur_r1": lambda img: gaussian_blur(img, 1),
+    "blur_r3": lambda img: gaussian_blur(img, 3),
+    "bright_0.7": lambda img: adjust_brightness(img, 0.7),
+    "bright_1.3": lambda img: adjust_brightness(img, 1.3),
+    "social_media": social_media_sim,
+}
+
+
+def main():
+    print("=" * 70)
+    print("  ADVERSARIAL ROBUSTNESS & FALSE POSITIVE ANALYSIS")
+    print("  148 features, LightGBM, ImagiNet paintings")
+    print("=" * 70)
+
+    # Load real and fake images
+    print("\nLoading images...")
+    real_files = load_image_paths(BASE / "wikiart", recursive=True, max_n=N_IMAGES)
+    # Mix of generators for fake
+    fake_files = []
+    for gen_dir in ["sdxl_paintings_fake", "sd_paintings_fake", "dalle3", "journeydb"]:
+        fake_files.extend(load_image_paths(BASE / gen_dir, recursive=False, max_n=N_IMAGES // 4))
+    rng.shuffle(fake_files)
+    fake_files = fake_files[:N_IMAGES]
+
+    print(f"  Real: {len(real_files)}, Fake: {len(fake_files)}")
+
+    real_images = [Image.open(f).convert("RGB") for f in tqdm(real_files, desc="Loading real")]
+    fake_images = [Image.open(f).convert("RGB") for f in tqdm(fake_files, desc="Loading fake")]
+
+    # Extract clean features and train model
+    print("\nExtracting clean features...")
+    X_real = extract_features_from_images(real_images, "Real features")
+    X_fake = extract_features_from_images(fake_images, "Fake features")
+
+    X = np.vstack([X_real, X_fake])
+    y = np.concatenate([np.zeros(len(X_real)), np.ones(len(X_fake))])
+
+    # Train on 70%, test perturbations on 30%
+    n = len(y)
+    idx = rng.permutation(n)
+    split = int(0.7 * n)
+    train_idx, test_idx = idx[:split], idx[split:]
+
+    model = lgb.LGBMClassifier(
+        n_estimators=500, learning_rate=0.05, max_depth=8,
+        num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED,
+    )
+    model.fit(X[train_idx], y[train_idx])
+
+    # Clean baseline
+    y_prob_clean = model.predict_proba(X[test_idx])[:, 1]
+    y_pred_clean = (y_prob_clean > 0.5).astype(int)
+    clean_acc = accuracy_score(y[test_idx], y_pred_clean)
+    clean_auc = roc_auc_score(y[test_idx], y_prob_clean)
+    print(f"\nClean baseline: acc={clean_acc:.4f}  auc={clean_auc:.4f}")
+
+    # ---- EXP 1: Perturbation robustness ----
+    print("\n" + "=" * 70)
+    print("  EXP 1: Adversarial Robustness (perturbations on test set)")
+    print("=" * 70)
+
+    # Get the test images
+    test_images_real = [real_images[i] for i in range(len(real_images)) if i in set(test_idx[test_idx < len(real_images)])]
+    test_images_fake = [fake_images[i - len(real_images)] for i in test_idx if i >= len(real_images)]
+    test_images = test_images_real + test_images_fake
+    test_labels = np.concatenate([np.zeros(len(test_images_real)), np.ones(len(test_images_fake))])
+
+    results_perturb = {"clean": {"acc": clean_acc, "auc": clean_auc}}
+
+    for pert_name, pert_fn in PERTURBATIONS.items():
+        print(f"\n  Applying {pert_name}...")
+        perturbed = []
+        for img in tqdm(test_images, desc=f"  {pert_name}", leave=False):
+            try:
+                perturbed.append(pert_fn(img))
+            except Exception:
+                perturbed.append(img)  # fallback to original
+
+        X_pert = extract_features_from_images(perturbed, f"  {pert_name} features")
+        y_prob_pert = model.predict_proba(X_pert)[:, 1]
+        y_pred_pert = (y_prob_pert > 0.5).astype(int)
+
+        acc = accuracy_score(test_labels, y_pred_pert)
+        auc = roc_auc_score(test_labels, y_prob_pert)
+        delta = acc - clean_acc
+        results_perturb[pert_name] = {"acc": float(acc), "auc": float(auc), "delta": float(delta)}
+        print(f"    acc={acc:.4f}  auc={auc:.4f}  delta={delta:+.4f}")
+
+    # Summary table
+    print("\n  +---------------------+--------+--------+---------+")
+    print("  | Perturbation        |  Acc   |  AUC   |  Delta  |")
+    print("  +---------------------+--------+--------+---------+")
+    for name, r in sorted(results_perturb.items(), key=lambda x: -x[1]["acc"]):
+        print(f"  | {name:19s} | {r['acc']:.4f} | {r['auc']:.4f} | {r.get('delta', 0):+.4f} |")
+    print("  +---------------------+--------+--------+---------+")
+
+    # ---- EXP 2: Threshold calibration ----
+    print("\n" + "=" * 70)
+    print("  EXP 2: Threshold Calibration (Precision-Recall Tradeoff)")
+    print("=" * 70)
+
+    # Use full 5-fold CV probabilities for calibration
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
+    all_probs, all_labels = [], []
+    for tr, te in skf.split(X, y):
+        m = lgb.LGBMClassifier(
+            n_estimators=500, learning_rate=0.05, max_depth=8,
+            num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED,
+        )
+        m.fit(X[tr], y[tr])
+        all_probs.extend(m.predict_proba(X[te])[:, 1])
+        all_labels.extend(y[te])
+
+    all_probs = np.array(all_probs)
+    all_labels = np.array(all_labels)
+
+    prec_arr, rec_arr, thresholds = precision_recall_curve(all_labels, all_probs)
+
+    # Find thresholds for different precision targets
+    print("\n  Threshold analysis (higher threshold = fewer false positives):")
+    print("  +------------+--------+-----------+--------+---------+")
+    print("  | Threshold  |  Prec  |  Recall   |   F1   | FP Rate |")
+    print("  +------------+--------+-----------+--------+---------+")
+
+    results_thresh = {}
+    for target_thresh in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]:
+        pred = (all_probs >= target_thresh).astype(int)
+        if pred.sum() == 0:
+            continue
+        prec = precision_score(all_labels, pred, zero_division=0)
+        rec = recall_score(all_labels, pred, zero_division=0)
+        f1 = f1_score(all_labels, pred, average="macro", zero_division=0)
+        # False positive rate: real images incorrectly flagged as AI
+        real_mask = all_labels == 0
+        fp_rate = float(pred[real_mask].mean())
+
+        results_thresh[str(target_thresh)] = {
+            "precision": float(prec), "recall": float(rec),
+            "f1": float(f1), "fp_rate": float(fp_rate),
+        }
+        print(f"  | {target_thresh:10.2f} | {prec:.4f} | {rec:9.4f} | {f1:.4f} | {fp_rate:7.4f} |")
+    print("  +------------+--------+-----------+--------+---------+")
+
+    # Find the sweet spot: highest F1 with FP rate < 5%
+    best_thresh = None
+    best_f1 = 0
+    for t_str, r in results_thresh.items():
+        if r["fp_rate"] <= 0.05 and r["f1"] > best_f1:
+            best_f1 = r["f1"]
+            best_thresh = t_str
+    if best_thresh:
+        print(f"\n  Recommended threshold: {best_thresh} (F1={results_thresh[best_thresh]['f1']:.4f}, FP rate={results_thresh[best_thresh]['fp_rate']:.4f})")
+
+    # ---- EXP 3: Social media simulation ----
+    print("\n" + "=" * 70)
+    print("  EXP 3: Social Media Pipeline Simulation")
+    print("=" * 70)
+    print("  (Resize to 1024px + JPEG Q75 + slight blur)")
+    # Already in perturbation results
+    sm = results_perturb.get("social_media", {})
+    print(f"  Social media accuracy: {sm.get('acc', 'N/A')}")
+    print(f"  Social media AUC: {sm.get('auc', 'N/A')}")
+    print(f"  Delta from clean: {sm.get('delta', 'N/A')}")
+
+    # ---- EXP 4: Worst-case adversarial (combined attacks) ----
+    print("\n" + "=" * 70)
+    print("  EXP 4: Worst-Case Adversarial (Combined Attacks)")
+    print("=" * 70)
+
+    def worst_case_attack(img: Image.Image) -> Image.Image:
+        """JPEG Q30 + resize 128→orig + noise σ=10 + blur r=1"""
+        img = jpeg_compress(img, 30)
+        img = resize_down_up(img, 128)
+        img = add_gaussian_noise(img, 10)
+        img = gaussian_blur(img, 1)
+        return img
+
+    perturbed_worst = [worst_case_attack(img) for img in tqdm(test_images, desc="Worst case")]
+    X_worst = extract_features_from_images(perturbed_worst, "Worst case features")
+    y_prob_worst = model.predict_proba(X_worst)[:, 1]
+    y_pred_worst = (y_prob_worst > 0.5).astype(int)
+    worst_acc = accuracy_score(test_labels, y_pred_worst)
+    worst_auc = roc_auc_score(test_labels, y_prob_worst)
+    print(f"  Worst-case accuracy: {worst_acc:.4f}")
+    print(f"  Worst-case AUC: {worst_auc:.4f}")
+    print(f"  Delta from clean: {worst_acc - clean_acc:+.4f}")
+
+    # Save all results
+    all_results = {
+        "clean_baseline": {"acc": float(clean_acc), "auc": float(clean_auc)},
+        "perturbation_robustness": {k: {kk: float(vv) for kk, vv in v.items()} for k, v in results_perturb.items()},
+        "threshold_calibration": results_thresh,
+        "recommended_threshold": best_thresh,
+        "worst_case_adversarial": {"acc": float(worst_acc), "auc": float(worst_auc)},
+        "n_real": len(real_files),
+        "n_fake": len(fake_files),
+    }
+
+    out_path = Path(__file__).parent.parent / "results" / "adversarial_robustness_results.json"
+    with open(out_path, "w") as f:
+        json.dump(all_results, f, indent=2)
+
+    print(f"\n{'='*70}")
+    print(f"  SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Clean accuracy:      {clean_acc:.4f}")
+    print(f"  Social media:        {sm.get('acc', 'N/A')}")
+    print(f"  Worst-case attack:   {worst_acc:.4f}")
+    print(f"  Recommended thresh:  {best_thresh}")
+    if best_thresh:
+        r = results_thresh[best_thresh]
+        print(f"    → Precision: {r['precision']:.4f}, Recall: {r['recall']:.4f}, FP rate: {r['fp_rate']:.4f}")
+    print(f"\n  Results saved to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_artwork_accuracy.py b/tests/test_artwork_accuracy.py
index 75ba27a..d191d63 100644
--- a/tests/test_artwork_accuracy.py
+++ b/tests/test_artwork_accuracy.py
@@ -96,7 +96,7 @@ def test_feature_extraction(self, benchmark_data):
         print(f"Features:   {benchmark_data['X'].shape[1]}")
         assert benchmark_data["n_human"] >= 50
         assert benchmark_data["n_ai"] >= 50
-        assert benchmark_data["X"].shape[1] == 49
+        assert benchmark_data["X"].shape[1] == 73
 
     def test_svm_cross_validation(self, benchmark_data):
         """SVM with RBF kernel — paper's best binary model (97.9% reported)."""
diff --git a/tests/test_clip_bias_defactify.py b/tests/test_clip_bias_defactify.py
new file mode 100644
index 0000000..658c959
--- /dev/null
+++ b/tests/test_clip_bias_defactify.py
@@ -0,0 +1,318 @@
+# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
+"""CLIP bias test on Defactify MS-COCOAI dataset.
+
+Tests whether CLIP's detection advantage comes from recognizing its own
+latent fingerprint in images from CLIP-based generators.
+
+Dataset: Rajarshi-Roy-research/Defactify_Image_Dataset (96K images)
+- Label_B=0: Real (MS COCO)
+- Label_B=1: SD 2.1 (uses CLIP)
+- Label_B=2: SDXL (uses CLIP-L + CLIP-G)
+- Label_B=3: SD 3 (uses CLIP-L + CLIP-G + T5)
+- Label_B=4: Midjourney v6 (proprietary, unknown)
+- Label_B=5: DALL-E 3 (uses T5, NOT CLIP)
+
+Key comparison: CLIP accuracy on SD 2.1/SDXL (pure CLIP) vs DALL-E 3 (no CLIP).
+If CLIP's advantage is larger on CLIP-based generators, bias is confirmed.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import numpy as np
+import pandas as pd
+import torch
+import xgboost as xgb
+from datasets import load_dataset, Image as HFImage
+from PIL import Image
+from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from tqdm import tqdm
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from negate.extract.feature_artwork import ArtworkExtract
+from negate.extract.feature_style import StyleExtract
+
+SEED = 42
+N_FOLDS = 5
+N_PER_CLASS = 500  # per generator
+RESULTS_DIR = Path(__file__).parent.parent / "results"
+
+GENERATORS = {
+    0: {"name": "Real (MS COCO)", "uses_clip": None},
+    1: {"name": "SD 2.1", "uses_clip": True},
+    2: {"name": "SDXL", "uses_clip": True},
+    3: {"name": "SD 3", "uses_clip": True},  # hybrid: CLIP + T5
+    4: {"name": "Midjourney v6", "uses_clip": "unknown"},
+    5: {"name": "DALL-E 3", "uses_clip": False},  # T5 only
+}
+
+
+def extract_handcrafted(images):
+    art = ArtworkExtract()
+    style = StyleExtract()
+    features = []
+    for img in tqdm(images, desc="  Hand-crafted"):
+        try:
+            f = art(img)
+            f |= style(img)
+            features.append(f)
+        except Exception:
+            features.append(None)
+    valid = [i for i, f in enumerate(features) if f is not None]
+    df = pd.DataFrame([f for f in features if f is not None]).fillna(0)
+    X = df.to_numpy(dtype=np.float64)
+    return np.where(np.isfinite(X), X, 0), valid
+
+
+def extract_clip(images):
+    from transformers import CLIPModel, CLIPProcessor
+    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    model.eval()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+
+    all_emb, valid = [], []
+    bs = 32
+    for i in tqdm(range(0, len(images), bs), desc="  CLIP"):
+        batch = [img for img in images[i:i+bs] if img and isinstance(img, Image.Image)]
+        batch_idx = [i+j for j, img in enumerate(images[i:i+bs]) if img and isinstance(img, Image.Image)]
+        if not batch:
+            continue
+        with torch.no_grad():
+            inputs = processor(images=batch, return_tensors="pt", padding=True).to(device)
+            out = model.get_image_features(**inputs)
+            emb = out.pooler_output.cpu().numpy() if hasattr(out, 'pooler_output') else out.cpu().numpy()
+        all_emb.append(emb)
+        valid.extend(batch_idx)
+    return np.vstack(all_emb), valid
+
+
+def run_cv(X, y, model_type="svm"):
+    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
+    all_true, all_prob = [], []
+    for train_idx, test_idx in skf.split(X, y):
+        X_train, X_test = X[train_idx], X[test_idx]
+        y_train, y_test = y[train_idx], y[test_idx]
+        if model_type == "svm":
+            scaler = StandardScaler()
+            clf = SVC(kernel="rbf", probability=True, random_state=SEED)
+            clf.fit(scaler.fit_transform(X_train), y_train)
+            y_prob = clf.predict_proba(scaler.transform(X_test))[:, 1]
+        elif model_type == "xgb":
+            spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1)
+            dtrain = xgb.DMatrix(X_train, label=y_train)
+            dtest = xgb.DMatrix(X_test, label=y_test)
+            m = xgb.train({"objective": "binary:logistic", "max_depth": 5,
+                           "learning_rate": 0.1, "scale_pos_weight": spw, "seed": SEED},
+                          dtrain, num_boost_round=200, evals=[(dtest, "t")],
+                          early_stopping_rounds=10, verbose_eval=False)
+            y_prob = m.predict(dtest)
+        all_true.extend(y_test)
+        all_prob.extend(y_prob)
+
+    yt, yp = np.array(all_true), np.array(all_prob)
+    ypr = (yp > 0.5).astype(int)
+    return {
+        "accuracy": float(accuracy_score(yt, ypr)),
+        "precision": float(precision_score(yt, ypr, zero_division=0)),
+        "recall": float(recall_score(yt, ypr, zero_division=0)),
+        "roc_auc": float(roc_auc_score(yt, yp)),
+    }
+
+
+def generate_pdf(results):
+    RESULTS_DIR.mkdir(exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    pdf_path = RESULTS_DIR / f"clip_bias_defactify_{ts}.pdf"
+
+    with PdfPages(str(pdf_path)) as pdf:
+        fig = plt.figure(figsize=(8.5, 11))
+        fig.patch.set_facecolor("white")
+        ax = fig.add_axes([0, 0, 1, 1])
+        ax.axis("off")
+
+        ax.text(0.5, 0.93, "CLIP Bias Analysis:\nDefactify MS-COCOAI Dataset",
+                transform=ax.transAxes, fontsize=18, fontweight="bold",
+                ha="center", va="top", fontfamily="serif")
+        ax.text(0.5, 0.83, f"negate — darkshapes — {datetime.now().strftime('%B %d, %Y')}",
+                transform=ax.transAxes, fontsize=10, ha="center", fontfamily="serif", style="italic")
+
+        hyp = (
+            "Hypothesis: CLIP embeddings achieve high detection accuracy because many\n"
+            "generators use CLIP as their text encoder, so CLIP recognizes its own fingerprint.\n\n"
+            "Test: Compare CLIP vs hand-crafted feature accuracy PER GENERATOR.\n"
+            "If CLIP's advantage is larger on CLIP-based generators (SD 2.1, SDXL, SD 3)\n"
+            "than on non-CLIP generators (DALL-E 3), bias is confirmed."
+        )
+        ax.text(0.08, 0.74, hyp, transform=ax.transAxes, fontsize=9,
+                ha="left", va="top", fontfamily="serif",
+                bbox=dict(boxstyle="round,pad=0.4", facecolor="lightyellow", edgecolor="gray"))
+
+        # Results table
+        table_data = []
+        for r in results:
+            table_data.append([
+                r["generator"], "Yes" if r["uses_clip"] is True else "No" if r["uses_clip"] is False else "?",
+                f"{r['handcrafted_best']:.1%}", f"{r['clip_best']:.1%}",
+                f"{r['clip_best'] - r['handcrafted_best']:+.1%}pp"
+            ])
+
+        ax_t = fig.add_axes([0.05, 0.38, 0.9, 0.28])
+        ax_t.axis("off")
+        table = ax_t.table(cellText=table_data,
+                           colLabels=["Generator", "Uses CLIP?", "Hand-crafted", "CLIP", "CLIP Advantage"],
+                           loc="center", cellLoc="center")
+        table.auto_set_font_size(False)
+        table.set_fontsize(8.5)
+        table.scale(1, 1.6)
+        for (row, col), cell in table.get_celld().items():
+            if row == 0:
+                cell.set_facecolor("#4472C4")
+                cell.set_text_props(color="white", fontweight="bold")
+
+        # Analysis
+        clip_gens = [r for r in results if r["uses_clip"] is True]
+        non_clip = [r for r in results if r["uses_clip"] is False]
+        clip_avg_adv = np.mean([r["clip_best"] - r["handcrafted_best"] for r in clip_gens]) if clip_gens else 0
+        non_clip_avg_adv = np.mean([r["clip_best"] - r["handcrafted_best"] for r in non_clip]) if non_clip else 0
+
+        if clip_gens and non_clip:
+            diff = clip_avg_adv - non_clip_avg_adv
+            if diff > 0.05:
+                verdict = (
+                    f"CLIP BIAS CONFIRMED.\n\n"
+                    f"CLIP advantage on CLIP-based generators: {clip_avg_adv:+.1%}pp (avg)\n"
+                    f"CLIP advantage on non-CLIP generators:   {non_clip_avg_adv:+.1%}pp (avg)\n"
+                    f"Difference: {diff:+.1%}pp\n\n"
+                    "CLIP performs significantly better on images from generators that use\n"
+                    "CLIP internally. This suggests CLIP partially recognizes its own latent\n"
+                    "fingerprint rather than detecting universal generation artifacts."
+                )
+            elif diff < -0.05:
+                verdict = (
+                    f"CLIP BIAS NOT CONFIRMED (reverse pattern).\n\n"
+                    f"CLIP advantage on CLIP-based generators: {clip_avg_adv:+.1%}pp (avg)\n"
+                    f"CLIP advantage on non-CLIP generators:   {non_clip_avg_adv:+.1%}pp (avg)\n\n"
+                    "CLIP actually has a LARGER advantage on non-CLIP generators.\n"
+                    "This suggests CLIP detects genuine visual artifacts, not its own fingerprint."
+                )
+            else:
+                verdict = (
+                    f"NO SIGNIFICANT CLIP BIAS.\n\n"
+                    f"CLIP advantage on CLIP-based generators: {clip_avg_adv:+.1%}pp (avg)\n"
+                    f"CLIP advantage on non-CLIP generators:   {non_clip_avg_adv:+.1%}pp (avg)\n"
+                    f"Difference: {diff:+.1%}pp (not significant)\n\n"
+                    "CLIP's advantage is consistent across generator types, suggesting\n"
+                    "it detects genuine visual differences, not architectural fingerprints."
+                )
+        else:
+            verdict = "Insufficient data."
+
+        ax.text(0.08, 0.3, verdict, transform=ax.transAxes, fontsize=9.5,
+                ha="left", va="top", fontfamily="serif",
+                bbox=dict(boxstyle="round,pad=0.4", facecolor="#E8F5E9", edgecolor="#66BB6A"))
+
+        pdf.savefig(fig)
+        plt.close(fig)
+
+    print(f"PDF saved to: {pdf_path}")
+    return pdf_path
+
+
+def main():
+    print("=" * 60)
+    print("  CLIP BIAS ANALYSIS — Defactify MS-COCOAI")
+    print("  5 generators, labeled, semantically matched")
+    print("=" * 60)
+
+    ds = load_dataset("Rajarshi-Roy-research/Defactify_Image_Dataset", split="train")
+    ds = ds.cast_column("Image", HFImage(decode=True, mode="RGB"))
+    print(f"Total: {len(ds)} images")
+
+    rng = np.random.RandomState(SEED)
+    real_indices = [i for i, l in enumerate(ds["Label_B"]) if l == 0]
+    real_sample = rng.choice(real_indices, size=N_PER_CLASS, replace=False)
+    real_images = [ds[int(i)]["Image"] for i in tqdm(real_sample, desc="Loading real")]
+
+    all_results = []
+
+    for gen_id in [1, 2, 3, 4, 5]:
+        gen_info = GENERATORS[gen_id]
+        print(f"\n{'='*50}")
+        print(f"  {gen_info['name']} (uses_clip={gen_info['uses_clip']}) vs Real")
+        print(f"{'='*50}")
+
+        gen_indices = [i for i, l in enumerate(ds["Label_B"]) if l == gen_id]
+        gen_sample = rng.choice(gen_indices, size=N_PER_CLASS, replace=False)
+        gen_images = [ds[int(i)]["Image"] for i in tqdm(gen_sample, desc=f"Loading {gen_info['name']}")]
+
+        all_images = real_images + gen_images
+        y = np.array([0] * len(real_images) + [1] * len(gen_images))
+
+        # Hand-crafted
+        print("  Extracting hand-crafted features...")
+        X_hc, hc_valid = extract_handcrafted(all_images)
+        y_hc = y[hc_valid]
+
+        # CLIP
+        print("  Extracting CLIP features...")
+        X_clip, clip_valid = extract_clip(all_images)
+        y_clip = y[clip_valid]
+
+        result = {"generator": gen_info["name"], "uses_clip": gen_info["uses_clip"]}
+
+        for feat_name, X_f, y_f in [("handcrafted", X_hc, y_hc), ("clip", X_clip, y_clip)]:
+            for model in ["xgb", "svm"]:
+                key = f"{feat_name}_{model}"
+                r = run_cv(X_f, y_f, model)
+                result[key] = r
+                print(f"    {key:25s} acc={r['accuracy']:.2%} auc={r['roc_auc']:.4f}")
+
+        result["handcrafted_best"] = max(result["handcrafted_xgb"]["accuracy"],
+                                         result["handcrafted_svm"]["accuracy"])
+        result["clip_best"] = max(result["clip_xgb"]["accuracy"],
+                                  result["clip_svm"]["accuracy"])
+        result["clip_advantage"] = result["clip_best"] - result["handcrafted_best"]
+        all_results.append(result)
+
+    # Save
+    RESULTS_DIR.mkdir(exist_ok=True)
+    json_path = RESULTS_DIR / "clip_bias_defactify_results.json"
+    with open(json_path, "w") as f:
+        json.dump({"timestamp": datetime.now().isoformat(), "results": all_results}, f, indent=2)
+
+    generate_pdf(all_results)
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("  SUMMARY — CLIP advantage per generator")
+    print(f"{'='*60}")
+    for r in all_results:
+        clip_tag = "CLIP" if r["uses_clip"] is True else "NO-CLIP" if r["uses_clip"] is False else "???"
+        print(f"  {r['generator']:20s} [{clip_tag:7s}]  hand={r['handcrafted_best']:.1%}  "
+              f"clip={r['clip_best']:.1%}  delta={r['clip_advantage']:+.1%}")
+
+    clip_gens = [r for r in all_results if r["uses_clip"] is True]
+    non_clip = [r for r in all_results if r["uses_clip"] is False]
+    if clip_gens and non_clip:
+        print(f"\n  Avg CLIP advantage on CLIP generators: "
+              f"{np.mean([r['clip_advantage'] for r in clip_gens]):+.1%}")
+        print(f"  Avg CLIP advantage on non-CLIP generators: "
+              f"{np.mean([r['clip_advantage'] for r in non_clip]):+.1%}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
deleted file mode 100644
index f035c48..0000000
--- a/tests/test_experiments.py
+++ /dev/null
@@ -1,514 +0,0 @@
-# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
-"""Run all feature experiments on Hemg art dataset and compare.
-
-Experiments:
-  1. Artwork features only (49 features) — baseline
-  2. Style features only (15 features)
-  3. Artwork + Style combined (64 features)
-  4. CLIP embeddings (768 features)
-  5. CLIP + Artwork + Style (832 features)
-
-Each experiment: 4000 samples, 5-fold CV, XGBoost/SVM/MLP.
-Generates a comparison PDF.
-"""
-
-from __future__ import annotations
-
-import json
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from matplotlib.backends.backend_pdf import PdfPages
-import matplotlib.gridspec as gridspec
-import numpy as np
-import pandas as pd
-import torch
-import xgboost as xgb
-from datasets import load_dataset, Image as HFImage
-from PIL import Image
-from sklearn.metrics import (
-    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
-    confusion_matrix, roc_curve,
-)
-from sklearn.model_selection import StratifiedKFold
-from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
-from tqdm import tqdm
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from negate.extract.feature_artwork import ArtworkExtract
-from negate.extract.feature_style import StyleExtract
-
-SEED = 42
-N_FOLDS = 5
-N_PER_CLASS = 2000
-REPO = "Hemg/AI-Generated-vs-Real-Images-Datasets"
-RESULTS_DIR = Path(__file__).parent.parent / "results"
-
-
-def load_dataset_cached():
-    """Load and return the Hemg dataset."""
-    print("Loading Hemg dataset...")
-    ds = load_dataset(REPO, split="train")
-    ds = ds.cast_column("image", HFImage(decode=True, mode="RGB"))
-    return ds
-
-
-def extract_artwork_features(ds, indices) -> np.ndarray:
-    """Extract 49 artwork features."""
-    extractor = ArtworkExtract()
-    features = []
-    for idx in tqdm(indices, desc="  Artwork features"):
-        try:
-            img = ds[int(idx)]["image"]
-            if img and isinstance(img, Image.Image):
-                features.append(extractor(img))
-            else:
-                features.append(None)
-        except Exception:
-            features.append(None)
-    df = pd.DataFrame([f for f in features if f is not None]).fillna(0)
-    X = df.to_numpy(dtype=np.float64)
-    return np.where(np.isfinite(X), X, 0), list(df.columns), [i for i, f in enumerate(features) if f is not None]
-
-
-def extract_style_features(ds, indices) -> np.ndarray:
-    """Extract 15 style features."""
-    extractor = StyleExtract()
-    features = []
-    for idx in tqdm(indices, desc="  Style features"):
-        try:
-            img = ds[int(idx)]["image"]
-            if img and isinstance(img, Image.Image):
-                features.append(extractor(img))
-            else:
-                features.append(None)
-        except Exception:
-            features.append(None)
-    df = pd.DataFrame([f for f in features if f is not None]).fillna(0)
-    X = df.to_numpy(dtype=np.float64)
-    return np.where(np.isfinite(X), X, 0), list(df.columns), [i for i, f in enumerate(features) if f is not None]
-
-
-def extract_clip_features(ds, indices) -> np.ndarray:
-    """Extract CLIP ViT-B/32 embeddings (512-d)."""
-    from transformers import CLIPProcessor, CLIPModel
-
-    print("  Loading CLIP model...")
-    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
-
-    features = []
-    valid = []
-    batch_size = 32
-
-    for batch_start in tqdm(range(0, len(indices), batch_size), desc="  CLIP features"):
-        batch_indices = indices[batch_start:batch_start + batch_size]
-        images = []
-        batch_valid = []
-        for i, idx in enumerate(batch_indices):
-            try:
-                img = ds[int(idx)]["image"]
-                if img and isinstance(img, Image.Image):
-                    images.append(img)
-                    batch_valid.append(batch_start + i)
-            except Exception:
-                pass
-
-        if not images:
-            continue
-
-        with torch.no_grad():
-            inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
-            outputs = model.get_image_features(**inputs)
-            if isinstance(outputs, torch.Tensor):
-                embeddings = outputs.cpu().numpy()
-            else:
-                embeddings = outputs.pooler_output.cpu().numpy()
-
-        features.append(embeddings)
-        valid.extend(batch_valid)
-
-    X = np.vstack(features)
-    return X, [f"clip_{i}" for i in range(X.shape[1])], valid
-
-
-def run_cv(X, y, model_type="xgb"):
-    """5-fold CV, return metrics dict."""
-    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
-    all_true, all_prob = [], []
-
-    for train_idx, test_idx in skf.split(X, y):
-        X_train, X_test = X[train_idx], X[test_idx]
-        y_train, y_test = y[train_idx], y[test_idx]
-
-        if model_type == "xgb":
-            spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1)
-            params = {
-                "objective": "binary:logistic", "eval_metric": "logloss",
-                "max_depth": 5, "learning_rate": 0.1, "subsample": 0.8,
-                "colsample_bytree": 0.8, "scale_pos_weight": spw, "seed": SEED,
-            }
-            dtrain = xgb.DMatrix(X_train, label=y_train)
-            dtest = xgb.DMatrix(X_test, label=y_test)
-            model = xgb.train(params, dtrain, num_boost_round=300,
-                              evals=[(dtest, "test")], early_stopping_rounds=15,
-                              verbose_eval=False)
-            y_prob = model.predict(dtest)
-        elif model_type == "svm":
-            scaler = StandardScaler()
-            X_tr = scaler.fit_transform(X_train)
-            X_te = scaler.transform(X_test)
-            svm = SVC(kernel="rbf", probability=True, random_state=SEED)
-            svm.fit(X_tr, y_train)
-            y_prob = svm.predict_proba(X_te)[:, 1]
-        elif model_type == "mlp":
-            scaler = StandardScaler()
-            X_tr = scaler.fit_transform(X_train)
-            X_te = scaler.transform(X_test)
-            mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=1000,
-                                random_state=SEED, early_stopping=True)
-            mlp.fit(X_tr, y_train)
-            y_prob = mlp.predict_proba(X_te)[:, 1]
-
-        all_true.extend(y_test)
-        all_prob.extend(y_prob)
-
-    y_true = np.array(all_true)
-    y_prob = np.array(all_prob)
-    y_pred = (y_prob > 0.5).astype(int)
-
-    return {
-        "accuracy": float(accuracy_score(y_true, y_pred)),
-        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
-        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
-        "f1": float(f1_score(y_true, y_pred, average="macro")),
-        "roc_auc": float(roc_auc_score(y_true, y_prob)),
-        "y_true": y_true.tolist(),
-        "y_prob": y_prob.tolist(),
-    }
-
-
-def generate_pdf(experiments):
-    """Generate comparison PDF."""
-    RESULTS_DIR.mkdir(exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    pdf_path = RESULTS_DIR / f"experiments_comparison_{timestamp}.pdf"
-
-    with PdfPages(str(pdf_path)) as pdf:
-        # PAGE 1: Title + comparison chart
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-
-        fig.suptitle("Feature Experiment Comparison\nfor AI Artwork Detection",
-                     fontsize=18, fontweight="bold", fontfamily="serif", y=0.96)
-        fig.text(0.5, 0.89, f"negate project — darkshapes — {datetime.now().strftime('%B %d, %Y')}",
-                 fontsize=10, ha="center", fontfamily="serif", style="italic")
-        fig.text(0.5, 0.86, f"Dataset: Hemg AI-Art vs Real-Art | {N_PER_CLASS*2} samples | 5-fold CV",
-                 fontsize=9, ha="center", fontfamily="serif")
-
-        # Grouped bar chart: accuracy by experiment and model
-        ax = fig.add_axes([0.1, 0.45, 0.8, 0.35])
-
-        exp_names = [e["name"] for e in experiments]
-        n_exp = len(exp_names)
-        x = np.arange(n_exp)
-        w = 0.25
-
-        for i, (model, color) in enumerate([("xgb", "#4472C4"), ("svm", "#ED7D31"), ("mlp", "#70AD47")]):
-            accs = [e["results"][model]["accuracy"] for e in experiments]
-            bars = ax.bar(x + i * w - w, accs, w, label=model.upper(), color=color)
-            for bar in bars:
-                ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
-                       f"{bar.get_height():.1%}", ha="center", fontsize=6.5, rotation=45)
-
-        ax.set_xticks(x)
-        ax.set_xticklabels([e["short_name"] for e in experiments], fontsize=8, rotation=15, ha="right")
-        ax.set_ylabel("Accuracy", fontsize=10)
-        ax.set_title("Accuracy by Feature Set and Model", fontsize=12, fontfamily="serif")
-        ax.legend(fontsize=9)
-        ax.set_ylim(0.5, 1.0)
-        ax.grid(axis="y", alpha=0.3)
-
-        # Summary table
-        ax_table = fig.add_axes([0.05, 0.08, 0.9, 0.3])
-        ax_table.axis("off")
-
-        table_data = []
-        for e in experiments:
-            best_model = max(e["results"], key=lambda m: e["results"][m]["accuracy"])
-            best = e["results"][best_model]
-            table_data.append([
-                e["short_name"],
-                str(e["n_features"]),
-                f"{best['accuracy']:.2%}",
-                f"{best['precision']:.2%}",
-                f"{best['recall']:.2%}",
-                f"{best['roc_auc']:.4f}",
-                best_model.upper(),
-                e.get("extract_time", "?"),
-            ])
-
-        table = ax_table.table(
-            cellText=table_data,
-            colLabels=["Features", "Count", "Best Acc", "Prec", "Recall", "AUC", "Model", "Time"],
-            loc="center", cellLoc="center",
-        )
-        table.auto_set_font_size(False)
-        table.set_fontsize(7.5)
-        table.scale(1, 1.5)
-        for (row, col), cell in table.get_celld().items():
-            if row == 0:
-                cell.set_facecolor("#4472C4")
-                cell.set_text_props(color="white", fontweight="bold")
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-        # PAGE 2: ROC curves
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        fig.suptitle("ROC Curves by Experiment (Best Model)", fontsize=14,
-                     fontweight="bold", fontfamily="serif", y=0.96)
-
-        colors = ["#4472C4", "#ED7D31", "#70AD47", "#FFC000", "#9B59B6"]
-        ax = fig.add_axes([0.12, 0.5, 0.76, 0.38])
-
-        for i, e in enumerate(experiments):
-            best_model = max(e["results"], key=lambda m: e["results"][m]["roc_auc"])
-            r = e["results"][best_model]
-            fpr, tpr, _ = roc_curve(r["y_true"], r["y_prob"])
-            ax.plot(fpr, tpr, color=colors[i % len(colors)], linewidth=2,
-                   label=f"{e['short_name']} (AUC={r['roc_auc']:.3f})")
-
-        ax.plot([0, 1], [0, 1], "k--", alpha=0.3)
-        ax.set_xlabel("False Positive Rate")
-        ax.set_ylabel("True Positive Rate")
-        ax.legend(fontsize=8, loc="lower right")
-        ax.grid(True, alpha=0.3)
-
-        # Analysis text
-        ax_text = fig.add_axes([0.08, 0.05, 0.84, 0.38])
-        ax_text.axis("off")
-
-        # Find best and worst
-        best_exp = max(experiments, key=lambda e: max(e["results"][m]["accuracy"] for m in e["results"]))
-        worst_exp = min(experiments, key=lambda e: max(e["results"][m]["accuracy"] for m in e["results"]))
-        best_acc = max(best_exp["results"][m]["accuracy"] for m in best_exp["results"])
-        worst_acc = max(worst_exp["results"][m]["accuracy"] for m in worst_exp["results"])
-
-        analysis = (
-            "Analysis\n\n"
-            f"Best performing: {best_exp['name']} at {best_acc:.1%}\n"
-            f"Worst performing: {worst_exp['name']} at {worst_acc:.1%}\n"
-            f"Improvement from best to worst: {(best_acc - worst_acc)*100:+.1f}pp\n\n"
-        )
-
-        # Check if CLIP exists
-        clip_exp = [e for e in experiments if "clip" in e["short_name"].lower()]
-        art_exp = [e for e in experiments if e["short_name"] == "Artwork (49)"]
-
-        if clip_exp and art_exp:
-            clip_acc = max(clip_exp[0]["results"][m]["accuracy"] for m in clip_exp[0]["results"])
-            art_acc = max(art_exp[0]["results"][m]["accuracy"] for m in art_exp[0]["results"])
-            analysis += (
-                f"CLIP vs hand-crafted: {clip_acc:.1%} vs {art_acc:.1%} "
-                f"({(clip_acc - art_acc)*100:+.1f}pp)\n"
-            )
-            if clip_acc > art_acc + 0.03:
-                analysis += "Learned features significantly outperform hand-crafted features.\n"
-            elif clip_acc < art_acc - 0.03:
-                analysis += "Surprisingly, hand-crafted features outperform CLIP on this task.\n"
-            else:
-                analysis += "Learned and hand-crafted features perform similarly.\n"
-
-        # Check if combined helps
-        combined_exp = [e for e in experiments if "+" in e["short_name"]]
-        if combined_exp:
-            comb_acc = max(combined_exp[-1]["results"][m]["accuracy"] for m in combined_exp[-1]["results"])
-            analysis += (
-                f"\nCombined features: {comb_acc:.1%}\n"
-            )
-            if comb_acc > best_acc - 0.01:
-                analysis += "Combining features achieves the best overall performance.\n"
-            else:
-                analysis += "Combining features does not improve over the best individual set.\n"
-
-        analysis += (
-            "\nConclusions\n\n"
-            "This comparison tests whether:\n"
-            "  1. Style-specific craft features add signal beyond generic statistics\n"
-            "  2. Learned representations (CLIP) outperform hand-crafted features\n"
-            "  3. Combining multiple feature types improves detection\n\n"
-            "All experiments use the same dataset (Hemg AI Art vs Real Art),\n"
-            "same sample size, and same evaluation methodology.\n"
-        )
-
-        ax_text.text(0, 1, analysis, transform=ax_text.transAxes, fontsize=9,
-                    ha="left", va="top", fontfamily="serif")
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-    print(f"PDF saved to: {pdf_path}")
-    return pdf_path
-
-
-def main():
-    print("=" * 60)
-    print("  FEATURE EXPERIMENTS COMPARISON")
-    print("  Dataset: Hemg AI Art vs Real Art")
-    print(f"  Samples: {N_PER_CLASS * 2} ({N_PER_CLASS} per class)")
-    print("=" * 60)
-
-    ds = load_dataset_cached()
-    all_labels = ds["label"]
-
-    # Select balanced indices
-    rng = np.random.RandomState(SEED)
-    idx_0 = [i for i, l in enumerate(all_labels) if l == 0]
-    idx_1 = [i for i, l in enumerate(all_labels) if l == 1]
-    chosen_0 = rng.choice(idx_0, size=N_PER_CLASS, replace=False)
-    chosen_1 = rng.choice(idx_1, size=N_PER_CLASS, replace=False)
-    all_indices = np.concatenate([chosen_0, chosen_1])
-    # Labels: 0=AI(synthetic), 1=Real(genuine) in dataset
-    # We want: 0=genuine, 1=synthetic
-    y = np.array([1] * N_PER_CLASS + [0] * N_PER_CLASS)
-
-    experiments = []
-
-    # === Experiment 1: Artwork features (49) ===
-    print("\n" + "=" * 50)
-    print("  Experiment 1: Artwork Features (49)")
-    print("=" * 50)
-    t0 = time.time()
-    X_art, art_names, art_valid = extract_artwork_features(ds, all_indices)
-    t_art = f"{time.time() - t0:.0f}s"
-    y_art = y[art_valid]
-    print(f"  {X_art.shape[0]} images, {X_art.shape[1]} features, {t_art}")
-
-    exp1 = {"name": "Artwork Features (Li & Stamp + FFT/DCT)", "short_name": "Artwork (49)",
-            "n_features": X_art.shape[1], "extract_time": t_art, "results": {}}
-    for model in ["xgb", "svm", "mlp"]:
-        print(f"  {model.upper()}...")
-        exp1["results"][model] = run_cv(X_art, y_art, model)
-        print(f"    acc={exp1['results'][model]['accuracy']:.2%}")
-    experiments.append(exp1)
-
-    # === Experiment 2: Style features (15) ===
-    print("\n" + "=" * 50)
-    print("  Experiment 2: Style Features (15)")
-    print("=" * 50)
-    t0 = time.time()
-    X_style, style_names, style_valid = extract_style_features(ds, all_indices)
-    t_style = f"{time.time() - t0:.0f}s"
-    y_style = y[style_valid]
-    print(f"  {X_style.shape[0]} images, {X_style.shape[1]} features, {t_style}")
-
-    exp2 = {"name": "Style Features (stroke/palette/composition/texture)", "short_name": "Style (15)",
-            "n_features": X_style.shape[1], "extract_time": t_style, "results": {}}
-    for model in ["xgb", "svm", "mlp"]:
-        print(f"  {model.upper()}...")
-        exp2["results"][model] = run_cv(X_style, y_style, model)
-        print(f"    acc={exp2['results'][model]['accuracy']:.2%}")
-    experiments.append(exp2)
-
-    # === Experiment 3: Artwork + Style combined (64) ===
-    print("\n" + "=" * 50)
-    print("  Experiment 3: Artwork + Style Combined (64)")
-    print("=" * 50)
-    # Align valid indices
-    common_valid = sorted(set(art_valid) & set(style_valid))
-    art_mask = [art_valid.index(v) for v in common_valid]
-    style_mask = [style_valid.index(v) for v in common_valid]
-    X_combined = np.hstack([X_art[art_mask], X_style[style_mask]])
-    y_combined = y[common_valid]
-    print(f"  {X_combined.shape[0]} images, {X_combined.shape[1]} features")
-
-    exp3 = {"name": "Artwork + Style Combined", "short_name": "Art+Style (64)",
-            "n_features": X_combined.shape[1], "extract_time": "combined", "results": {}}
-    for model in ["xgb", "svm", "mlp"]:
-        print(f"  {model.upper()}...")
-        exp3["results"][model] = run_cv(X_combined, y_combined, model)
-        print(f"    acc={exp3['results'][model]['accuracy']:.2%}")
-    experiments.append(exp3)
-
-    # === Experiment 4: CLIP embeddings (512) ===
-    print("\n" + "=" * 50)
-    print("  Experiment 4: CLIP ViT-B/32 Embeddings (512)")
-    print("=" * 50)
-    t0 = time.time()
-    X_clip, clip_names, clip_valid = extract_clip_features(ds, all_indices)
-    t_clip = f"{time.time() - t0:.0f}s"
-    y_clip = y[clip_valid]
-    print(f"  {X_clip.shape[0]} images, {X_clip.shape[1]} features, {t_clip}")
-
-    exp4 = {"name": "CLIP ViT-B/32 Embeddings", "short_name": "CLIP (512)",
-            "n_features": X_clip.shape[1], "extract_time": t_clip, "results": {}}
-    for model in ["xgb", "svm", "mlp"]:
-        print(f"  {model.upper()}...")
-        exp4["results"][model] = run_cv(X_clip, y_clip, model)
-        print(f"    acc={exp4['results'][model]['accuracy']:.2%}")
-    experiments.append(exp4)
-
-    # === Experiment 5: CLIP + Artwork + Style (all combined) ===
-    print("\n" + "=" * 50)
-    print("  Experiment 5: CLIP + Artwork + Style (all)")
-    print("=" * 50)
-    common_all = sorted(set(art_valid) & set(style_valid) & set(clip_valid))
-    art_m = [art_valid.index(v) for v in common_all]
-    style_m = [style_valid.index(v) for v in common_all]
-    clip_m = [clip_valid.index(v) for v in common_all]
-    X_all = np.hstack([X_art[art_m], X_style[style_m], X_clip[clip_m]])
-    y_all = y[common_all]
-    print(f"  {X_all.shape[0]} images, {X_all.shape[1]} features")
-
-    exp5 = {"name": "CLIP + Artwork + Style (Everything)", "short_name": "All Combined",
-            "n_features": X_all.shape[1], "extract_time": "combined", "results": {}}
-    for model in ["xgb", "svm", "mlp"]:
-        print(f"  {model.upper()}...")
-        exp5["results"][model] = run_cv(X_all, y_all, model)
-        print(f"    acc={exp5['results'][model]['accuracy']:.2%}")
-    experiments.append(exp5)
-
-    # Save results (without y_true/y_prob arrays for JSON)
-    json_results = []
-    for e in experiments:
-        je = {k: v for k, v in e.items() if k != "results"}
-        je["results"] = {}
-        for m, r in e["results"].items():
-            je["results"][m] = {k: v for k, v in r.items() if k not in ("y_true", "y_prob")}
-        json_results.append(je)
-
-    RESULTS_DIR.mkdir(exist_ok=True)
-    json_path = RESULTS_DIR / "experiments_results.json"
-    with open(json_path, "w") as f:
-        json.dump({"timestamp": datetime.now().isoformat(), "experiments": json_results}, f, indent=2)
-    print(f"\nJSON saved to: {json_path}")
-
-    # Generate PDF
-    print("\nGenerating comparison PDF...")
-    generate_pdf(experiments)
-
-    # Final summary
-    print(f"\n{'='*60}")
-    print("  FINAL SUMMARY")
-    print(f"{'='*60}")
-    for e in experiments:
-        best_model = max(e["results"], key=lambda m: e["results"][m]["accuracy"])
-        best = e["results"][best_model]
-        print(f"  {e['short_name']:20s}  acc={best['accuracy']:.2%}  auc={best['roc_auc']:.4f}  ({best_model})")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/test_fair_evaluation.py b/tests/test_fair_evaluation.py
deleted file mode 100644
index e3d1381..0000000
--- a/tests/test_fair_evaluation.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
-"""Fair evaluation: test artwork features on datasets where both classes are art.
-
-Addresses the confound that previous benchmarks used different subject matter
-(cats vs bananas, wikiart vs generated), which inflates accuracy.
-
-Datasets:
-  1. Hemg/AI-Generated-vs-Real-Images-Datasets — 153K, "AiArtData" vs "RealArt"
-  2. Parveshiiii/AI-vs-Real — 14K balanced binary
-
-We sample N images from each class, extract 49 features, run 5-fold CV,
-and report accuracy/precision/recall/F1/AUC with confidence intervals.
-"""
-
-from __future__ import annotations
-
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-from datasets import load_dataset, Image as HFImage
-from PIL import Image
-from sklearn.metrics import (
-    accuracy_score, f1_score, roc_auc_score, precision_score, recall_score,
-    confusion_matrix, roc_curve,
-)
-from sklearn.model_selection import StratifiedKFold
-from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
-from tqdm import tqdm
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from negate.extract.feature_artwork import ArtworkExtract
-
-SEED = 42
-N_FOLDS = 5
-RESULTS_DIR = Path(__file__).parent.parent / "results"
-
-
-def extract_all_features(dataset, label_col: str, n_samples: int = 200):
-    """Extract features from a dataset, balanced per class."""
-    extractor = ArtworkExtract()
-    features, labels, errors = [], [], 0
-
-    # Get unique labels and sample equally
-    all_labels = dataset[label_col]
-    unique_labels = sorted(set(all_labels))
-    print(f"  Labels found: {unique_labels}")
-
-    per_class = n_samples // len(unique_labels)
-
-    for lbl in unique_labels:
-        indices = [i for i, l in enumerate(all_labels) if l == lbl]
-        rng = np.random.RandomState(SEED)
-        chosen = rng.choice(indices, size=min(per_class, len(indices)), replace=False)
-
-        for idx in tqdm(chosen, desc=f"  Class {lbl}"):
-            try:
-                img = dataset[int(idx)]["image"]
-                if img is None:
-                    errors += 1
-                    continue
-                if not isinstance(img, Image.Image):
-                    errors += 1
-                    continue
-                feat = extractor(img)
-                features.append(feat)
-                # Binary: 0 = real/genuine, 1 = AI/synthetic
-                labels.append(0 if lbl == max(unique_labels) else 1)
-            except Exception as e:
-                errors += 1
-
-    print(f"  Extracted {len(features)} images ({errors} errors)")
-
-    df = pd.DataFrame(features).fillna(0)
-    X = df.to_numpy(dtype=np.float64)
-    X = np.where(np.isfinite(X), X, 0)
-    y = np.array(labels)
-
-    return X, y, list(df.columns)
-
-
-def cross_validate_xgb(X, y):
-    """5-fold CV with XGBoost."""
-    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
-    all_true, all_prob = [], []
-    fold_results = []
-
-    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
-        X_train, X_test = X[train_idx], X[test_idx]
-        y_train, y_test = y[train_idx], y[test_idx]
-
-        spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1)
-        params = {
-            "objective": "binary:logistic",
-            "eval_metric": ["logloss", "aucpr"],
-            "max_depth": 4,
-            "learning_rate": 0.1,
-            "subsample": 0.8,
-            "colsample_bytree": 0.8,
-            "scale_pos_weight": spw,
-            "seed": SEED,
-        }
-        dtrain = xgb.DMatrix(X_train, label=y_train)
-        dtest = xgb.DMatrix(X_test, label=y_test)
-        model = xgb.train(params, dtrain, num_boost_round=200,
-                          evals=[(dtest, "test")], early_stopping_rounds=10,
-                          verbose_eval=False)
-
-        y_prob = model.predict(dtest)
-        y_pred = (y_prob > 0.5).astype(int)
-
-        fold_results.append({
-            "fold": fold + 1,
-            "accuracy": float(accuracy_score(y_test, y_pred)),
-            "precision": float(precision_score(y_test, y_pred, zero_division=0)),
-            "recall": float(recall_score(y_test, y_pred, zero_division=0)),
-            "f1": float(f1_score(y_test, y_pred, average="macro")),
-            "roc_auc": float(roc_auc_score(y_test, y_prob)),
-        })
-        all_true.extend(y_test)
-        all_prob.extend(y_prob)
-
-    return fold_results, np.array(all_true), np.array(all_prob)
-
-
-def cross_validate_svm(X, y):
-    """5-fold CV with SVM."""
-    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
-    all_true, all_prob = [], []
-
-    scaler = StandardScaler()
-
-    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
-        X_train = scaler.fit_transform(X[train_idx])
-        X_test = scaler.transform(X[test_idx])
-        y_train, y_test = y[train_idx], y[test_idx]
-
-        svm = SVC(kernel="rbf", probability=True, random_state=SEED)
-        svm.fit(X_train, y_train)
-        y_prob = svm.predict_proba(X_test)[:, 1]
-        all_true.extend(y_test)
-        all_prob.extend(y_prob)
-
-    return np.array(all_true), np.array(all_prob)
-
-
-def cross_validate_mlp(X, y):
-    """5-fold CV with MLP."""
-    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
-    all_true, all_prob = [], []
-    scaler = StandardScaler()
-
-    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
-        X_train = scaler.fit_transform(X[train_idx])
-        X_test = scaler.transform(X[test_idx])
-        y_train, y_test = y[train_idx], y[test_idx]
-
-        mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=SEED)
-        mlp.fit(X_train, y_train)
-        y_prob = mlp.predict_proba(X_test)[:, 1]
-        all_true.extend(y_test)
-        all_prob.extend(y_prob)
-
-    return np.array(all_true), np.array(all_prob)
-
-
-def summarize(name, fold_results, y_true, y_prob):
-    """Print summary for a classifier."""
-    y_pred = (y_prob > 0.5).astype(int)
-    accs = [r["accuracy"] for r in fold_results]
-    print(f"\n{'='*50}")
-    print(f"  {name}")
-    print(f"{'='*50}")
-    for r in fold_results:
-        print(f"  Fold {r['fold']}: acc={r['accuracy']:.2%} prec={r['precision']:.2%} "
-              f"rec={r['recall']:.2%} auc={r['roc_auc']:.4f}")
-    print(f"  ---")
-    print(f"  Mean acc:  {np.mean(accs):.2%} +/- {np.std(accs):.2%}")
-    print(f"  Pooled:    acc={accuracy_score(y_true, y_pred):.2%} "
-          f"prec={precision_score(y_true, y_pred, zero_division=0):.2%} "
-          f"rec={recall_score(y_true, y_pred, zero_division=0):.2%} "
-          f"auc={roc_auc_score(y_true, y_prob):.4f}")
-    cm = confusion_matrix(y_true, y_pred)
-    print(f"  Confusion: TN={cm[0,0]} FP={cm[0,1]} FN={cm[1,0]} TP={cm[1,1]}")
-
-
-def evaluate_dataset(name: str, repo: str, label_col: str, n_samples: int, split: str = "train"):
-    """Full evaluation pipeline for one dataset."""
-    print(f"\n{'#'*60}")
-    print(f"  DATASET: {name}")
-    print(f"  Repo: {repo}")
-    print(f"  Sampling: {n_samples} images ({n_samples//2} per class)")
-    print(f"{'#'*60}")
-
-    print(f"\nLoading dataset...")
-    ds = load_dataset(repo, split=split)
-    ds = ds.cast_column("image", HFImage(decode=True, mode="RGB"))
-    print(f"  Total rows: {len(ds)}")
-
-    X, y, feature_names = extract_all_features(ds, label_col, n_samples)
-    print(f"  Class balance: {np.sum(y==0)} real, {np.sum(y==1)} synthetic")
-    print(f"  Features: {X.shape[1]}")
-
-    # XGBoost
-    print(f"\nRunning {N_FOLDS}-fold CV (XGBoost)...")
-    xgb_folds, xgb_true, xgb_prob = cross_validate_xgb(X, y)
-    summarize(f"XGBoost on {name}", xgb_folds, xgb_true, xgb_prob)
-
-    # SVM
-    print(f"\nRunning {N_FOLDS}-fold CV (SVM)...")
-    svm_true, svm_prob = cross_validate_svm(X, y)
-    svm_pred = (svm_prob > 0.5).astype(int)
-    print(f"  SVM pooled: acc={accuracy_score(svm_true, svm_pred):.2%} "
-          f"auc={roc_auc_score(svm_true, svm_prob):.4f}")
-
-    # MLP
-    print(f"\nRunning {N_FOLDS}-fold CV (MLP)...")
-    mlp_true, mlp_prob = cross_validate_mlp(X, y)
-    mlp_pred = (mlp_prob > 0.5).astype(int)
-    print(f"  MLP pooled: acc={accuracy_score(mlp_true, mlp_pred):.2%} "
-          f"auc={roc_auc_score(mlp_true, mlp_prob):.4f}")
-
-    return {
-        "dataset": name,
-        "repo": repo,
-        "n_samples": int(np.sum(y >= 0)),
-        "n_features": X.shape[1],
-        "xgb_folds": xgb_folds,
-        "xgb_accuracy": float(accuracy_score(xgb_true, (xgb_prob > 0.5).astype(int))),
-        "xgb_auc": float(roc_auc_score(xgb_true, xgb_prob)),
-        "xgb_precision": float(precision_score(xgb_true, (xgb_prob > 0.5).astype(int), zero_division=0)),
-        "xgb_recall": float(recall_score(xgb_true, (xgb_prob > 0.5).astype(int), zero_division=0)),
-        "svm_accuracy": float(accuracy_score(svm_true, svm_pred)),
-        "svm_auc": float(roc_auc_score(svm_true, svm_prob)),
-        "mlp_accuracy": float(accuracy_score(mlp_true, mlp_pred)),
-        "mlp_auc": float(roc_auc_score(mlp_true, mlp_prob)),
-        "feature_names": feature_names,
-    }
-
-
-def main():
-    print("=" * 60)
-    print("  FAIR EVALUATION: 49-Feature Artwork Detection")
-    print("  Testing on semantically-similar datasets")
-    print("=" * 60)
-
-    results = []
-
-    # Dataset 1: Hemg — both classes are art
-    results.append(evaluate_dataset(
-        name="AI-Art vs Real-Art (Hemg)",
-        repo="Hemg/AI-Generated-vs-Real-Images-Datasets",
-        label_col="label",
-        n_samples=400,
-    ))
-
-    # Dataset 2: Parveshiiii — balanced binary
-    results.append(evaluate_dataset(
-        name="AI vs Real (Parveshiiii)",
-        repo="Parveshiiii/AI-vs-Real",
-        label_col="binary_label",
-        n_samples=400,
-    ))
-
-    # Save results
-    RESULTS_DIR.mkdir(exist_ok=True)
-    out_path = RESULTS_DIR / "fair_evaluation_results.json"
-    with open(out_path, "w") as f:
-        json.dump({
-            "timestamp": datetime.now().isoformat(),
-            "evaluation": "fair_semantically_similar",
-            "datasets": results,
-        }, f, indent=2, default=str)
-
-    print(f"\n{'='*60}")
-    print(f"  RESULTS SUMMARY")
-    print(f"{'='*60}")
-    for r in results:
-        print(f"\n  {r['dataset']}:")
-        print(f"    XGBoost: acc={r['xgb_accuracy']:.2%} auc={r['xgb_auc']:.4f} "
-              f"prec={r['xgb_precision']:.2%} rec={r['xgb_recall']:.2%}")
-        print(f"    SVM:     acc={r['svm_accuracy']:.2%} auc={r['svm_auc']:.4f}")
-        print(f"    MLP:     acc={r['mlp_accuracy']:.2%} auc={r['mlp_auc']:.4f}")
-
-    print(f"\nResults saved to: {out_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/test_hybrid_modern.py b/tests/test_hybrid_modern.py
new file mode 100644
index 0000000..32b09a4
--- /dev/null
+++ b/tests/test_hybrid_modern.py
@@ -0,0 +1,324 @@
+"""Hybrid detector: 148 handcrafted + 768 ConvNeXt, max training diversity.
+
+Extracts features INCREMENTALLY to avoid OOM from holding all images in memory.
+"""
+
+from __future__ import annotations
+
+import gc
+import json
+import os
+import warnings
+from pathlib import Path
+
+os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface"
+os.environ["HF_DATASETS_CACHE"] = "D:/Projects/negate/negate/.cache/huggingface/datasets"
+# Set HF_TOKEN env var or run: huggingface-cli login
+
+import lightgbm as lgb
+import numpy as np
+import torch
+from PIL import Image
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, roc_auc_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from tqdm import tqdm
+
+warnings.filterwarnings("ignore")
+
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from negate.extract.feature_artwork import ArtworkExtract
+from negate.extract.feature_learned import LearnedExtract
+
+art_ext = ArtworkExtract()
+learned_ext = LearnedExtract()
+
+BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted")
+CIVITAI = Path("D:/Projects/negate/negate/.datasets/civitai")
+SEED = 42
+rng = np.random.RandomState(SEED)
+W_LGBM, W_RF, W_SVM = 0.4, 0.1, 0.5
+HIGH_T, LOW_T = 0.80, 0.20
+
+
+def load_paths(path, recursive, max_n):
+    exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
+    files = list(path.rglob("*") if recursive else path.iterdir())
+    files = [f for f in files if f.suffix.lower() in exts]
+    if len(files) > max_n:
+        files = list(rng.choice(files, max_n, replace=False))
+    return files
+
+
+def extract_one(img):
+    """Extract 916 features from one PIL image."""
+    try:
+        hc = list(art_ext(img).values())
+    except Exception:
+        hc = [0.0] * 148
+    try:
+        with torch.no_grad():
+            ln = list(learned_ext(img).values())
+    except Exception:
+        ln = [0.0] * 768
+    return hc + ln
+
+
+def extract_from_paths(files, desc=""):
+    """Extract features from file paths, one at a time (no memory buildup)."""
+    rows = []
+    for f in tqdm(files, desc=desc):
+        try:
+            img = Image.open(f).convert("RGB")
+            rows.append(extract_one(img))
+            del img
+        except Exception:
+            rows.append([0.0] * 916)
+    X = np.array(rows, dtype=np.float64)
+    return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+
+
+def extract_from_stream(repo, max_n, desc=""):
+    """Stream from HF, extract features one at a time."""
+    from datasets import load_dataset
+    ds = load_dataset(repo, split="train", streaming=True)
+    rows = []
+    for i, s in enumerate(tqdm(ds, desc=desc, total=max_n)):
+        if i >= max_n:
+            break
+        try:
+            img = s["image"].convert("RGB")
+            rows.append(extract_one(img))
+            del img
+        except Exception:
+            rows.append([0.0] * 916)
+    if not rows:
+        return np.empty((0, 916))
+    X = np.array(rows, dtype=np.float64)
+    return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+
+
+def extract_from_imgs(images, desc=""):
+    rows = []
+    for img in tqdm(images, desc=desc):
+        rows.append(extract_one(img))
+    X = np.array(rows, dtype=np.float64)
+    return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+
+
+def main():
+    print("=" * 70, flush=True)
+    print("  HYBRID DETECTOR: 916 features, max diversity training", flush=True)
+    print("=" * 70, flush=True)
+
+    feature_chunks_real = []
+    feature_chunks_fake = []
+
+    # === REAL: WikiArt ===
+    print("\n--- REAL: WikiArt (2000) ---", flush=True)
+    files = load_paths(BASE / "wikiart", recursive=True, max_n=2000)
+    X = extract_from_paths(files, "WikiArt")
+    feature_chunks_real.append(X)
+    print(f"  Got {len(X)}", flush=True)
+    gc.collect()
+
+    # === REAL: tellif ===
+    print("\n--- REAL: tellif (122) ---", flush=True)
+    from datasets import load_dataset
+    ds_tellif = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test")
+    tellif_real_imgs = [ds_tellif[i]["image"].convert("RGB") for i in range(len(ds_tellif)) if ds_tellif[i]["label"] == 8]
+    X = extract_from_imgs(tellif_real_imgs, "tellif real")
+    feature_chunks_real.append(X)
+    print(f"  Got {len(X)}", flush=True)
+    del tellif_real_imgs
+    gc.collect()
+
+    # === REAL: Hemg (stream) ===
+    print("\n--- REAL: Hemg (1000) ---", flush=True)
+    X = extract_from_stream("Hemg/AI-Generated-vs-Real-Images-Datasets", max_n=2500, desc="Hemg real")
+    # Hemg label=1 is real, but streaming gives us mixed — need to filter
+    # Actually extract_from_stream doesn't filter by label. Let me handle differently.
+    ds_hemg = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train", streaming=True)
+    rows = []
+    count = 0
+    for s in tqdm(ds_hemg, desc="Hemg real", total=3000):
+        if s["label"] == 1:  # real
+            try:
+                rows.append(extract_one(s["image"].convert("RGB")))
+                count += 1
+            except Exception:
+                pass
+        if count >= 1000:
+            break
+    X = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0) if rows else np.empty((0, 916))
+    feature_chunks_real.append(X)
+    print(f"  Got {len(X)}", flush=True)
+    gc.collect()
+
+    # === FAKE: ImagiNet old ===
+    print("\n--- FAKE: ImagiNet old (1000) ---", flush=True)
+    for src in ["sdxl_paintings_fake", "sd_paintings_fake", "dalle3", "journeydb", "animaginexl_paintings_fake"]:
+        files = load_paths(BASE / src, recursive=False, max_n=200)
+        X = extract_from_paths(files, f"ImagiNet {src}")
+        feature_chunks_fake.append(X)
+        print(f"  {src}: {len(X)}", flush=True)
+        gc.collect()
+
+    # === FAKE: Modern HF datasets ===
+    print("\n--- FAKE: Modern generators ---", flush=True)
+    for repo, n, name in [
+        ("ash12321/seedream-4.5-generated-2k", 500, "Seedream 4.5"),
+        ("exdysa/nano-banana-pro-generated-1k-clone", 500, "Nano Banana Pro"),
+        ("LukasT9/Flux-1-Dev-Images-1k", 500, "Flux Dev"),
+        ("LukasT9/Flux-1-Schnell-Images-1k", 500, "Flux Schnell"),
+    ]:
+        X = extract_from_stream(repo, max_n=n, desc=name)
+        feature_chunks_fake.append(X)
+        print(f"  {name}: {len(X)}", flush=True)
+        gc.collect()
+
+    # === FAKE: CivitAI ===
+    print("\n--- FAKE: CivitAI ---", flush=True)
+    for gen_dir in ["flux", "sdxl", "pony", "illustrious"]:
+        p = CIVITAI / gen_dir
+        if p.exists() and any(p.iterdir()):
+            files = load_paths(p, recursive=False, max_n=500)
+            X = extract_from_paths(files, f"CivitAI {gen_dir}")
+            feature_chunks_fake.append(X)
+            print(f"  CivitAI {gen_dir}: {len(X)}", flush=True)
+            gc.collect()
+
+    # === FAKE: Hemg AI ===
+    print("\n--- FAKE: Hemg AI (1000) ---", flush=True)
+    ds_hemg2 = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train", streaming=True)
+    rows = []
+    count = 0
+    for s in tqdm(ds_hemg2, desc="Hemg AI", total=1500):
+        if s["label"] == 0:  # AI
+            try:
+                rows.append(extract_one(s["image"].convert("RGB")))
+                count += 1
+            except Exception:
+                pass
+        if count >= 1000:
+            break
+    X = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0) if rows else np.empty((0, 916))
+    feature_chunks_fake.append(X)
+    print(f"  Hemg AI: {len(X)}", flush=True)
+    gc.collect()
+
+    # === Combine ===
+    X_real = np.vstack([c for c in feature_chunks_real if len(c) > 0])
+    X_fake = np.vstack([c for c in feature_chunks_fake if len(c) > 0])
+    del feature_chunks_real, feature_chunks_fake
+    gc.collect()
+
+    n = min(len(X_real), len(X_fake))
+    X_train = np.vstack([X_real[:n], X_fake[:n]])
+    y_train = np.concatenate([np.zeros(n), np.ones(n)])
+    print(f"\n  TOTAL: {len(X_train)} train ({n}/class), {X_train.shape[1]} features", flush=True)
+    print(f"  Real sources: WikiArt + tellif + Hemg", flush=True)
+    print(f"  Fake sources: ImagiNet + Seedream + Nano Banana + Flux + CivitAI + Hemg AI", flush=True)
+
+    # === 5-fold CV ===
+    print("\n--- 5-fold CV ---", flush=True)
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
+    accs, aucs = [], []
+    for fold, (tr, te) in enumerate(skf.split(X_train, y_train)):
+        m = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8,
+                                num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED)
+        m.fit(X_train[tr], y_train[tr])
+        p = m.predict_proba(X_train[te])[:, 1]
+        acc = accuracy_score(y_train[te], (p > 0.5).astype(int))
+        auc = roc_auc_score(y_train[te], p)
+        accs.append(acc); aucs.append(auc)
+        print(f"  Fold {fold+1}: acc={acc:.4f} auc={auc:.4f}", flush=True)
+    print(f"  Mean: acc={np.mean(accs):.4f} auc={np.mean(aucs):.4f}", flush=True)
+
+    # === Train ensemble ===
+    print("\n--- Training ensemble ---", flush=True)
+    lgbm = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8,
+                                num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED)
+    lgbm.fit(X_train, y_train)
+    rf = RandomForestClassifier(n_estimators=300, max_depth=12, n_jobs=1, random_state=SEED)
+    rf.fit(X_train, y_train)
+    scaler = StandardScaler()
+    svm = SVC(kernel="rbf", probability=True, random_state=SEED)
+    svm.fit(scaler.fit_transform(X_train), y_train)
+
+    def predict(X):
+        p = (W_LGBM * lgbm.predict_proba(X)[:, 1] +
+             W_RF * rf.predict_proba(X)[:, 1] +
+             W_SVM * svm.predict_proba(scaler.transform(X))[:, 1])
+        pred = np.full(len(X), -1)
+        pred[p >= HIGH_T] = 1
+        pred[p < LOW_T] = 0
+        return pred, p
+
+    # === TEST: tellif SOTA ===
+    print("\n" + "=" * 70, flush=True)
+    print("  TEST: tellif 2025 SOTA (HYBRID 916 features)", flush=True)
+    print("=" * 70, flush=True)
+
+    label_names = ds_tellif.features["label"].names
+    results = {}
+
+    for i, gen_name in enumerate(label_names):
+        indices = [j for j in range(len(ds_tellif)) if ds_tellif[j]["label"] == i]
+        if not indices:
+            continue
+        images = [ds_tellif[j]["image"].convert("RGB") for j in indices]
+        X_gen = extract_from_imgs(images, gen_name)
+        pred, prob = predict(X_gen)
+
+        if gen_name == "real":
+            fp = (pred == 1).sum()
+            correct = (pred == 0).sum()
+            uncertain = (pred == -1).sum()
+            fp_rate = fp / len(X_gen)
+            print(f"  {gen_name:45s}  n={len(X_gen):3d}  GENUINE={correct}  UNCERTAIN={uncertain}  FP={fp}  FP_rate={fp_rate:.1%}", flush=True)
+            results[gen_name] = {"n": len(X_gen), "fp": int(fp), "fp_rate": float(fp_rate)}
+        else:
+            detected = (pred == 1).sum()
+            uncertain = (pred == -1).sum()
+            missed = (pred == 0).sum()
+            rate = detected / len(X_gen)
+            print(f"  {gen_name:45s}  n={len(X_gen):3d}  DETECTED={detected}  UNCERTAIN={uncertain}  MISSED={missed}  rate={rate:.1%}", flush=True)
+            results[gen_name] = {"n": len(X_gen), "detected": int(detected), "uncertain": int(uncertain), "missed": int(missed), "rate": float(rate)}
+        del images
+        gc.collect()
+
+    # === Feature importance ===
+    importances = lgbm.feature_importances_
+    hc_imp = importances[:148].sum()
+    ln_imp = importances[148:].sum()
+    total_imp = importances.sum()
+    print(f"\n  Feature importance: HC={hc_imp/total_imp*100:.1f}% / Learned={ln_imp/total_imp*100:.1f}%", flush=True)
+
+    all_names = list(art_ext(Image.new("RGB", (64, 64), "red")).keys()) + learned_ext.feature_names()
+    sorted_idx = np.argsort(importances)[::-1]
+    print(f"  Top 10:", flush=True)
+    for r in range(10):
+        idx = sorted_idx[r]
+        src = "HC" if idx < 148 else "CNN"
+        name = all_names[idx] if idx < len(all_names) else f"f{idx}"
+        print(f"    {r+1:2d}. [{src}] {name:35s} {importances[idx]}", flush=True)
+
+    # Save
+    out = Path(__file__).parent.parent / "results" / "hybrid_modern_results.json"
+    with open(out, "w") as f:
+        json.dump({
+            "training": {"n_per_class": int(n), "n_features": 916,
+                         "cv_acc": float(np.mean(accs)), "cv_auc": float(np.mean(aucs))},
+            "tellif": results,
+            "importance": {"handcrafted_pct": float(hc_imp/total_imp), "learned_pct": float(ln_imp/total_imp)},
+        }, f, indent=2)
+    print(f"\nSaved to {out}", flush=True)
+    print("DONE", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_imaginet_paintings.py b/tests/test_imaginet_paintings.py
new file mode 100644
index 0000000..f2bcc34
--- /dev/null
+++ b/tests/test_imaginet_paintings.py
@@ -0,0 +1,186 @@
+"""ImagiNet Paintings Benchmark: per-generator accuracy on labeled art data.
+
+Tests the 148-feature extractor against 6 different AI generators using
+real WikiArt paintings as the genuine class. This is the definitive test
+for art-specific detection with known generators.
+"""
+
+from __future__ import annotations
+
+import json
+import warnings
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+from PIL import Image
+from sklearn.metrics import (
+    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
+)
+from sklearn.model_selection import StratifiedKFold
+from tqdm import tqdm
+
+warnings.filterwarnings("ignore")
+
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from negate.extract.feature_artwork import ArtworkExtract
+
+ext = ArtworkExtract()
+BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted")
+SEED = 42
+MAX_PER_CLASS = 500
+rng = np.random.RandomState(SEED)
+
+GENERATORS = {
+    "wikiart": {"path": BASE / "wikiart", "label": 0, "recursive": True},
+    "AnimagineXL": {"path": BASE / "animaginexl_paintings_fake", "label": 1, "recursive": False},
+    "SD": {"path": BASE / "sd_paintings_fake", "label": 1, "recursive": False},
+    "SDXL": {"path": BASE / "sdxl_paintings_fake", "label": 1, "recursive": False},
+    "StyleGAN": {"path": BASE / "wikiart_stylegan", "label": 1, "recursive": True},
+    "Midjourney": {"path": BASE / "journeydb", "label": 1, "recursive": False},
+    "DALL-E_3": {"path": BASE / "dalle3", "label": 1, "recursive": False},
+}
+
+
+def load_image_paths(path: Path, recursive: bool, max_n: int) -> list[Path]:
+    exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
+    if recursive:
+        files = [f for f in path.rglob("*") if f.suffix.lower() in exts]
+    else:
+        files = [f for f in path.iterdir() if f.suffix.lower() in exts]
+    if len(files) > max_n:
+        files = list(rng.choice(files, max_n, replace=False))
+    return files
+
+
+def extract_features(files: list[Path], name: str) -> np.ndarray:
+    rows = []
+    for f in tqdm(files, desc=name):
+        try:
+            img = Image.open(f).convert("RGB")
+            feat = ext(img)
+            rows.append(list(feat.values()))
+        except Exception:
+            rows.append([0.0] * 148)
+    X = np.array(rows, dtype=np.float64)
+    return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+
+
+def cv_evaluate(X, y, n_splits=5):
+    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
+    metrics = {"acc": [], "auc": [], "f1": [], "prec": [], "rec": []}
+    for tr, te in skf.split(X, y):
+        model = lgb.LGBMClassifier(
+            n_estimators=500, learning_rate=0.05, max_depth=8,
+            num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED,
+        )
+        model.fit(X[tr], y[tr])
+        yp = model.predict_proba(X[te])[:, 1]
+        ypred = (yp > 0.5).astype(int)
+        metrics["acc"].append(accuracy_score(y[te], ypred))
+        metrics["auc"].append(roc_auc_score(y[te], yp))
+        metrics["f1"].append(f1_score(y[te], ypred, average="macro"))
+        metrics["prec"].append(precision_score(y[te], ypred))
+        metrics["rec"].append(recall_score(y[te], ypred))
+    return {k: float(np.mean(v)) for k, v in metrics.items()}
+
+
+def main():
+    print("=" * 60)
+    print("  ImagiNet PAINTINGS Benchmark")
+    print("  148 features, LightGBM, per-generator accuracy")
+    print("=" * 60)
+
+    # Load and extract features
+    all_features = {}
+    for name, info in GENERATORS.items():
+        files = load_image_paths(info["path"], info["recursive"], MAX_PER_CLASS)
+        print(f"\n{name}: {len(files)} images (label={info['label']})")
+        all_features[name] = extract_features(files, name)
+        print(f"  Shape: {all_features[name].shape}")
+
+    X_real = all_features["wikiart"]
+
+    # --- EXP 1: Per-generator ---
+    print("\n" + "=" * 60)
+    print("  EXP 1: Per-Generator (Real Art vs Each Generator)")
+    print("=" * 60)
+
+    results_per_gen = {}
+    for gen in ["AnimagineXL", "SD", "SDXL", "StyleGAN", "Midjourney", "DALL-E_3"]:
+        X_fake = all_features[gen]
+        n = min(len(X_real), len(X_fake))
+        X = np.vstack([X_real[:n], X_fake[:n]])
+        y = np.concatenate([np.zeros(n), np.ones(n)])
+        r = cv_evaluate(X, y)
+        results_per_gen[gen] = r
+        print(f"  {gen:15s}  acc={r['acc']:.4f}  auc={r['auc']:.4f}  f1={r['f1']:.4f}  prec={r['prec']:.4f}  rec={r['rec']:.4f}")
+
+    # --- EXP 2: Pooled ---
+    print("\n" + "=" * 60)
+    print("  EXP 2: Pooled (All Generators Mixed)")
+    print("=" * 60)
+
+    fake_list = ["AnimagineXL", "SD", "SDXL", "StyleGAN", "Midjourney", "DALL-E_3"]
+    X_all_fake = np.vstack([all_features[g] for g in fake_list])
+    n_fake = len(X_all_fake)
+    n_real = len(X_real)
+    if n_real < n_fake:
+        idx = rng.choice(n_fake, n_real, replace=False)
+        X_all_fake = X_all_fake[idx]
+    else:
+        idx = rng.choice(n_real, n_fake, replace=False)
+        X_real_pooled = X_real[idx]
+
+    n = min(len(X_real), len(X_all_fake))
+    X_pooled = np.vstack([X_real[:n], X_all_fake[:n]])
+    y_pooled = np.concatenate([np.zeros(n), np.ones(n)])
+    r = cv_evaluate(X_pooled, y_pooled)
+    print(f"  Pooled: acc={r['acc']:.4f}  auc={r['auc']:.4f}  f1={r['f1']:.4f}  prec={r['prec']:.4f}  rec={r['rec']:.4f}")
+
+    # --- EXP 3: Leave-one-generator-out ---
+    print("\n" + "=" * 60)
+    print("  EXP 3: Leave-One-Generator-Out (Generalization)")
+    print("=" * 60)
+
+    results_logo = {}
+    for held_out in fake_list:
+        train_gens = [g for g in fake_list if g != held_out]
+        X_train_fake = np.vstack([all_features[g] for g in train_gens])
+        n_tf = len(X_train_fake)
+        X_train = np.vstack([X_real[:n_tf], X_train_fake])
+        y_train = np.concatenate([np.zeros(min(len(X_real), n_tf)), np.ones(n_tf)])
+
+        X_test_fake = all_features[held_out]
+        n_te = len(X_test_fake)
+        X_test = np.vstack([X_real[:n_te], X_test_fake])
+        y_test = np.concatenate([np.zeros(n_te), np.ones(n_te)])
+
+        model = lgb.LGBMClassifier(
+            n_estimators=500, learning_rate=0.05, max_depth=8,
+            num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED,
+        )
+        model.fit(X_train, y_train)
+        yp = model.predict_proba(X_test)[:, 1]
+        ypred = (yp > 0.5).astype(int)
+        acc = accuracy_score(y_test, ypred)
+        auc = roc_auc_score(y_test, yp)
+        f1 = f1_score(y_test, ypred, average="macro")
+        results_logo[held_out] = {"acc": acc, "auc": auc, "f1": f1}
+        print(f"  Hold out {held_out:15s}: acc={acc:.4f}  auc={auc:.4f}  f1={f1:.4f}")
+
+    # Save results
+    results = {
+        "per_generator": results_per_gen,
+        "pooled": r,
+        "leave_one_out": {k: {kk: float(vv) for kk, vv in v.items()} for k, v in results_logo.items()},
+    }
+    out_path = Path(__file__).parent.parent / "results" / "imaginet_paintings_results.json"
+    with open(out_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_multidataset_benchmark.py b/tests/test_multidataset_benchmark.py
new file mode 100644
index 0000000..3e33818
--- /dev/null
+++ b/tests/test_multidataset_benchmark.py
@@ -0,0 +1,581 @@
+"""Multi-dataset benchmark for artwork feature extraction + LightGBM classification."""
+
+from __future__ import annotations
+
+import os
+
+os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface"
+os.environ["HF_DATASETS_CACHE"] = "D:/Projects/negate/negate/.cache/huggingface/datasets"
+
+import json
+import time
+import warnings
+from pathlib import Path
+
+import numpy as np
+from numpy.typing import NDArray
+from sklearn.metrics import accuracy_score, roc_auc_score
+from sklearn.model_selection import StratifiedKFold
+from tqdm import tqdm
+
+warnings.filterwarnings("ignore")
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def extract_features_batch(images: list, extractor) -> NDArray:
+    """Extract features from a list of PIL images, return (N, D) array."""
+    rows = []
+    for img in tqdm(images, desc="Extracting features", leave=False):
+        try:
+            feats = extractor(img)
+            rows.append(list(feats.values()))
+        except Exception:
+            rows.append([0.0] * 148)
+    X = np.array(rows, dtype=np.float64)
+    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+    return X
+
+
+def train_and_evaluate(X_train: NDArray, y_train: NDArray, X_test: NDArray, y_test: NDArray) -> dict:
+    """Train LightGBM and return accuracy + AUC."""
+    import lightgbm as lgb
+
+    model = lgb.LGBMClassifier(
+        n_estimators=300,
+        learning_rate=0.05,
+        max_depth=6,
+        num_leaves=31,
+        n_jobs=1,
+        verbose=-1,
+        random_state=42,
+    )
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    y_prob = model.predict_proba(X_test)[:, 1]
+    acc = accuracy_score(y_test, y_pred)
+    try:
+        auc = roc_auc_score(y_test, y_prob)
+    except ValueError:
+        auc = float("nan")
+    return {"accuracy": round(acc, 4), "auc": round(auc, 4)}
+
+
+def cross_validate(X: NDArray, y: NDArray, n_splits: int = 5) -> dict:
+    """5-fold stratified CV, return pooled accuracy + AUC."""
+    import lightgbm as lgb
+
+    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
+    all_y_true, all_y_pred, all_y_prob = [], [], []
+    for train_idx, test_idx in skf.split(X, y):
+        model = lgb.LGBMClassifier(
+            n_estimators=300, learning_rate=0.05, max_depth=6,
+            num_leaves=31, n_jobs=1, verbose=-1, random_state=42,
+        )
+        model.fit(X[train_idx], y[train_idx])
+        preds = model.predict(X[test_idx])
+        probs = model.predict_proba(X[test_idx])[:, 1]
+        all_y_true.extend(y[test_idx])
+        all_y_pred.extend(preds)
+        all_y_prob.extend(probs)
+    acc = accuracy_score(all_y_true, all_y_pred)
+    try:
+        auc = roc_auc_score(all_y_true, all_y_prob)
+    except ValueError:
+        auc = float("nan")
+    return {"accuracy": round(acc, 4), "auc": round(auc, 4)}
+
+
+def sample_balanced(images: list, labels: list, max_per_class: int) -> tuple[list, list]:
+    """Subsample up to max_per_class per unique label value."""
+    from collections import defaultdict
+    buckets: dict[int, list[int]] = defaultdict(list)
+    for i, l in enumerate(labels):
+        buckets[l].append(i)
+    rng = np.random.RandomState(42)
+    selected: list[int] = []
+    for cls, idxs in sorted(buckets.items()):
+        if len(idxs) > max_per_class:
+            idxs = rng.choice(idxs, max_per_class, replace=False).tolist()
+        selected.extend(idxs)
+    rng.shuffle(selected)
+    return [images[i] for i in selected], [labels[i] for i in selected]
+
+
+# ---------------------------------------------------------------------------
+# Dataset loaders
+# ---------------------------------------------------------------------------
+
+def load_defactify(extractor, max_per_gen: int = 200) -> tuple[NDArray, NDArray, dict]:
+    """Load Defactify, sample up to max_per_gen real+fake per generator.
+
+    Uses Rajarshi-Roy-research/Defactify_Image_Dataset with:
+      - Label_B: 0=real, 1=SD2.1, 2=SDXL, 3=SD3, 4=Midjourney v6, 5=DALL-E 3
+      - Image: PIL image column
+    """
+    from collections import defaultdict
+    from datasets import load_dataset, Image as HFImage
+
+    GENERATOR_NAMES = {1: "SD_2.1", 2: "SDXL", 3: "SD_3", 4: "Midjourney_v6", 5: "DALL-E_3"}
+
+    print("\n=== Loading Defactify ===")
+    ds = load_dataset("Rajarshi-Roy-research/Defactify_Image_Dataset", split="train")
+    ds = ds.cast_column("Image", HFImage(decode=True, mode="RGB"))
+    print(f"  Total samples: {len(ds)}")
+
+    # Group by Label_B
+    label_b = ds["Label_B"]
+    gen_buckets: dict[str, list[int]] = defaultdict(list)
+    real_indices: list[int] = []
+    for i, lb in enumerate(label_b):
+        if lb == 0:
+            real_indices.append(i)
+        elif lb in GENERATOR_NAMES:
+            gen_buckets[GENERATOR_NAMES[lb]].append(i)
+
+    generators = sorted(gen_buckets.keys())
+    print(f"  Real images: {len(real_indices)}")
+    print(f"  Generators: {generators}")
+    for g in generators:
+        print(f"    {g}: {len(gen_buckets[g])}")
+
+    # Sample real + per-generator fake
+    rng = np.random.RandomState(42)
+    selected_indices: list[int] = []
+    selected_labels: list[int] = []
+    gen_indices: dict[str, list[int]] = {}
+
+    # Sample real images
+    real_sample = rng.choice(real_indices, size=min(max_per_gen, len(real_indices)), replace=False).tolist()
+    start = len(selected_indices)
+    selected_indices.extend(real_sample)
+    selected_labels.extend([0] * len(real_sample))
+    gen_indices["real"] = list(range(start, len(selected_indices)))
+
+    # Sample fake per generator
+    for g in generators:
+        idxs = gen_buckets[g]
+        if len(idxs) > max_per_gen:
+            idxs = rng.choice(idxs, max_per_gen, replace=False).tolist()
+        start = len(selected_indices)
+        selected_indices.extend(idxs)
+        selected_labels.extend([1] * len(idxs))
+        gen_indices[g] = list(range(start, len(selected_indices)))
+
+    print(f"  Selected {len(selected_indices)} samples (real={sum(1 for l in selected_labels if l==0)}, fake={sum(1 for l in selected_labels if l==1)})")
+    import gc
+
+    # Extract features one at a time to avoid holding all images in memory
+    rows = []
+    for idx in tqdm(selected_indices, desc="Defactify: extract features"):
+        try:
+            img = ds[int(idx)]["Image"]
+            feats = extractor(img)
+            rows.append(list(feats.values()))
+            del img
+        except Exception:
+            rows.append([0.0] * 148)
+
+    del ds
+    gc.collect()
+
+    X = np.array(rows, dtype=np.float64)
+    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+    y = np.array(selected_labels, dtype=np.int32)
+
+    meta = {"generators": generators, "gen_indices": gen_indices}
+    return X, y, meta
+
+
+def load_hemg(extractor, max_per_class: int = 1000) -> tuple[NDArray, NDArray]:
+    """Load Hemg AI-art dataset. label: 0=AI, 1=real."""
+    from collections import defaultdict
+    from datasets import load_dataset
+    import gc
+
+    print("\n=== Loading Hemg (AI art vs real art) ===")
+    ds = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train")
+    print(f"  Total samples: {len(ds)}")
+
+    # Read only labels column (fast, avoids decoding images)
+    all_labels = ds["label"]  # fetches entire label column as list
+    label_buckets: dict[int, list[int]] = defaultdict(list)
+    for i, lbl in enumerate(all_labels):
+        label_buckets[lbl].append(i)
+
+    for k, v in sorted(label_buckets.items()):
+        print(f"  Label {k}: {len(v)} samples")
+
+    # Sample indices first
+    rng = np.random.RandomState(42)
+    selected_indices: list[int] = []
+    selected_labels: list[int] = []
+    for cls in sorted(label_buckets.keys()):
+        idxs = label_buckets[cls]
+        if len(idxs) > max_per_class:
+            idxs = rng.choice(idxs, max_per_class, replace=False).tolist()
+        selected_indices.extend(idxs)
+        selected_labels.extend([cls] * len(idxs))
+
+    print(f"  Selected {len(selected_indices)} samples")
+
+    # Extract features one at a time
+    rows = []
+    final_labels = []
+    for idx, lbl in tqdm(zip(selected_indices, selected_labels), total=len(selected_indices), desc="Hemg: extract features"):
+        try:
+            img = ds[idx]["image"]
+            feats = extractor(img)
+            rows.append(list(feats.values()))
+            final_labels.append(lbl)
+            del img
+        except Exception:
+            rows.append([0.0] * 148)
+            final_labels.append(lbl)
+
+    del ds
+    gc.collect()
+
+    X = np.array(rows, dtype=np.float64)
+    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+    y = np.array(final_labels, dtype=np.int32)
+    return X, y
+
+
+def load_tellif(extractor, max_per_class: int = 200) -> tuple[NDArray, NDArray, list[str]]:
+    """Load tellif semantically similar dataset."""
+    from collections import defaultdict
+    from datasets import load_dataset
+    import gc
+
+    print("\n=== Loading tellif (semantically similar) ===")
+    ds = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test")
+    print(f"  Total samples: {len(ds)}")
+
+    label_names = ds.features["label"].names
+    print(f"  Label names: {label_names}")
+
+    # First pass: read labels only
+    all_labels_raw = [ds[i]["label"] for i in range(len(ds))]
+
+    unique, counts = np.unique(all_labels_raw, return_counts=True)
+    for u, c in zip(unique, counts):
+        print(f"    {label_names[u]} ({u}): {c}")
+
+    # Map to binary: find which labels are "real" vs AI-generated
+    real_keywords = ["real", "photograph", "photo", "human", "genuine", "original"]
+    real_label_ids = set()
+    for lid, name in enumerate(label_names):
+        if any(kw in name.lower() for kw in real_keywords):
+            real_label_ids.add(lid)
+    print(f"  Detected real labels: {[label_names[i] for i in sorted(real_label_ids)]}")
+    if not real_label_ids:
+        # If no clear "real" label, treat the first label as real
+        print("  WARNING: No clear 'real' label found. Treating label 0 as real.")
+        real_label_ids = {0}
+
+    binary_labels = [0 if l in real_label_ids else 1 for l in all_labels_raw]
+
+    # Sample indices per binary class
+    buckets: dict[int, list[int]] = defaultdict(list)
+    for i, bl in enumerate(binary_labels):
+        buckets[bl].append(i)
+    rng = np.random.RandomState(42)
+    selected_indices: list[int] = []
+    selected_binary: list[int] = []
+    for cls in sorted(buckets.keys()):
+        idxs = buckets[cls]
+        if len(idxs) > max_per_class:
+            idxs = rng.choice(idxs, max_per_class, replace=False).tolist()
+        selected_indices.extend(idxs)
+        selected_binary.extend([cls] * len(idxs))
+    print(f"  Selected {len(selected_indices)} samples (binary: 0=real, 1=AI)")
+
+    # Extract features one at a time
+    rows = []
+    final_labels = []
+    for idx, lbl in tqdm(zip(selected_indices, selected_binary), total=len(selected_indices), desc="tellif: extract features"):
+        try:
+            img = ds[idx]["image"]
+            if img is not None:
+                feats = extractor(img)
+                rows.append(list(feats.values()))
+                final_labels.append(lbl)
+                del img
+        except Exception:
+            continue
+
+    del ds
+    gc.collect()
+    X = np.array(rows, dtype=np.float64)
+    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+    y = np.array(final_labels, dtype=np.int32)
+    return X, y, label_names
+
+
+# ---------------------------------------------------------------------------
+# Experiments
+# ---------------------------------------------------------------------------
+
+def experiment_1_baselines(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif) -> dict:
+    """Exp 1: Single-dataset baselines with 5-fold CV."""
+    print("\n" + "=" * 70)
+    print("EXPERIMENT 1: Single-dataset baselines (5-fold CV)")
+    print("=" * 70)
+
+    results = {}
+    for name, X, y in [("Defactify", X_def, y_def), ("Hemg", X_hemg, y_hemg), ("tellif", X_tellif, y_tellif)]:
+        print(f"\n  {name}: {X.shape[0]} samples, {X.shape[1]} features")
+        r = cross_validate(X, y)
+        results[name] = r
+        print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    print("\n  +--------------+----------+--------+")
+    print("  | Dataset      | Accuracy |   AUC  |")
+    print("  +--------------+----------+--------+")
+    for name, r in results.items():
+        print(f"  | {name:<12} | {r['accuracy']:.4f}   | {r['auc']:.4f} |")
+    print("  +--------------+----------+--------+")
+    return results
+
+
+def experiment_2_cross_dataset(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif) -> dict:
+    """Exp 2: Cross-dataset generalization."""
+    print("\n" + "=" * 70)
+    print("EXPERIMENT 2: Cross-dataset generalization")
+    print("=" * 70)
+
+    # Normalize Hemg labels: 0=AI->1(fake), 1=real->0(real) to match Defactify convention
+    # Defactify: 0=real, 1=fake
+    # Hemg: 0=AI, 1=real  -> need to flip
+    y_hemg_norm = 1 - y_hemg
+    # tellif: already 0=real, 1=AI
+
+    results = {}
+
+    # Train Defactify -> test Hemg
+    print("\n  Train Defactify -> Test Hemg")
+    r = train_and_evaluate(X_def, y_def, X_hemg, y_hemg_norm)
+    results["Defactify->Hemg"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    # Train Hemg -> test Defactify
+    print("\n  Train Hemg -> Test Defactify")
+    r = train_and_evaluate(X_hemg, y_hemg_norm, X_def, y_def)
+    results["Hemg->Defactify"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    # Combined train -> test each
+    X_combined = np.vstack([X_def, X_hemg])
+    y_combined = np.concatenate([y_def, y_hemg_norm])
+
+    print("\n  Train Combined -> Test Defactify")
+    r = train_and_evaluate(X_combined, y_combined, X_def, y_def)
+    results["Combined->Defactify"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    print("\n  Train Combined -> Test Hemg")
+    r = train_and_evaluate(X_combined, y_combined, X_hemg, y_hemg_norm)
+    results["Combined->Hemg"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    print("\n  Train Combined -> Test tellif")
+    r = train_and_evaluate(X_combined, y_combined, X_tellif, y_tellif)
+    results["Combined->tellif"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    print("\n  +------------------------+----------+--------+")
+    print("  | Transfer               | Accuracy |   AUC  |")
+    print("  +------------------------+----------+--------+")
+    for name, r in results.items():
+        print(f"  | {name:<22} | {r['accuracy']:.4f}   | {r['auc']:.4f} |")
+    print("  +------------------------+----------+--------+")
+    return results
+
+
+def experiment_3_generator_diversity(X_def, y_def, meta_def) -> dict:
+    """Exp 3: Generator diversity impact using Defactify."""
+    print("\n" + "=" * 70)
+    print("EXPERIMENT 3: Generator diversity impact (Defactify)")
+    print("=" * 70)
+
+    generators = meta_def["generators"]
+    gen_indices = meta_def["gen_indices"]
+    real_idx = np.array(gen_indices["real"])  # shared real block
+    results = {}
+
+    # Build per-generator fake-only index arrays
+    gen_fake_arrays = {}
+    for g in generators:
+        gen_fake_arrays[g] = np.array(gen_indices[g])
+
+    rng = np.random.RandomState(42)
+    gen_order = list(generators)
+    rng.shuffle(gen_order)
+    print(f"  Generator order: {gen_order}")
+
+    # Split real indices proportionally for train/test
+    n_gens = len(generators)
+
+    # Progressive: train on N generators, test on rest
+    for n_train in range(1, n_gens):
+        train_gens = gen_order[:n_train]
+        test_gens = gen_order[n_train:]
+
+        # Split real images proportionally
+        real_shuffled = rng.permutation(real_idx)
+        n_real_train = int(len(real_shuffled) * n_train / n_gens)
+        real_train = real_shuffled[:n_real_train]
+        real_test = real_shuffled[n_real_train:]
+
+        train_fake = np.concatenate([gen_fake_arrays[g] for g in train_gens])
+        test_fake = np.concatenate([gen_fake_arrays[g] for g in test_gens])
+
+        train_idx = np.concatenate([real_train, train_fake])
+        test_idx = np.concatenate([real_test, test_fake])
+
+        X_tr, y_tr = X_def[train_idx], y_def[train_idx]
+        X_te, y_te = X_def[test_idx], y_def[test_idx]
+
+        label = f"{n_train}_gen_train"
+        print(f"\n  Train on {n_train} gen ({', '.join(train_gens)}) -> Test on ({', '.join(test_gens)})")
+        print(f"    Train: {len(X_tr)} (real={np.sum(y_tr==0)}, fake={np.sum(y_tr==1)})")
+        print(f"    Test:  {len(X_te)} (real={np.sum(y_te==0)}, fake={np.sum(y_te==1)})")
+        r = train_and_evaluate(X_tr, y_tr, X_te, y_te)
+        r["train_generators"] = train_gens
+        r["test_generators"] = test_gens
+        results[label] = r
+        print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    # All 5 generators: 5-fold CV
+    print(f"\n  All {n_gens} generators: 5-fold CV")
+    r = cross_validate(X_def, y_def)
+    r["train_generators"] = generators
+    r["test_generators"] = generators
+    results[f"{n_gens}_gen_cv"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    print("\n  +----------------+----------+--------+")
+    print("  | # Generators   | Accuracy |   AUC  |")
+    print("  +----------------+----------+--------+")
+    for name, r in results.items():
+        print(f"  | {name:<14} | {r['accuracy']:.4f}   | {r['auc']:.4f} |")
+    print("  +----------------+----------+--------+")
+    return results
+
+
+def experiment_4_domain_transfer(X_def, y_def, X_hemg, y_hemg) -> dict:
+    """Exp 4: Art vs photos domain transfer."""
+    print("\n" + "=" * 70)
+    print("EXPERIMENT 4: Art vs Photos domain transfer")
+    print("=" * 70)
+
+    # Normalize Hemg: 0=AI->1, 1=real->0
+    y_hemg_norm = 1 - y_hemg
+
+    results = {}
+
+    # Photos -> Art
+    print("\n  Train Defactify (photos) -> Test Hemg (art)")
+    r = train_and_evaluate(X_def, y_def, X_hemg, y_hemg_norm)
+    results["Photos->Art"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    # Art -> Photos
+    print("\n  Train Hemg (art) -> Test Defactify (photos)")
+    r = train_and_evaluate(X_hemg, y_hemg_norm, X_def, y_def)
+    results["Art->Photos"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    # Same-domain baselines for comparison
+    print("\n  Baseline: Defactify 5-fold CV (same domain)")
+    r = cross_validate(X_def, y_def)
+    results["Photos_self_CV"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    print("\n  Baseline: Hemg 5-fold CV (same domain)")
+    r = cross_validate(X_hemg, y_hemg_norm)
+    results["Art_self_CV"] = r
+    print(f"    Accuracy: {r['accuracy']:.4f}  AUC: {r['auc']:.4f}")
+
+    # Compute domain gap
+    photo_self = results["Photos_self_CV"]["accuracy"]
+    art_self = results["Art_self_CV"]["accuracy"]
+    p2a = results["Photos->Art"]["accuracy"]
+    a2p = results["Art->Photos"]["accuracy"]
+    print(f"\n  Domain gap (Photos->Art): {photo_self:.4f} -> {p2a:.4f} (delta: {photo_self - p2a:+.4f})")
+    print(f"  Domain gap (Art->Photos): {art_self:.4f} -> {a2p:.4f} (delta: {art_self - a2p:+.4f})")
+
+    print("\n  +------------------+----------+--------+")
+    print("  | Transfer         | Accuracy |   AUC  |")
+    print("  +------------------+----------+--------+")
+    for name, r in results.items():
+        print(f"  | {name:<16} | {r['accuracy']:.4f}   | {r['auc']:.4f} |")
+    print("  +------------------+----------+--------+")
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    start_time = time.time()
+
+    from negate.extract.feature_artwork import ArtworkExtract
+    extractor = ArtworkExtract()
+
+    # Determine feature count
+    from PIL import Image
+    dummy = Image.new("RGB", (255, 255), color="gray")
+    n_features = len(extractor(dummy))
+    print(f"Feature count: {n_features}")
+
+    # Load datasets
+    X_def, y_def, meta_def = load_defactify(extractor, max_per_gen=200)
+    X_hemg, y_hemg = load_hemg(extractor, max_per_class=1000)
+    X_tellif, y_tellif, tellif_labels = load_tellif(extractor, max_per_class=200)
+
+    print(f"\nDataset shapes:")
+    print(f"  Defactify: X={X_def.shape}, y={y_def.shape} (real={np.sum(y_def==0)}, fake={np.sum(y_def==1)})")
+    print(f"  Hemg:      X={X_hemg.shape}, y={y_hemg.shape} (AI={np.sum(y_hemg==0)}, real={np.sum(y_hemg==1)})")
+    print(f"  tellif:    X={X_tellif.shape}, y={y_tellif.shape} (real={np.sum(y_tellif==0)}, AI={np.sum(y_tellif==1)})")
+
+    # Run experiments
+    results_all: dict = {"feature_count": n_features}
+
+    results_all["exp1_baselines"] = experiment_1_baselines(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif)
+    results_all["exp2_cross_dataset"] = experiment_2_cross_dataset(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif)
+    results_all["exp3_generator_diversity"] = experiment_3_generator_diversity(X_def, y_def, meta_def)
+    results_all["exp4_domain_transfer"] = experiment_4_domain_transfer(X_def, y_def, X_hemg, y_hemg)
+
+    elapsed = time.time() - start_time
+    results_all["elapsed_seconds"] = round(elapsed, 1)
+
+    # Save results
+    out_path = Path("D:/Projects/negate/negate/results/multidataset_benchmark_results.json")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Make JSON-serializable
+    def _make_serializable(obj):
+        if isinstance(obj, dict):
+            return {k: _make_serializable(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [_make_serializable(v) for v in obj]
+        if isinstance(obj, (np.integer,)):
+            return int(obj)
+        if isinstance(obj, (np.floating,)):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return obj
+
+    with open(out_path, "w") as f:
+        json.dump(_make_serializable(results_all), f, indent=2)
+    print(f"\nResults saved to {out_path}")
+    print(f"Total elapsed: {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_scale_evaluation.py b/tests/test_scale_evaluation.py
deleted file mode 100644
index df9bbfe..0000000
--- a/tests/test_scale_evaluation.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0
-"""Scale evaluation: test if more training data improves artwork detection.
-
-Runs the 49-feature pipeline on increasing sample sizes from Hemg (art vs art)
-to determine if 71% accuracy is a data problem or a feature problem.
-"""
-
-from __future__ import annotations
-
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from matplotlib.backends.backend_pdf import PdfPages
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-from datasets import load_dataset, Image as HFImage
-from PIL import Image
-from sklearn.metrics import (
-    accuracy_score, f1_score, roc_auc_score, precision_score, recall_score,
-    confusion_matrix,
-)
-from sklearn.model_selection import StratifiedKFold
-from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
-from tqdm import tqdm
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from negate.extract.feature_artwork import ArtworkExtract
-
-SEED = 42
-N_FOLDS = 5
-REPO = "Hemg/AI-Generated-vs-Real-Images-Datasets"
-SAMPLE_SIZES = [400, 1000, 2000, 4000]  # total (half per class)
-RESULTS_DIR = Path(__file__).parent.parent / "results"
-
-
-def extract_features_cached(dataset, n_per_class: int, extractor: ArtworkExtract):
-    """Extract features, balanced per class."""
-    all_labels = dataset["label"]
-    features, labels, errors = [], [], 0
-
-    rng = np.random.RandomState(SEED)
-
-    for lbl in [0, 1]:
-        indices = [i for i, l in enumerate(all_labels) if l == lbl]
-        chosen = rng.choice(indices, size=min(n_per_class, len(indices)), replace=False)
-
-        for idx in tqdm(chosen, desc=f"  Label {lbl} (n={n_per_class})"):
-            try:
-                img = dataset[int(idx)]["image"]
-                if img is None or not isinstance(img, Image.Image):
-                    errors += 1
-                    continue
-                feat = extractor(img)
-                features.append(feat)
-                # label 0 = AI art (synthetic), label 1 = Real art (genuine)
-                # We want: 0 = genuine, 1 = synthetic
-                labels.append(1 if lbl == 0 else 0)
-            except Exception:
-                errors += 1
-
-    print(f"  Extracted {len(features)} ({errors} errors)")
-    df = pd.DataFrame(features).fillna(0)
-    X = df.to_numpy(dtype=np.float64)
-    X = np.where(np.isfinite(X), X, 0)
-    y = np.array(labels)
-    return X, y, list(df.columns)
-
-
-def run_cv(X, y, model_type="xgb"):
-    """Run 5-fold CV, return pooled y_true, y_prob."""
-    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
-    all_true, all_prob = [], []
-
-    for train_idx, test_idx in skf.split(X, y):
-        X_train, X_test = X[train_idx], X[test_idx]
-        y_train, y_test = y[train_idx], y[test_idx]
-
-        if model_type == "xgb":
-            spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1)
-            params = {
-                "objective": "binary:logistic", "eval_metric": "logloss",
-                "max_depth": 4, "learning_rate": 0.1, "subsample": 0.8,
-                "colsample_bytree": 0.8, "scale_pos_weight": spw, "seed": SEED,
-            }
-            dtrain = xgb.DMatrix(X_train, label=y_train)
-            dtest = xgb.DMatrix(X_test, label=y_test)
-            model = xgb.train(params, dtrain, num_boost_round=200,
-                              evals=[(dtest, "test")], early_stopping_rounds=10,
-                              verbose_eval=False)
-            y_prob = model.predict(dtest)
-        elif model_type == "svm":
-            scaler = StandardScaler()
-            X_tr = scaler.fit_transform(X_train)
-            X_te = scaler.transform(X_test)
-            svm = SVC(kernel="rbf", probability=True, random_state=SEED)
-            svm.fit(X_tr, y_train)
-            y_prob = svm.predict_proba(X_te)[:, 1]
-        elif model_type == "mlp":
-            scaler = StandardScaler()
-            X_tr = scaler.fit_transform(X_train)
-            X_te = scaler.transform(X_test)
-            mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000,
-                                random_state=SEED, early_stopping=True)
-            mlp.fit(X_tr, y_train)
-            y_prob = mlp.predict_proba(X_te)[:, 1]
-
-        all_true.extend(y_test)
-        all_prob.extend(y_prob)
-
-    y_true = np.array(all_true)
-    y_prob = np.array(all_prob)
-    y_pred = (y_prob > 0.5).astype(int)
-
-    return {
-        "accuracy": float(accuracy_score(y_true, y_pred)),
-        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
-        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
-        "f1": float(f1_score(y_true, y_pred, average="macro")),
-        "roc_auc": float(roc_auc_score(y_true, y_prob)),
-    }
-
-
-def generate_pdf(all_results):
-    """Generate scaling analysis PDF."""
-    RESULTS_DIR.mkdir(exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    pdf_path = RESULTS_DIR / f"scale_evaluation_{timestamp}.pdf"
-
-    with PdfPages(str(pdf_path)) as pdf:
-        # PAGE 1: Title + scaling curves
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-
-        fig.suptitle("Scaling Analysis: Does More Data Improve\nArtwork Detection Accuracy?",
-                     fontsize=16, fontweight="bold", fontfamily="serif", y=0.96)
-
-        # Subtitle
-        fig.text(0.5, 0.90, f"negate project — darkshapes — {datetime.now().strftime('%B %d, %Y')}",
-                 fontsize=10, ha="center", fontfamily="serif", style="italic")
-
-        fig.text(0.5, 0.87, "Dataset: Hemg/AI-Generated-vs-Real-Images-Datasets (AI Art vs Real Art)",
-                 fontsize=9, ha="center", fontfamily="serif")
-
-        # Accuracy scaling curve
-        ax1 = fig.add_axes([0.12, 0.52, 0.76, 0.3])
-        sizes = [r["total"] for r in all_results]
-
-        for model, color, marker in [("xgb", "#4472C4", "o"), ("svm", "#ED7D31", "s"), ("mlp", "#70AD47", "^")]:
-            accs = [r[model]["accuracy"] for r in all_results]
-            ax1.plot(sizes, accs, f"-{marker}", color=color, label=model.upper(), markersize=8, linewidth=2)
-            for x, y in zip(sizes, accs):
-                ax1.annotate(f"{y:.1%}", (x, y), textcoords="offset points",
-                            xytext=(0, 10), ha="center", fontsize=8)
-
-        ax1.set_xlabel("Total Training Samples", fontsize=10)
-        ax1.set_ylabel("5-Fold CV Accuracy", fontsize=10)
-        ax1.set_title("Accuracy vs Training Set Size", fontsize=12, fontfamily="serif")
-        ax1.legend(fontsize=9)
-        ax1.grid(True, alpha=0.3)
-        ax1.set_ylim(0.5, 1.0)
-        ax1.axhline(y=0.5, color="red", linestyle="--", alpha=0.3, label="Random chance")
-
-        # AUC scaling curve
-        ax2 = fig.add_axes([0.12, 0.12, 0.76, 0.3])
-
-        for model, color, marker in [("xgb", "#4472C4", "o"), ("svm", "#ED7D31", "s"), ("mlp", "#70AD47", "^")]:
-            aucs = [r[model]["roc_auc"] for r in all_results]
-            ax2.plot(sizes, aucs, f"-{marker}", color=color, label=model.upper(), markersize=8, linewidth=2)
-            for x, y in zip(sizes, aucs):
-                ax2.annotate(f"{y:.3f}", (x, y), textcoords="offset points",
-                            xytext=(0, 10), ha="center", fontsize=8)
-
-        ax2.set_xlabel("Total Training Samples", fontsize=10)
-        ax2.set_ylabel("5-Fold CV ROC-AUC", fontsize=10)
-        ax2.set_title("ROC-AUC vs Training Set Size", fontsize=12, fontfamily="serif")
-        ax2.legend(fontsize=9)
-        ax2.grid(True, alpha=0.3)
-        ax2.set_ylim(0.5, 1.0)
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-        # PAGE 2: Results table + analysis
-        fig = plt.figure(figsize=(8.5, 11))
-        fig.patch.set_facecolor("white")
-        fig.suptitle("Detailed Results & Analysis", fontsize=14,
-                     fontweight="bold", fontfamily="serif", y=0.96)
-
-        # Results table
-        ax_table = fig.add_axes([0.05, 0.62, 0.9, 0.28])
-        ax_table.axis("off")
-
-        table_data = []
-        for r in all_results:
-            for model in ["xgb", "svm", "mlp"]:
-                m = r[model]
-                table_data.append([
-                    str(r["total"]), model.upper(),
-                    f"{m['accuracy']:.2%}", f"{m['precision']:.2%}",
-                    f"{m['recall']:.2%}", f"{m['f1']:.2%}", f"{m['roc_auc']:.4f}"
-                ])
-
-        table = ax_table.table(
-            cellText=table_data,
-            colLabels=["Samples", "Model", "Accuracy", "Precision", "Recall", "F1", "AUC"],
-            loc="center", cellLoc="center",
-        )
-        table.auto_set_font_size(False)
-        table.set_fontsize(7.5)
-        table.scale(1, 1.3)
-        for (row, col), cell in table.get_celld().items():
-            if row == 0:
-                cell.set_facecolor("#4472C4")
-                cell.set_text_props(color="white", fontweight="bold")
-
-        # Analysis
-        ax_text = fig.add_axes([0.08, 0.05, 0.84, 0.52])
-        ax_text.axis("off")
-
-        best_final = max(all_results[-1]["xgb"]["accuracy"],
-                        all_results[-1]["svm"]["accuracy"],
-                        all_results[-1]["mlp"]["accuracy"])
-        best_initial = max(all_results[0]["xgb"]["accuracy"],
-                          all_results[0]["svm"]["accuracy"],
-                          all_results[0]["mlp"]["accuracy"])
-        improvement = best_final - best_initial
-
-        analysis = (
-            "Analysis\n\n"
-            f"Sample sizes tested: {', '.join(str(r['total']) for r in all_results)}\n"
-            f"Best accuracy at smallest size ({all_results[0]['total']}): {best_initial:.1%}\n"
-            f"Best accuracy at largest size ({all_results[-1]['total']}): {best_final:.1%}\n"
-            f"Improvement from scaling: {improvement:+.1%}pp\n\n"
-        )
-
-        if improvement > 0.10:
-            analysis += (
-                "FINDING: Significant improvement with more data.\n"
-                "The 49 features have capacity to learn — the initial low accuracy was\n"
-                "primarily a data limitation. With sufficient training data, the hand-crafted\n"
-                "features can achieve useful detection rates on artwork.\n\n"
-                "Recommendation: Scale to even larger samples (10K+) and consider\n"
-                "integrating these features into the negate pipeline."
-            )
-        elif improvement > 0.03:
-            analysis += (
-                "FINDING: Modest improvement with more data.\n"
-                "More data helps somewhat, but accuracy is plateauing. The features\n"
-                "capture some genuine signal but are limited by their expressiveness.\n\n"
-                "Recommendation: The hand-crafted features are hitting a ceiling.\n"
-                "To push past this, the pipeline needs learned features — either\n"
-                "fine-tuned CLIP/DINOv2 or the self-supervised approach from\n"
-                "Zhong et al. (2026)."
-            )
-        else:
-            analysis += (
-                "FINDING: Minimal improvement with more data.\n"
-                "The features are saturated — adding more training data does not help.\n"
-                "The 49 hand-crafted features simply don't capture enough discriminative\n"
-                "information to distinguish AI art from human art.\n\n"
-                "Recommendation: Fundamentally different features are needed.\n"
-                "Hand-crafted statistics cannot match the representational power of\n"
-                "learned features for this task."
-            )
-
-        analysis += (
-            "\n\nContext\n\n"
-            "This evaluation uses only the Hemg dataset where BOTH classes are artwork.\n"
-            "This is the hardest and most honest test — no content shortcuts.\n"
-            "All processing is CPU-only, 49 features per image.\n"
-            "5-fold stratified cross-validation with fixed random seed (42).\n"
-        )
-
-        ax_text.text(0, 1, analysis, transform=ax_text.transAxes, fontsize=9,
-                    ha="left", va="top", fontfamily="serif")
-
-        pdf.savefig(fig)
-        plt.close(fig)
-
-    print(f"PDF saved to: {pdf_path}")
-    return pdf_path
-
-
-def main():
-    print("=" * 60)
-    print("  SCALING ANALYSIS: Art Detection vs Training Data Size")
-    print("  Dataset: Hemg (AI Art vs Real Art)")
-    print("=" * 60)
-
-    print("\nLoading dataset...")
-    ds = load_dataset(REPO, split="train")
-    ds = ds.cast_column("image", HFImage(decode=True, mode="RGB"))
-    print(f"  Total rows: {len(ds)}")
-
-    extractor = ArtworkExtract()
-    all_results = []
-
-    # We extract at the largest size once, then subsample
-    max_per_class = max(SAMPLE_SIZES) // 2
-    print(f"\nExtracting features for {max_per_class} per class...")
-    X_full, y_full, feature_names = extract_features_cached(ds, max_per_class, extractor)
-    print(f"  Total: {len(y_full)} images, {X_full.shape[1]} features")
-    print(f"  Balance: {np.sum(y_full==0)} genuine, {np.sum(y_full==1)} synthetic")
-
-    for total in SAMPLE_SIZES:
-        per_class = total // 2
-        print(f"\n{'='*40}")
-        print(f"  Testing with {total} samples ({per_class} per class)")
-        print(f"{'='*40}")
-
-        # Subsample from the full extraction
-        rng = np.random.RandomState(SEED)
-        idx_0 = np.where(y_full == 0)[0]
-        idx_1 = np.where(y_full == 1)[0]
-        chosen_0 = rng.choice(idx_0, size=min(per_class, len(idx_0)), replace=False)
-        chosen_1 = rng.choice(idx_1, size=min(per_class, len(idx_1)), replace=False)
-        chosen = np.concatenate([chosen_0, chosen_1])
-        X = X_full[chosen]
-        y = y_full[chosen]
-
-        result = {"total": len(y)}
-        for model in ["xgb", "svm", "mlp"]:
-            print(f"  Running {model.upper()}...")
-            result[model] = run_cv(X, y, model)
-            print(f"    acc={result[model]['accuracy']:.2%} auc={result[model]['roc_auc']:.4f}")
-
-        all_results.append(result)
-
-    # Save JSON
-    RESULTS_DIR.mkdir(exist_ok=True)
-    json_path = RESULTS_DIR / "scale_evaluation_results.json"
-    with open(json_path, "w") as f:
-        json.dump({
-            "timestamp": datetime.now().isoformat(),
-            "dataset": REPO,
-            "feature_count": X_full.shape[1],
-            "results": all_results,
-        }, f, indent=2)
-    print(f"\nJSON saved to: {json_path}")
-
-    # Generate PDF
-    print("\nGenerating PDF...")
-    generate_pdf(all_results)
-
-    # Print summary
-    print(f"\n{'='*60}")
-    print("  SUMMARY")
-    print(f"{'='*60}")
-    for r in all_results:
-        best = max(r["xgb"]["accuracy"], r["svm"]["accuracy"], r["mlp"]["accuracy"])
-        print(f"  n={r['total']:5d}  best_acc={best:.2%}")
-
-
-if __name__ == "__main__":
-    main()