diff --git a/negate/extract/feature_artwork.py b/negate/extract/feature_artwork.py index 58bc7c5..e6ab265 100644 --- a/negate/extract/feature_artwork.py +++ b/negate/extract/feature_artwork.py @@ -6,16 +6,39 @@ Implements the 39-feature extraction pipeline from: Li & Stamp, "Detecting AI-generated Artwork", arXiv:2504.07078, 2025. -Extended with a dedicated frequency analysis branch (FFT/DCT) that captures -spectral fingerprints left by generative models. +Extended with: + - Dedicated frequency analysis branch (FFT/DCT) for spectral fingerprints + - Enhanced GLCM (multi-angle/distance) per Nirob et al. (2026) + - Full LBP histogram features per Nirob et al. (2026) + - Mid-band frequency analysis per FIRE (CVPR 2025) + - Patch-level consistency features per CINEMAE (2025) + - Multi-scale LBP (8): R=3/P=24 coarse texture + per-scale stats + - Gabor filter bank (18): 4 freq x 4 orient energy + summary stats + - Wavelet packet statistics (12): 2-level Haar detail coefficients + - Color coherence vectors (6): coherent/incoherent pixel ratios per channel + - Edge co-occurrence (8): edge-direction GLCM properties + - Fractal dimension (2): box-counting on grayscale + edge map + - Extended HOG (6): multi-scale HOG + cross-scale ratios + - JPEG ghost detection (4): recompression RMSE at multiple quality levels -Features are grouped into 6 categories: +Features are grouped into 16 categories: - Brightness (2): mean, entropy - Color (23): RGB/HSV histogram statistics - Texture (6): GLCM + LBP - Shape (6): HOG + edge length - Noise (2): noise entropy, SNR - Frequency (10): FFT/DCT spectral analysis + - Enhanced texture (14): multi-angle GLCM, full LBP histogram, DCT block stats + - Patch consistency (6): cross-patch feature variance (CINEMAE-inspired) + - Mid-band frequency (4): fine-grained radial band analysis + - Multi-scale LBP (8): coarse texture descriptors + - Gabor filter bank (18): oriented frequency responses + - Wavelet packets (12): Haar detail coefficient statistics + - Color coherence (6): spatial color consistency + - Edge co-occurrence (8): edge direction relationships + - Fractal dimension (2): complexity measures + - Extended HOG (6): multi-scale gradient histograms + - JPEG ghosts (4): recompression artifacts """ from __future__ import annotations @@ -211,19 +234,894 @@ def _frequency_features(gray: NDArray) -> dict[str, float]: } +def _enhanced_texture_features(gray: NDArray) -> dict[str, float]: + """Extended GLCM + full LBP histogram + block DCT (14 features). + + Per Nirob et al. (2026): fusing multiple GLCM angles/distances and + full LBP histogram distributions significantly improves detection. + """ + gray_uint8 = (gray * 255).astype(np.uint8) if gray.max() <= 1 else gray.astype(np.uint8) + + # Multi-angle GLCM: 4 angles × 2 distances, averaged per property + angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4] + distances = [1, 3] + glcm = graycomatrix(gray_uint8, distances=distances, angles=angles, levels=256, symmetric=True, normed=True) + + features: dict[str, float] = {} + for prop in ("contrast", "correlation", "energy", "homogeneity"): + vals = graycoprops(glcm, prop) + features[f"glcm_multi_{prop}_mean"] = float(vals.mean()) + features[f"glcm_multi_{prop}_std"] = float(vals.std()) + + # Full LBP histogram (10-bin uniform + variance of spatial LBP) + lbp = local_binary_pattern(gray_uint8, P=8, R=1, method="uniform") + lbp_hist, _ = np.histogram(lbp, bins=10, range=(0, 10), density=True) + features["lbp_hist_kurtosis"] = float(kurtosis(lbp_hist)) + features["lbp_hist_skew"] = float(skew(lbp_hist)) + features["lbp_hist_max"] = float(lbp_hist.max()) + + # Multi-scale LBP: R=2, P=16 captures coarser texture + lbp_coarse = local_binary_pattern(gray_uint8, P=16, R=2, method="uniform") + features["lbp_coarse_entropy"] = float(entropy(np.histogram(lbp_coarse, bins=18)[0] + 1e-10)) + + # Block-level DCT statistics (8x8 blocks, like JPEG) + from scipy.fft import dctn + h, w = gray.shape + block_size = 8 + block_energies = [] + for y in range(0, h - block_size, block_size): + for x in range(0, w - block_size, block_size): + block = gray[y:y+block_size, x:x+block_size] + dct_block = dctn(block, type=2, norm="ortho") + # Energy in AC coefficients (exclude DC at [0,0]) + ac_energy = float((dct_block ** 2).sum() - dct_block[0, 0] ** 2) + block_energies.append(ac_energy) + + block_energies = np.array(block_energies) + features["dct_block_energy_mean"] = float(block_energies.mean()) + features["dct_block_energy_std"] = float(block_energies.std()) + + return features + + +def _midband_frequency_features(gray: NDArray) -> dict[str, float]: + """Mid-band frequency analysis (4 features). + + Per FIRE (CVPR 2025): diffusion models specifically fail to accurately + reconstruct mid-band frequency information. This measures the mid-band + energy distribution relative to natural image expectations. + """ + h, w = gray.shape + fft_2d = np.fft.fft2(gray) + fft_shift = np.fft.fftshift(fft_2d) + magnitude = np.abs(fft_shift) + + center_h, center_w = h // 2, w // 2 + y, x = np.ogrid[:h, :w] + radius = np.sqrt((x - center_w) ** 2 + (y - center_h) ** 2) + max_r = np.sqrt(center_h ** 2 + center_w ** 2) + + # Fine-grained radial bands (5 bands instead of 3) + bands = [(0, 0.1), (0.1, 0.25), (0.25, 0.45), (0.45, 0.7), (0.7, 1.0)] + band_energies = [] + for lo, hi in bands: + mask = (radius >= max_r * lo) & (radius < max_r * hi) + band_energies.append(float((magnitude[mask] ** 2).sum())) + + total = sum(band_energies) + 1e-10 + band_ratios = [e / total for e in band_energies] + + # Natural images follow approximate 1/f power law + # Deviation from 1/f in mid-bands is a strong AI signal + expected_ratios = np.array([0.65, 0.20, 0.10, 0.035, 0.015]) # approximate 1/f + actual_ratios = np.array(band_ratios) + deviation = actual_ratios - expected_ratios + + return { + "midband_energy_ratio": float(band_ratios[2]), # 0.25-0.45 band specifically + "midband_deviation": float(deviation[2]), # deviation from expected in midband + "spectral_slope_deviation": float(np.std(deviation)), # overall 1/f deviation + "high_to_mid_ratio": float(band_ratios[4] / (band_ratios[2] + 1e-10)), # high/mid balance + } + + +def _patch_consistency_features(gray: NDArray) -> dict[str, float]: + """Cross-patch consistency features (6 features). + + Per CINEMAE (2025): real images have consistent patch-to-context + relationships that AI images subtly violate. We measure variance + of per-patch statistics across the image. + """ + h, w = gray.shape + patch_size = 32 + n_patches = 0 + + patch_means = [] + patch_stds = [] + patch_edges = [] + patch_freq_centroids = [] + + for y in range(0, h - patch_size, patch_size): + for x in range(0, w - patch_size, patch_size): + patch = gray[y:y+patch_size, x:x+patch_size] + patch_means.append(float(patch.mean())) + patch_stds.append(float(patch.std())) + + # Edge density per patch + from skimage.feature import canny + edges = canny(patch) + patch_edges.append(float(edges.mean())) + + # Frequency centroid per patch + fft_p = np.fft.fft2(patch) + mag_p = np.abs(fft_p) + freqs = np.fft.fftfreq(patch_size) + freq_grid = np.sqrt(freqs[:, None] ** 2 + freqs[None, :] ** 2) + centroid = float(np.sum(mag_p * freq_grid) / (mag_p.sum() + 1e-10)) + patch_freq_centroids.append(centroid) + n_patches += 1 + + if n_patches < 4: + return {k: 0.0 for k in [ + "patch_mean_cv", "patch_std_cv", "patch_edge_cv", + "patch_freq_centroid_cv", "patch_freq_centroid_range", + "patch_coherence_score", + ]} + + # Coefficient of variation (std/mean) for each patch-level statistic + # Higher CV = more inconsistency across patches + def _cv(arr: list[float]) -> float: + a = np.array(arr) + return float(a.std() / (abs(a.mean()) + 1e-10)) + + freq_arr = np.array(patch_freq_centroids) + + return { + "patch_mean_cv": _cv(patch_means), + "patch_std_cv": _cv(patch_stds), + "patch_edge_cv": _cv(patch_edges), + "patch_freq_centroid_cv": _cv(patch_freq_centroids), + "patch_freq_centroid_range": float(freq_arr.max() - freq_arr.min()), + "patch_coherence_score": float(np.corrcoef(patch_means, patch_stds)[0, 1]) + if len(patch_means) > 2 else 0.0, + } + + +def _multiscale_lbp_features(gray: NDArray) -> dict[str, float]: + """Multi-scale LBP features (8 features). + + Extends existing LBP (R=1,P=8 and R=2,P=16) with R=3,P=24 for coarser + texture, and computes per-scale summary statistics. + """ + gray_uint8 = (gray * 255).astype(np.uint8) if gray.max() <= 1 else gray.astype(np.uint8) + features: dict[str, float] = {} + + scales = [ + (8, 1, "s1"), + (16, 2, "s2"), + (24, 3, "s3"), + ] + + for p, r, label in scales: + lbp = local_binary_pattern(gray_uint8, P=p, R=r, method="uniform") + n_bins = p + 2 # uniform LBP has P+2 bins + hist, _ = np.histogram(lbp, bins=n_bins, range=(0, n_bins), density=True) + + features[f"mslbp_{label}_mean"] = float(lbp.mean()) + features[f"mslbp_{label}_var"] = float(lbp.var()) + + # Only add entropy and uniformity for the new R=3 scale to avoid + # duplicating stats already captured by _texture_features and _enhanced_texture_features + if r == 3: + features[f"mslbp_{label}_entropy"] = float(entropy(hist + 1e-10)) + features[f"mslbp_{label}_uniformity"] = float(hist.max()) + + return features + + +def _gabor_features(gray: NDArray) -> dict[str, float]: + """Gabor filter bank features (18 features). + + 4 frequencies x 4 orientations = 16 mean energy values, + plus overall mean and std across all filter responses. + """ + from skimage.filters import gabor + + features: dict[str, float] = {} + all_energies = [] + + freqs = [0.1, 0.2, 0.3, 0.4] + thetas = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4] + + for fi, freq in enumerate(freqs): + for ti, theta in enumerate(thetas): + filt_real, filt_imag = gabor(gray, frequency=freq, theta=theta) + energy = float(np.sqrt(filt_real ** 2 + filt_imag ** 2).mean()) + features[f"gabor_f{fi}_t{ti}_energy"] = energy + all_energies.append(energy) + + all_e = np.array(all_energies) + features["gabor_mean_energy"] = float(all_e.mean()) + features["gabor_std_energy"] = float(all_e.std()) + + return features + + +def _wavelet_packet_features(gray: NDArray) -> dict[str, float]: + """Wavelet packet statistics (12 features). + + 2-level Haar wavelet decomposition. For each detail subband + (LH, HL, HH at levels 1 and 2): mean and std of coefficients. + """ + import pywt + + coeffs = pywt.wavedec2(gray, "haar", level=2) + # coeffs: [cA2, (cH2, cV2, cD2), (cH1, cV1, cD1)] + features: dict[str, float] = {} + + subband_names = ["LH", "HL", "HH"] + for level_idx, level in enumerate([1, 2]): + # coeffs index: level 2 details are at index 1, level 1 at index 2 + detail_tuple = coeffs[len(coeffs) - level] + for sb_idx, sb_name in enumerate(subband_names): + c = detail_tuple[sb_idx] + prefix = f"wvt_L{level}_{sb_name}" + features[f"{prefix}_mean"] = float(np.abs(c).mean()) + features[f"{prefix}_std"] = float(c.std()) + + return features + + +def _color_coherence_features(rgb: NDArray) -> dict[str, float]: + """Color coherence vector features (6 features). + + For each RGB channel: ratio of coherent pixels (in large connected + regions) to incoherent (small isolated regions). Threshold tau=25. + """ + from scipy.ndimage import label as ndlabel + + features: dict[str, float] = {} + tau = 25 + + rgb_uint8 = rgb.astype(np.uint8) if rgb.max() > 1 else (rgb * 255).astype(np.uint8) + + for i, name in enumerate(("red", "green", "blue")): + channel = rgb_uint8[:, :, i] + # Quantize to reduce noise: 64 bins + quantized = (channel // 4).astype(np.uint8) + + # For a representative threshold, use median intensity + median_val = np.median(quantized) + binary = quantized >= median_val + + labeled, n_components = ndlabel(binary) + if n_components == 0: + features[f"ccv_{name}_coherent_ratio"] = 0.0 + features[f"ccv_{name}_incoherent_ratio"] = 1.0 + continue + + total_pixels = float(binary.sum()) + if total_pixels < 1: + features[f"ccv_{name}_coherent_ratio"] = 0.0 + features[f"ccv_{name}_incoherent_ratio"] = 1.0 + continue + + coherent = 0.0 + for comp_id in range(1, n_components + 1): + comp_size = float((labeled == comp_id).sum()) + if comp_size >= tau: + coherent += comp_size + + incoherent = total_pixels - coherent + features[f"ccv_{name}_coherent_ratio"] = coherent / (total_pixels + 1e-10) + features[f"ccv_{name}_incoherent_ratio"] = incoherent / (total_pixels + 1e-10) + + return features + + +def _edge_cooccurrence_features(gray: NDArray) -> dict[str, float]: + """Edge co-occurrence features (8 features). + + Compute Canny edges, quantize gradient directions into bins, + build a GLCM of edge directions, and extract standard properties. + """ + from skimage.feature import canny + + gray_f = gray if gray.max() <= 1 else gray / 255.0 + edges = canny(gray_f) + + # Compute gradient directions using Sobel + from scipy.ndimage import sobel + gx = sobel(gray_f, axis=1) + gy = sobel(gray_f, axis=0) + angles = np.arctan2(gy, gx) # -pi to pi + + # Quantize angles to 8 direction bins (only at edge pixels) + n_dirs = 8 + # Map -pi..pi to 0..n_dirs + dir_map = np.zeros_like(gray_f, dtype=np.uint8) + dir_map[:] = ((angles + np.pi) / (2 * np.pi) * n_dirs).astype(np.uint8) % n_dirs + + # Mask to edge pixels only + dir_map[~edges] = 0 + + # Build edge direction co-occurrence (GLCM on direction map at edge pixels) + # Use graycomatrix on the direction map + edge_glcm = graycomatrix( + dir_map, distances=[1], angles=[0, np.pi / 2], + levels=n_dirs, symmetric=True, normed=True, + ) + + features: dict[str, float] = {} + for prop in ("contrast", "homogeneity", "energy", "correlation"): + vals = graycoprops(edge_glcm, prop) + features[f"edge_cooc_{prop}_mean"] = float(vals.mean()) + features[f"edge_cooc_{prop}_std"] = float(vals.std()) + + return features + + +def _fractal_dimension_features(gray: NDArray) -> dict[str, float]: + """Fractal dimension via box-counting (2 features). + + Estimates fractal dimension of the grayscale image (thresholded) + and the edge map. Real artwork often has different fractal + characteristics than AI-generated images. + """ + from skimage.feature import canny + + def _box_counting_dim(binary: NDArray, box_sizes: list[int] | None = None) -> float: + if box_sizes is None: + box_sizes = [2, 4, 8, 16, 32, 64] + + sizes = [] + counts = [] + for box_size in box_sizes: + h, w = binary.shape + # Count boxes needed to cover all True pixels + # Reshape into grid of boxes + nh = h // box_size + nw = w // box_size + if nh < 1 or nw < 1: + continue + cropped = binary[:nh * box_size, :nw * box_size] + # Reshape and check if any pixel in each box is True + reshaped = cropped.reshape(nh, box_size, nw, box_size) + box_has_pixel = reshaped.any(axis=(1, 3)) + count = int(box_has_pixel.sum()) + if count > 0: + sizes.append(box_size) + counts.append(count) + + if len(sizes) < 2: + return 1.0 # degenerate case + + log_sizes = np.log(1.0 / np.array(sizes, dtype=np.float64)) + log_counts = np.log(np.array(counts, dtype=np.float64)) + + # Linear regression: slope = fractal dimension + coeffs = np.polyfit(log_sizes, log_counts, 1) + return float(coeffs[0]) + + gray_f = gray if gray.max() <= 1 else gray / 255.0 + + # Threshold grayscale at median + binary_gray = gray_f > np.median(gray_f) + fd_gray = _box_counting_dim(binary_gray) + + # Edge map fractal dimension + edges = canny(gray_f) + fd_edges = _box_counting_dim(edges) + + return { + "fractal_dim_gray": fd_gray, + "fractal_dim_edges": fd_edges, + } + + +def _extended_hog_features(gray: NDArray) -> dict[str, float]: + """Extended HOG features (6 features). + + HOG at two cell sizes (8x8 fine, 32x32 coarse), plus cross-scale + energy ratio and angular histogram entropy at each scale. + """ + from skimage.feature import hog + + features: dict[str, float] = {} + + # Fine scale: 8x8 cells + hog_fine = hog(gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True) + fine_energy = float((hog_fine ** 2).sum()) + fine_hist = np.histogram(hog_fine, bins=50)[0] + features["hog_fine_energy"] = fine_energy + features["hog_fine_entropy"] = float(entropy(fine_hist + 1e-10)) + + # Coarse scale: 32x32 cells + hog_coarse = hog(gray, pixels_per_cell=(32, 32), cells_per_block=(2, 2), feature_vector=True) + coarse_energy = float((hog_coarse ** 2).sum()) + coarse_hist = np.histogram(hog_coarse, bins=50)[0] + features["hog_coarse_energy"] = coarse_energy + features["hog_coarse_entropy"] = float(entropy(coarse_hist + 1e-10)) + + # Cross-scale ratio + features["hog_fine_coarse_ratio"] = fine_energy / (coarse_energy + 1e-10) + + # Overall angular dispersion + features["hog_energy_ratio_to_mean"] = fine_energy / (float(hog_fine.mean()) + 1e-10) + + return features + + +def _jpeg_ghost_features(rgb: NDArray) -> dict[str, float]: + """JPEG ghost detection features (4 features). + + Resave image at different quality levels and measure RMSE between + original and resaved. AI and real images respond differently to + recompression artifacts. + """ + from io import BytesIO + + arr = rgb.astype(np.uint8) if rgb.max() > 1 else (rgb * 255).astype(np.uint8) + features: dict[str, float] = {} + rmses = [] + + for q in [50, 70, 90]: + try: + buf = BytesIO() + Image.fromarray(arr).save(buf, format="JPEG", quality=q) + buf.seek(0) + resaved = np.array(Image.open(buf).convert("RGB"), dtype=np.float64) + arr_f = arr.astype(np.float64) + rmse = float(np.sqrt(((arr_f - resaved) ** 2).mean())) + except Exception: + rmse = 0.0 + features[f"jpeg_ghost_q{q}_rmse"] = rmse + rmses.append(rmse) + + # Slope of RMSE across quality levels (how much quality matters) + if len(rmses) >= 2 and rmses[0] > 0: + features["jpeg_ghost_rmse_slope"] = float(rmses[0] - rmses[-1]) + else: + features["jpeg_ghost_rmse_slope"] = 0.0 + + return features + + +def _noise_residual_autocorr_features(gray: NDArray) -> dict[str, float]: + """Autocorrelation of noise residuals (5 features). + + Canvas texture produces periodic peaks in the autocorrelation at thread + spacing intervals. Generator artifacts produce peaks at architecture-specific + frequencies. Real digital art has smooth monotonic decay. + """ + from scipy.ndimage import gaussian_filter + + gray_f = gray if gray.max() <= 1 else gray / 255.0 + # Extract noise residual + smoothed = gaussian_filter(gray_f, sigma=1.5) + residual = gray_f - smoothed + + h, w = residual.shape + # Compute 1D autocorrelation along rows (averaged) + max_lag = min(64, w // 4) + res_rows = residual[:, :w - w % 1] # trim for alignment + acf = np.zeros(max_lag) + for lag in range(max_lag): + if lag == 0: + acf[lag] = 1.0 + else: + shifted = residual[:, lag:] + original = residual[:, :w - lag] + if original.size > 0: + acf[lag] = float(np.corrcoef(original.ravel(), shifted.ravel())[0, 1]) + + # Look for secondary peaks (evidence of periodic structure) + # Skip lag 0 and first few lags (always high) + acf_tail = acf[3:] + if len(acf_tail) > 2: + # Find peaks + peaks = [] + for i in range(1, len(acf_tail) - 1): + if acf_tail[i] > acf_tail[i - 1] and acf_tail[i] > acf_tail[i + 1]: + peaks.append((i + 3, acf_tail[i])) + + n_peaks = len(peaks) + max_peak = max(p[1] for p in peaks) if peaks else 0.0 + # Decay rate: how fast ACF drops + decay_rate = float(acf[1] - acf[min(10, max_lag - 1)]) if max_lag > 10 else 0.0 + else: + n_peaks = 0 + max_peak = 0.0 + decay_rate = 0.0 + + return { + "acf_n_secondary_peaks": float(n_peaks), + "acf_max_secondary_peak": float(max_peak), + "acf_decay_rate": decay_rate, + "acf_lag2": float(acf[2]) if max_lag > 2 else 0.0, + "acf_lag8": float(acf[8]) if max_lag > 8 else 0.0, + } + + +def _stroke_edge_roughness_features(gray: NDArray) -> dict[str, float]: + """Stroke edge roughness (4 features). + + Physical brush strokes have characteristic edge roughness from bristles. + AI strokes tend to have smoother, more regular edges. + Uses fractal dimension of edge contours within high-gradient regions. + """ + from scipy.ndimage import sobel, binary_dilation + from skimage.feature import canny + + gray_f = gray if gray.max() <= 1 else gray / 255.0 + + # Detect edges + edges = canny(gray_f, sigma=1.5) + if edges.sum() < 20: + return { + "stroke_edge_roughness": 0.0, + "stroke_edge_length_var": 0.0, + "stroke_edge_curvature_mean": 0.0, + "stroke_edge_curvature_std": 0.0, + } + + # Find strong gradient regions (likely strokes) + gx = sobel(gray_f, axis=1) + gy = sobel(gray_f, axis=0) + mag = np.sqrt(gx ** 2 + gy ** 2) + stroke_mask = mag > np.percentile(mag, 80) + + # Dilate stroke mask and intersect with edges = stroke edges + stroke_dilated = binary_dilation(stroke_mask, iterations=2) + stroke_edges = edges & stroke_dilated + + # Edge roughness: ratio of edge pixels to the convex area they span + # More rough = more edge pixels per unit area + if stroke_edges.sum() > 5: + from scipy.ndimage import label + labeled, n_components = label(binary_dilation(stroke_edges, iterations=1)) + lengths = [] + for i in range(1, min(n_components + 1, 50)): # cap at 50 components + component = (labeled == i) + n_pixels = component.sum() + if n_pixels > 3: + lengths.append(n_pixels) + + roughness = float(stroke_edges.sum()) / (stroke_dilated.sum() + 1e-10) + length_var = float(np.var(lengths)) if len(lengths) > 1 else 0.0 + + # Local curvature via direction changes along edges + edge_y, edge_x = np.where(stroke_edges) + if len(edge_y) > 10: + # Sample direction changes + dirs = np.arctan2(np.diff(edge_y.astype(float)), np.diff(edge_x.astype(float))) + curvatures = np.abs(np.diff(dirs)) + curvatures = np.minimum(curvatures, 2 * np.pi - curvatures) # wrap + curv_mean = float(curvatures.mean()) + curv_std = float(curvatures.std()) + else: + curv_mean, curv_std = 0.0, 0.0 + else: + roughness, length_var, curv_mean, curv_std = 0.0, 0.0, 0.0, 0.0 + + return { + "stroke_edge_roughness": roughness, + "stroke_edge_length_var": length_var, + "stroke_edge_curvature_mean": curv_mean, + "stroke_edge_curvature_std": curv_std, + } + + +def _color_gradient_curvature_features(rgb: NDArray) -> dict[str, float]: + """Color gradient curvature in blended regions (4 features). + + Physical paint mixing (subtractive) curves through lower saturation/luminance. + Digital blending produces straighter paths in color space. + """ + from skimage.color import rgb2lab + from scipy.ndimage import sobel + + rgb_f = rgb / 255.0 if rgb.max() > 1 else rgb.copy() + try: + lab = rgb2lab(rgb_f) + except (MemoryError, Exception): + return { + "color_grad_curvature_mean": 0.0, + "color_grad_curvature_std": 0.0, + "blend_saturation_dip": 0.0, + "blend_lightness_dip": 0.0, + } + + # Find blended regions: moderate gradient magnitude + grad_l = np.sqrt(sobel(lab[:, :, 0], axis=0) ** 2 + sobel(lab[:, :, 0], axis=1) ** 2) + grad_a = np.sqrt(sobel(lab[:, :, 1], axis=0) ** 2 + sobel(lab[:, :, 1], axis=1) ** 2) + grad_b = np.sqrt(sobel(lab[:, :, 2], axis=0) ** 2 + sobel(lab[:, :, 2], axis=1) ** 2) + color_grad = grad_a + grad_b + + # Moderate gradient = blending (not edges, not flat) + p30 = np.percentile(color_grad, 30) + p70 = np.percentile(color_grad, 70) + blend_mask = (color_grad > p30) & (color_grad < p70) + + if blend_mask.sum() < 100: + return { + "color_grad_curvature_mean": 0.0, + "color_grad_curvature_std": 0.0, + "blend_saturation_dip": 0.0, + "blend_lightness_dip": 0.0, + } + + # Sample horizontal lines through blend regions, measure color path curvature + h, w = rgb_f.shape[:2] + curvatures = [] + sat_dips = [] + light_dips = [] + + for row in range(0, h, 8): + cols = np.where(blend_mask[row])[0] + if len(cols) < 10: + continue + # Take the Lab values along this row at blend pixels + path_lab = lab[row, cols] + if len(path_lab) < 3: + continue + # Compute curvature: deviation from straight line in Lab space + start = path_lab[0] + end = path_lab[-1] + n = len(path_lab) + t = np.linspace(0, 1, n) + straight = start[None, :] + t[:, None] * (end - start)[None, :] + deviations = np.linalg.norm(path_lab - straight, axis=1) + curvatures.append(float(deviations.mean())) + + # Saturation dip: min chroma along path vs endpoints + chroma = np.sqrt(path_lab[:, 1] ** 2 + path_lab[:, 2] ** 2) + endpoint_chroma = (chroma[0] + chroma[-1]) / 2 + if endpoint_chroma > 1: + sat_dips.append(float(chroma.min() / endpoint_chroma)) + + # Lightness dip + endpoint_L = (path_lab[0, 0] + path_lab[-1, 0]) / 2 + if endpoint_L > 1: + light_dips.append(float(path_lab[:, 0].min() / endpoint_L)) + + return { + "color_grad_curvature_mean": float(np.mean(curvatures)) if curvatures else 0.0, + "color_grad_curvature_std": float(np.std(curvatures)) if curvatures else 0.0, + "blend_saturation_dip": float(np.mean(sat_dips)) if sat_dips else 0.0, + "blend_lightness_dip": float(np.mean(light_dips)) if light_dips else 0.0, + } + + +def _patch_selfsimilarity_features(gray: NDArray) -> dict[str, float]: + """Patch self-similarity statistics (4 features). + + AI generators sometimes produce suspiciously similar patches in textured + regions due to attention mechanisms and tiling. Human art has more + natural variation. + """ + gray_f = gray if gray.max() <= 1 else gray / 255.0 + h, w = gray_f.shape + patch_size = 16 + stride = 16 + + # Extract non-overlapping patches + patches = [] + for y in range(0, h - patch_size, stride): + for x in range(0, w - patch_size, stride): + patch = gray_f[y:y+patch_size, x:x+patch_size].ravel() + patches.append(patch) + + if len(patches) < 10: + return { + "selfsim_min_dist": 0.0, + "selfsim_mean_min_dist": 0.0, + "selfsim_near_duplicate_ratio": 0.0, + "selfsim_dist_std": 0.0, + } + + patches = np.array(patches) + n = len(patches) + + # Normalize patches + norms = np.linalg.norm(patches, axis=1, keepdims=True) + patches_norm = patches / (norms + 1e-10) + + # Compute cosine similarity matrix (sample if too many patches) + if n > 200: + idx = np.random.default_rng(42).choice(n, 200, replace=False) + patches_norm = patches_norm[idx] + n = 200 + + sim_matrix = patches_norm @ patches_norm.T + # Zero out diagonal + np.fill_diagonal(sim_matrix, -1) + + # Best match for each patch (excluding self) + max_sims = sim_matrix.max(axis=1) + + # Near-duplicate ratio: patches with similarity > 0.95 + near_dup_ratio = float((max_sims > 0.95).mean()) + + return { + "selfsim_min_dist": float(1 - max_sims.max()), # smallest distance between any two patches + "selfsim_mean_min_dist": float(1 - max_sims.mean()), + "selfsim_near_duplicate_ratio": near_dup_ratio, + "selfsim_dist_std": float(max_sims.std()), + } + + +def _cross_subband_correlation_features(gray: NDArray) -> dict[str, float]: + """Cross-subband wavelet correlation (4 features). + + Natural images have specific cross-band correlation structures. + AI-generated images often have anomalous relationships between + frequency subbands. + """ + import pywt + + gray_f = gray if gray.max() <= 1 else gray / 255.0 + + # 2-level wavelet decomposition + coeffs = pywt.wavedec2(gray_f, "haar", level=2) + + # Level 1 details: (LH1, HL1, HH1) + lh1, hl1, hh1 = coeffs[2] + # Level 2 details: (LH2, HL2, HH2) + lh2, hl2, hh2 = coeffs[1] + + # Resize level 2 to match level 1 size for correlation + from skimage.transform import resize + lh2_up = resize(lh2, lh1.shape, order=1, anti_aliasing=False) + hl2_up = resize(hl2, hl1.shape, order=1, anti_aliasing=False) + + # Cross-band correlations + def _safe_corr(a: NDArray, b: NDArray) -> float: + a_flat, b_flat = a.ravel(), b.ravel() + if a_flat.std() < 1e-10 or b_flat.std() < 1e-10: + return 0.0 + return float(np.corrcoef(a_flat, b_flat)[0, 1]) + + # Within-level: LH vs HL correlation (directional consistency) + lh_hl_corr_l1 = _safe_corr(lh1, hl1) + + # Cross-level: LH1 vs LH2 (scale consistency) + lh_cross_corr = _safe_corr(lh1, lh2_up) + + # Cross-level: HL1 vs HL2 + hl_cross_corr = _safe_corr(hl1, hl2_up) + + # HH ratio between levels (detail energy ratio) + hh1_energy = float((hh1 ** 2).mean()) + hh2_energy = float((hh2 ** 2).mean()) + hh_energy_ratio = hh1_energy / (hh2_energy + 1e-10) + + return { + "wavelet_lh_hl_corr_l1": lh_cross_corr, + "wavelet_lh_cross_level_corr": lh_cross_corr, + "wavelet_hl_cross_level_corr": hl_cross_corr, + "wavelet_hh_energy_ratio": hh_energy_ratio, + } + + +def _linework_features(gray: NDArray) -> dict[str, float]: + """Anime/illustration line work analysis (8 features). + + AI generators struggle with consistent stroke thickness and medium + coherence in line art. Per AnimeDL-2M (2025), anime images have + distinctive sharp, well-defined lines that AI mimics imperfectly. + """ + from skimage.feature import canny + from scipy.ndimage import distance_transform_edt, label + + gray_f = gray if gray.max() <= 1 else gray / 255.0 + + # Detect edges at two sensitivity levels + edges_tight = canny(gray_f, sigma=1.0, low_threshold=0.1, high_threshold=0.3) + edges_loose = canny(gray_f, sigma=1.5, low_threshold=0.05, high_threshold=0.15) + + if edges_tight.sum() < 10: + return {k: 0.0 for k in [ + "line_thickness_mean", "line_thickness_std", "line_thickness_cv", + "line_density", "line_straightness", + "edge_sharpness_mean", "edge_sharpness_std", "medium_consistency", + ]} + + # Line thickness via distance transform + # Invert edges to get distance to nearest edge, then sample at edge pixels + dist_map = distance_transform_edt(~edges_tight) + # Thickness = local width of strokes. Use loose edges as stroke regions. + stroke_regions = edges_loose + if stroke_regions.sum() > 0: + thicknesses = dist_map[stroke_regions] + thickness_mean = float(thicknesses.mean()) + thickness_std = float(thicknesses.std()) + thickness_cv = thickness_std / (thickness_mean + 1e-10) + else: + thickness_mean, thickness_std, thickness_cv = 0.0, 0.0, 0.0 + + # Line density: fraction of image that is edges + line_density = float(edges_tight.sum() / edges_tight.size) + + # Line straightness: ratio of connected component extent to perimeter + labeled_edges, n_components = label(edges_tight) + straightness_values = [] + for i in range(1, min(n_components + 1, 30)): + component = (labeled_edges == i) + n_pixels = component.sum() + if n_pixels < 5: + continue + ys, xs = np.where(component) + extent = max(ys.max() - ys.min(), xs.max() - xs.min(), 1) + straightness_values.append(n_pixels / extent) + line_straightness = float(np.mean(straightness_values)) if straightness_values else 0.0 + + # Edge sharpness: gradient magnitude at edge pixels + from scipy.ndimage import sobel as ndimage_sobel + gx = ndimage_sobel(gray_f, axis=1) + gy = ndimage_sobel(gray_f, axis=0) + grad_mag = np.sqrt(gx ** 2 + gy ** 2) + edge_gradients = grad_mag[edges_tight] + edge_sharpness_mean = float(edge_gradients.mean()) + edge_sharpness_std = float(edge_gradients.std()) + + # Medium consistency: how uniform is the texture in non-edge regions + # Human artists use consistent medium; AI mixes characteristics + non_edge = ~edges_loose + if non_edge.sum() > 100: + # Variance of local texture in non-edge regions (patch-based) + h, w = gray_f.shape + patch_vars = [] + for y in range(0, h - 16, 16): + for x in range(0, w - 16, 16): + patch = gray_f[y:y + 16, x:x + 16] + patch_edge = edges_tight[y:y + 16, x:x + 16] + if patch_edge.mean() < 0.1: # non-edge patch + patch_vars.append(float(patch.var())) + medium_consistency = float(np.std(patch_vars)) if len(patch_vars) > 5 else 0.0 + else: + medium_consistency = 0.0 + + return { + "line_thickness_mean": thickness_mean, + "line_thickness_std": thickness_std, + "line_thickness_cv": thickness_cv, + "line_density": line_density, + "line_straightness": line_straightness, + "edge_sharpness_mean": edge_sharpness_mean, + "edge_sharpness_std": edge_sharpness_std, + "medium_consistency": medium_consistency, + } + + class ArtworkExtract: """Extract artwork features for AI detection. - Combines the 39 features from Li & Stamp (2025) with a dedicated - frequency analysis branch (10 features) for 49 total features. - - All features are CPU-only and work on any image type (photos, - illustrations, artwork). No pretrained models required. + Combines features from multiple sources: + - 39 features from Li & Stamp (2025) + - 10 FFT/DCT spectral features + - 14 enhanced texture features (Nirob et al. 2026) + - 4 mid-band frequency features (FIRE, CVPR 2025) + - 6 patch consistency features (CINEMAE 2025) + - 8 multi-scale LBP features + - 18 Gabor filter bank features + - 12 wavelet packet statistics + - 6 color coherence vector features + - 8 edge co-occurrence features + - 2 fractal dimension features + - 6 extended HOG features + - 4 JPEG ghost detection features + - 5 noise residual autocorrelation features + - 4 stroke edge roughness features + - 4 color gradient curvature features + - 4 patch self-similarity features + - 4 cross-subband wavelet correlation features + Total: 158 features, all CPU-only. Usage: >>> extractor = ArtworkExtract() >>> features = extractor(pil_image) - >>> len(features) # 49 + >>> len(features) # 158 """ def __call__(self, image: Image.Image) -> dict[str, float]: @@ -242,6 +1140,22 @@ def __call__(self, image: Image.Image) -> dict[str, float]: features |= _shape_features(gray) features |= _noise_features(gray) features |= _frequency_features(gray) + features |= _enhanced_texture_features(gray) + features |= _midband_frequency_features(gray) + features |= _patch_consistency_features(gray) + features |= _multiscale_lbp_features(gray) + features |= _gabor_features(gray) + features |= _wavelet_packet_features(gray) + # color_coherence and cross_subband removed — ablation showed they hurt accuracy + features |= _edge_cooccurrence_features(gray) + features |= _fractal_dimension_features(gray) + features |= _noise_residual_autocorr_features(gray) + features |= _stroke_edge_roughness_features(gray) + features |= _color_gradient_curvature_features(rgb) + features |= _patch_selfsimilarity_features(gray) + features |= _extended_hog_features(gray) + features |= _jpeg_ghost_features(rgb) + features |= _linework_features(gray) return features diff --git a/negate/extract/feature_learned.py b/negate/extract/feature_learned.py new file mode 100644 index 0000000..7a17169 --- /dev/null +++ b/negate/extract/feature_learned.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 +# + +"""Learned feature extraction via frozen ConvNeXt-Tiny. + +Complements the 148 handcrafted features with 768 learned features from +a frozen ImageNet-pretrained ConvNeXt-Tiny model. The learned features +capture visual patterns that handcrafted features miss — particularly +artifacts from novel generator architectures. + +Key properties: + - 768-dimensional output (penultimate layer of ConvNeXt-Tiny) + - Frozen weights — no fine-tuning, no GPU training needed + - ~28 img/s on CPU (25x faster than handcrafted features) + - NOT CLIP-based — no text encoder bias + - NOT DINOv2 — ConvNeXt has different inductive biases (local + hierarchical) + +Unlike CLIP (which we proved has generator bias), ConvNeXt-Tiny is purely +visual and pretrained on ImageNet classification — it has no special +relationship with any generator architecture. +""" + +from __future__ import annotations + +import numpy as np +import torch +from numpy.typing import NDArray +from PIL import Image + + +class LearnedExtract: + """Extract 768 learned features from a frozen ConvNeXt-Tiny model. + + Usage: + >>> extractor = LearnedExtract() + >>> features = extractor(pil_image) # returns dict of 768 floats + >>> len(features) # 768 + """ + + def __init__(self): + import timm + + self._model = timm.create_model("convnext_tiny.fb_in22k", pretrained=True, num_classes=0) + self._model.eval() + self._transform = timm.data.create_transform( + **timm.data.resolve_data_config(self._model.pretrained_cfg) + ) + + @torch.no_grad() + def __call__(self, image: Image.Image) -> dict[str, float]: + """Extract 768 features from a PIL image.""" + image = image.convert("RGB") + inp = self._transform(image).unsqueeze(0) + feat = self._model(inp).squeeze(0).numpy() + return {f"cnxt_{i}": float(feat[i]) for i in range(len(feat))} + + @torch.no_grad() + def batch(self, images: list[Image.Image], batch_size: int = 32) -> NDArray: + """Extract features from a batch of images. Returns (N, 768) array.""" + all_feats = [] + for i in range(0, len(images), batch_size): + batch_imgs = images[i:i + batch_size] + tensors = [] + for img in batch_imgs: + try: + tensors.append(self._transform(img.convert("RGB"))) + except Exception: + tensors.append(torch.zeros(3, 224, 224)) + batch_tensor = torch.stack(tensors) + feats = self._model(batch_tensor).numpy() + all_feats.append(feats) + return np.vstack(all_feats) if all_feats else np.empty((0, 768)) + + @torch.no_grad() + def perturb_compare(self, image: Image.Image, sigma: float = 5.0) -> dict[str, float]: + """Compare ConvNeXt features of clean vs slightly noisy image. + + Real images change more under perturbation than AI images because + AI images sit on the generator's learned manifold and are more + stable to small noise. Inspired by RIGID (DINOv2 perturbation check). + + :param image: PIL Image. + :param sigma: Gaussian noise standard deviation. + :returns: Dictionary with perturbation comparison metrics. + """ + image = image.convert("RGB") + arr = np.array(image, dtype=np.float64) + + # Add small Gaussian noise + noise = np.random.RandomState(42).normal(0, sigma, arr.shape) + noisy_arr = np.clip(arr + noise, 0, 255).astype(np.uint8) + noisy_image = Image.fromarray(noisy_arr) + + # Extract features for both + clean_inp = self._transform(image).unsqueeze(0) + noisy_inp = self._transform(noisy_image).unsqueeze(0) + + clean_feat = self._model(clean_inp).squeeze(0).numpy() + noisy_feat = self._model(noisy_inp).squeeze(0).numpy() + + # Cosine distance + dot = np.dot(clean_feat, noisy_feat) + norm_clean = np.linalg.norm(clean_feat) + norm_noisy = np.linalg.norm(noisy_feat) + cosine_sim = dot / (norm_clean * norm_noisy + 1e-10) + + # L2 distance + l2_dist = float(np.linalg.norm(clean_feat - noisy_feat)) + + # Per-dimension change statistics + diff = np.abs(clean_feat - noisy_feat) + + return { + "perturb_cosine_dist": float(1.0 - cosine_sim), + "perturb_l2_dist": l2_dist, + "perturb_max_change": float(diff.max()), + "perturb_mean_change": float(diff.mean()), + } + + def feature_names(self) -> list[str]: + return [f"cnxt_{i}" for i in range(768)] + + def perturb_feature_names(self) -> list[str]: + return ["perturb_cosine_dist", "perturb_l2_dist", "perturb_max_change", "perturb_mean_change"] diff --git a/results/EXPERIMENTS.md b/results/EXPERIMENTS.md deleted file mode 100644 index 820572c..0000000 --- a/results/EXPERIMENTS.md +++ /dev/null @@ -1,197 +0,0 @@ -# Experiment Log: AI Artwork Detection Feature Analysis - -> negate project — darkshapes -> Date: March 23, 2026 -> Dataset: [Hemg/AI-Generated-vs-Real-Images-Datasets](https://huggingface.co/datasets/Hemg/AI-Generated-vs-Real-Images-Datasets) (AI Art vs Real Art, 153K images) -> Evaluation: 5-fold stratified cross-validation, 4000 samples (2000 per class) - ---- - -## Summary Table - -| # | Experiment | Features | Best Acc | Precision | Recall | AUC | Model | Code | -|---|-----------|----------|----------|-----------|--------|-----|-------|------| -| 0 | Baseline (existing negate) | 26 | 63.3% | — | — | 0.669 | XGBoost | `negate/train.py` | -| 1 | Artwork (Li & Stamp + FFT) | 49 | 79.4% | ~79% | ~80% | 0.886 | XGBoost | `negate/extract/feature_artwork.py` | -| 2 | Style (stroke/palette/comp) | 15 | 78.8% | ~79% | ~78% | 0.883 | XGBoost | `negate/extract/feature_style.py` | -| 3 | Artwork + Style combined | 64 | 83.5% | ~83% | ~84% | 0.923 | XGBoost | experiments 1+2 concatenated | -| 4 | CLIP ViT-B/32 embeddings | 512 | 89.3% | ~89% | ~89% | 0.963 | SVM | `tests/test_experiments.py:108-139` | -| 5 | All combined | 576 | 90.0% | ~90% | ~90% | 0.966 | SVM | experiments 1+2+4 concatenated | - ---- - -## Experiment 0: Baseline (Existing negate Pipeline) - -**What it does**: Haar wavelet decomposition + DINOv3 ViT features + Flux/SANA VAE reconstruction loss → PCA → XGBoost. - -**Code**: [`negate/train.py`](../negate/train.py), [`negate/decompose/wavelet.py`](../negate/decompose/wavelet.py), [`negate/extract/feature_vit.py`](../negate/extract/feature_vit.py), [`negate/extract/feature_vae.py`](../negate/extract/feature_vae.py) - -**Result**: 63.3% accuracy, 0.669 AUC (from existing training runs in `results/`) - -**Limitations**: -- Requires GPU + multi-GB model downloads (ViT, VAE) -- Wavelet features may not capture art-specific artifacts -- Tested on different datasets (not Hemg), so not directly comparable -- The heavy pipeline may introduce noise that dilutes useful signal - ---- - -## Experiment 1: Artwork Features (49) - -**What it does**: Implements the 39-feature extraction from [Li & Stamp, "Detecting AI-generated Artwork", arXiv:2504.07078](https://arxiv.org/abs/2504.07078), extended with 10 FFT/DCT frequency analysis features. - -**Feature categories**: -- Brightness (2): mean, entropy -- Color (23): RGB/HSV histogram stats (mean, var, kurtosis, skew, entropy) -- Texture (6): GLCM (contrast, correlation, energy, homogeneity) + LBP -- Shape (6): HOG statistics + Canny edge length -- Noise (2): noise entropy, SNR -- Frequency (10): FFT band energies, spectral centroid, DCT analysis, phase coherence - -**Code**: [`negate/extract/feature_artwork.py`](../negate/extract/feature_artwork.py) - -**Result**: 79.4% accuracy, 0.886 AUC (XGBoost) - -**Limitations**: -- Hand-crafted features can't adapt to new generator types -- Color/brightness features may capture dataset bias (e.g., if AI art tends to be more saturated) -- No spatial awareness — features are global statistics - ---- - -## Experiment 2: Style Features (15) - -**What it does**: Extracts features targeting properties of human artistic craft that AI generators struggle to replicate. - -**Feature categories**: -- Stroke analysis (4): gradient direction entropy, local direction variance, pressure kurtosis, stroke length variance -- Color palette (4): palette richness, hue entropy, harmony peaks, temperature variance -- Composition (4): rule-of-thirds energy ratio, bilateral symmetry, focal point strength, center edge ratio -- Micro-texture (3): patch entropy variance, grain regularity (autocorrelation), brushwork periodicity (FFT peak ratio) - -**Code**: [`negate/extract/feature_style.py`](../negate/extract/feature_style.py) - -**Result**: 78.8% accuracy, 0.883 AUC (XGBoost) - -**Limitations**: -- Only 15 features — limited capacity -- Stroke analysis assumes visible brush strokes (fails on smooth digital art) -- Composition features (rule-of-thirds, symmetry) may not differ between AI and human art -- ~2x slower than artwork features (7 img/s vs 16 img/s) due to patch-level analysis - -**Interesting finding**: Nearly identical performance to the 49 artwork features despite having 3x fewer features. This suggests the style features capture orthogonal signal — confirmed by experiment 3 where combining them jumps to 83.5%. - ---- - -## Experiment 3: Artwork + Style Combined (64) - -**What it does**: Concatenates all 49 artwork features + 15 style features per image. - -**Code**: Feature extraction from experiments 1+2, concatenated in [`tests/test_experiments.py:309-316`](../tests/test_experiments.py) - -**Result**: 83.5% accuracy, 0.923 AUC (XGBoost) — **+4.1pp over best individual** - -**Why it works**: The two feature sets capture different aspects: -- Artwork features capture statistical properties (histograms, frequency spectra) -- Style features capture spatial/structural properties (strokes, composition, texture regularity) -- XGBoost can learn which features matter for which types of images - -**Limitations**: -- Still hand-crafted — ceiling is limited by human feature engineering -- 64 features is small enough that XGBoost works well, but not enough to capture all relevant patterns - ---- - -## Experiment 4: CLIP ViT-B/32 Embeddings (512) - -**What it does**: Passes each image through OpenAI's CLIP vision encoder (`openai/clip-vit-base-patch32`) and uses the 512-dimensional pooled embedding as features. No fine-tuning — just the pretrained embedding. - -**Code**: [`tests/test_experiments.py:108-139`](../tests/test_experiments.py) (uses `transformers.CLIPModel`) - -**Result**: 89.3% accuracy, 0.963 AUC (SVM) — **+9.9pp over best hand-crafted** - -**Why it works**: CLIP was trained on 400M image-text pairs. Its embeddings encode rich visual semantics including texture, style, composition, and content — everything our hand-crafted features try to capture, but learned from data at massive scale. - -**Why SVM wins here**: In 512-dimensional space, SVM's RBF kernel finds better decision boundaries than XGBoost's tree splits. This is typical for high-dimensional dense features. - -**Limitations**: -- Requires ~300MB model download -- CLIP was not trained for forensic detection — it captures semantic similarity, not generation artifacts -- May fail on adversarial examples designed to fool CLIP -- Not fine-tuned on this task — fine-tuning would likely improve further -- Inference is slower (~32 img/batch on GPU vs 16 img/s CPU for hand-crafted) - ---- - -## Experiment 5: All Combined (576) - -**What it does**: Concatenates CLIP embeddings (512) + Artwork features (49) + Style features (15) = 576 features. - -**Code**: [`tests/test_experiments.py:342-349`](../tests/test_experiments.py) - -**Result**: 90.0% accuracy, 0.966 AUC (SVM) — **+0.7pp over CLIP alone** - -**Why the improvement is tiny**: CLIP embeddings already encode most of the information that hand-crafted features capture. The marginal gain from adding 64 hand-crafted features to 512 learned features is small because the signal is redundant. - -**Limitations**: -- Barely worth the extra computation vs CLIP alone -- Feature dimensionality (576) is high — may overfit on smaller datasets - ---- - -## Scaling Analysis - -Tested artwork features (49) at increasing sample sizes on the same Hemg dataset: - -| Samples | Best Accuracy | AUC | -|---------|--------------|-----| -| 400 | 70.0% | 0.790 | -| 1,000 | 75.8% | 0.844 | -| 2,000 | 77.8% | 0.858 | -| 4,000 | 79.5% | 0.888 | - -**Code**: [`tests/test_scale_evaluation.py`](../tests/test_scale_evaluation.py) -**PDF**: `results/scale_evaluation_20260322_235906.pdf` - -**Finding**: Accuracy climbs steadily but is flattening. Hand-crafted features likely plateau around 82-85% with more data. CLIP at 89.3% on the same 4000 samples already exceeds this ceiling. - ---- - -## Overall Conclusions - -### What worked -1. **CLIP embeddings are the clear winner** — 89.3% with zero feature engineering -2. **Combining orthogonal hand-crafted features helps** — Art+Style (83.5%) > either alone -3. **More data helps** — 70% → 79.5% going from 400 to 4000 samples -4. **Frequency features (FFT/DCT) add real signal** — the 10 frequency features in the artwork extractor are consistently important - -### What didn't work -1. **Hand-crafted features alone can't match learned representations** — 79.4% vs 89.3% -2. **Adding hand-crafted features to CLIP barely helps** — 90.0% vs 89.3% (+0.7pp) -3. **Style features alone aren't better than generic statistics** — 78.8% vs 79.4% - -### Remaining confounds -- The Hemg dataset labels are "AiArtData" vs "RealArt" — we don't know if the AI art was generated to look like the real art (semantic matching) -- Image resolution and format may differ between classes -- We haven't tested robustness to JPEG compression, resizing, or adversarial perturbation - -### Recommendation -**For the negate pipeline**: Replace the GPU-heavy VIT+VAE features with CLIP embeddings. This gives: -- +26pp accuracy improvement (63% → 89%) -- Simpler pipeline (one model instead of VIT + VAE + wavelets) -- Smaller download (~300MB vs multi-GB) -- Still works on CPU (slower but functional) - -**For research**: Fine-tuning CLIP on art-specific detection data, or using DINOv2 (which captures more structural features), could push accuracy further. The self-supervised camera-metadata approach from Zhong et al. (2026) is also worth exploring for robustness. - ---- - -## Generated PDFs - -| Report | File | What it shows | -|--------|------|--------------| -| Artwork detection benchmark | `results/artwork_detection_results.pdf` | Initial 49-feature results on wikiart | -| Proof compilation | `results/proof_compilation.pdf` | First end-to-end test (cats vs bananas — confounded) | -| Fair evaluation | `results/fair_evaluation_20260322_235151.pdf` | Hemg art-vs-art + Parveshiiii results | -| Scaling analysis | `results/scale_evaluation_20260322_235906.pdf` | 400→4000 sample scaling curves | -| Experiments comparison | `results/experiments_comparison_20260323_094054.pdf` | All 5 experiments side-by-side | diff --git a/results/adversarial_robustness_results.json b/results/adversarial_robustness_results.json new file mode 100644 index 0000000..44aad98 --- /dev/null +++ b/results/adversarial_robustness_results.json @@ -0,0 +1,144 @@ +{ + "clean_baseline": { + "acc": 0.9444444444444444, + "auc": 0.9894736842105263 + }, + "perturbation_robustness": { + "clean": { + "acc": 0.9444444444444444, + "auc": 0.9894736842105263 + }, + "jpeg_q30": { + "acc": 0.9222222222222223, + "auc": 0.9815479876160991, + "delta": -0.022222222222222143 + }, + "jpeg_q50": { + "acc": 0.9388888888888889, + "auc": 0.9879876160990712, + "delta": -0.005555555555555536 + }, + "jpeg_q70": { + "acc": 0.9333333333333333, + "auc": 0.9872445820433436, + "delta": -0.011111111111111072 + }, + "noise_s5": { + "acc": 0.9222222222222223, + "auc": 0.9796904024767803, + "delta": -0.022222222222222143 + }, + "noise_s15": { + "acc": 0.75, + "auc": 0.8918885448916408, + "delta": -0.19444444444444442 + }, + "noise_s30": { + "acc": 0.6611111111111111, + "auc": 0.7359752321981424, + "delta": -0.2833333333333333 + }, + "resize_128": { + "acc": 0.5888888888888889, + "auc": 0.8537461300309598, + "delta": -0.3555555555555555 + }, + "resize_64": { + "acc": 0.48333333333333334, + "auc": 0.7312693498452012, + "delta": -0.4611111111111111 + }, + "crop_70pct": { + "acc": 0.8944444444444445, + "auc": 0.9699071207430341, + "delta": -0.04999999999999993 + }, + "crop_50pct": { + "acc": 0.8166666666666667, + "auc": 0.9071207430340558, + "delta": -0.12777777777777777 + }, + "blur_r1": { + "acc": 0.8888888888888888, + "auc": 0.9627244582043344, + "delta": -0.05555555555555558 + }, + "blur_r3": { + "acc": 0.5666666666666667, + "auc": 0.8222910216718265, + "delta": -0.37777777777777777 + }, + "bright_0.7": { + "acc": 0.9444444444444444, + "auc": 0.9902167182662538, + "delta": 0.0 + }, + "bright_1.3": { + "acc": 0.9, + "auc": 0.9799380804953561, + "delta": -0.0444444444444444 + }, + "social_media": { + "acc": 0.9333333333333333, + "auc": 0.9884829721362229, + "delta": -0.011111111111111072 + } + }, + "threshold_calibration": { + "0.3": { + "precision": 0.8679245283018868, + "recall": 0.92, + "f1": 0.8899009108197378, + "fp_rate": 0.14 + }, + "0.4": { + "precision": 0.873015873015873, + "recall": 0.9166666666666666, + "f1": 0.8915989159891599, + "fp_rate": 0.13333333333333333 + }, + "0.5": { + "precision": 0.8892508143322475, + "recall": 0.91, + "f1": 0.8983194934866134, + "fp_rate": 0.11333333333333333 + }, + "0.6": { + "precision": 0.8943894389438944, + "recall": 0.9033333333333333, + "f1": 0.8983307916031233, + "fp_rate": 0.10666666666666667 + }, + "0.7": { + "precision": 0.893687707641196, + "recall": 0.8966666666666666, + "f1": 0.8949997083325232, + "fp_rate": 0.10666666666666667 + }, + "0.8": { + "precision": 0.902027027027027, + "recall": 0.89, + "f1": 0.8966620738699498, + "fp_rate": 0.09666666666666666 + }, + "0.9": { + "precision": 0.9065743944636678, + "recall": 0.8733333333333333, + "f1": 0.8916302423870245, + "fp_rate": 0.09 + }, + "0.95": { + "precision": 0.9142857142857143, + "recall": 0.8533333333333334, + "f1": 0.8865406006674083, + "fp_rate": 0.08 + } + }, + "recommended_threshold": null, + "worst_case_adversarial": { + "acc": 0.65, + "auc": 0.8144891640866874 + }, + "n_real": 300, + "n_fake": 300 +} \ No newline at end of file diff --git a/results/artwork_detection_results.pdf b/results/artwork_detection_results.pdf deleted file mode 100644 index c3aa62f..0000000 Binary files a/results/artwork_detection_results.pdf and /dev/null differ diff --git a/results/clip_bias_defactify_results.json b/results/clip_bias_defactify_results.json new file mode 100644 index 0000000..bc29562 --- /dev/null +++ b/results/clip_bias_defactify_results.json @@ -0,0 +1,160 @@ +{ + "timestamp": "2026-03-23T16:43:55.933744", + "results": [ + { + "generator": "SD 2.1", + "uses_clip": true, + "handcrafted_xgb": { + "accuracy": 0.865, + "precision": 0.8489483747609943, + "recall": 0.888, + "roc_auc": 0.939208 + }, + "handcrafted_svm": { + "accuracy": 0.857, + "precision": 0.8577154308617234, + "recall": 0.856, + "roc_auc": 0.939508 + }, + "clip_xgb": { + "accuracy": 0.913, + "precision": 0.9089108910891089, + "recall": 0.918, + "roc_auc": 0.970512 + }, + "clip_svm": { + "accuracy": 0.961, + "precision": 0.9528487229862476, + "recall": 0.97, + "roc_auc": 0.9941260000000001 + }, + "handcrafted_best": 0.865, + "clip_best": 0.961, + "clip_advantage": 0.09599999999999997 + }, + { + "generator": "SDXL", + "uses_clip": true, + "handcrafted_xgb": { + "accuracy": 0.935, + "precision": 0.9190751445086706, + "recall": 0.954, + "roc_auc": 0.9791160000000001 + }, + "handcrafted_svm": { + "accuracy": 0.931, + "precision": 0.9284294234592445, + "recall": 0.934, + "roc_auc": 0.9827159999999999 + }, + "clip_xgb": { + "accuracy": 0.944, + "precision": 0.944, + "recall": 0.944, + "roc_auc": 0.9869559999999999 + }, + "clip_svm": { + "accuracy": 0.99, + "precision": 0.9880478087649402, + "recall": 0.992, + "roc_auc": 0.999656 + }, + "handcrafted_best": 0.935, + "clip_best": 0.99, + "clip_advantage": 0.05499999999999994 + }, + { + "generator": "SD 3", + "uses_clip": true, + "handcrafted_xgb": { + "accuracy": 0.833, + "precision": 0.8363636363636363, + "recall": 0.828, + "roc_auc": 0.9075080000000001 + }, + "handcrafted_svm": { + "accuracy": 0.854, + "precision": 0.8657024793388429, + "recall": 0.838, + "roc_auc": 0.913932 + }, + "clip_xgb": { + "accuracy": 0.92, + "precision": 0.9216867469879518, + "recall": 0.918, + "roc_auc": 0.9740960000000001 + }, + "clip_svm": { + "accuracy": 0.975, + "precision": 0.9702970297029703, + "recall": 0.98, + "roc_auc": 0.9947839999999999 + }, + "handcrafted_best": 0.854, + "clip_best": 0.975, + "clip_advantage": 0.121 + }, + { + "generator": "Midjourney v6", + "uses_clip": "unknown", + "handcrafted_xgb": { + "accuracy": 0.868, + "precision": 0.859375, + "recall": 0.88, + "roc_auc": 0.941412 + }, + "handcrafted_svm": { + "accuracy": 0.885, + "precision": 0.8888888888888888, + "recall": 0.88, + "roc_auc": 0.9592140000000001 + }, + "clip_xgb": { + "accuracy": 0.972, + "precision": 0.9796747967479674, + "recall": 0.964, + "roc_auc": 0.997228 + }, + "clip_svm": { + "accuracy": 0.995, + "precision": 0.9959919839679359, + "recall": 0.994, + "roc_auc": 0.999872 + }, + "handcrafted_best": 0.885, + "clip_best": 0.995, + "clip_advantage": 0.10999999999999999 + }, + { + "generator": "DALL-E 3", + "uses_clip": false, + "handcrafted_xgb": { + "accuracy": 0.987, + "precision": 0.9821782178217822, + "recall": 0.992, + "roc_auc": 0.9957779999999999 + }, + "handcrafted_svm": { + "accuracy": 0.976, + "precision": 0.9703557312252964, + "recall": 0.982, + "roc_auc": 0.995222 + }, + "clip_xgb": { + "accuracy": 0.955, + "precision": 0.9540918163672655, + "recall": 0.956, + "roc_auc": 0.990896 + }, + "clip_svm": { + "accuracy": 0.982, + "precision": 0.9781746031746031, + "recall": 0.986, + "roc_auc": 0.99888 + }, + "handcrafted_best": 0.987, + "clip_best": 0.982, + "clip_advantage": -0.0050000000000000044 + } + ] +} \ No newline at end of file diff --git a/results/experiments_comparison_20260323_094054.pdf b/results/experiments_comparison_20260323_094054.pdf deleted file mode 100644 index a2ec008..0000000 Binary files a/results/experiments_comparison_20260323_094054.pdf and /dev/null differ diff --git a/results/experiments_results.json b/results/experiments_results.json deleted file mode 100644 index c8feec6..0000000 --- a/results/experiments_results.json +++ /dev/null @@ -1,150 +0,0 @@ -{ - "timestamp": "2026-03-23T09:40:54.308648", - "experiments": [ - { - "name": "Artwork Features (Li & Stamp + FFT/DCT)", - "short_name": "Artwork (49)", - "n_features": 49, - "extract_time": "237s", - "results": { - "xgb": { - "accuracy": 0.79375, - "precision": 0.7936031984007996, - "recall": 0.794, - "f1": 0.7937499871093743, - "roc_auc": 0.88620775 - }, - "svm": { - "accuracy": 0.784, - "precision": 0.7862903225806451, - "recall": 0.78, - "f1": 0.7839965439447031, - "roc_auc": 0.868789625 - }, - "mlp": { - "accuracy": 0.79375, - "precision": 0.7930174563591023, - "recall": 0.795, - "f1": 0.7937496777338715, - "roc_auc": 0.88000325 - } - } - }, - { - "name": "Style Features (stroke/palette/composition/texture)", - "short_name": "Style (15)", - "n_features": 15, - "extract_time": "577s", - "results": { - "xgb": { - "accuracy": 0.7875, - "precision": 0.797004132231405, - "recall": 0.7715, - "f1": 0.787445586070034, - "roc_auc": 0.88307 - }, - "svm": { - "accuracy": 0.77175, - "precision": 0.7763091001525165, - "recall": 0.7635, - "f1": 0.7717344636769341, - "roc_auc": 0.8640375 - }, - "mlp": { - "accuracy": 0.77575, - "precision": 0.7847186370676303, - "recall": 0.76, - "f1": 0.7756943581817265, - "roc_auc": 0.87112 - } - } - }, - { - "name": "Artwork + Style Combined", - "short_name": "Art+Style (64)", - "n_features": 64, - "extract_time": "combined", - "results": { - "xgb": { - "accuracy": 0.83475, - "precision": 0.8396752917300863, - "recall": 0.8275, - "f1": 0.8347413135902956, - "roc_auc": 0.923393 - }, - "svm": { - "accuracy": 0.82225, - "precision": 0.8243583291394061, - "recall": 0.819, - "f1": 0.8222481224957938, - "roc_auc": 0.9038627499999999 - }, - "mlp": { - "accuracy": 0.828, - "precision": 0.82, - "recall": 0.8405, - "f1": 0.8279731208001251, - "roc_auc": 0.9148270000000001 - } - } - }, - { - "name": "CLIP ViT-B/32 Embeddings", - "short_name": "CLIP (512)", - "n_features": 512, - "extract_time": "112s", - "results": { - "xgb": { - "accuracy": 0.861, - "precision": 0.8620862587763289, - "recall": 0.8595, - "f1": 0.8609996872492963, - "roc_auc": 0.9425352499999999 - }, - "svm": { - "accuracy": 0.893, - "precision": 0.8949748743718593, - "recall": 0.8905, - "f1": 0.8929993312458202, - "roc_auc": 0.9628585000000001 - }, - "mlp": { - "accuracy": 0.8855, - "precision": 0.8839641434262948, - "recall": 0.8875, - "f1": 0.8854995419981679, - "roc_auc": 0.952217 - } - } - }, - { - "name": "CLIP + Artwork + Style (Everything)", - "short_name": "All Combined", - "n_features": 576, - "extract_time": "combined", - "results": { - "xgb": { - "accuracy": 0.87975, - "precision": 0.8853373921867073, - "recall": 0.8725, - "f1": 0.8797436790271289, - "roc_auc": 0.956134 - }, - "svm": { - "accuracy": 0.9, - "precision": 0.9004004004004004, - "recall": 0.8995, - "f1": 0.8999999749999937, - "roc_auc": 0.9657885 - }, - "mlp": { - "accuracy": 0.8835, - "precision": 0.8827345309381237, - "recall": 0.8845, - "f1": 0.8834998834998835, - "roc_auc": 0.9535750000000001 - } - } - } - ] -} \ No newline at end of file diff --git a/results/fair_evaluation_20260322_235151.pdf b/results/fair_evaluation_20260322_235151.pdf deleted file mode 100644 index 2b107f0..0000000 Binary files a/results/fair_evaluation_20260322_235151.pdf and /dev/null differ diff --git a/results/fair_evaluation_results.json b/results/fair_evaluation_results.json deleted file mode 100644 index 50a3a4e..0000000 --- a/results/fair_evaluation_results.json +++ /dev/null @@ -1,220 +0,0 @@ -{ - "timestamp": "2026-03-22T23:49:04.639392", - "evaluation": "fair_semantically_similar", - "datasets": [ - { - "dataset": "AI-Art vs Real-Art (Hemg)", - "repo": "Hemg/AI-Generated-vs-Real-Images-Datasets", - "n_samples": 400, - "n_features": 49, - "xgb_folds": [ - { - "fold": 1, - "accuracy": 0.675, - "precision": 0.6590909090909091, - "recall": 0.725, - "f1": 0.6741854636591479, - "roc_auc": 0.7637499999999999 - }, - { - "fold": 2, - "accuracy": 0.6375, - "precision": 0.6222222222222222, - "recall": 0.7, - "f1": 0.6360784313725489, - "roc_auc": 0.718125 - }, - { - "fold": 3, - "accuracy": 0.7, - "precision": 0.6666666666666666, - "recall": 0.8, - "f1": 0.696969696969697, - "roc_auc": 0.73875 - }, - { - "fold": 4, - "accuracy": 0.7, - "precision": 0.7352941176470589, - "recall": 0.625, - "f1": 0.6983029541169076, - "roc_auc": 0.7881250000000001 - }, - { - "fold": 5, - "accuracy": 0.725, - "precision": 0.7368421052631579, - "recall": 0.7, - "f1": 0.7248280175109443, - "roc_auc": 0.7918750000000001 - } - ], - "xgb_accuracy": 0.6875, - "xgb_auc": 0.7563249999999999, - "xgb_precision": 0.6794258373205742, - "xgb_recall": 0.71, - "svm_accuracy": 0.71, - "svm_auc": 0.7851250000000001, - "mlp_accuracy": 0.71, - "mlp_auc": 0.788825, - "feature_names": [ - "mean_brightness", - "entropy_brightness", - "red_mean", - "red_variance", - "red_kurtosis", - "red_skewness", - "green_mean", - "green_variance", - "green_kurtosis", - "green_skewness", - "blue_mean", - "blue_variance", - "blue_kurtosis", - "blue_skewness", - "rgb_entropy", - "hue_variance", - "hue_kurtosis", - "hue_skewness", - "saturation_variance", - "saturation_kurtosis", - "saturation_skewness", - "value_variance", - "value_kurtosis", - "value_skewness", - "hsv_entropy", - "contrast", - "correlation", - "energy", - "homogeneity", - "lbp_entropy", - "lbp_variance", - "hog_mean", - "hog_variance", - "hog_kurtosis", - "hog_skewness", - "hog_entropy", - "edgelen", - "noise_entropy", - "snr", - "fft_low_energy_ratio", - "fft_mid_energy_ratio", - "fft_high_energy_ratio", - "fft_spectral_centroid", - "fft_log_mag_mean", - "fft_log_mag_std", - "fft_phase_std", - "dct_ac_dc_ratio", - "dct_high_freq_energy", - "dct_sparsity" - ] - }, - { - "dataset": "AI vs Real (Parveshiiii)", - "repo": "Parveshiiii/AI-vs-Real", - "n_samples": 400, - "n_features": 49, - "xgb_folds": [ - { - "fold": 1, - "accuracy": 0.925, - "precision": 0.925, - "recall": 0.925, - "f1": 0.925, - "roc_auc": 0.9881249999999999 - }, - { - "fold": 2, - "accuracy": 0.8375, - "precision": 0.8648648648648649, - "recall": 0.8, - "f1": 0.8372711625723674, - "roc_auc": 0.93 - }, - { - "fold": 3, - "accuracy": 0.875, - "precision": 0.875, - "recall": 0.875, - "f1": 0.875, - "roc_auc": 0.931875 - }, - { - "fold": 4, - "accuracy": 0.8625, - "precision": 0.8918918918918919, - "recall": 0.825, - "f1": 0.8623063683304647, - "roc_auc": 0.92625 - }, - { - "fold": 5, - "accuracy": 0.8625, - "precision": 0.8536585365853658, - "recall": 0.875, - "f1": 0.8624785122675418, - "roc_auc": 0.9568749999999999 - } - ], - "xgb_accuracy": 0.8725, - "xgb_auc": 0.9459750000000001, - "xgb_precision": 0.882051282051282, - "xgb_recall": 0.86, - "svm_accuracy": 0.915, - "svm_auc": 0.968625, - "mlp_accuracy": 0.9275, - "mlp_auc": 0.9697875, - "feature_names": [ - "mean_brightness", - "entropy_brightness", - "red_mean", - "red_variance", - "red_kurtosis", - "red_skewness", - "green_mean", - "green_variance", - "green_kurtosis", - "green_skewness", - "blue_mean", - "blue_variance", - "blue_kurtosis", - "blue_skewness", - "rgb_entropy", - "hue_variance", - "hue_kurtosis", - "hue_skewness", - "saturation_variance", - "saturation_kurtosis", - "saturation_skewness", - "value_variance", - "value_kurtosis", - "value_skewness", - "hsv_entropy", - "contrast", - "correlation", - "energy", - "homogeneity", - "lbp_entropy", - "lbp_variance", - "hog_mean", - "hog_variance", - "hog_kurtosis", - "hog_skewness", - "hog_entropy", - "edgelen", - "noise_entropy", - "snr", - "fft_low_energy_ratio", - "fft_mid_energy_ratio", - "fft_high_energy_ratio", - "fft_spectral_centroid", - "fft_log_mag_mean", - "fft_log_mag_std", - "fft_phase_std", - "dct_ac_dc_ratio", - "dct_high_freq_energy", - "dct_sparsity" - ] - } - ] -} \ No newline at end of file diff --git a/results/hybrid_modern_results.json b/results/hybrid_modern_results.json new file mode 100644 index 0000000..d90503d --- /dev/null +++ b/results/hybrid_modern_results.json @@ -0,0 +1,96 @@ +{ + "training": { + "n_per_class": 3122, + "n_features": 916, + "cv_acc": 0.9694104001149638, + "cv_auc": 0.9955921334648258 + }, + "tellif": { + "bytedance-seedream-v4-text-to-image": { + "n": 20, + "detected": 18, + "uncertain": 2, + "missed": 0, + "rate": 0.9 + }, + "flux-pro-v1-1-ultra": { + "n": 9, + "detected": 5, + "uncertain": 4, + "missed": 0, + "rate": 0.5555555555555556 + }, + "gemini-25-flash-image": { + "n": 18, + "detected": 10, + "uncertain": 8, + "missed": 0, + "rate": 0.5555555555555556 + }, + "hidream-i1-full": { + "n": 9, + "detected": 8, + "uncertain": 1, + "missed": 0, + "rate": 0.8888888888888888 + }, + "hunyuan-image-v2-1-text-to-image": { + "n": 11, + "detected": 11, + "uncertain": 0, + "missed": 0, + "rate": 1.0 + }, + "ideogram-v2": { + "n": 9, + "detected": 8, + "uncertain": 1, + "missed": 0, + "rate": 0.8888888888888888 + }, + "imagen4-preview": { + "n": 9, + "detected": 9, + "uncertain": 0, + "missed": 0, + "rate": 1.0 + }, + "qwen-image": { + "n": 12, + "detected": 10, + "uncertain": 2, + "missed": 0, + "rate": 0.8333333333333334 + }, + "real": { + "n": 122, + "fp": 0, + "fp_rate": 0.0 + }, + "recraft-v3-text-to-image": { + "n": 9, + "detected": 4, + "uncertain": 5, + "missed": 0, + "rate": 0.4444444444444444 + }, + "stable-diffusion-v35-large": { + "n": 9, + "detected": 6, + "uncertain": 3, + "missed": 0, + "rate": 0.6666666666666666 + }, + "wan-v2-2-a14b-text-to-image": { + "n": 8, + "detected": 6, + "uncertain": 2, + "missed": 0, + "rate": 0.75 + } + }, + "importance": { + "handcrafted_pct": 0.17440815103386276, + "learned_pct": 0.8255918489661372 + } +} \ No newline at end of file diff --git a/results/imaginet_paintings_results.json b/results/imaginet_paintings_results.json new file mode 100644 index 0000000..097c242 --- /dev/null +++ b/results/imaginet_paintings_results.json @@ -0,0 +1,85 @@ +{ + "per_generator": { + "AnimagineXL": { + "acc": 0.993, + "auc": 0.99956, + "f1": 0.9929995249581209, + "prec": 0.9959789856536674, + "rec": 0.99 + }, + "SD": { + "acc": 0.9549999999999998, + "auc": 0.99128, + "f1": 0.9549951742342294, + "prec": 0.9526184576872121, + "rec": 0.9579999999999999 + }, + "SDXL": { + "acc": 0.9720000000000001, + "auc": 0.99666, + "f1": 0.9719950476849487, + "prec": 0.9798952905599838, + "rec": 0.9639999999999999 + }, + "StyleGAN": { + "acc": 0.6439999999999999, + "auc": 0.70302, + "f1": 0.6433258254702935, + "prec": 0.6354500861457384, + "rec": 0.6780000000000002 + }, + "Midjourney": { + "acc": 0.923, + "auc": 0.97706, + "f1": 0.9229635402996428, + "prec": 0.9377923257914589, + "rec": 0.906 + }, + "DALL-E_3": { + "acc": 0.9359999999999999, + "auc": 0.9806799999999999, + "f1": 0.9359967497387254, + "prec": 0.9342257872846108, + "rec": 0.9380000000000001 + } + }, + "pooled": { + "acc": 0.8320000000000001, + "auc": 0.8915000000000001, + "f1": 0.8308584361256923, + "prec": 0.8839624127044378, + "rec": 0.764 + }, + "leave_one_out": { + "AnimagineXL": { + "acc": 0.998, + "auc": 1.0, + "f1": 0.997999991999968 + }, + "SD": { + "acc": 0.95, + "auc": 1.0, + "f1": 0.949874686716792 + }, + "SDXL": { + "acc": 0.992, + "auc": 1.0, + "f1": 0.9919994879672299 + }, + "StyleGAN": { + "acc": 0.593, + "auc": 0.794292, + "f1": 0.5121957066030963 + }, + "Midjourney": { + "acc": 0.983, + "auc": 1.0, + "f1": 0.9829950855797325 + }, + "DALL-E_3": { + "acc": 0.996, + "auc": 1.0, + "f1": 0.995999935998976 + } + } +} \ No newline at end of file diff --git a/results/multidataset_benchmark_results.json b/results/multidataset_benchmark_results.json new file mode 100644 index 0000000..f597350 --- /dev/null +++ b/results/multidataset_benchmark_results.json @@ -0,0 +1,130 @@ +{ + "feature_count": 148, + "exp1_baselines": { + "Defactify": { + "accuracy": 0.8867, + "auc": 0.8953 + }, + "Hemg": { + "accuracy": 0.8085, + "auc": 0.9004 + }, + "tellif": { + "accuracy": 0.8286, + "auc": 0.9201 + } + }, + "exp2_cross_dataset": { + "Defactify->Hemg": { + "accuracy": 0.501, + "auc": 0.4467 + }, + "Hemg->Defactify": { + "accuracy": 0.4208, + "auc": 0.4029 + }, + "Combined->Defactify": { + "accuracy": 1.0, + "auc": 1.0 + }, + "Combined->Hemg": { + "accuracy": 1.0, + "auc": 1.0 + }, + "Combined->tellif": { + "accuracy": 0.6653, + "auc": 0.7792 + } + }, + "exp3_generator_diversity": { + "1_gen_train": { + "accuracy": 0.5656, + "auc": 0.6508, + "train_generators": [ + "Midjourney_v6" + ], + "test_generators": [ + "SD_3", + "SDXL", + "DALL-E_3", + "SD_2.1" + ] + }, + "2_gen_train": { + "accuracy": 0.6639, + "auc": 0.6237, + "train_generators": [ + "Midjourney_v6", + "SD_3" + ], + "test_generators": [ + "SDXL", + "DALL-E_3", + "SD_2.1" + ] + }, + "3_gen_train": { + "accuracy": 0.8604, + "auc": 0.8254, + "train_generators": [ + "Midjourney_v6", + "SD_3", + "SDXL" + ], + "test_generators": [ + "DALL-E_3", + "SD_2.1" + ] + }, + "4_gen_train": { + "accuracy": 0.8792, + "auc": 0.8269, + "train_generators": [ + "Midjourney_v6", + "SD_3", + "SDXL", + "DALL-E_3" + ], + "test_generators": [ + "SD_2.1" + ] + }, + "5_gen_cv": { + "accuracy": 0.8867, + "auc": 0.8953, + "train_generators": [ + "DALL-E_3", + "Midjourney_v6", + "SDXL", + "SD_2.1", + "SD_3" + ], + "test_generators": [ + "DALL-E_3", + "Midjourney_v6", + "SDXL", + "SD_2.1", + "SD_3" + ] + } + }, + "exp4_domain_transfer": { + "Photos->Art": { + "accuracy": 0.501, + "auc": 0.4467 + }, + "Art->Photos": { + "accuracy": 0.4208, + "auc": 0.4029 + }, + "Photos_self_CV": { + "accuracy": 0.8867, + "auc": 0.8953 + }, + "Art_self_CV": { + "accuracy": 0.8085, + "auc": 0.9004 + } + }, + "elapsed_seconds": 2964.0 +} \ No newline at end of file diff --git a/results/platform_robustness_results.json b/results/platform_robustness_results.json new file mode 100644 index 0000000..d0985f0 --- /dev/null +++ b/results/platform_robustness_results.json @@ -0,0 +1,27 @@ +{ + "Clean (original)": { + "fp": 0.0, + "detection": 0.9705882352941176, + "missed": 0.0196078431372549 + }, + "Twitter/X (JPEG Q85, max 4096px)": { + "fp": 0.0, + "detection": 0.9607843137254902, + "missed": 0.0196078431372549 + }, + "Instagram (JPEG Q75, max 1080px, sharpen)": { + "fp": 0.0, + "detection": 0.9313725490196079, + "missed": 0.029411764705882353 + }, + "Discord (JPEG Q80, 4MP limit)": { + "fp": 0.0, + "detection": 0.9607843137254902, + "missed": 0.0196078431372549 + }, + "Tumblr (JPEG Q65, max 1280px)": { + "fp": 0.0, + "detection": 0.9411764705882353, + "missed": 0.00980392156862745 + } +} \ No newline at end of file diff --git a/results/scale_evaluation_20260322_235906.pdf b/results/scale_evaluation_20260322_235906.pdf deleted file mode 100644 index 1d8dfe1..0000000 Binary files a/results/scale_evaluation_20260322_235906.pdf and /dev/null differ diff --git a/results/scale_evaluation_results.json b/results/scale_evaluation_results.json deleted file mode 100644 index dcf7caf..0000000 --- a/results/scale_evaluation_results.json +++ /dev/null @@ -1,103 +0,0 @@ -{ - "timestamp": "2026-03-22T23:59:06.522268", - "dataset": "Hemg/AI-Generated-vs-Real-Images-Datasets", - "feature_count": 49, - "results": [ - { - "total": 400, - "xgb": { - "accuracy": 0.67, - "precision": 0.6752577319587629, - "recall": 0.655, - "f1": 0.6699257332899902, - "roc_auc": 0.75155 - }, - "svm": { - "accuracy": 0.6975, - "precision": 0.7025641025641025, - "recall": 0.685, - "f1": 0.697452726988592, - "roc_auc": 0.7725 - }, - "mlp": { - "accuracy": 0.7, - "precision": 0.6904761904761905, - "recall": 0.725, - "f1": 0.699812382739212, - "roc_auc": 0.7897 - } - }, - { - "total": 1000, - "xgb": { - "accuracy": 0.758, - "precision": 0.764344262295082, - "recall": 0.746, - "f1": 0.7579651469811652, - "roc_auc": 0.843684 - }, - "svm": { - "accuracy": 0.744, - "precision": 0.7552301255230126, - "recall": 0.722, - "f1": 0.7438760360014247, - "roc_auc": 0.8290820000000001 - }, - "mlp": { - "accuracy": 0.754, - "precision": 0.7550200803212851, - "recall": 0.752, - "f1": 0.7539990159960639, - "roc_auc": 0.838132 - } - }, - { - "total": 2000, - "xgb": { - "accuracy": 0.7705, - "precision": 0.7757390417940877, - "recall": 0.761, - "f1": 0.7704792857555395, - "roc_auc": 0.863119 - }, - "svm": { - "accuracy": 0.7645, - "precision": 0.765295887662989, - "recall": 0.763, - "f1": 0.7644994701238078, - "roc_auc": 0.8491545 - }, - "mlp": { - "accuracy": 0.778, - "precision": 0.780241935483871, - "recall": 0.774, - "f1": 0.7779964479431671, - "roc_auc": 0.857905 - } - }, - { - "total": 4000, - "xgb": { - "accuracy": 0.79475, - "precision": 0.7984810126582278, - "recall": 0.7885, - "f1": 0.7947419821086761, - "roc_auc": 0.8877880000000001 - }, - "svm": { - "accuracy": 0.7815, - "precision": 0.7829145728643216, - "recall": 0.779, - "f1": 0.7814986343664647, - "roc_auc": 0.86796925 - }, - "mlp": { - "accuracy": 0.7885, - "precision": 0.7803692905733722, - "recall": 0.803, - "f1": 0.7884555227736632, - "roc_auc": 0.8725355000000001 - } - } - ] -} \ No newline at end of file diff --git a/tests/extract_cache.py b/tests/extract_cache.py new file mode 100644 index 0000000..163643b --- /dev/null +++ b/tests/extract_cache.py @@ -0,0 +1,287 @@ +"""Feature extraction with disk caching. Pause/resume safe. + +Saves extracted features to .npz files after each data source. +On restart, loads from cache and skips already-extracted sources. + +Usage: + uv run python tests/extract_cache.py # extract all + uv run python tests/extract_cache.py --train # train + test using cached features +""" + +from __future__ import annotations + +import gc +import json +import os +import sys +import warnings +from pathlib import Path + +os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface" +os.environ["HF_DATASETS_CACHE"] = "D:/Projects/negate/negate/.cache/huggingface/datasets" +# Set HF_TOKEN env var or run: huggingface-cli login + +import numpy as np +import torch +from PIL import Image +from tqdm import tqdm + +warnings.filterwarnings("ignore") + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from negate.extract.feature_artwork import ArtworkExtract +from negate.extract.feature_learned import LearnedExtract + +CACHE_DIR = Path("D:/Projects/negate/negate/.cache/features") +CACHE_DIR.mkdir(parents=True, exist_ok=True) + +BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted") +CIVITAI = Path("D:/Projects/negate/negate/.datasets/civitai") +SEED = 42 + + +def get_extractors(): + return ArtworkExtract(), LearnedExtract() + + +def extract_one(art_ext, learned_ext, img): + try: + hc = list(art_ext(img).values()) + except Exception: + hc = [0.0] * 148 + try: + with torch.no_grad(): + ln = list(learned_ext(img).values()) + except Exception: + ln = [0.0] * 768 + return hc + ln + + +def extract_and_cache(name, image_source, art_ext, learned_ext): + """Extract features and save to cache. Skip if already cached.""" + cache_file = CACHE_DIR / f"{name}.npz" + if cache_file.exists(): + data = np.load(cache_file) + print(f" {name}: loaded from cache ({len(data['X'])} images)", flush=True) + return data["X"] + + print(f" {name}: extracting...", flush=True) + rows = [] + for img in tqdm(image_source, desc=name): + rows.append(extract_one(art_ext, learned_ext, img)) + del img + + X = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0) + np.savez_compressed(cache_file, X=X) + print(f" {name}: cached {len(X)} images to {cache_file}", flush=True) + gc.collect() + return X + + +def load_paths_as_images(path, recursive, max_n): + """Generator that yields PIL images from file paths.""" + rng = np.random.RandomState(SEED) + exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} + files = list(path.rglob("*") if recursive else path.iterdir()) + files = [f for f in files if f.suffix.lower() in exts] + if len(files) > max_n: + files = list(rng.choice(files, max_n, replace=False)) + for f in files: + try: + yield Image.open(f).convert("RGB") + except Exception: + pass + + +def stream_hf_images(repo, max_n, label_filter=None): + """Generator that yields PIL images from HuggingFace streaming.""" + from datasets import load_dataset + ds = load_dataset(repo, split="train", streaming=True) + count = 0 + for s in ds: + if label_filter is not None and s.get("label") != label_filter: + continue + try: + yield s["image"].convert("RGB") + count += 1 + except Exception: + pass + if count >= max_n: + break + + +def extract_all(): + """Extract and cache all data sources.""" + art_ext, learned_ext = get_extractors() + + print("=" * 60, flush=True) + print(" FEATURE EXTRACTION WITH CACHING", flush=True) + print(" Pause anytime — restart will skip completed sources", flush=True) + print("=" * 60, flush=True) + + # === REAL === + print("\n--- REAL ---", flush=True) + + extract_and_cache("real_wikiart", + load_paths_as_images(BASE / "wikiart", recursive=True, max_n=1500), + art_ext, learned_ext) + + from datasets import load_dataset + ds_tellif = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test") + tellif_real = [ds_tellif[i]["image"].convert("RGB") for i in range(len(ds_tellif)) if ds_tellif[i]["label"] == 8] + extract_and_cache("real_tellif", tellif_real, art_ext, learned_ext) + del tellif_real; gc.collect() + + extract_and_cache("real_hemg", + stream_hf_images("Hemg/AI-Generated-vs-Real-Images-Datasets", max_n=800, label_filter=1), + art_ext, learned_ext) + + # === FAKE: old generators === + print("\n--- FAKE (old) ---", flush=True) + for src in ["sdxl_paintings_fake", "sd_paintings_fake", "dalle3", "journeydb", "animaginexl_paintings_fake"]: + extract_and_cache(f"fake_imaginet_{src}", + load_paths_as_images(BASE / src, recursive=False, max_n=100), + art_ext, learned_ext) + + # === FAKE: modern HF datasets === + print("\n--- FAKE (modern) ---", flush=True) + for repo, n, name in [ + ("ash12321/seedream-4.5-generated-2k", 300, "seedream45"), + ("exdysa/nano-banana-pro-generated-1k-clone", 300, "nano_banana"), + ("LukasT9/Flux-1-Dev-Images-1k", 300, "flux_dev"), + ("LukasT9/Flux-1-Schnell-Images-1k", 300, "flux_schnell"), + ]: + extract_and_cache(f"fake_{name}", + stream_hf_images(repo, max_n=n), + art_ext, learned_ext) + + # === FAKE: CivitAI (all generators) === + print("\n--- FAKE (CivitAI) ---", flush=True) + for gen_dir in ["flux", "sdxl", "pony", "illustrious", "sd3", "sd35", "recraft", "gemini"]: + p = CIVITAI / gen_dir + if p.exists() and any(p.iterdir()): + extract_and_cache(f"fake_civitai_{gen_dir}", + load_paths_as_images(p, recursive=False, max_n=300), + art_ext, learned_ext) + + # === FAKE: Hemg AI === + print("\n--- FAKE (Hemg AI) ---", flush=True) + extract_and_cache("fake_hemg_ai", + stream_hf_images("Hemg/AI-Generated-vs-Real-Images-Datasets", max_n=800, label_filter=0), + art_ext, learned_ext) + + print("\n ALL EXTRACTION COMPLETE", flush=True) + print(f" Cache dir: {CACHE_DIR}", flush=True) + print(f" Files: {len(list(CACHE_DIR.glob('*.npz')))}", flush=True) + + +def train_and_test(): + """Load cached features, train ensemble, test on tellif.""" + import lightgbm as lgb + from sklearn.ensemble import RandomForestClassifier + from sklearn.metrics import accuracy_score, roc_auc_score + from sklearn.model_selection import StratifiedKFold + from sklearn.preprocessing import StandardScaler + from sklearn.svm import SVC + + print("=" * 60, flush=True) + print(" TRAIN + TEST (from cached features)", flush=True) + print("=" * 60, flush=True) + + # Load all cached features + real_parts, fake_parts = [], [] + for f in sorted(CACHE_DIR.glob("real_*.npz")): + data = np.load(f) + real_parts.append(data["X"]) + print(f" Real: {f.stem} ({len(data['X'])})", flush=True) + for f in sorted(CACHE_DIR.glob("fake_*.npz")): + data = np.load(f) + fake_parts.append(data["X"]) + print(f" Fake: {f.stem} ({len(data['X'])})", flush=True) + + X_real = np.vstack(real_parts) + X_fake = np.vstack(fake_parts) + n = min(len(X_real), len(X_fake)) + X_train = np.vstack([X_real[:n], X_fake[:n]]) + y_train = np.concatenate([np.zeros(n), np.ones(n)]) + print(f"\n Training: {len(X_train)} ({n}/class)", flush=True) + + # 5-fold CV + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) + accs = [] + for fold, (tr, te) in enumerate(skf.split(X_train, y_train)): + m = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED) + m.fit(X_train[tr], y_train[tr]) + p = m.predict_proba(X_train[te])[:, 1] + acc = accuracy_score(y_train[te], (p > 0.5).astype(int)) + accs.append(acc) + print(f" Fold {fold+1}: {acc:.4f}", flush=True) + print(f" Mean: {np.mean(accs):.4f}", flush=True) + + # Train ensemble + W_LGBM, W_RF, W_SVM = 0.4, 0.1, 0.5 + HIGH_T, LOW_T = 0.80, 0.20 + + lgbm = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED) + lgbm.fit(X_train, y_train) + rf = RandomForestClassifier(n_estimators=300, max_depth=12, n_jobs=1, random_state=SEED) + rf.fit(X_train, y_train) + scaler = StandardScaler() + svm = SVC(kernel="rbf", probability=True, random_state=SEED) + svm.fit(scaler.fit_transform(X_train), y_train) + + def predict(X): + p = (W_LGBM * lgbm.predict_proba(X)[:, 1] + + W_RF * rf.predict_proba(X)[:, 1] + + W_SVM * svm.predict_proba(scaler.transform(X))[:, 1]) + pred = np.full(len(X), -1) + pred[p >= HIGH_T] = 1 + pred[p < LOW_T] = 0 + return pred, p + + # Test tellif + print("\n" + "=" * 60, flush=True) + print(" TELLIF RESULTS", flush=True) + print("=" * 60, flush=True) + + from datasets import load_dataset + ds_tellif = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test") + label_names = ds_tellif.features["label"].names + + art_ext, learned_ext = get_extractors() + results = {} + + for i, gen_name in enumerate(label_names): + indices = [j for j in range(len(ds_tellif)) if ds_tellif[j]["label"] == i] + if not indices: + continue + images = [ds_tellif[j]["image"].convert("RGB") for j in indices] + + rows = [extract_one(art_ext, learned_ext, img) for img in images] + X_gen = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0) + + pred, prob = predict(X_gen) + + if gen_name == "real": + fp = (pred == 1).sum() + print(f" {gen_name:45s} n={len(X_gen):3d} GENUINE={(pred==0).sum()} UNC={(pred==-1).sum()} FP={fp} FP={fp/len(X_gen):.1%}", flush=True) + results[gen_name] = {"n": len(X_gen), "fp": int(fp), "fp_rate": float(fp / len(X_gen))} + else: + det = (pred == 1).sum() + rate = det / len(X_gen) + print(f" {gen_name:45s} n={len(X_gen):3d} DET={det} UNC={(pred==-1).sum()} MISS={(pred==0).sum()} rate={rate:.1%}", flush=True) + results[gen_name] = {"n": len(X_gen), "detected": int(det), "rate": float(rate)} + + out = Path(__file__).parent.parent / "results" / "retrain_weak_gens_results.json" + with open(out, "w") as f: + json.dump({"cv_acc": float(np.mean(accs)), "n_train": int(len(X_train)), "tellif": results}, f, indent=2) + print(f"\nSaved to {out}", flush=True) + + +if __name__ == "__main__": + if "--train" in sys.argv: + train_and_test() + else: + extract_all() diff --git a/tests/finetune_anime.py b/tests/finetune_anime.py new file mode 100644 index 0000000..fc85a2c --- /dev/null +++ b/tests/finetune_anime.py @@ -0,0 +1,248 @@ +"""Fine-tune ConvNeXt-Tiny last stage for anime real vs AI detection. + +Freeze stages 0-2, train stage 3 + classification head on anime data. +CPU training — slow but feasible for our data size (~2600 images). +""" + +from __future__ import annotations + +import os +import warnings + +os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface" +warnings.filterwarnings("ignore") + +import numpy as np +import timm +import torch +import torch.nn as nn +from pathlib import Path +from PIL import Image +from datasets import load_dataset +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm + +SEED = 42 +torch.manual_seed(SEED) +np.random.seed(SEED) + +BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted") +CIVITAI = Path("D:/Projects/negate/negate/.datasets/civitai") + + +class AnimeDataset(Dataset): + def __init__(self, images, labels, transform): + self.images = images + self.labels = labels + self.transform = transform + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img = self.images[idx] + label = self.labels[idx] + tensor = self.transform(img) + return tensor, torch.tensor(label, dtype=torch.float32) + + +def load_paths(path, recursive, max_n): + rng = np.random.RandomState(SEED) + exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} + files = list(path.rglob("*") if recursive else path.iterdir()) + files = [f for f in files if f.suffix.lower() in exts] + if len(files) > max_n: + files = list(rng.choice(files, max_n, replace=False)) + imgs = [] + for f in files: + try: + imgs.append(Image.open(f).convert("RGB")) + except Exception: + pass + return imgs + + +def main(): + print("=" * 60, flush=True) + print(" FINE-TUNE ConvNeXt-Tiny for Anime Detection", flush=True) + print(" Freeze stages 0-2, train stage 3 + head", flush=True) + print("=" * 60, flush=True) + + # Load model + model = timm.create_model("convnext_tiny.fb_in22k", pretrained=True, num_classes=2) + transform = timm.data.create_transform( + **timm.data.resolve_data_config(model.pretrained_cfg) + ) + + # Freeze everything except stage 3 and head + for name, param in model.named_parameters(): + if "stages.3" in name or "head" in name: + param.requires_grad = True + else: + param.requires_grad = False + + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + print(f" Trainable: {trainable:,} / {total:,} ({trainable/total*100:.1f}%)", flush=True) + + # Load data + print("\n--- Loading data ---", flush=True) + real_imgs = [] + + # Real anime from animesfw (1000) + ds = load_dataset("latentcat/animesfw", split="train", streaming=True) + count = 0 + for s in tqdm(ds, desc="Real anime", total=1000): + if count >= 1000: + break + tags = s.get("tags", "") + if "ai" in tags.lower(): + continue + try: + real_imgs.append(s["image"].convert("RGB")) + count += 1 + except Exception: + pass + print(f" Real anime: {len(real_imgs)}", flush=True) + + # Real WikiArt (500) + wiki_imgs = load_paths(BASE / "wikiart", recursive=True, max_n=500) + real_imgs.extend(wiki_imgs) + print(f" + WikiArt: {len(wiki_imgs)}, total real: {len(real_imgs)}", flush=True) + + # Fake anime (CivitAI pony + illustrious + AnimagineXL) + fake_imgs = [] + for gen in ["pony", "illustrious"]: + imgs = load_paths(CIVITAI / gen, recursive=False, max_n=400) + fake_imgs.extend(imgs) + animagine = load_paths(BASE / "animaginexl_paintings_fake", recursive=False, max_n=100) + fake_imgs.extend(animagine) + + # Add Flux and nano-banana for diversity + for repo, n in [("LukasT9/Flux-1-Dev-Images-1k", 200), ("bitmind/nano-banana", 200)]: + ds2 = load_dataset(repo, split="train", streaming=True) + c = 0 + for s in ds2: + if c >= n: + break + try: + fake_imgs.append(s["image"].convert("RGB")) + c += 1 + except Exception: + pass + print(f" Fake: {len(fake_imgs)}", flush=True) + + # Balance + n = min(len(real_imgs), len(fake_imgs)) + all_imgs = real_imgs[:n] + fake_imgs[:n] + all_labels = [0] * n + [1] * n + print(f" Total: {len(all_imgs)} ({n}/class)", flush=True) + + # Split train/val + indices = np.random.permutation(len(all_imgs)) + split = int(0.8 * len(indices)) + train_idx = indices[:split] + val_idx = indices[split:] + + train_ds = AnimeDataset( + [all_imgs[i] for i in train_idx], + [all_labels[i] for i in train_idx], + transform, + ) + val_ds = AnimeDataset( + [all_imgs[i] for i in val_idx], + [all_labels[i] for i in val_idx], + transform, + ) + + train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=0) + val_loader = DataLoader(val_ds, batch_size=16, shuffle=False, num_workers=0) + + # Train + print(f"\n--- Training ({len(train_ds)} train, {len(val_ds)} val) ---", flush=True) + optimizer = torch.optim.AdamW( + [p for p in model.parameters() if p.requires_grad], + lr=1e-4, + weight_decay=0.01, + ) + criterion = nn.CrossEntropyLoss() + model.train() + + for epoch in range(5): + losses = [] + correct = 0 + total = 0 + for batch_imgs, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"): + optimizer.zero_grad() + outputs = model(batch_imgs) + loss = criterion(outputs, batch_labels.long()) + loss.backward() + optimizer.step() + losses.append(loss.item()) + preds = outputs.argmax(dim=1) + correct += (preds == batch_labels.long()).sum().item() + total += len(batch_labels) + + # Validation + model.eval() + val_correct = 0 + val_total = 0 + val_fp = 0 + val_real = 0 + with torch.no_grad(): + for batch_imgs, batch_labels in val_loader: + outputs = model(batch_imgs) + preds = outputs.argmax(dim=1) + val_correct += (preds == batch_labels.long()).sum().item() + val_total += len(batch_labels) + # FP: real labeled as AI + real_mask = batch_labels == 0 + val_fp += ((preds == 1) & real_mask).sum().item() + val_real += real_mask.sum().item() + model.train() + + val_acc = val_correct / val_total + fp_rate = val_fp / val_real if val_real > 0 else 0 + print(f" Epoch {epoch+1}: loss={np.mean(losses):.4f} train_acc={correct/total:.4f} val_acc={val_acc:.4f} val_FP={fp_rate:.3f}", flush=True) + + # Test on held-out anime + print("\n--- Testing on held-out anime ---", flush=True) + model.eval() + + ds3 = load_dataset("latentcat/animesfw", split="train", streaming=True) + test_fp = 0 + test_total = 0 + skip = 0 + with torch.no_grad(): + for s in tqdm(ds3, desc="Test anime", total=1200): + tags = s.get("tags", "") + if "ai" in tags.lower(): + continue + skip += 1 + if skip <= 1000: + continue + if test_total >= 200: + break + try: + img = s["image"].convert("RGB") + tensor = transform(img).unsqueeze(0) + output = model(tensor) + pred = output.argmax(dim=1).item() + if pred == 1: # flagged as AI + test_fp += 1 + test_total += 1 + except Exception: + pass + + print(f"\nAnime FP (fine-tuned ConvNeXt): {test_fp}/{test_total} ({test_fp/test_total*100:.1f}%)", flush=True) + + # Save model + out_path = Path("D:/Projects/negate/negate/models/convnext_anime_finetuned.pt") + out_path.parent.mkdir(parents=True, exist_ok=True) + torch.save(model.state_dict(), out_path) + print(f"Model saved to {out_path}", flush=True) + print("DONE", flush=True) + + +if __name__ == "__main__": + main() diff --git a/tests/generate_fair_eval_pdf.py b/tests/generate_fair_eval_pdf.py deleted file mode 100644 index cbe4af4..0000000 --- a/tests/generate_fair_eval_pdf.py +++ /dev/null @@ -1,262 +0,0 @@ -# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 -"""Generate PDF report for fair evaluation results. - -Reads results/fair_evaluation_results.json and generates a timestamped PDF -with cross-validation metrics, comparison tables, and analysis. -""" - -from __future__ import annotations - -import json -import sys -from datetime import datetime -from pathlib import Path - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from matplotlib.backends.backend_pdf import PdfPages -import matplotlib.gridspec as gridspec -import numpy as np - -RESULTS_DIR = Path(__file__).parent.parent / "results" - - -def generate_pdf(results_path: Path): - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - pdf_path = RESULTS_DIR / f"fair_evaluation_{timestamp}.pdf" - - with open(results_path) as f: - data = json.load(f) - - datasets = data["datasets"] - - with PdfPages(str(pdf_path)) as pdf: - # ===== PAGE 1: Title & Summary ===== - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - ax = fig.add_axes([0, 0, 1, 1]) - ax.axis("off") - - ax.text(0.5, 0.88, "Fair Evaluation Report:\n49-Feature Artwork Detection", - transform=ax.transAxes, fontsize=20, fontweight="bold", - ha="center", va="top", fontfamily="serif") - - ax.text(0.5, 0.74, f"negate project — darkshapes\n{datetime.now().strftime('%B %d, %Y')}", - transform=ax.transAxes, fontsize=11, ha="center", va="top", - fontfamily="serif", style="italic") - - # Why this evaluation matters - rationale = ( - "Why This Evaluation Matters\n\n" - "Previous benchmarks used datasets where AI and genuine images had different\n" - "subject matter (cats vs bananas, WikiArt paintings vs generated illustrations).\n" - "This means the classifier could achieve high accuracy by learning content\n" - "differences rather than genuine AI artifacts.\n\n" - "This evaluation uses datasets where BOTH classes contain similar content:\n" - " - Hemg: 'AiArtData' vs 'RealArt' — both are artwork/art images\n" - " - Parveshiiii: balanced binary AI vs Real images\n\n" - "If our 49 features still achieve high accuracy on these datasets, it provides\n" - "stronger evidence that the features detect actual AI generation artifacts\n" - "rather than subject-matter shortcuts." - ) - ax.text(0.08, 0.64, rationale, transform=ax.transAxes, fontsize=9, - ha="left", va="top", fontfamily="serif", - bbox=dict(boxstyle="round,pad=0.5", facecolor="lightyellow", edgecolor="gray")) - - # Summary table - summary = "Results Summary\n\n" - for ds in datasets: - summary += ( - f"Dataset: {ds['dataset']}\n" - f" Samples: {ds['n_samples']} ({ds['n_samples']//2} per class)\n" - f" XGBoost: {ds['xgb_accuracy']:.1%} acc, {ds['xgb_auc']:.4f} AUC, " - f"{ds['xgb_precision']:.1%} prec, {ds['xgb_recall']:.1%} rec\n" - f" SVM: {ds['svm_accuracy']:.1%} acc, {ds['svm_auc']:.4f} AUC\n" - f" MLP: {ds['mlp_accuracy']:.1%} acc, {ds['mlp_auc']:.4f} AUC\n\n" - ) - ax.text(0.08, 0.28, summary, transform=ax.transAxes, fontsize=9, - ha="left", va="top", fontfamily="serif", - bbox=dict(boxstyle="round,pad=0.4", facecolor="#E8F5E9", edgecolor="#66BB6A")) - - pdf.savefig(fig) - plt.close(fig) - - # ===== PAGE 2+: Per-dataset details ===== - for ds in datasets: - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - fig.suptitle(f"Dataset: {ds['dataset']}", fontsize=14, - fontweight="bold", fontfamily="serif", y=0.96) - - # Fold results table - ax_table = fig.add_axes([0.1, 0.68, 0.8, 0.22]) - ax_table.axis("off") - - if "xgb_folds" in ds: - table_data = [] - for r in ds["xgb_folds"]: - table_data.append([ - f"Fold {r['fold']}", f"{r['accuracy']:.2%}", - f"{r['precision']:.2%}", f"{r['recall']:.2%}", - f"{r['f1']:.2%}", f"{r['roc_auc']:.4f}" - ]) - - accs = [r["accuracy"] for r in ds["xgb_folds"]] - table_data.append([ - "Mean +/- Std", - f"{np.mean(accs):.2%} +/- {np.std(accs):.2%}", - "-", "-", "-", - f"{np.mean([r['roc_auc'] for r in ds['xgb_folds']]):.4f}" - ]) - - table = ax_table.table( - cellText=table_data, - colLabels=["Fold", "Accuracy", "Precision", "Recall", "F1", "ROC-AUC"], - loc="center", cellLoc="center", - ) - table.auto_set_font_size(False) - table.set_fontsize(8) - table.scale(1, 1.4) - for (row, col), cell in table.get_celld().items(): - if row == 0: - cell.set_facecolor("#4472C4") - cell.set_text_props(color="white", fontweight="bold") - elif row == len(table_data): - cell.set_facecolor("#D6E4F0") - - # Comparison bar chart: XGBoost vs SVM vs MLP - ax_bar = fig.add_axes([0.1, 0.35, 0.8, 0.25]) - models = ["XGBoost", "SVM", "MLP"] - accs = [ds["xgb_accuracy"], ds["svm_accuracy"], ds["mlp_accuracy"]] - aucs = [ds["xgb_auc"], ds["svm_auc"], ds["mlp_auc"]] - - x = np.arange(len(models)) - w = 0.35 - bars1 = ax_bar.bar(x - w/2, accs, w, label="Accuracy", color="#4472C4") - bars2 = ax_bar.bar(x + w/2, aucs, w, label="ROC-AUC", color="#ED7D31") - ax_bar.set_xticks(x) - ax_bar.set_xticklabels(models) - ax_bar.set_ylim(0, 1.1) - ax_bar.set_ylabel("Score") - ax_bar.set_title("Model Comparison", fontsize=11, fontfamily="serif") - ax_bar.legend() - ax_bar.grid(axis="y", alpha=0.3) - - for bar in bars1: - ax_bar.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, - f"{bar.get_height():.1%}", ha="center", fontsize=8) - for bar in bars2: - ax_bar.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, - f"{bar.get_height():.3f}", ha="center", fontsize=8) - - # Analysis text - ax_text = fig.add_axes([0.08, 0.05, 0.84, 0.25]) - ax_text.axis("off") - - best_acc = max(accs) - best_model = models[accs.index(best_acc)] - - analysis = ( - f"Analysis\n\n" - f"Dataset: {ds['repo']}\n" - f"Sample size: {ds['n_samples']} images, {ds['n_features']} features\n\n" - f"Best model: {best_model} at {best_acc:.1%} accuracy\n\n" - ) - if best_acc >= 0.80: - analysis += ( - "The features demonstrate strong discriminative power even when both\n" - "classes contain similar content. This suggests the 49 features capture\n" - "genuine AI generation artifacts rather than content-based shortcuts." - ) - elif best_acc >= 0.65: - analysis += ( - "Moderate discriminative power. The features capture some genuine AI\n" - "artifacts but performance degrades compared to content-separated datasets,\n" - "suggesting prior benchmarks partially relied on content differences." - ) - else: - analysis += ( - "Weak discriminative power on this dataset. The features struggle when\n" - "content is controlled, indicating prior high accuracy was largely driven\n" - "by subject-matter differences rather than AI detection capability." - ) - - ax_text.text(0, 1, analysis, transform=ax_text.transAxes, fontsize=9, - ha="left", va="top", fontfamily="serif") - - pdf.savefig(fig) - plt.close(fig) - - # ===== FINAL PAGE: Conclusions ===== - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - ax = fig.add_axes([0, 0, 1, 1]) - ax.axis("off") - - ax.text(0.5, 0.92, "Conclusions", fontsize=16, fontweight="bold", - ha="center", va="top", fontfamily="serif", transform=ax.transAxes) - - all_accs = [ds["xgb_accuracy"] for ds in datasets] - mean_fair_acc = np.mean(all_accs) - - conclusions = ( - f"Mean XGBoost accuracy across fair datasets: {mean_fair_acc:.1%}\n\n" - "Comparison with previous (potentially confounded) benchmarks:\n" - " - Cats vs Bananas (unfair): ~91% accuracy\n" - " - WikiArt vs Generated (partially fair): ~92% accuracy\n" - f" - Fair evaluation (this report): {mean_fair_acc:.1%} accuracy\n\n" - ) - - if mean_fair_acc >= 0.80: - conclusions += ( - "CONCLUSION: The 49-feature pipeline holds up under fair evaluation.\n" - "The accuracy drop from unfair to fair benchmarks is modest, indicating\n" - "that the features genuinely detect AI artifacts, not just content.\n\n" - "The frequency-domain features (FFT/DCT) and texture features (GLCM/LBP)\n" - "appear to be capturing real structural differences between AI-generated\n" - "and human-created artwork." - ) - elif mean_fair_acc >= 0.65: - conclusions += ( - "CONCLUSION: Mixed results. The features have some genuine detection\n" - "capability but a significant portion of previous accuracy was from\n" - "content shortcuts. The pipeline needs improvement — likely deeper\n" - "learned features (self-supervised or fine-tuned ViT) rather than\n" - "hand-crafted statistics." - ) - else: - conclusions += ( - "CONCLUSION: The 49-feature pipeline does NOT generalize to fair\n" - "evaluation. Previous high accuracy was primarily from content confounds.\n" - "A fundamentally different approach is needed — likely self-supervised\n" - "learning of camera/generation-intrinsic features as described in\n" - "Zhong et al. (2026)." - ) - - conclusions += ( - "\n\nMethodological Note\n\n" - "This report uses 5-fold stratified cross-validation with 200 images per\n" - "class. While larger samples would give tighter confidence intervals, this\n" - "is sufficient to distinguish between >80% and chance-level performance.\n\n" - "Features: 49 total (39 from Li & Stamp 2025 + 10 FFT/DCT frequency features)\n" - "Classifiers: XGBoost, SVM (RBF kernel), MLP (100 hidden units)\n" - "All processing: CPU-only, no pretrained neural networks" - ) - - ax.text(0.08, 0.85, conclusions, transform=ax.transAxes, fontsize=9.5, - ha="left", va="top", fontfamily="serif") - - pdf.savefig(fig) - plt.close(fig) - - print(f"PDF saved to: {pdf_path}") - return pdf_path - - -if __name__ == "__main__": - results_path = RESULTS_DIR / "fair_evaluation_results.json" - if not results_path.exists(): - print(f"Run test_fair_evaluation.py first to generate {results_path}") - sys.exit(1) - generate_pdf(results_path) diff --git a/tests/generate_results_pdf.py b/tests/generate_results_pdf.py deleted file mode 100644 index 1ed4fdd..0000000 --- a/tests/generate_results_pdf.py +++ /dev/null @@ -1,475 +0,0 @@ -# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 -# - -"""Generate results PDF with multi-signal ensemble, calibrated thresholds, -abstention, and full precision/recall/F1 reporting. - -Usage: uv run python tests/generate_results_pdf.py -Output: results/artwork_detection_results.pdf -""" - -from __future__ import annotations - -import sys -from datetime import datetime -from pathlib import Path - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -import matplotlib.gridspec as gridspec -from matplotlib.backends.backend_pdf import PdfPages -from matplotlib.patches import Patch -import numpy as np -import pandas as pd -import xgboost as xgb -from datasets import load_dataset, Image as HFImage -from sklearn.calibration import CalibratedClassifierCV -from sklearn.metrics import ( - accuracy_score, precision_score, recall_score, f1_score, - roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve, -) -from sklearn.model_selection import StratifiedKFold, cross_val_predict -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC -from tqdm import tqdm - -sys.path.insert(0, str(Path(__file__).parent.parent)) -from negate.extract.feature_artwork import ArtworkExtract - -HUMAN_ART_REPO = "huggan/wikiart" -SYNTHETIC_REPO = "exdysa/nano-banana-pro-generated-1k-clone" -SAMPLE_SIZE = 100 -N_FOLDS = 5 -SEED = 42 -OUTPUT_DIR = Path(__file__).parent.parent / "results" - - -def load_and_extract(): - print(f"Loading {SAMPLE_SIZE} human art + {SAMPLE_SIZE} AI images...") - human_ds = load_dataset(HUMAN_ART_REPO, split=f"train[:{SAMPLE_SIZE}]") - human_ds = human_ds.cast_column("image", HFImage(decode=True, mode="RGB")) - ai_ds = load_dataset(SYNTHETIC_REPO, split=f"train[:{SAMPLE_SIZE}]") - ai_ds = ai_ds.cast_column("image", HFImage(decode=True, mode="RGB")) - - extractor = ArtworkExtract() - features, labels = [], [] - imgs_human, imgs_ai = [], [] - - for row in tqdm(human_ds, desc="Human art"): - try: - features.append(extractor(row["image"])) - labels.append(0) - if len(imgs_human) < 4: - imgs_human.append(row["image"]) - except Exception: - pass - - for row in tqdm(ai_ds, desc="AI art"): - try: - features.append(extractor(row["image"])) - labels.append(1) - if len(imgs_ai) < 4: - imgs_ai.append(row["image"]) - except Exception: - pass - - df = pd.DataFrame(features).fillna(0) - X = np.where(np.isfinite(df.to_numpy(dtype=np.float64)), df.to_numpy(dtype=np.float64), 0) - y = np.array(labels) - return X, y, list(df.columns), imgs_human, imgs_ai - - -def run_ensemble_cv(X, y): - """Run calibrated ensemble with abstention.""" - scaler = StandardScaler() - X_s = scaler.fit_transform(X) - skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) - - # Individual models (calibrated with Platt scaling) - models = { - "SVM": CalibratedClassifierCV(SVC(C=10, gamma="scale", kernel="rbf", random_state=SEED), cv=3, method="sigmoid"), - "MLP": CalibratedClassifierCV(MLPClassifier(hidden_layer_sizes=(100,), activation="relu", max_iter=1000, random_state=SEED), cv=3, method="sigmoid"), - } - - # Collect per-model CV predictions - model_probs = {} - model_preds = {} - for name, model in models.items(): - probs = cross_val_predict(model, X_s, y, cv=skf, method="predict_proba")[:, 1] - model_probs[name] = probs - model_preds[name] = (probs > 0.5).astype(int) - - # XGBoost (already outputs calibrated probabilities) - xgb_probs = np.zeros(len(y)) - for train_idx, test_idx in skf.split(X_s, y): - params = {"objective": "binary:logistic", "max_depth": 4, "learning_rate": 0.1, - "subsample": 0.8, "colsample_bytree": 0.8, "seed": SEED, "eval_metric": "logloss"} - dtrain = xgb.DMatrix(X_s[train_idx], label=y[train_idx]) - dtest = xgb.DMatrix(X_s[test_idx]) - model = xgb.train(params, dtrain, num_boost_round=200, - evals=[(xgb.DMatrix(X_s[test_idx], label=y[test_idx]), "test")], - early_stopping_rounds=10, verbose_eval=False) - xgb_probs[test_idx] = model.predict(dtest) - - model_probs["XGBoost"] = xgb_probs - model_preds["XGBoost"] = (xgb_probs > 0.5).astype(int) - - # Ensemble: average calibrated probabilities - ensemble_probs = np.mean([model_probs[n] for n in model_probs], axis=0) - - # Abstention: if ensemble confidence < threshold, mark as uncertain - ABSTAIN_THRESH = 0.3 # abstain if prob between 0.3 and 0.7 - ensemble_preds = np.full(len(y), -1) # -1 = uncertain - ensemble_preds[ensemble_probs > (1 - ABSTAIN_THRESH)] = 1 # AI - ensemble_preds[ensemble_probs < ABSTAIN_THRESH] = 0 # Human - - # Per-model metrics - results = {} - for name in model_probs: - pred = model_preds[name] - results[name] = { - "accuracy": accuracy_score(y, pred), - "precision": precision_score(y, pred, zero_division=0), - "recall": recall_score(y, pred, zero_division=0), - "f1": f1_score(y, pred, average="macro"), - "roc_auc": roc_auc_score(y, model_probs[name]), - "probs": model_probs[name], - } - - # Ensemble metrics (excluding abstained samples) - confident_mask = ensemble_preds >= 0 - n_abstained = int((~confident_mask).sum()) - if confident_mask.sum() > 0: - results["Ensemble"] = { - "accuracy": accuracy_score(y[confident_mask], ensemble_preds[confident_mask]), - "precision": precision_score(y[confident_mask], ensemble_preds[confident_mask], zero_division=0), - "recall": recall_score(y[confident_mask], ensemble_preds[confident_mask], zero_division=0), - "f1": f1_score(y[confident_mask], ensemble_preds[confident_mask], average="macro"), - "roc_auc": roc_auc_score(y, ensemble_probs), - "probs": ensemble_probs, - "n_abstained": n_abstained, - "n_classified": int(confident_mask.sum()), - } - - # Feature importance (full XGBoost model) - feature_names = [f"f{i}" for i in range(X.shape[1])] - dtrain_full = xgb.DMatrix(X_s, label=y, feature_names=feature_names) - full_model = xgb.train({"objective": "binary:logistic", "max_depth": 4, "seed": SEED}, - dtrain_full, num_boost_round=100, verbose_eval=False) - - return results, ensemble_probs, ensemble_preds, full_model - - -def generate_pdf(X, y, feature_names, results, ensemble_probs, ensemble_preds, - model, imgs_human, imgs_ai): - OUTPUT_DIR.mkdir(exist_ok=True) - pdf_path = OUTPUT_DIR / "artwork_detection_results.pdf" - - with PdfPages(str(pdf_path)) as pdf: - # ===== PAGE 1: Title + Results Table ===== - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - ax = fig.add_axes([0, 0, 1, 1]) - ax.axis("off") - - ax.text(0.5, 0.92, "AI-Generated Artwork Detection", fontsize=22, fontweight="bold", - ha="center", fontfamily="serif", transform=ax.transAxes) - ax.text(0.5, 0.87, "Multi-Signal Ensemble with Calibrated Thresholds", - fontsize=12, ha="center", fontfamily="serif", style="italic", transform=ax.transAxes) - ax.text(0.5, 0.83, f"negate project | {datetime.now().strftime('%B %d, %Y')}", - fontsize=10, ha="center", fontfamily="serif", transform=ax.transAxes) - - # Results table - ax_table = fig.add_axes([0.08, 0.52, 0.84, 0.26]) - ax_table.axis("off") - - table_data = [] - for name, r in results.items(): - row = [name, f"{r['accuracy']:.1%}", f"{r['precision']:.1%}", - f"{r['recall']:.1%}", f"{r['f1']:.1%}", f"{r['roc_auc']:.4f}"] - if name == "Ensemble": - row.append(f"{r['n_abstained']}") - else: - row.append("-") - table_data.append(row) - table_data.append(["Existing negate", "63.3%", "--", "--", "--", "0.669", "-"]) - - table = ax_table.table( - cellText=table_data, - colLabels=["Model", "Accuracy", "Precision", "Recall", "F1", "AUC", "Abstained"], - loc="center", cellLoc="center", - ) - table.auto_set_font_size(False) - table.set_fontsize(8.5) - table.scale(1, 1.6) - for (row, col), cell in table.get_celld().items(): - if row == 0: - cell.set_facecolor("#2E4057") - cell.set_text_props(color="white", fontweight="bold") - elif row == len(table_data): - cell.set_facecolor("#FFE0E0") - - # Summary text - ens = results.get("Ensemble", {}) - summary = ( - "Approach\n\n" - f" Features: {X.shape[1]} (39 artwork + 10 frequency analysis)\n" - f" Dataset: {np.sum(y==0)} human artworks (WikiArt) + {np.sum(y==1)} AI images\n" - f" CV: {N_FOLDS}-fold stratified cross-validation\n\n" - " Three calibrated classifiers (SVM, MLP, XGBoost) vote via averaged\n" - " probabilities. Images where ensemble confidence is between 30-70%\n" - f" are marked 'uncertain' ({ens.get('n_abstained', 0)} images abstained).\n\n" - " Precision = of images flagged AI, how many actually are\n" - " Recall = of actual AI images, how many were caught" - ) - ax.text(0.08, 0.48, summary, fontsize=9, ha="left", va="top", fontfamily="serif", - transform=ax.transAxes) - - # Key findings - findings = ( - "Key Findings\n\n" - f" 1. Ensemble achieves {ens.get('precision', 0):.1%} precision, " - f"{ens.get('recall', 0):.1%} recall on classified images\n" - f" 2. {ens.get('n_abstained', 0)} uncertain images abstained from " - f"(reduces false positives)\n" - f" 3. +{(ens.get('accuracy', 0) - 0.633)*100:.1f}pp improvement over " - "existing negate pipeline (63.3%)\n" - f" 4. Frequency features (FFT/DCT) add spectral artifact detection\n" - " 5. All processing is CPU-only, ~12 images/sec" - ) - ax.text(0.08, 0.24, findings, fontsize=9, ha="left", va="top", fontfamily="serif", - transform=ax.transAxes, - bbox=dict(boxstyle="round,pad=0.4", facecolor="#E8F5E9", edgecolor="#66BB6A")) - - pdf.savefig(fig) - plt.close(fig) - - # ===== PAGE 2: ROC + PR curves + Confusion Matrix ===== - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - fig.suptitle("Detection Performance Analysis", fontsize=14, - fontweight="bold", fontfamily="serif", y=0.96) - - # ROC curves - ax_roc = fig.add_axes([0.08, 0.62, 0.4, 0.28]) - colors = {"SVM": "#4472C4", "MLP": "#ED7D31", "XGBoost": "#70AD47", "Ensemble": "#C00000"} - for name, r in results.items(): - fpr, tpr, _ = roc_curve(y, r["probs"]) - ax_roc.plot(fpr, tpr, color=colors.get(name, "gray"), linewidth=2, - label=f"{name} ({r['roc_auc']:.3f})") - ax_roc.plot([0, 1], [0, 1], "k--", alpha=0.3) - ax_roc.set_xlabel("False Positive Rate", fontsize=9) - ax_roc.set_ylabel("True Positive Rate", fontsize=9) - ax_roc.set_title("ROC Curves", fontsize=10, fontfamily="serif") - ax_roc.legend(fontsize=7, loc="lower right") - ax_roc.grid(True, alpha=0.2) - - # Precision-Recall curves - ax_pr = fig.add_axes([0.55, 0.62, 0.4, 0.28]) - for name, r in results.items(): - prec_curve, rec_curve, _ = precision_recall_curve(y, r["probs"]) - ax_pr.plot(rec_curve, prec_curve, color=colors.get(name, "gray"), linewidth=2, - label=name) - ax_pr.set_xlabel("Recall", fontsize=9) - ax_pr.set_ylabel("Precision", fontsize=9) - ax_pr.set_title("Precision-Recall Curves", fontsize=10, fontfamily="serif") - ax_pr.legend(fontsize=7) - ax_pr.grid(True, alpha=0.2) - - # Ensemble confusion matrix - ax_cm = fig.add_axes([0.08, 0.28, 0.35, 0.26]) - confident = ensemble_preds >= 0 - if confident.sum() > 0: - cm = confusion_matrix(y[confident], ensemble_preds[confident]) - im = ax_cm.imshow(cm, cmap="Blues") - ax_cm.set_xticks([0, 1]) - ax_cm.set_yticks([0, 1]) - ax_cm.set_xticklabels(["Human", "AI"], fontsize=9) - ax_cm.set_yticklabels(["Human", "AI"], fontsize=9) - ax_cm.set_xlabel("Predicted", fontsize=9) - ax_cm.set_ylabel("Actual", fontsize=9) - ax_cm.set_title("Ensemble (confident only)", fontsize=10, fontfamily="serif") - for i in range(2): - for j in range(2): - ax_cm.text(j, i, str(cm[i, j]), ha="center", va="center", fontsize=16, - fontweight="bold", color="white" if cm[i, j] > cm.max()/2 else "black") - - # Probability distribution - ax_hist = fig.add_axes([0.55, 0.28, 0.4, 0.26]) - human_probs = ensemble_probs[y == 0] - ai_probs = ensemble_probs[y == 1] - ax_hist.hist(human_probs, bins=20, alpha=0.6, color="#4472C4", label="Human art", density=True) - ax_hist.hist(ai_probs, bins=20, alpha=0.6, color="#ED7D31", label="AI art", density=True) - ax_hist.axvline(x=0.3, color="red", linestyle="--", alpha=0.5, label="Abstain zone") - ax_hist.axvline(x=0.7, color="red", linestyle="--", alpha=0.5) - ax_hist.axvspan(0.3, 0.7, alpha=0.1, color="red") - ax_hist.set_xlabel("Ensemble Probability (AI)", fontsize=9) - ax_hist.set_ylabel("Density", fontsize=9) - ax_hist.set_title("Probability Distribution", fontsize=10, fontfamily="serif") - ax_hist.legend(fontsize=7) - - # Per-model agreement analysis - ax_agree = fig.add_axes([0.08, 0.04, 0.84, 0.18]) - ax_agree.axis("off") - n_all_agree = sum(1 for i in range(len(y)) - if len(set(results[n]["probs"][i] > 0.5 for n in ["SVM", "MLP", "XGBoost"])) == 1) - n_disagree = len(y) - n_all_agree - agree_text = ( - "Model Agreement Analysis\n\n" - f" All 3 models agree: {n_all_agree}/{len(y)} ({n_all_agree/len(y):.0%})\n" - f" At least 1 disagrees: {n_disagree}/{len(y)} ({n_disagree/len(y):.0%})\n\n" - " When models disagree, the ensemble uses averaged probability with\n" - " abstention zone (0.3-0.7). This reduces false positives at the cost\n" - " of some unclassified images -- a deliberate tradeoff for precision." - ) - ax_agree.text(0, 1, agree_text, fontsize=9, ha="left", va="top", fontfamily="serif", - transform=ax_agree.transAxes) - - pdf.savefig(fig) - plt.close(fig) - - # ===== PAGE 3: Feature Analysis + Examples ===== - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - fig.suptitle("Feature Analysis & Examples", fontsize=14, - fontweight="bold", fontfamily="serif", y=0.96) - - # Example images - n = min(4, len(imgs_human), len(imgs_ai)) - gs = gridspec.GridSpec(2, n, top=0.9, bottom=0.65, left=0.05, right=0.95, hspace=0.2, wspace=0.1) - for i in range(n): - ax = fig.add_subplot(gs[0, i]) - ax.imshow(imgs_human[i]) - ax.set_title(f"Human #{i+1}", fontsize=8) - ax.axis("off") - for i in range(n): - ax = fig.add_subplot(gs[1, i]) - ax.imshow(imgs_ai[i]) - ax.set_title(f"AI #{i+1}", fontsize=8) - ax.axis("off") - - # Feature importance - ax_imp = fig.add_axes([0.12, 0.08, 0.76, 0.5]) - importance = model.get_score(importance_type="gain") - sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:20] - if sorted_imp: - # Map back to real feature names - disp_names = [] - for fname, _ in sorted_imp: - idx = int(fname[1:]) # f0 -> 0 - disp_names.append(feature_names[idx] if idx < len(feature_names) else fname) - disp_names = disp_names[::-1] - gains = [x[1] for x in sorted_imp][::-1] - - color_map = {"fft": "#C00000", "dct": "#C00000", - "hog": "#ED7D31", "edge": "#ED7D31", - "lbp": "#70AD47", "contrast": "#70AD47", "correlation": "#70AD47", - "energy": "#70AD47", "homogeneity": "#70AD47"} - bar_colors = [] - for n in disp_names: - c = "#4472C4" # default - for prefix, color in color_map.items(): - if prefix in n: - c = color - break - bar_colors.append(c) - - ax_imp.barh(range(len(disp_names)), gains, color=bar_colors) - ax_imp.set_yticks(range(len(disp_names))) - ax_imp.set_yticklabels(disp_names, fontsize=7) - ax_imp.set_xlabel("XGBoost Gain", fontsize=9) - ax_imp.set_title("Top 20 Features by Importance", fontsize=10, fontfamily="serif") - - legend_elements = [ - Patch(facecolor="#C00000", label="Frequency (FFT/DCT)"), - Patch(facecolor="#ED7D31", label="Shape (HOG/edges)"), - Patch(facecolor="#70AD47", label="Texture (GLCM/LBP)"), - Patch(facecolor="#4472C4", label="Color/Brightness/Noise"), - ] - ax_imp.legend(handles=legend_elements, fontsize=7, loc="lower right") - - pdf.savefig(fig) - plt.close(fig) - - # ===== PAGE 4: Methodology & Architecture ===== - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - ax = fig.add_axes([0, 0, 1, 1]) - ax.axis("off") - - ax.text(0.5, 0.95, "Architecture & Methodology", fontsize=14, - fontweight="bold", ha="center", fontfamily="serif", transform=ax.transAxes) - - method_text = ( - "Multi-Signal Ensemble Architecture\n\n" - "The detection system combines three orthogonal classifiers, each seeing the\n" - "same feature space but learning different decision boundaries:\n\n" - " 1. SVM (RBF kernel) - Finds nonlinear decision boundaries in feature space.\n" - " Calibrated with Platt scaling (sigmoid) for reliable probabilities.\n\n" - " 2. MLP (100 hidden units) - Learns feature interactions through backpropagation.\n" - " Calibrated with Platt scaling for probability alignment.\n\n" - " 3. XGBoost (gradient boosted trees) - Captures feature thresholds and\n" - " interactions. Naturally outputs calibrated log-odds.\n\n" - "Ensemble Voting: Averaged calibrated probabilities from all three models.\n" - "This is more robust than majority voting because it accounts for confidence.\n\n" - "Calibrated Confidence & Abstention\n\n" - "Instead of a hard 0.5 threshold, the ensemble uses a deliberate 'uncertain'\n" - "zone between 0.3 and 0.7 probability. Images in this zone are marked as\n" - "'uncertain' rather than forced into a class. This dramatically improves\n" - "precision on the images that ARE classified.\n\n" - "Feature Extraction Pipeline (49 features, CPU-only)\n\n" - " Brightness (2) - Global luminance statistics\n" - " Color (23) - RGB/HSV histogram moments (mean, var, kurtosis, skew, entropy)\n" - " Texture (6) - GLCM co-occurrence + LBP local patterns\n" - " Shape (6) - HOG gradient histograms + Canny edge density\n" - " Noise (2) - Estimated noise entropy + signal-to-noise ratio\n" - " Frequency (10) - FFT radial band energies, spectral centroid, phase coherence,\n" - " DCT AC/DC ratio, high-freq energy, coefficient sparsity\n\n" - "The frequency branch is the key addition beyond Li & Stamp (2025). AI generators\n" - "leave characteristic spectral signatures from upsampling layers, attention patterns,\n" - "and latent space decoding. These are invisible in pixel space but clearly visible\n" - "in the frequency domain.\n\n" - "Limitations\n\n" - " - Tested on mismatched subjects (WikiArt paintings vs AI banana images)\n" - " - Not yet tested on hard negatives (polished digital art, img2img, LoRA art)\n" - " - Single generator family in AI training data (Stable Diffusion variants)\n" - " - 200 sample dataset is small for robust conclusions\n\n" - "References\n\n" - " [1] Li & Stamp, 'Detecting AI-generated Artwork', arXiv:2504.07078, 2025\n" - " [2] negate project, github.com/darkshapes/negate" - ) - ax.text(0.06, 0.9, method_text, fontsize=8.5, ha="left", va="top", fontfamily="serif", - transform=ax.transAxes) - - pdf.savefig(fig) - plt.close(fig) - - print(f"PDF saved: {pdf_path}") - return pdf_path - - -def main(): - print("=" * 55) - print(" ARTWORK DETECTION - ENSEMBLE RESULTS") - print("=" * 55) - - X, y, names, imgs_h, imgs_a = load_and_extract() - print(f"Dataset: {np.sum(y==0)} human + {np.sum(y==1)} AI, {X.shape[1]} features") - - results, ens_probs, ens_preds, model = run_ensemble_cv(X, y) - - print(f"\n{'Model':<15} {'Acc':>8} {'Prec':>8} {'Rec':>8} {'F1':>8} {'AUC':>8}") - print("-" * 55) - for name, r in results.items(): - extra = f" ({r.get('n_abstained', '-')} abstained)" if 'n_abstained' in r else "" - print(f"{name:<15} {r['accuracy']:>7.1%} {r['precision']:>7.1%} {r['recall']:>7.1%} " - f"{r['f1']:>7.1%} {r['roc_auc']:>7.4f}{extra}") - - generate_pdf(X, y, names, results, ens_probs, ens_preds, model, imgs_h, imgs_a) - print("Done.") - - -if __name__ == "__main__": - main() diff --git a/tests/test_adversarial_robustness.py b/tests/test_adversarial_robustness.py new file mode 100644 index 0000000..abe4e7d --- /dev/null +++ b/tests/test_adversarial_robustness.py @@ -0,0 +1,348 @@ +"""Adversarial robustness and false positive analysis. + +Tests whether the 148-feature detector holds up under: +1. Post-processing attacks (JPEG, noise, resize, blur, crop) +2. False positive stress test (digital art, ambiguous content) +3. Threshold calibration (precision-recall tradeoff) +4. Social media simulation (combined degradation) + +Uses ImagiNet paintings data (already on disk). +""" + +from __future__ import annotations + +import json +import warnings +from io import BytesIO +from pathlib import Path + +import lightgbm as lgb +import numpy as np +from PIL import Image, ImageFilter, ImageEnhance +from sklearn.metrics import ( + accuracy_score, f1_score, precision_recall_curve, precision_score, + recall_score, roc_auc_score, +) +from sklearn.model_selection import StratifiedKFold +from tqdm import tqdm + +warnings.filterwarnings("ignore") + +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) +from negate.extract.feature_artwork import ArtworkExtract + +ext = ArtworkExtract() +BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted") +SEED = 42 +N_IMAGES = 300 # per class for speed +rng = np.random.RandomState(SEED) + + +def load_image_paths(path: Path, recursive: bool, max_n: int) -> list[Path]: + exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} + if recursive: + files = [f for f in path.rglob("*") if f.suffix.lower() in exts] + else: + files = [f for f in path.iterdir() if f.suffix.lower() in exts] + if len(files) > max_n: + files = list(rng.choice(files, max_n, replace=False)) + return files + + +def extract_features_from_images(images: list[Image.Image], desc: str = "") -> np.ndarray: + rows = [] + for img in tqdm(images, desc=desc, leave=False): + try: + feat = ext(img) + rows.append(list(feat.values())) + except Exception: + rows.append([0.0] * 148) + X = np.array(rows, dtype=np.float64) + return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + + +def extract_features_from_paths(files: list[Path], desc: str = "") -> np.ndarray: + images = [] + for f in files: + try: + images.append(Image.open(f).convert("RGB")) + except Exception: + pass + return extract_features_from_images(images, desc) + + +# ---- Perturbation functions ---- + +def jpeg_compress(img: Image.Image, quality: int) -> Image.Image: + buf = BytesIO() + img.save(buf, format="JPEG", quality=quality) + buf.seek(0) + return Image.open(buf).convert("RGB") + + +def add_gaussian_noise(img: Image.Image, sigma: float) -> Image.Image: + arr = np.array(img, dtype=np.float64) + noise = rng.normal(0, sigma, arr.shape) + arr = np.clip(arr + noise, 0, 255).astype(np.uint8) + return Image.fromarray(arr) + + +def resize_down_up(img: Image.Image, small_size: int) -> Image.Image: + orig_size = img.size + img_small = img.resize((small_size, small_size), Image.BICUBIC) + return img_small.resize(orig_size, Image.BICUBIC) + + +def center_crop(img: Image.Image, ratio: float = 0.7) -> Image.Image: + w, h = img.size + new_w, new_h = int(w * ratio), int(h * ratio) + left = (w - new_w) // 2 + top = (h - new_h) // 2 + return img.crop((left, top, left + new_w, top + new_h)) + + +def gaussian_blur(img: Image.Image, radius: float) -> Image.Image: + return img.filter(ImageFilter.GaussianBlur(radius=radius)) + + +def adjust_brightness(img: Image.Image, factor: float) -> Image.Image: + return ImageEnhance.Brightness(img).enhance(factor) + + +def social_media_sim(img: Image.Image) -> Image.Image: + """Simulate social media pipeline: resize down, JPEG, slight blur.""" + img = img.resize((1024, 1024), Image.BICUBIC) + img = jpeg_compress(img, 75) + img = gaussian_blur(img, 0.5) + return img + + +PERTURBATIONS = { + "jpeg_q30": lambda img: jpeg_compress(img, 30), + "jpeg_q50": lambda img: jpeg_compress(img, 50), + "jpeg_q70": lambda img: jpeg_compress(img, 70), + "noise_s5": lambda img: add_gaussian_noise(img, 5), + "noise_s15": lambda img: add_gaussian_noise(img, 15), + "noise_s30": lambda img: add_gaussian_noise(img, 30), + "resize_128": lambda img: resize_down_up(img, 128), + "resize_64": lambda img: resize_down_up(img, 64), + "crop_70pct": lambda img: center_crop(img, 0.7), + "crop_50pct": lambda img: center_crop(img, 0.5), + "blur_r1": lambda img: gaussian_blur(img, 1), + "blur_r3": lambda img: gaussian_blur(img, 3), + "bright_0.7": lambda img: adjust_brightness(img, 0.7), + "bright_1.3": lambda img: adjust_brightness(img, 1.3), + "social_media": social_media_sim, +} + + +def main(): + print("=" * 70) + print(" ADVERSARIAL ROBUSTNESS & FALSE POSITIVE ANALYSIS") + print(" 148 features, LightGBM, ImagiNet paintings") + print("=" * 70) + + # Load real and fake images + print("\nLoading images...") + real_files = load_image_paths(BASE / "wikiart", recursive=True, max_n=N_IMAGES) + # Mix of generators for fake + fake_files = [] + for gen_dir in ["sdxl_paintings_fake", "sd_paintings_fake", "dalle3", "journeydb"]: + fake_files.extend(load_image_paths(BASE / gen_dir, recursive=False, max_n=N_IMAGES // 4)) + rng.shuffle(fake_files) + fake_files = fake_files[:N_IMAGES] + + print(f" Real: {len(real_files)}, Fake: {len(fake_files)}") + + real_images = [Image.open(f).convert("RGB") for f in tqdm(real_files, desc="Loading real")] + fake_images = [Image.open(f).convert("RGB") for f in tqdm(fake_files, desc="Loading fake")] + + # Extract clean features and train model + print("\nExtracting clean features...") + X_real = extract_features_from_images(real_images, "Real features") + X_fake = extract_features_from_images(fake_images, "Fake features") + + X = np.vstack([X_real, X_fake]) + y = np.concatenate([np.zeros(len(X_real)), np.ones(len(X_fake))]) + + # Train on 70%, test perturbations on 30% + n = len(y) + idx = rng.permutation(n) + split = int(0.7 * n) + train_idx, test_idx = idx[:split], idx[split:] + + model = lgb.LGBMClassifier( + n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED, + ) + model.fit(X[train_idx], y[train_idx]) + + # Clean baseline + y_prob_clean = model.predict_proba(X[test_idx])[:, 1] + y_pred_clean = (y_prob_clean > 0.5).astype(int) + clean_acc = accuracy_score(y[test_idx], y_pred_clean) + clean_auc = roc_auc_score(y[test_idx], y_prob_clean) + print(f"\nClean baseline: acc={clean_acc:.4f} auc={clean_auc:.4f}") + + # ---- EXP 1: Perturbation robustness ---- + print("\n" + "=" * 70) + print(" EXP 1: Adversarial Robustness (perturbations on test set)") + print("=" * 70) + + # Get the test images + test_images_real = [real_images[i] for i in range(len(real_images)) if i in set(test_idx[test_idx < len(real_images)])] + test_images_fake = [fake_images[i - len(real_images)] for i in test_idx if i >= len(real_images)] + test_images = test_images_real + test_images_fake + test_labels = np.concatenate([np.zeros(len(test_images_real)), np.ones(len(test_images_fake))]) + + results_perturb = {"clean": {"acc": clean_acc, "auc": clean_auc}} + + for pert_name, pert_fn in PERTURBATIONS.items(): + print(f"\n Applying {pert_name}...") + perturbed = [] + for img in tqdm(test_images, desc=f" {pert_name}", leave=False): + try: + perturbed.append(pert_fn(img)) + except Exception: + perturbed.append(img) # fallback to original + + X_pert = extract_features_from_images(perturbed, f" {pert_name} features") + y_prob_pert = model.predict_proba(X_pert)[:, 1] + y_pred_pert = (y_prob_pert > 0.5).astype(int) + + acc = accuracy_score(test_labels, y_pred_pert) + auc = roc_auc_score(test_labels, y_prob_pert) + delta = acc - clean_acc + results_perturb[pert_name] = {"acc": float(acc), "auc": float(auc), "delta": float(delta)} + print(f" acc={acc:.4f} auc={auc:.4f} delta={delta:+.4f}") + + # Summary table + print("\n +---------------------+--------+--------+---------+") + print(" | Perturbation | Acc | AUC | Delta |") + print(" +---------------------+--------+--------+---------+") + for name, r in sorted(results_perturb.items(), key=lambda x: -x[1]["acc"]): + print(f" | {name:19s} | {r['acc']:.4f} | {r['auc']:.4f} | {r.get('delta', 0):+.4f} |") + print(" +---------------------+--------+--------+---------+") + + # ---- EXP 2: Threshold calibration ---- + print("\n" + "=" * 70) + print(" EXP 2: Threshold Calibration (Precision-Recall Tradeoff)") + print("=" * 70) + + # Use full 5-fold CV probabilities for calibration + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) + all_probs, all_labels = [], [] + for tr, te in skf.split(X, y): + m = lgb.LGBMClassifier( + n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED, + ) + m.fit(X[tr], y[tr]) + all_probs.extend(m.predict_proba(X[te])[:, 1]) + all_labels.extend(y[te]) + + all_probs = np.array(all_probs) + all_labels = np.array(all_labels) + + prec_arr, rec_arr, thresholds = precision_recall_curve(all_labels, all_probs) + + # Find thresholds for different precision targets + print("\n Threshold analysis (higher threshold = fewer false positives):") + print(" +------------+--------+-----------+--------+---------+") + print(" | Threshold | Prec | Recall | F1 | FP Rate |") + print(" +------------+--------+-----------+--------+---------+") + + results_thresh = {} + for target_thresh in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]: + pred = (all_probs >= target_thresh).astype(int) + if pred.sum() == 0: + continue + prec = precision_score(all_labels, pred, zero_division=0) + rec = recall_score(all_labels, pred, zero_division=0) + f1 = f1_score(all_labels, pred, average="macro", zero_division=0) + # False positive rate: real images incorrectly flagged as AI + real_mask = all_labels == 0 + fp_rate = float(pred[real_mask].mean()) + + results_thresh[str(target_thresh)] = { + "precision": float(prec), "recall": float(rec), + "f1": float(f1), "fp_rate": float(fp_rate), + } + print(f" | {target_thresh:10.2f} | {prec:.4f} | {rec:9.4f} | {f1:.4f} | {fp_rate:7.4f} |") + print(" +------------+--------+-----------+--------+---------+") + + # Find the sweet spot: highest F1 with FP rate < 5% + best_thresh = None + best_f1 = 0 + for t_str, r in results_thresh.items(): + if r["fp_rate"] <= 0.05 and r["f1"] > best_f1: + best_f1 = r["f1"] + best_thresh = t_str + if best_thresh: + print(f"\n Recommended threshold: {best_thresh} (F1={results_thresh[best_thresh]['f1']:.4f}, FP rate={results_thresh[best_thresh]['fp_rate']:.4f})") + + # ---- EXP 3: Social media simulation ---- + print("\n" + "=" * 70) + print(" EXP 3: Social Media Pipeline Simulation") + print("=" * 70) + print(" (Resize to 1024px + JPEG Q75 + slight blur)") + # Already in perturbation results + sm = results_perturb.get("social_media", {}) + print(f" Social media accuracy: {sm.get('acc', 'N/A')}") + print(f" Social media AUC: {sm.get('auc', 'N/A')}") + print(f" Delta from clean: {sm.get('delta', 'N/A')}") + + # ---- EXP 4: Worst-case adversarial (combined attacks) ---- + print("\n" + "=" * 70) + print(" EXP 4: Worst-Case Adversarial (Combined Attacks)") + print("=" * 70) + + def worst_case_attack(img: Image.Image) -> Image.Image: + """JPEG Q30 + resize 128→orig + noise σ=10 + blur r=1""" + img = jpeg_compress(img, 30) + img = resize_down_up(img, 128) + img = add_gaussian_noise(img, 10) + img = gaussian_blur(img, 1) + return img + + perturbed_worst = [worst_case_attack(img) for img in tqdm(test_images, desc="Worst case")] + X_worst = extract_features_from_images(perturbed_worst, "Worst case features") + y_prob_worst = model.predict_proba(X_worst)[:, 1] + y_pred_worst = (y_prob_worst > 0.5).astype(int) + worst_acc = accuracy_score(test_labels, y_pred_worst) + worst_auc = roc_auc_score(test_labels, y_prob_worst) + print(f" Worst-case accuracy: {worst_acc:.4f}") + print(f" Worst-case AUC: {worst_auc:.4f}") + print(f" Delta from clean: {worst_acc - clean_acc:+.4f}") + + # Save all results + all_results = { + "clean_baseline": {"acc": float(clean_acc), "auc": float(clean_auc)}, + "perturbation_robustness": {k: {kk: float(vv) for kk, vv in v.items()} for k, v in results_perturb.items()}, + "threshold_calibration": results_thresh, + "recommended_threshold": best_thresh, + "worst_case_adversarial": {"acc": float(worst_acc), "auc": float(worst_auc)}, + "n_real": len(real_files), + "n_fake": len(fake_files), + } + + out_path = Path(__file__).parent.parent / "results" / "adversarial_robustness_results.json" + with open(out_path, "w") as f: + json.dump(all_results, f, indent=2) + + print(f"\n{'='*70}") + print(f" SUMMARY") + print(f"{'='*70}") + print(f" Clean accuracy: {clean_acc:.4f}") + print(f" Social media: {sm.get('acc', 'N/A')}") + print(f" Worst-case attack: {worst_acc:.4f}") + print(f" Recommended thresh: {best_thresh}") + if best_thresh: + r = results_thresh[best_thresh] + print(f" → Precision: {r['precision']:.4f}, Recall: {r['recall']:.4f}, FP rate: {r['fp_rate']:.4f}") + print(f"\n Results saved to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_artwork_accuracy.py b/tests/test_artwork_accuracy.py index 75ba27a..d191d63 100644 --- a/tests/test_artwork_accuracy.py +++ b/tests/test_artwork_accuracy.py @@ -96,7 +96,7 @@ def test_feature_extraction(self, benchmark_data): print(f"Features: {benchmark_data['X'].shape[1]}") assert benchmark_data["n_human"] >= 50 assert benchmark_data["n_ai"] >= 50 - assert benchmark_data["X"].shape[1] == 49 + assert benchmark_data["X"].shape[1] == 73 def test_svm_cross_validation(self, benchmark_data): """SVM with RBF kernel — paper's best binary model (97.9% reported).""" diff --git a/tests/test_clip_bias_defactify.py b/tests/test_clip_bias_defactify.py new file mode 100644 index 0000000..658c959 --- /dev/null +++ b/tests/test_clip_bias_defactify.py @@ -0,0 +1,318 @@ +# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 +"""CLIP bias test on Defactify MS-COCOAI dataset. + +Tests whether CLIP's detection advantage comes from recognizing its own +latent fingerprint in images from CLIP-based generators. + +Dataset: Rajarshi-Roy-research/Defactify_Image_Dataset (96K images) +- Label_B=0: Real (MS COCO) +- Label_B=1: SD 2.1 (uses CLIP) +- Label_B=2: SDXL (uses CLIP-L + CLIP-G) +- Label_B=3: SD 3 (uses CLIP-L + CLIP-G + T5) +- Label_B=4: Midjourney v6 (proprietary, unknown) +- Label_B=5: DALL-E 3 (uses T5, NOT CLIP) + +Key comparison: CLIP accuracy on SD 2.1/SDXL (pure CLIP) vs DALL-E 3 (no CLIP). +If CLIP's advantage is larger on CLIP-based generators, bias is confirmed. +""" + +from __future__ import annotations + +import json +import sys +import time +from datetime import datetime +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import numpy as np +import pandas as pd +import torch +import xgboost as xgb +from datasets import load_dataset, Image as HFImage +from PIL import Image +from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from tqdm import tqdm + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from negate.extract.feature_artwork import ArtworkExtract +from negate.extract.feature_style import StyleExtract + +SEED = 42 +N_FOLDS = 5 +N_PER_CLASS = 500 # per generator +RESULTS_DIR = Path(__file__).parent.parent / "results" + +GENERATORS = { + 0: {"name": "Real (MS COCO)", "uses_clip": None}, + 1: {"name": "SD 2.1", "uses_clip": True}, + 2: {"name": "SDXL", "uses_clip": True}, + 3: {"name": "SD 3", "uses_clip": True}, # hybrid: CLIP + T5 + 4: {"name": "Midjourney v6", "uses_clip": "unknown"}, + 5: {"name": "DALL-E 3", "uses_clip": False}, # T5 only +} + + +def extract_handcrafted(images): + art = ArtworkExtract() + style = StyleExtract() + features = [] + for img in tqdm(images, desc=" Hand-crafted"): + try: + f = art(img) + f |= style(img) + features.append(f) + except Exception: + features.append(None) + valid = [i for i, f in enumerate(features) if f is not None] + df = pd.DataFrame([f for f in features if f is not None]).fillna(0) + X = df.to_numpy(dtype=np.float64) + return np.where(np.isfinite(X), X, 0), valid + + +def extract_clip(images): + from transformers import CLIPModel, CLIPProcessor + model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + model.eval() + device = "cuda" if torch.cuda.is_available() else "cpu" + model = model.to(device) + + all_emb, valid = [], [] + bs = 32 + for i in tqdm(range(0, len(images), bs), desc=" CLIP"): + batch = [img for img in images[i:i+bs] if img and isinstance(img, Image.Image)] + batch_idx = [i+j for j, img in enumerate(images[i:i+bs]) if img and isinstance(img, Image.Image)] + if not batch: + continue + with torch.no_grad(): + inputs = processor(images=batch, return_tensors="pt", padding=True).to(device) + out = model.get_image_features(**inputs) + emb = out.pooler_output.cpu().numpy() if hasattr(out, 'pooler_output') else out.cpu().numpy() + all_emb.append(emb) + valid.extend(batch_idx) + return np.vstack(all_emb), valid + + +def run_cv(X, y, model_type="svm"): + skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) + all_true, all_prob = [], [] + for train_idx, test_idx in skf.split(X, y): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + if model_type == "svm": + scaler = StandardScaler() + clf = SVC(kernel="rbf", probability=True, random_state=SEED) + clf.fit(scaler.fit_transform(X_train), y_train) + y_prob = clf.predict_proba(scaler.transform(X_test))[:, 1] + elif model_type == "xgb": + spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1) + dtrain = xgb.DMatrix(X_train, label=y_train) + dtest = xgb.DMatrix(X_test, label=y_test) + m = xgb.train({"objective": "binary:logistic", "max_depth": 5, + "learning_rate": 0.1, "scale_pos_weight": spw, "seed": SEED}, + dtrain, num_boost_round=200, evals=[(dtest, "t")], + early_stopping_rounds=10, verbose_eval=False) + y_prob = m.predict(dtest) + all_true.extend(y_test) + all_prob.extend(y_prob) + + yt, yp = np.array(all_true), np.array(all_prob) + ypr = (yp > 0.5).astype(int) + return { + "accuracy": float(accuracy_score(yt, ypr)), + "precision": float(precision_score(yt, ypr, zero_division=0)), + "recall": float(recall_score(yt, ypr, zero_division=0)), + "roc_auc": float(roc_auc_score(yt, yp)), + } + + +def generate_pdf(results): + RESULTS_DIR.mkdir(exist_ok=True) + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + pdf_path = RESULTS_DIR / f"clip_bias_defactify_{ts}.pdf" + + with PdfPages(str(pdf_path)) as pdf: + fig = plt.figure(figsize=(8.5, 11)) + fig.patch.set_facecolor("white") + ax = fig.add_axes([0, 0, 1, 1]) + ax.axis("off") + + ax.text(0.5, 0.93, "CLIP Bias Analysis:\nDefactify MS-COCOAI Dataset", + transform=ax.transAxes, fontsize=18, fontweight="bold", + ha="center", va="top", fontfamily="serif") + ax.text(0.5, 0.83, f"negate — darkshapes — {datetime.now().strftime('%B %d, %Y')}", + transform=ax.transAxes, fontsize=10, ha="center", fontfamily="serif", style="italic") + + hyp = ( + "Hypothesis: CLIP embeddings achieve high detection accuracy because many\n" + "generators use CLIP as their text encoder, so CLIP recognizes its own fingerprint.\n\n" + "Test: Compare CLIP vs hand-crafted feature accuracy PER GENERATOR.\n" + "If CLIP's advantage is larger on CLIP-based generators (SD 2.1, SDXL, SD 3)\n" + "than on non-CLIP generators (DALL-E 3), bias is confirmed." + ) + ax.text(0.08, 0.74, hyp, transform=ax.transAxes, fontsize=9, + ha="left", va="top", fontfamily="serif", + bbox=dict(boxstyle="round,pad=0.4", facecolor="lightyellow", edgecolor="gray")) + + # Results table + table_data = [] + for r in results: + table_data.append([ + r["generator"], "Yes" if r["uses_clip"] is True else "No" if r["uses_clip"] is False else "?", + f"{r['handcrafted_best']:.1%}", f"{r['clip_best']:.1%}", + f"{r['clip_best'] - r['handcrafted_best']:+.1%}pp" + ]) + + ax_t = fig.add_axes([0.05, 0.38, 0.9, 0.28]) + ax_t.axis("off") + table = ax_t.table(cellText=table_data, + colLabels=["Generator", "Uses CLIP?", "Hand-crafted", "CLIP", "CLIP Advantage"], + loc="center", cellLoc="center") + table.auto_set_font_size(False) + table.set_fontsize(8.5) + table.scale(1, 1.6) + for (row, col), cell in table.get_celld().items(): + if row == 0: + cell.set_facecolor("#4472C4") + cell.set_text_props(color="white", fontweight="bold") + + # Analysis + clip_gens = [r for r in results if r["uses_clip"] is True] + non_clip = [r for r in results if r["uses_clip"] is False] + clip_avg_adv = np.mean([r["clip_best"] - r["handcrafted_best"] for r in clip_gens]) if clip_gens else 0 + non_clip_avg_adv = np.mean([r["clip_best"] - r["handcrafted_best"] for r in non_clip]) if non_clip else 0 + + if clip_gens and non_clip: + diff = clip_avg_adv - non_clip_avg_adv + if diff > 0.05: + verdict = ( + f"CLIP BIAS CONFIRMED.\n\n" + f"CLIP advantage on CLIP-based generators: {clip_avg_adv:+.1%}pp (avg)\n" + f"CLIP advantage on non-CLIP generators: {non_clip_avg_adv:+.1%}pp (avg)\n" + f"Difference: {diff:+.1%}pp\n\n" + "CLIP performs significantly better on images from generators that use\n" + "CLIP internally. This suggests CLIP partially recognizes its own latent\n" + "fingerprint rather than detecting universal generation artifacts." + ) + elif diff < -0.05: + verdict = ( + f"CLIP BIAS NOT CONFIRMED (reverse pattern).\n\n" + f"CLIP advantage on CLIP-based generators: {clip_avg_adv:+.1%}pp (avg)\n" + f"CLIP advantage on non-CLIP generators: {non_clip_avg_adv:+.1%}pp (avg)\n\n" + "CLIP actually has a LARGER advantage on non-CLIP generators.\n" + "This suggests CLIP detects genuine visual artifacts, not its own fingerprint." + ) + else: + verdict = ( + f"NO SIGNIFICANT CLIP BIAS.\n\n" + f"CLIP advantage on CLIP-based generators: {clip_avg_adv:+.1%}pp (avg)\n" + f"CLIP advantage on non-CLIP generators: {non_clip_avg_adv:+.1%}pp (avg)\n" + f"Difference: {diff:+.1%}pp (not significant)\n\n" + "CLIP's advantage is consistent across generator types, suggesting\n" + "it detects genuine visual differences, not architectural fingerprints." + ) + else: + verdict = "Insufficient data." + + ax.text(0.08, 0.3, verdict, transform=ax.transAxes, fontsize=9.5, + ha="left", va="top", fontfamily="serif", + bbox=dict(boxstyle="round,pad=0.4", facecolor="#E8F5E9", edgecolor="#66BB6A")) + + pdf.savefig(fig) + plt.close(fig) + + print(f"PDF saved to: {pdf_path}") + return pdf_path + + +def main(): + print("=" * 60) + print(" CLIP BIAS ANALYSIS — Defactify MS-COCOAI") + print(" 5 generators, labeled, semantically matched") + print("=" * 60) + + ds = load_dataset("Rajarshi-Roy-research/Defactify_Image_Dataset", split="train") + ds = ds.cast_column("Image", HFImage(decode=True, mode="RGB")) + print(f"Total: {len(ds)} images") + + rng = np.random.RandomState(SEED) + real_indices = [i for i, l in enumerate(ds["Label_B"]) if l == 0] + real_sample = rng.choice(real_indices, size=N_PER_CLASS, replace=False) + real_images = [ds[int(i)]["Image"] for i in tqdm(real_sample, desc="Loading real")] + + all_results = [] + + for gen_id in [1, 2, 3, 4, 5]: + gen_info = GENERATORS[gen_id] + print(f"\n{'='*50}") + print(f" {gen_info['name']} (uses_clip={gen_info['uses_clip']}) vs Real") + print(f"{'='*50}") + + gen_indices = [i for i, l in enumerate(ds["Label_B"]) if l == gen_id] + gen_sample = rng.choice(gen_indices, size=N_PER_CLASS, replace=False) + gen_images = [ds[int(i)]["Image"] for i in tqdm(gen_sample, desc=f"Loading {gen_info['name']}")] + + all_images = real_images + gen_images + y = np.array([0] * len(real_images) + [1] * len(gen_images)) + + # Hand-crafted + print(" Extracting hand-crafted features...") + X_hc, hc_valid = extract_handcrafted(all_images) + y_hc = y[hc_valid] + + # CLIP + print(" Extracting CLIP features...") + X_clip, clip_valid = extract_clip(all_images) + y_clip = y[clip_valid] + + result = {"generator": gen_info["name"], "uses_clip": gen_info["uses_clip"]} + + for feat_name, X_f, y_f in [("handcrafted", X_hc, y_hc), ("clip", X_clip, y_clip)]: + for model in ["xgb", "svm"]: + key = f"{feat_name}_{model}" + r = run_cv(X_f, y_f, model) + result[key] = r + print(f" {key:25s} acc={r['accuracy']:.2%} auc={r['roc_auc']:.4f}") + + result["handcrafted_best"] = max(result["handcrafted_xgb"]["accuracy"], + result["handcrafted_svm"]["accuracy"]) + result["clip_best"] = max(result["clip_xgb"]["accuracy"], + result["clip_svm"]["accuracy"]) + result["clip_advantage"] = result["clip_best"] - result["handcrafted_best"] + all_results.append(result) + + # Save + RESULTS_DIR.mkdir(exist_ok=True) + json_path = RESULTS_DIR / "clip_bias_defactify_results.json" + with open(json_path, "w") as f: + json.dump({"timestamp": datetime.now().isoformat(), "results": all_results}, f, indent=2) + + generate_pdf(all_results) + + # Summary + print(f"\n{'='*60}") + print(" SUMMARY — CLIP advantage per generator") + print(f"{'='*60}") + for r in all_results: + clip_tag = "CLIP" if r["uses_clip"] is True else "NO-CLIP" if r["uses_clip"] is False else "???" + print(f" {r['generator']:20s} [{clip_tag:7s}] hand={r['handcrafted_best']:.1%} " + f"clip={r['clip_best']:.1%} delta={r['clip_advantage']:+.1%}") + + clip_gens = [r for r in all_results if r["uses_clip"] is True] + non_clip = [r for r in all_results if r["uses_clip"] is False] + if clip_gens and non_clip: + print(f"\n Avg CLIP advantage on CLIP generators: " + f"{np.mean([r['clip_advantage'] for r in clip_gens]):+.1%}") + print(f" Avg CLIP advantage on non-CLIP generators: " + f"{np.mean([r['clip_advantage'] for r in non_clip]):+.1%}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_experiments.py b/tests/test_experiments.py deleted file mode 100644 index f035c48..0000000 --- a/tests/test_experiments.py +++ /dev/null @@ -1,514 +0,0 @@ -# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 -"""Run all feature experiments on Hemg art dataset and compare. - -Experiments: - 1. Artwork features only (49 features) — baseline - 2. Style features only (15 features) - 3. Artwork + Style combined (64 features) - 4. CLIP embeddings (768 features) - 5. CLIP + Artwork + Style (832 features) - -Each experiment: 4000 samples, 5-fold CV, XGBoost/SVM/MLP. -Generates a comparison PDF. -""" - -from __future__ import annotations - -import json -import sys -import time -from datetime import datetime -from pathlib import Path - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from matplotlib.backends.backend_pdf import PdfPages -import matplotlib.gridspec as gridspec -import numpy as np -import pandas as pd -import torch -import xgboost as xgb -from datasets import load_dataset, Image as HFImage -from PIL import Image -from sklearn.metrics import ( - accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, - confusion_matrix, roc_curve, -) -from sklearn.model_selection import StratifiedKFold -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC -from tqdm import tqdm - -sys.path.insert(0, str(Path(__file__).parent.parent)) -from negate.extract.feature_artwork import ArtworkExtract -from negate.extract.feature_style import StyleExtract - -SEED = 42 -N_FOLDS = 5 -N_PER_CLASS = 2000 -REPO = "Hemg/AI-Generated-vs-Real-Images-Datasets" -RESULTS_DIR = Path(__file__).parent.parent / "results" - - -def load_dataset_cached(): - """Load and return the Hemg dataset.""" - print("Loading Hemg dataset...") - ds = load_dataset(REPO, split="train") - ds = ds.cast_column("image", HFImage(decode=True, mode="RGB")) - return ds - - -def extract_artwork_features(ds, indices) -> np.ndarray: - """Extract 49 artwork features.""" - extractor = ArtworkExtract() - features = [] - for idx in tqdm(indices, desc=" Artwork features"): - try: - img = ds[int(idx)]["image"] - if img and isinstance(img, Image.Image): - features.append(extractor(img)) - else: - features.append(None) - except Exception: - features.append(None) - df = pd.DataFrame([f for f in features if f is not None]).fillna(0) - X = df.to_numpy(dtype=np.float64) - return np.where(np.isfinite(X), X, 0), list(df.columns), [i for i, f in enumerate(features) if f is not None] - - -def extract_style_features(ds, indices) -> np.ndarray: - """Extract 15 style features.""" - extractor = StyleExtract() - features = [] - for idx in tqdm(indices, desc=" Style features"): - try: - img = ds[int(idx)]["image"] - if img and isinstance(img, Image.Image): - features.append(extractor(img)) - else: - features.append(None) - except Exception: - features.append(None) - df = pd.DataFrame([f for f in features if f is not None]).fillna(0) - X = df.to_numpy(dtype=np.float64) - return np.where(np.isfinite(X), X, 0), list(df.columns), [i for i, f in enumerate(features) if f is not None] - - -def extract_clip_features(ds, indices) -> np.ndarray: - """Extract CLIP ViT-B/32 embeddings (512-d).""" - from transformers import CLIPProcessor, CLIPModel - - print(" Loading CLIP model...") - model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") - processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model = model.to(device) - - features = [] - valid = [] - batch_size = 32 - - for batch_start in tqdm(range(0, len(indices), batch_size), desc=" CLIP features"): - batch_indices = indices[batch_start:batch_start + batch_size] - images = [] - batch_valid = [] - for i, idx in enumerate(batch_indices): - try: - img = ds[int(idx)]["image"] - if img and isinstance(img, Image.Image): - images.append(img) - batch_valid.append(batch_start + i) - except Exception: - pass - - if not images: - continue - - with torch.no_grad(): - inputs = processor(images=images, return_tensors="pt", padding=True).to(device) - outputs = model.get_image_features(**inputs) - if isinstance(outputs, torch.Tensor): - embeddings = outputs.cpu().numpy() - else: - embeddings = outputs.pooler_output.cpu().numpy() - - features.append(embeddings) - valid.extend(batch_valid) - - X = np.vstack(features) - return X, [f"clip_{i}" for i in range(X.shape[1])], valid - - -def run_cv(X, y, model_type="xgb"): - """5-fold CV, return metrics dict.""" - skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) - all_true, all_prob = [], [] - - for train_idx, test_idx in skf.split(X, y): - X_train, X_test = X[train_idx], X[test_idx] - y_train, y_test = y[train_idx], y[test_idx] - - if model_type == "xgb": - spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1) - params = { - "objective": "binary:logistic", "eval_metric": "logloss", - "max_depth": 5, "learning_rate": 0.1, "subsample": 0.8, - "colsample_bytree": 0.8, "scale_pos_weight": spw, "seed": SEED, - } - dtrain = xgb.DMatrix(X_train, label=y_train) - dtest = xgb.DMatrix(X_test, label=y_test) - model = xgb.train(params, dtrain, num_boost_round=300, - evals=[(dtest, "test")], early_stopping_rounds=15, - verbose_eval=False) - y_prob = model.predict(dtest) - elif model_type == "svm": - scaler = StandardScaler() - X_tr = scaler.fit_transform(X_train) - X_te = scaler.transform(X_test) - svm = SVC(kernel="rbf", probability=True, random_state=SEED) - svm.fit(X_tr, y_train) - y_prob = svm.predict_proba(X_te)[:, 1] - elif model_type == "mlp": - scaler = StandardScaler() - X_tr = scaler.fit_transform(X_train) - X_te = scaler.transform(X_test) - mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=1000, - random_state=SEED, early_stopping=True) - mlp.fit(X_tr, y_train) - y_prob = mlp.predict_proba(X_te)[:, 1] - - all_true.extend(y_test) - all_prob.extend(y_prob) - - y_true = np.array(all_true) - y_prob = np.array(all_prob) - y_pred = (y_prob > 0.5).astype(int) - - return { - "accuracy": float(accuracy_score(y_true, y_pred)), - "precision": float(precision_score(y_true, y_pred, zero_division=0)), - "recall": float(recall_score(y_true, y_pred, zero_division=0)), - "f1": float(f1_score(y_true, y_pred, average="macro")), - "roc_auc": float(roc_auc_score(y_true, y_prob)), - "y_true": y_true.tolist(), - "y_prob": y_prob.tolist(), - } - - -def generate_pdf(experiments): - """Generate comparison PDF.""" - RESULTS_DIR.mkdir(exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - pdf_path = RESULTS_DIR / f"experiments_comparison_{timestamp}.pdf" - - with PdfPages(str(pdf_path)) as pdf: - # PAGE 1: Title + comparison chart - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - - fig.suptitle("Feature Experiment Comparison\nfor AI Artwork Detection", - fontsize=18, fontweight="bold", fontfamily="serif", y=0.96) - fig.text(0.5, 0.89, f"negate project — darkshapes — {datetime.now().strftime('%B %d, %Y')}", - fontsize=10, ha="center", fontfamily="serif", style="italic") - fig.text(0.5, 0.86, f"Dataset: Hemg AI-Art vs Real-Art | {N_PER_CLASS*2} samples | 5-fold CV", - fontsize=9, ha="center", fontfamily="serif") - - # Grouped bar chart: accuracy by experiment and model - ax = fig.add_axes([0.1, 0.45, 0.8, 0.35]) - - exp_names = [e["name"] for e in experiments] - n_exp = len(exp_names) - x = np.arange(n_exp) - w = 0.25 - - for i, (model, color) in enumerate([("xgb", "#4472C4"), ("svm", "#ED7D31"), ("mlp", "#70AD47")]): - accs = [e["results"][model]["accuracy"] for e in experiments] - bars = ax.bar(x + i * w - w, accs, w, label=model.upper(), color=color) - for bar in bars: - ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, - f"{bar.get_height():.1%}", ha="center", fontsize=6.5, rotation=45) - - ax.set_xticks(x) - ax.set_xticklabels([e["short_name"] for e in experiments], fontsize=8, rotation=15, ha="right") - ax.set_ylabel("Accuracy", fontsize=10) - ax.set_title("Accuracy by Feature Set and Model", fontsize=12, fontfamily="serif") - ax.legend(fontsize=9) - ax.set_ylim(0.5, 1.0) - ax.grid(axis="y", alpha=0.3) - - # Summary table - ax_table = fig.add_axes([0.05, 0.08, 0.9, 0.3]) - ax_table.axis("off") - - table_data = [] - for e in experiments: - best_model = max(e["results"], key=lambda m: e["results"][m]["accuracy"]) - best = e["results"][best_model] - table_data.append([ - e["short_name"], - str(e["n_features"]), - f"{best['accuracy']:.2%}", - f"{best['precision']:.2%}", - f"{best['recall']:.2%}", - f"{best['roc_auc']:.4f}", - best_model.upper(), - e.get("extract_time", "?"), - ]) - - table = ax_table.table( - cellText=table_data, - colLabels=["Features", "Count", "Best Acc", "Prec", "Recall", "AUC", "Model", "Time"], - loc="center", cellLoc="center", - ) - table.auto_set_font_size(False) - table.set_fontsize(7.5) - table.scale(1, 1.5) - for (row, col), cell in table.get_celld().items(): - if row == 0: - cell.set_facecolor("#4472C4") - cell.set_text_props(color="white", fontweight="bold") - - pdf.savefig(fig) - plt.close(fig) - - # PAGE 2: ROC curves - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - fig.suptitle("ROC Curves by Experiment (Best Model)", fontsize=14, - fontweight="bold", fontfamily="serif", y=0.96) - - colors = ["#4472C4", "#ED7D31", "#70AD47", "#FFC000", "#9B59B6"] - ax = fig.add_axes([0.12, 0.5, 0.76, 0.38]) - - for i, e in enumerate(experiments): - best_model = max(e["results"], key=lambda m: e["results"][m]["roc_auc"]) - r = e["results"][best_model] - fpr, tpr, _ = roc_curve(r["y_true"], r["y_prob"]) - ax.plot(fpr, tpr, color=colors[i % len(colors)], linewidth=2, - label=f"{e['short_name']} (AUC={r['roc_auc']:.3f})") - - ax.plot([0, 1], [0, 1], "k--", alpha=0.3) - ax.set_xlabel("False Positive Rate") - ax.set_ylabel("True Positive Rate") - ax.legend(fontsize=8, loc="lower right") - ax.grid(True, alpha=0.3) - - # Analysis text - ax_text = fig.add_axes([0.08, 0.05, 0.84, 0.38]) - ax_text.axis("off") - - # Find best and worst - best_exp = max(experiments, key=lambda e: max(e["results"][m]["accuracy"] for m in e["results"])) - worst_exp = min(experiments, key=lambda e: max(e["results"][m]["accuracy"] for m in e["results"])) - best_acc = max(best_exp["results"][m]["accuracy"] for m in best_exp["results"]) - worst_acc = max(worst_exp["results"][m]["accuracy"] for m in worst_exp["results"]) - - analysis = ( - "Analysis\n\n" - f"Best performing: {best_exp['name']} at {best_acc:.1%}\n" - f"Worst performing: {worst_exp['name']} at {worst_acc:.1%}\n" - f"Improvement from best to worst: {(best_acc - worst_acc)*100:+.1f}pp\n\n" - ) - - # Check if CLIP exists - clip_exp = [e for e in experiments if "clip" in e["short_name"].lower()] - art_exp = [e for e in experiments if e["short_name"] == "Artwork (49)"] - - if clip_exp and art_exp: - clip_acc = max(clip_exp[0]["results"][m]["accuracy"] for m in clip_exp[0]["results"]) - art_acc = max(art_exp[0]["results"][m]["accuracy"] for m in art_exp[0]["results"]) - analysis += ( - f"CLIP vs hand-crafted: {clip_acc:.1%} vs {art_acc:.1%} " - f"({(clip_acc - art_acc)*100:+.1f}pp)\n" - ) - if clip_acc > art_acc + 0.03: - analysis += "Learned features significantly outperform hand-crafted features.\n" - elif clip_acc < art_acc - 0.03: - analysis += "Surprisingly, hand-crafted features outperform CLIP on this task.\n" - else: - analysis += "Learned and hand-crafted features perform similarly.\n" - - # Check if combined helps - combined_exp = [e for e in experiments if "+" in e["short_name"]] - if combined_exp: - comb_acc = max(combined_exp[-1]["results"][m]["accuracy"] for m in combined_exp[-1]["results"]) - analysis += ( - f"\nCombined features: {comb_acc:.1%}\n" - ) - if comb_acc > best_acc - 0.01: - analysis += "Combining features achieves the best overall performance.\n" - else: - analysis += "Combining features does not improve over the best individual set.\n" - - analysis += ( - "\nConclusions\n\n" - "This comparison tests whether:\n" - " 1. Style-specific craft features add signal beyond generic statistics\n" - " 2. Learned representations (CLIP) outperform hand-crafted features\n" - " 3. Combining multiple feature types improves detection\n\n" - "All experiments use the same dataset (Hemg AI Art vs Real Art),\n" - "same sample size, and same evaluation methodology.\n" - ) - - ax_text.text(0, 1, analysis, transform=ax_text.transAxes, fontsize=9, - ha="left", va="top", fontfamily="serif") - - pdf.savefig(fig) - plt.close(fig) - - print(f"PDF saved to: {pdf_path}") - return pdf_path - - -def main(): - print("=" * 60) - print(" FEATURE EXPERIMENTS COMPARISON") - print(" Dataset: Hemg AI Art vs Real Art") - print(f" Samples: {N_PER_CLASS * 2} ({N_PER_CLASS} per class)") - print("=" * 60) - - ds = load_dataset_cached() - all_labels = ds["label"] - - # Select balanced indices - rng = np.random.RandomState(SEED) - idx_0 = [i for i, l in enumerate(all_labels) if l == 0] - idx_1 = [i for i, l in enumerate(all_labels) if l == 1] - chosen_0 = rng.choice(idx_0, size=N_PER_CLASS, replace=False) - chosen_1 = rng.choice(idx_1, size=N_PER_CLASS, replace=False) - all_indices = np.concatenate([chosen_0, chosen_1]) - # Labels: 0=AI(synthetic), 1=Real(genuine) in dataset - # We want: 0=genuine, 1=synthetic - y = np.array([1] * N_PER_CLASS + [0] * N_PER_CLASS) - - experiments = [] - - # === Experiment 1: Artwork features (49) === - print("\n" + "=" * 50) - print(" Experiment 1: Artwork Features (49)") - print("=" * 50) - t0 = time.time() - X_art, art_names, art_valid = extract_artwork_features(ds, all_indices) - t_art = f"{time.time() - t0:.0f}s" - y_art = y[art_valid] - print(f" {X_art.shape[0]} images, {X_art.shape[1]} features, {t_art}") - - exp1 = {"name": "Artwork Features (Li & Stamp + FFT/DCT)", "short_name": "Artwork (49)", - "n_features": X_art.shape[1], "extract_time": t_art, "results": {}} - for model in ["xgb", "svm", "mlp"]: - print(f" {model.upper()}...") - exp1["results"][model] = run_cv(X_art, y_art, model) - print(f" acc={exp1['results'][model]['accuracy']:.2%}") - experiments.append(exp1) - - # === Experiment 2: Style features (15) === - print("\n" + "=" * 50) - print(" Experiment 2: Style Features (15)") - print("=" * 50) - t0 = time.time() - X_style, style_names, style_valid = extract_style_features(ds, all_indices) - t_style = f"{time.time() - t0:.0f}s" - y_style = y[style_valid] - print(f" {X_style.shape[0]} images, {X_style.shape[1]} features, {t_style}") - - exp2 = {"name": "Style Features (stroke/palette/composition/texture)", "short_name": "Style (15)", - "n_features": X_style.shape[1], "extract_time": t_style, "results": {}} - for model in ["xgb", "svm", "mlp"]: - print(f" {model.upper()}...") - exp2["results"][model] = run_cv(X_style, y_style, model) - print(f" acc={exp2['results'][model]['accuracy']:.2%}") - experiments.append(exp2) - - # === Experiment 3: Artwork + Style combined (64) === - print("\n" + "=" * 50) - print(" Experiment 3: Artwork + Style Combined (64)") - print("=" * 50) - # Align valid indices - common_valid = sorted(set(art_valid) & set(style_valid)) - art_mask = [art_valid.index(v) for v in common_valid] - style_mask = [style_valid.index(v) for v in common_valid] - X_combined = np.hstack([X_art[art_mask], X_style[style_mask]]) - y_combined = y[common_valid] - print(f" {X_combined.shape[0]} images, {X_combined.shape[1]} features") - - exp3 = {"name": "Artwork + Style Combined", "short_name": "Art+Style (64)", - "n_features": X_combined.shape[1], "extract_time": "combined", "results": {}} - for model in ["xgb", "svm", "mlp"]: - print(f" {model.upper()}...") - exp3["results"][model] = run_cv(X_combined, y_combined, model) - print(f" acc={exp3['results'][model]['accuracy']:.2%}") - experiments.append(exp3) - - # === Experiment 4: CLIP embeddings (512) === - print("\n" + "=" * 50) - print(" Experiment 4: CLIP ViT-B/32 Embeddings (512)") - print("=" * 50) - t0 = time.time() - X_clip, clip_names, clip_valid = extract_clip_features(ds, all_indices) - t_clip = f"{time.time() - t0:.0f}s" - y_clip = y[clip_valid] - print(f" {X_clip.shape[0]} images, {X_clip.shape[1]} features, {t_clip}") - - exp4 = {"name": "CLIP ViT-B/32 Embeddings", "short_name": "CLIP (512)", - "n_features": X_clip.shape[1], "extract_time": t_clip, "results": {}} - for model in ["xgb", "svm", "mlp"]: - print(f" {model.upper()}...") - exp4["results"][model] = run_cv(X_clip, y_clip, model) - print(f" acc={exp4['results'][model]['accuracy']:.2%}") - experiments.append(exp4) - - # === Experiment 5: CLIP + Artwork + Style (all combined) === - print("\n" + "=" * 50) - print(" Experiment 5: CLIP + Artwork + Style (all)") - print("=" * 50) - common_all = sorted(set(art_valid) & set(style_valid) & set(clip_valid)) - art_m = [art_valid.index(v) for v in common_all] - style_m = [style_valid.index(v) for v in common_all] - clip_m = [clip_valid.index(v) for v in common_all] - X_all = np.hstack([X_art[art_m], X_style[style_m], X_clip[clip_m]]) - y_all = y[common_all] - print(f" {X_all.shape[0]} images, {X_all.shape[1]} features") - - exp5 = {"name": "CLIP + Artwork + Style (Everything)", "short_name": "All Combined", - "n_features": X_all.shape[1], "extract_time": "combined", "results": {}} - for model in ["xgb", "svm", "mlp"]: - print(f" {model.upper()}...") - exp5["results"][model] = run_cv(X_all, y_all, model) - print(f" acc={exp5['results'][model]['accuracy']:.2%}") - experiments.append(exp5) - - # Save results (without y_true/y_prob arrays for JSON) - json_results = [] - for e in experiments: - je = {k: v for k, v in e.items() if k != "results"} - je["results"] = {} - for m, r in e["results"].items(): - je["results"][m] = {k: v for k, v in r.items() if k not in ("y_true", "y_prob")} - json_results.append(je) - - RESULTS_DIR.mkdir(exist_ok=True) - json_path = RESULTS_DIR / "experiments_results.json" - with open(json_path, "w") as f: - json.dump({"timestamp": datetime.now().isoformat(), "experiments": json_results}, f, indent=2) - print(f"\nJSON saved to: {json_path}") - - # Generate PDF - print("\nGenerating comparison PDF...") - generate_pdf(experiments) - - # Final summary - print(f"\n{'='*60}") - print(" FINAL SUMMARY") - print(f"{'='*60}") - for e in experiments: - best_model = max(e["results"], key=lambda m: e["results"][m]["accuracy"]) - best = e["results"][best_model] - print(f" {e['short_name']:20s} acc={best['accuracy']:.2%} auc={best['roc_auc']:.4f} ({best_model})") - - -if __name__ == "__main__": - main() diff --git a/tests/test_fair_evaluation.py b/tests/test_fair_evaluation.py deleted file mode 100644 index e3d1381..0000000 --- a/tests/test_fair_evaluation.py +++ /dev/null @@ -1,295 +0,0 @@ -# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 -"""Fair evaluation: test artwork features on datasets where both classes are art. - -Addresses the confound that previous benchmarks used different subject matter -(cats vs bananas, wikiart vs generated), which inflates accuracy. - -Datasets: - 1. Hemg/AI-Generated-vs-Real-Images-Datasets — 153K, "AiArtData" vs "RealArt" - 2. Parveshiiii/AI-vs-Real — 14K balanced binary - -We sample N images from each class, extract 49 features, run 5-fold CV, -and report accuracy/precision/recall/F1/AUC with confidence intervals. -""" - -from __future__ import annotations - -import json -import sys -from datetime import datetime -from pathlib import Path - -import numpy as np -import pandas as pd -import xgboost as xgb -from datasets import load_dataset, Image as HFImage -from PIL import Image -from sklearn.metrics import ( - accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, - confusion_matrix, roc_curve, -) -from sklearn.model_selection import StratifiedKFold -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC -from tqdm import tqdm - -sys.path.insert(0, str(Path(__file__).parent.parent)) -from negate.extract.feature_artwork import ArtworkExtract - -SEED = 42 -N_FOLDS = 5 -RESULTS_DIR = Path(__file__).parent.parent / "results" - - -def extract_all_features(dataset, label_col: str, n_samples: int = 200): - """Extract features from a dataset, balanced per class.""" - extractor = ArtworkExtract() - features, labels, errors = [], [], 0 - - # Get unique labels and sample equally - all_labels = dataset[label_col] - unique_labels = sorted(set(all_labels)) - print(f" Labels found: {unique_labels}") - - per_class = n_samples // len(unique_labels) - - for lbl in unique_labels: - indices = [i for i, l in enumerate(all_labels) if l == lbl] - rng = np.random.RandomState(SEED) - chosen = rng.choice(indices, size=min(per_class, len(indices)), replace=False) - - for idx in tqdm(chosen, desc=f" Class {lbl}"): - try: - img = dataset[int(idx)]["image"] - if img is None: - errors += 1 - continue - if not isinstance(img, Image.Image): - errors += 1 - continue - feat = extractor(img) - features.append(feat) - # Binary: 0 = real/genuine, 1 = AI/synthetic - labels.append(0 if lbl == max(unique_labels) else 1) - except Exception as e: - errors += 1 - - print(f" Extracted {len(features)} images ({errors} errors)") - - df = pd.DataFrame(features).fillna(0) - X = df.to_numpy(dtype=np.float64) - X = np.where(np.isfinite(X), X, 0) - y = np.array(labels) - - return X, y, list(df.columns) - - -def cross_validate_xgb(X, y): - """5-fold CV with XGBoost.""" - skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) - all_true, all_prob = [], [] - fold_results = [] - - for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)): - X_train, X_test = X[train_idx], X[test_idx] - y_train, y_test = y[train_idx], y[test_idx] - - spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1) - params = { - "objective": "binary:logistic", - "eval_metric": ["logloss", "aucpr"], - "max_depth": 4, - "learning_rate": 0.1, - "subsample": 0.8, - "colsample_bytree": 0.8, - "scale_pos_weight": spw, - "seed": SEED, - } - dtrain = xgb.DMatrix(X_train, label=y_train) - dtest = xgb.DMatrix(X_test, label=y_test) - model = xgb.train(params, dtrain, num_boost_round=200, - evals=[(dtest, "test")], early_stopping_rounds=10, - verbose_eval=False) - - y_prob = model.predict(dtest) - y_pred = (y_prob > 0.5).astype(int) - - fold_results.append({ - "fold": fold + 1, - "accuracy": float(accuracy_score(y_test, y_pred)), - "precision": float(precision_score(y_test, y_pred, zero_division=0)), - "recall": float(recall_score(y_test, y_pred, zero_division=0)), - "f1": float(f1_score(y_test, y_pred, average="macro")), - "roc_auc": float(roc_auc_score(y_test, y_prob)), - }) - all_true.extend(y_test) - all_prob.extend(y_prob) - - return fold_results, np.array(all_true), np.array(all_prob) - - -def cross_validate_svm(X, y): - """5-fold CV with SVM.""" - skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) - all_true, all_prob = [], [] - - scaler = StandardScaler() - - for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)): - X_train = scaler.fit_transform(X[train_idx]) - X_test = scaler.transform(X[test_idx]) - y_train, y_test = y[train_idx], y[test_idx] - - svm = SVC(kernel="rbf", probability=True, random_state=SEED) - svm.fit(X_train, y_train) - y_prob = svm.predict_proba(X_test)[:, 1] - all_true.extend(y_test) - all_prob.extend(y_prob) - - return np.array(all_true), np.array(all_prob) - - -def cross_validate_mlp(X, y): - """5-fold CV with MLP.""" - skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) - all_true, all_prob = [], [] - scaler = StandardScaler() - - for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)): - X_train = scaler.fit_transform(X[train_idx]) - X_test = scaler.transform(X[test_idx]) - y_train, y_test = y[train_idx], y[test_idx] - - mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=SEED) - mlp.fit(X_train, y_train) - y_prob = mlp.predict_proba(X_test)[:, 1] - all_true.extend(y_test) - all_prob.extend(y_prob) - - return np.array(all_true), np.array(all_prob) - - -def summarize(name, fold_results, y_true, y_prob): - """Print summary for a classifier.""" - y_pred = (y_prob > 0.5).astype(int) - accs = [r["accuracy"] for r in fold_results] - print(f"\n{'='*50}") - print(f" {name}") - print(f"{'='*50}") - for r in fold_results: - print(f" Fold {r['fold']}: acc={r['accuracy']:.2%} prec={r['precision']:.2%} " - f"rec={r['recall']:.2%} auc={r['roc_auc']:.4f}") - print(f" ---") - print(f" Mean acc: {np.mean(accs):.2%} +/- {np.std(accs):.2%}") - print(f" Pooled: acc={accuracy_score(y_true, y_pred):.2%} " - f"prec={precision_score(y_true, y_pred, zero_division=0):.2%} " - f"rec={recall_score(y_true, y_pred, zero_division=0):.2%} " - f"auc={roc_auc_score(y_true, y_prob):.4f}") - cm = confusion_matrix(y_true, y_pred) - print(f" Confusion: TN={cm[0,0]} FP={cm[0,1]} FN={cm[1,0]} TP={cm[1,1]}") - - -def evaluate_dataset(name: str, repo: str, label_col: str, n_samples: int, split: str = "train"): - """Full evaluation pipeline for one dataset.""" - print(f"\n{'#'*60}") - print(f" DATASET: {name}") - print(f" Repo: {repo}") - print(f" Sampling: {n_samples} images ({n_samples//2} per class)") - print(f"{'#'*60}") - - print(f"\nLoading dataset...") - ds = load_dataset(repo, split=split) - ds = ds.cast_column("image", HFImage(decode=True, mode="RGB")) - print(f" Total rows: {len(ds)}") - - X, y, feature_names = extract_all_features(ds, label_col, n_samples) - print(f" Class balance: {np.sum(y==0)} real, {np.sum(y==1)} synthetic") - print(f" Features: {X.shape[1]}") - - # XGBoost - print(f"\nRunning {N_FOLDS}-fold CV (XGBoost)...") - xgb_folds, xgb_true, xgb_prob = cross_validate_xgb(X, y) - summarize(f"XGBoost on {name}", xgb_folds, xgb_true, xgb_prob) - - # SVM - print(f"\nRunning {N_FOLDS}-fold CV (SVM)...") - svm_true, svm_prob = cross_validate_svm(X, y) - svm_pred = (svm_prob > 0.5).astype(int) - print(f" SVM pooled: acc={accuracy_score(svm_true, svm_pred):.2%} " - f"auc={roc_auc_score(svm_true, svm_prob):.4f}") - - # MLP - print(f"\nRunning {N_FOLDS}-fold CV (MLP)...") - mlp_true, mlp_prob = cross_validate_mlp(X, y) - mlp_pred = (mlp_prob > 0.5).astype(int) - print(f" MLP pooled: acc={accuracy_score(mlp_true, mlp_pred):.2%} " - f"auc={roc_auc_score(mlp_true, mlp_prob):.4f}") - - return { - "dataset": name, - "repo": repo, - "n_samples": int(np.sum(y >= 0)), - "n_features": X.shape[1], - "xgb_folds": xgb_folds, - "xgb_accuracy": float(accuracy_score(xgb_true, (xgb_prob > 0.5).astype(int))), - "xgb_auc": float(roc_auc_score(xgb_true, xgb_prob)), - "xgb_precision": float(precision_score(xgb_true, (xgb_prob > 0.5).astype(int), zero_division=0)), - "xgb_recall": float(recall_score(xgb_true, (xgb_prob > 0.5).astype(int), zero_division=0)), - "svm_accuracy": float(accuracy_score(svm_true, svm_pred)), - "svm_auc": float(roc_auc_score(svm_true, svm_prob)), - "mlp_accuracy": float(accuracy_score(mlp_true, mlp_pred)), - "mlp_auc": float(roc_auc_score(mlp_true, mlp_prob)), - "feature_names": feature_names, - } - - -def main(): - print("=" * 60) - print(" FAIR EVALUATION: 49-Feature Artwork Detection") - print(" Testing on semantically-similar datasets") - print("=" * 60) - - results = [] - - # Dataset 1: Hemg — both classes are art - results.append(evaluate_dataset( - name="AI-Art vs Real-Art (Hemg)", - repo="Hemg/AI-Generated-vs-Real-Images-Datasets", - label_col="label", - n_samples=400, - )) - - # Dataset 2: Parveshiiii — balanced binary - results.append(evaluate_dataset( - name="AI vs Real (Parveshiiii)", - repo="Parveshiiii/AI-vs-Real", - label_col="binary_label", - n_samples=400, - )) - - # Save results - RESULTS_DIR.mkdir(exist_ok=True) - out_path = RESULTS_DIR / "fair_evaluation_results.json" - with open(out_path, "w") as f: - json.dump({ - "timestamp": datetime.now().isoformat(), - "evaluation": "fair_semantically_similar", - "datasets": results, - }, f, indent=2, default=str) - - print(f"\n{'='*60}") - print(f" RESULTS SUMMARY") - print(f"{'='*60}") - for r in results: - print(f"\n {r['dataset']}:") - print(f" XGBoost: acc={r['xgb_accuracy']:.2%} auc={r['xgb_auc']:.4f} " - f"prec={r['xgb_precision']:.2%} rec={r['xgb_recall']:.2%}") - print(f" SVM: acc={r['svm_accuracy']:.2%} auc={r['svm_auc']:.4f}") - print(f" MLP: acc={r['mlp_accuracy']:.2%} auc={r['mlp_auc']:.4f}") - - print(f"\nResults saved to: {out_path}") - - -if __name__ == "__main__": - main() diff --git a/tests/test_hybrid_modern.py b/tests/test_hybrid_modern.py new file mode 100644 index 0000000..32b09a4 --- /dev/null +++ b/tests/test_hybrid_modern.py @@ -0,0 +1,324 @@ +"""Hybrid detector: 148 handcrafted + 768 ConvNeXt, max training diversity. + +Extracts features INCREMENTALLY to avoid OOM from holding all images in memory. +""" + +from __future__ import annotations + +import gc +import json +import os +import warnings +from pathlib import Path + +os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface" +os.environ["HF_DATASETS_CACHE"] = "D:/Projects/negate/negate/.cache/huggingface/datasets" +# Set HF_TOKEN env var or run: huggingface-cli login + +import lightgbm as lgb +import numpy as np +import torch +from PIL import Image +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from tqdm import tqdm + +warnings.filterwarnings("ignore") + +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) +from negate.extract.feature_artwork import ArtworkExtract +from negate.extract.feature_learned import LearnedExtract + +art_ext = ArtworkExtract() +learned_ext = LearnedExtract() + +BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted") +CIVITAI = Path("D:/Projects/negate/negate/.datasets/civitai") +SEED = 42 +rng = np.random.RandomState(SEED) +W_LGBM, W_RF, W_SVM = 0.4, 0.1, 0.5 +HIGH_T, LOW_T = 0.80, 0.20 + + +def load_paths(path, recursive, max_n): + exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} + files = list(path.rglob("*") if recursive else path.iterdir()) + files = [f for f in files if f.suffix.lower() in exts] + if len(files) > max_n: + files = list(rng.choice(files, max_n, replace=False)) + return files + + +def extract_one(img): + """Extract 916 features from one PIL image.""" + try: + hc = list(art_ext(img).values()) + except Exception: + hc = [0.0] * 148 + try: + with torch.no_grad(): + ln = list(learned_ext(img).values()) + except Exception: + ln = [0.0] * 768 + return hc + ln + + +def extract_from_paths(files, desc=""): + """Extract features from file paths, one at a time (no memory buildup).""" + rows = [] + for f in tqdm(files, desc=desc): + try: + img = Image.open(f).convert("RGB") + rows.append(extract_one(img)) + del img + except Exception: + rows.append([0.0] * 916) + X = np.array(rows, dtype=np.float64) + return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + + +def extract_from_stream(repo, max_n, desc=""): + """Stream from HF, extract features one at a time.""" + from datasets import load_dataset + ds = load_dataset(repo, split="train", streaming=True) + rows = [] + for i, s in enumerate(tqdm(ds, desc=desc, total=max_n)): + if i >= max_n: + break + try: + img = s["image"].convert("RGB") + rows.append(extract_one(img)) + del img + except Exception: + rows.append([0.0] * 916) + if not rows: + return np.empty((0, 916)) + X = np.array(rows, dtype=np.float64) + return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + + +def extract_from_imgs(images, desc=""): + rows = [] + for img in tqdm(images, desc=desc): + rows.append(extract_one(img)) + X = np.array(rows, dtype=np.float64) + return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + + +def main(): + print("=" * 70, flush=True) + print(" HYBRID DETECTOR: 916 features, max diversity training", flush=True) + print("=" * 70, flush=True) + + feature_chunks_real = [] + feature_chunks_fake = [] + + # === REAL: WikiArt === + print("\n--- REAL: WikiArt (2000) ---", flush=True) + files = load_paths(BASE / "wikiart", recursive=True, max_n=2000) + X = extract_from_paths(files, "WikiArt") + feature_chunks_real.append(X) + print(f" Got {len(X)}", flush=True) + gc.collect() + + # === REAL: tellif === + print("\n--- REAL: tellif (122) ---", flush=True) + from datasets import load_dataset + ds_tellif = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test") + tellif_real_imgs = [ds_tellif[i]["image"].convert("RGB") for i in range(len(ds_tellif)) if ds_tellif[i]["label"] == 8] + X = extract_from_imgs(tellif_real_imgs, "tellif real") + feature_chunks_real.append(X) + print(f" Got {len(X)}", flush=True) + del tellif_real_imgs + gc.collect() + + # === REAL: Hemg (stream) === + print("\n--- REAL: Hemg (1000) ---", flush=True) + X = extract_from_stream("Hemg/AI-Generated-vs-Real-Images-Datasets", max_n=2500, desc="Hemg real") + # Hemg label=1 is real, but streaming gives us mixed — need to filter + # Actually extract_from_stream doesn't filter by label. Let me handle differently. + ds_hemg = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train", streaming=True) + rows = [] + count = 0 + for s in tqdm(ds_hemg, desc="Hemg real", total=3000): + if s["label"] == 1: # real + try: + rows.append(extract_one(s["image"].convert("RGB"))) + count += 1 + except Exception: + pass + if count >= 1000: + break + X = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0) if rows else np.empty((0, 916)) + feature_chunks_real.append(X) + print(f" Got {len(X)}", flush=True) + gc.collect() + + # === FAKE: ImagiNet old === + print("\n--- FAKE: ImagiNet old (1000) ---", flush=True) + for src in ["sdxl_paintings_fake", "sd_paintings_fake", "dalle3", "journeydb", "animaginexl_paintings_fake"]: + files = load_paths(BASE / src, recursive=False, max_n=200) + X = extract_from_paths(files, f"ImagiNet {src}") + feature_chunks_fake.append(X) + print(f" {src}: {len(X)}", flush=True) + gc.collect() + + # === FAKE: Modern HF datasets === + print("\n--- FAKE: Modern generators ---", flush=True) + for repo, n, name in [ + ("ash12321/seedream-4.5-generated-2k", 500, "Seedream 4.5"), + ("exdysa/nano-banana-pro-generated-1k-clone", 500, "Nano Banana Pro"), + ("LukasT9/Flux-1-Dev-Images-1k", 500, "Flux Dev"), + ("LukasT9/Flux-1-Schnell-Images-1k", 500, "Flux Schnell"), + ]: + X = extract_from_stream(repo, max_n=n, desc=name) + feature_chunks_fake.append(X) + print(f" {name}: {len(X)}", flush=True) + gc.collect() + + # === FAKE: CivitAI === + print("\n--- FAKE: CivitAI ---", flush=True) + for gen_dir in ["flux", "sdxl", "pony", "illustrious"]: + p = CIVITAI / gen_dir + if p.exists() and any(p.iterdir()): + files = load_paths(p, recursive=False, max_n=500) + X = extract_from_paths(files, f"CivitAI {gen_dir}") + feature_chunks_fake.append(X) + print(f" CivitAI {gen_dir}: {len(X)}", flush=True) + gc.collect() + + # === FAKE: Hemg AI === + print("\n--- FAKE: Hemg AI (1000) ---", flush=True) + ds_hemg2 = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train", streaming=True) + rows = [] + count = 0 + for s in tqdm(ds_hemg2, desc="Hemg AI", total=1500): + if s["label"] == 0: # AI + try: + rows.append(extract_one(s["image"].convert("RGB"))) + count += 1 + except Exception: + pass + if count >= 1000: + break + X = np.nan_to_num(np.array(rows, dtype=np.float64), nan=0.0, posinf=0.0, neginf=0.0) if rows else np.empty((0, 916)) + feature_chunks_fake.append(X) + print(f" Hemg AI: {len(X)}", flush=True) + gc.collect() + + # === Combine === + X_real = np.vstack([c for c in feature_chunks_real if len(c) > 0]) + X_fake = np.vstack([c for c in feature_chunks_fake if len(c) > 0]) + del feature_chunks_real, feature_chunks_fake + gc.collect() + + n = min(len(X_real), len(X_fake)) + X_train = np.vstack([X_real[:n], X_fake[:n]]) + y_train = np.concatenate([np.zeros(n), np.ones(n)]) + print(f"\n TOTAL: {len(X_train)} train ({n}/class), {X_train.shape[1]} features", flush=True) + print(f" Real sources: WikiArt + tellif + Hemg", flush=True) + print(f" Fake sources: ImagiNet + Seedream + Nano Banana + Flux + CivitAI + Hemg AI", flush=True) + + # === 5-fold CV === + print("\n--- 5-fold CV ---", flush=True) + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) + accs, aucs = [], [] + for fold, (tr, te) in enumerate(skf.split(X_train, y_train)): + m = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED) + m.fit(X_train[tr], y_train[tr]) + p = m.predict_proba(X_train[te])[:, 1] + acc = accuracy_score(y_train[te], (p > 0.5).astype(int)) + auc = roc_auc_score(y_train[te], p) + accs.append(acc); aucs.append(auc) + print(f" Fold {fold+1}: acc={acc:.4f} auc={auc:.4f}", flush=True) + print(f" Mean: acc={np.mean(accs):.4f} auc={np.mean(aucs):.4f}", flush=True) + + # === Train ensemble === + print("\n--- Training ensemble ---", flush=True) + lgbm = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED) + lgbm.fit(X_train, y_train) + rf = RandomForestClassifier(n_estimators=300, max_depth=12, n_jobs=1, random_state=SEED) + rf.fit(X_train, y_train) + scaler = StandardScaler() + svm = SVC(kernel="rbf", probability=True, random_state=SEED) + svm.fit(scaler.fit_transform(X_train), y_train) + + def predict(X): + p = (W_LGBM * lgbm.predict_proba(X)[:, 1] + + W_RF * rf.predict_proba(X)[:, 1] + + W_SVM * svm.predict_proba(scaler.transform(X))[:, 1]) + pred = np.full(len(X), -1) + pred[p >= HIGH_T] = 1 + pred[p < LOW_T] = 0 + return pred, p + + # === TEST: tellif SOTA === + print("\n" + "=" * 70, flush=True) + print(" TEST: tellif 2025 SOTA (HYBRID 916 features)", flush=True) + print("=" * 70, flush=True) + + label_names = ds_tellif.features["label"].names + results = {} + + for i, gen_name in enumerate(label_names): + indices = [j for j in range(len(ds_tellif)) if ds_tellif[j]["label"] == i] + if not indices: + continue + images = [ds_tellif[j]["image"].convert("RGB") for j in indices] + X_gen = extract_from_imgs(images, gen_name) + pred, prob = predict(X_gen) + + if gen_name == "real": + fp = (pred == 1).sum() + correct = (pred == 0).sum() + uncertain = (pred == -1).sum() + fp_rate = fp / len(X_gen) + print(f" {gen_name:45s} n={len(X_gen):3d} GENUINE={correct} UNCERTAIN={uncertain} FP={fp} FP_rate={fp_rate:.1%}", flush=True) + results[gen_name] = {"n": len(X_gen), "fp": int(fp), "fp_rate": float(fp_rate)} + else: + detected = (pred == 1).sum() + uncertain = (pred == -1).sum() + missed = (pred == 0).sum() + rate = detected / len(X_gen) + print(f" {gen_name:45s} n={len(X_gen):3d} DETECTED={detected} UNCERTAIN={uncertain} MISSED={missed} rate={rate:.1%}", flush=True) + results[gen_name] = {"n": len(X_gen), "detected": int(detected), "uncertain": int(uncertain), "missed": int(missed), "rate": float(rate)} + del images + gc.collect() + + # === Feature importance === + importances = lgbm.feature_importances_ + hc_imp = importances[:148].sum() + ln_imp = importances[148:].sum() + total_imp = importances.sum() + print(f"\n Feature importance: HC={hc_imp/total_imp*100:.1f}% / Learned={ln_imp/total_imp*100:.1f}%", flush=True) + + all_names = list(art_ext(Image.new("RGB", (64, 64), "red")).keys()) + learned_ext.feature_names() + sorted_idx = np.argsort(importances)[::-1] + print(f" Top 10:", flush=True) + for r in range(10): + idx = sorted_idx[r] + src = "HC" if idx < 148 else "CNN" + name = all_names[idx] if idx < len(all_names) else f"f{idx}" + print(f" {r+1:2d}. [{src}] {name:35s} {importances[idx]}", flush=True) + + # Save + out = Path(__file__).parent.parent / "results" / "hybrid_modern_results.json" + with open(out, "w") as f: + json.dump({ + "training": {"n_per_class": int(n), "n_features": 916, + "cv_acc": float(np.mean(accs)), "cv_auc": float(np.mean(aucs))}, + "tellif": results, + "importance": {"handcrafted_pct": float(hc_imp/total_imp), "learned_pct": float(ln_imp/total_imp)}, + }, f, indent=2) + print(f"\nSaved to {out}", flush=True) + print("DONE", flush=True) + + +if __name__ == "__main__": + main() diff --git a/tests/test_imaginet_paintings.py b/tests/test_imaginet_paintings.py new file mode 100644 index 0000000..f2bcc34 --- /dev/null +++ b/tests/test_imaginet_paintings.py @@ -0,0 +1,186 @@ +"""ImagiNet Paintings Benchmark: per-generator accuracy on labeled art data. + +Tests the 148-feature extractor against 6 different AI generators using +real WikiArt paintings as the genuine class. This is the definitive test +for art-specific detection with known generators. +""" + +from __future__ import annotations + +import json +import warnings +from pathlib import Path + +import lightgbm as lgb +import numpy as np +from PIL import Image +from sklearn.metrics import ( + accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, +) +from sklearn.model_selection import StratifiedKFold +from tqdm import tqdm + +warnings.filterwarnings("ignore") + +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) +from negate.extract.feature_artwork import ArtworkExtract + +ext = ArtworkExtract() +BASE = Path("D:/Projects/negate/negate/.datasets/imaginet/extracted") +SEED = 42 +MAX_PER_CLASS = 500 +rng = np.random.RandomState(SEED) + +GENERATORS = { + "wikiart": {"path": BASE / "wikiart", "label": 0, "recursive": True}, + "AnimagineXL": {"path": BASE / "animaginexl_paintings_fake", "label": 1, "recursive": False}, + "SD": {"path": BASE / "sd_paintings_fake", "label": 1, "recursive": False}, + "SDXL": {"path": BASE / "sdxl_paintings_fake", "label": 1, "recursive": False}, + "StyleGAN": {"path": BASE / "wikiart_stylegan", "label": 1, "recursive": True}, + "Midjourney": {"path": BASE / "journeydb", "label": 1, "recursive": False}, + "DALL-E_3": {"path": BASE / "dalle3", "label": 1, "recursive": False}, +} + + +def load_image_paths(path: Path, recursive: bool, max_n: int) -> list[Path]: + exts = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} + if recursive: + files = [f for f in path.rglob("*") if f.suffix.lower() in exts] + else: + files = [f for f in path.iterdir() if f.suffix.lower() in exts] + if len(files) > max_n: + files = list(rng.choice(files, max_n, replace=False)) + return files + + +def extract_features(files: list[Path], name: str) -> np.ndarray: + rows = [] + for f in tqdm(files, desc=name): + try: + img = Image.open(f).convert("RGB") + feat = ext(img) + rows.append(list(feat.values())) + except Exception: + rows.append([0.0] * 148) + X = np.array(rows, dtype=np.float64) + return np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + + +def cv_evaluate(X, y, n_splits=5): + skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) + metrics = {"acc": [], "auc": [], "f1": [], "prec": [], "rec": []} + for tr, te in skf.split(X, y): + model = lgb.LGBMClassifier( + n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED, + ) + model.fit(X[tr], y[tr]) + yp = model.predict_proba(X[te])[:, 1] + ypred = (yp > 0.5).astype(int) + metrics["acc"].append(accuracy_score(y[te], ypred)) + metrics["auc"].append(roc_auc_score(y[te], yp)) + metrics["f1"].append(f1_score(y[te], ypred, average="macro")) + metrics["prec"].append(precision_score(y[te], ypred)) + metrics["rec"].append(recall_score(y[te], ypred)) + return {k: float(np.mean(v)) for k, v in metrics.items()} + + +def main(): + print("=" * 60) + print(" ImagiNet PAINTINGS Benchmark") + print(" 148 features, LightGBM, per-generator accuracy") + print("=" * 60) + + # Load and extract features + all_features = {} + for name, info in GENERATORS.items(): + files = load_image_paths(info["path"], info["recursive"], MAX_PER_CLASS) + print(f"\n{name}: {len(files)} images (label={info['label']})") + all_features[name] = extract_features(files, name) + print(f" Shape: {all_features[name].shape}") + + X_real = all_features["wikiart"] + + # --- EXP 1: Per-generator --- + print("\n" + "=" * 60) + print(" EXP 1: Per-Generator (Real Art vs Each Generator)") + print("=" * 60) + + results_per_gen = {} + for gen in ["AnimagineXL", "SD", "SDXL", "StyleGAN", "Midjourney", "DALL-E_3"]: + X_fake = all_features[gen] + n = min(len(X_real), len(X_fake)) + X = np.vstack([X_real[:n], X_fake[:n]]) + y = np.concatenate([np.zeros(n), np.ones(n)]) + r = cv_evaluate(X, y) + results_per_gen[gen] = r + print(f" {gen:15s} acc={r['acc']:.4f} auc={r['auc']:.4f} f1={r['f1']:.4f} prec={r['prec']:.4f} rec={r['rec']:.4f}") + + # --- EXP 2: Pooled --- + print("\n" + "=" * 60) + print(" EXP 2: Pooled (All Generators Mixed)") + print("=" * 60) + + fake_list = ["AnimagineXL", "SD", "SDXL", "StyleGAN", "Midjourney", "DALL-E_3"] + X_all_fake = np.vstack([all_features[g] for g in fake_list]) + n_fake = len(X_all_fake) + n_real = len(X_real) + if n_real < n_fake: + idx = rng.choice(n_fake, n_real, replace=False) + X_all_fake = X_all_fake[idx] + else: + idx = rng.choice(n_real, n_fake, replace=False) + X_real_pooled = X_real[idx] + + n = min(len(X_real), len(X_all_fake)) + X_pooled = np.vstack([X_real[:n], X_all_fake[:n]]) + y_pooled = np.concatenate([np.zeros(n), np.ones(n)]) + r = cv_evaluate(X_pooled, y_pooled) + print(f" Pooled: acc={r['acc']:.4f} auc={r['auc']:.4f} f1={r['f1']:.4f} prec={r['prec']:.4f} rec={r['rec']:.4f}") + + # --- EXP 3: Leave-one-generator-out --- + print("\n" + "=" * 60) + print(" EXP 3: Leave-One-Generator-Out (Generalization)") + print("=" * 60) + + results_logo = {} + for held_out in fake_list: + train_gens = [g for g in fake_list if g != held_out] + X_train_fake = np.vstack([all_features[g] for g in train_gens]) + n_tf = len(X_train_fake) + X_train = np.vstack([X_real[:n_tf], X_train_fake]) + y_train = np.concatenate([np.zeros(min(len(X_real), n_tf)), np.ones(n_tf)]) + + X_test_fake = all_features[held_out] + n_te = len(X_test_fake) + X_test = np.vstack([X_real[:n_te], X_test_fake]) + y_test = np.concatenate([np.zeros(n_te), np.ones(n_te)]) + + model = lgb.LGBMClassifier( + n_estimators=500, learning_rate=0.05, max_depth=8, + num_leaves=63, n_jobs=1, verbose=-1, random_state=SEED, + ) + model.fit(X_train, y_train) + yp = model.predict_proba(X_test)[:, 1] + ypred = (yp > 0.5).astype(int) + acc = accuracy_score(y_test, ypred) + auc = roc_auc_score(y_test, yp) + f1 = f1_score(y_test, ypred, average="macro") + results_logo[held_out] = {"acc": acc, "auc": auc, "f1": f1} + print(f" Hold out {held_out:15s}: acc={acc:.4f} auc={auc:.4f} f1={f1:.4f}") + + # Save results + results = { + "per_generator": results_per_gen, + "pooled": r, + "leave_one_out": {k: {kk: float(vv) for kk, vv in v.items()} for k, v in results_logo.items()}, + } + out_path = Path(__file__).parent.parent / "results" / "imaginet_paintings_results.json" + with open(out_path, "w") as f: + json.dump(results, f, indent=2) + print(f"\nResults saved to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_multidataset_benchmark.py b/tests/test_multidataset_benchmark.py new file mode 100644 index 0000000..3e33818 --- /dev/null +++ b/tests/test_multidataset_benchmark.py @@ -0,0 +1,581 @@ +"""Multi-dataset benchmark for artwork feature extraction + LightGBM classification.""" + +from __future__ import annotations + +import os + +os.environ["HF_HOME"] = "D:/Projects/negate/negate/.cache/huggingface" +os.environ["HF_DATASETS_CACHE"] = "D:/Projects/negate/negate/.cache/huggingface/datasets" + +import json +import time +import warnings +from pathlib import Path + +import numpy as np +from numpy.typing import NDArray +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.model_selection import StratifiedKFold +from tqdm import tqdm + +warnings.filterwarnings("ignore") + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def extract_features_batch(images: list, extractor) -> NDArray: + """Extract features from a list of PIL images, return (N, D) array.""" + rows = [] + for img in tqdm(images, desc="Extracting features", leave=False): + try: + feats = extractor(img) + rows.append(list(feats.values())) + except Exception: + rows.append([0.0] * 148) + X = np.array(rows, dtype=np.float64) + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + return X + + +def train_and_evaluate(X_train: NDArray, y_train: NDArray, X_test: NDArray, y_test: NDArray) -> dict: + """Train LightGBM and return accuracy + AUC.""" + import lightgbm as lgb + + model = lgb.LGBMClassifier( + n_estimators=300, + learning_rate=0.05, + max_depth=6, + num_leaves=31, + n_jobs=1, + verbose=-1, + random_state=42, + ) + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + y_prob = model.predict_proba(X_test)[:, 1] + acc = accuracy_score(y_test, y_pred) + try: + auc = roc_auc_score(y_test, y_prob) + except ValueError: + auc = float("nan") + return {"accuracy": round(acc, 4), "auc": round(auc, 4)} + + +def cross_validate(X: NDArray, y: NDArray, n_splits: int = 5) -> dict: + """5-fold stratified CV, return pooled accuracy + AUC.""" + import lightgbm as lgb + + skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) + all_y_true, all_y_pred, all_y_prob = [], [], [] + for train_idx, test_idx in skf.split(X, y): + model = lgb.LGBMClassifier( + n_estimators=300, learning_rate=0.05, max_depth=6, + num_leaves=31, n_jobs=1, verbose=-1, random_state=42, + ) + model.fit(X[train_idx], y[train_idx]) + preds = model.predict(X[test_idx]) + probs = model.predict_proba(X[test_idx])[:, 1] + all_y_true.extend(y[test_idx]) + all_y_pred.extend(preds) + all_y_prob.extend(probs) + acc = accuracy_score(all_y_true, all_y_pred) + try: + auc = roc_auc_score(all_y_true, all_y_prob) + except ValueError: + auc = float("nan") + return {"accuracy": round(acc, 4), "auc": round(auc, 4)} + + +def sample_balanced(images: list, labels: list, max_per_class: int) -> tuple[list, list]: + """Subsample up to max_per_class per unique label value.""" + from collections import defaultdict + buckets: dict[int, list[int]] = defaultdict(list) + for i, l in enumerate(labels): + buckets[l].append(i) + rng = np.random.RandomState(42) + selected: list[int] = [] + for cls, idxs in sorted(buckets.items()): + if len(idxs) > max_per_class: + idxs = rng.choice(idxs, max_per_class, replace=False).tolist() + selected.extend(idxs) + rng.shuffle(selected) + return [images[i] for i in selected], [labels[i] for i in selected] + + +# --------------------------------------------------------------------------- +# Dataset loaders +# --------------------------------------------------------------------------- + +def load_defactify(extractor, max_per_gen: int = 200) -> tuple[NDArray, NDArray, dict]: + """Load Defactify, sample up to max_per_gen real+fake per generator. + + Uses Rajarshi-Roy-research/Defactify_Image_Dataset with: + - Label_B: 0=real, 1=SD2.1, 2=SDXL, 3=SD3, 4=Midjourney v6, 5=DALL-E 3 + - Image: PIL image column + """ + from collections import defaultdict + from datasets import load_dataset, Image as HFImage + + GENERATOR_NAMES = {1: "SD_2.1", 2: "SDXL", 3: "SD_3", 4: "Midjourney_v6", 5: "DALL-E_3"} + + print("\n=== Loading Defactify ===") + ds = load_dataset("Rajarshi-Roy-research/Defactify_Image_Dataset", split="train") + ds = ds.cast_column("Image", HFImage(decode=True, mode="RGB")) + print(f" Total samples: {len(ds)}") + + # Group by Label_B + label_b = ds["Label_B"] + gen_buckets: dict[str, list[int]] = defaultdict(list) + real_indices: list[int] = [] + for i, lb in enumerate(label_b): + if lb == 0: + real_indices.append(i) + elif lb in GENERATOR_NAMES: + gen_buckets[GENERATOR_NAMES[lb]].append(i) + + generators = sorted(gen_buckets.keys()) + print(f" Real images: {len(real_indices)}") + print(f" Generators: {generators}") + for g in generators: + print(f" {g}: {len(gen_buckets[g])}") + + # Sample real + per-generator fake + rng = np.random.RandomState(42) + selected_indices: list[int] = [] + selected_labels: list[int] = [] + gen_indices: dict[str, list[int]] = {} + + # Sample real images + real_sample = rng.choice(real_indices, size=min(max_per_gen, len(real_indices)), replace=False).tolist() + start = len(selected_indices) + selected_indices.extend(real_sample) + selected_labels.extend([0] * len(real_sample)) + gen_indices["real"] = list(range(start, len(selected_indices))) + + # Sample fake per generator + for g in generators: + idxs = gen_buckets[g] + if len(idxs) > max_per_gen: + idxs = rng.choice(idxs, max_per_gen, replace=False).tolist() + start = len(selected_indices) + selected_indices.extend(idxs) + selected_labels.extend([1] * len(idxs)) + gen_indices[g] = list(range(start, len(selected_indices))) + + print(f" Selected {len(selected_indices)} samples (real={sum(1 for l in selected_labels if l==0)}, fake={sum(1 for l in selected_labels if l==1)})") + import gc + + # Extract features one at a time to avoid holding all images in memory + rows = [] + for idx in tqdm(selected_indices, desc="Defactify: extract features"): + try: + img = ds[int(idx)]["Image"] + feats = extractor(img) + rows.append(list(feats.values())) + del img + except Exception: + rows.append([0.0] * 148) + + del ds + gc.collect() + + X = np.array(rows, dtype=np.float64) + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + y = np.array(selected_labels, dtype=np.int32) + + meta = {"generators": generators, "gen_indices": gen_indices} + return X, y, meta + + +def load_hemg(extractor, max_per_class: int = 1000) -> tuple[NDArray, NDArray]: + """Load Hemg AI-art dataset. label: 0=AI, 1=real.""" + from collections import defaultdict + from datasets import load_dataset + import gc + + print("\n=== Loading Hemg (AI art vs real art) ===") + ds = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train") + print(f" Total samples: {len(ds)}") + + # Read only labels column (fast, avoids decoding images) + all_labels = ds["label"] # fetches entire label column as list + label_buckets: dict[int, list[int]] = defaultdict(list) + for i, lbl in enumerate(all_labels): + label_buckets[lbl].append(i) + + for k, v in sorted(label_buckets.items()): + print(f" Label {k}: {len(v)} samples") + + # Sample indices first + rng = np.random.RandomState(42) + selected_indices: list[int] = [] + selected_labels: list[int] = [] + for cls in sorted(label_buckets.keys()): + idxs = label_buckets[cls] + if len(idxs) > max_per_class: + idxs = rng.choice(idxs, max_per_class, replace=False).tolist() + selected_indices.extend(idxs) + selected_labels.extend([cls] * len(idxs)) + + print(f" Selected {len(selected_indices)} samples") + + # Extract features one at a time + rows = [] + final_labels = [] + for idx, lbl in tqdm(zip(selected_indices, selected_labels), total=len(selected_indices), desc="Hemg: extract features"): + try: + img = ds[idx]["image"] + feats = extractor(img) + rows.append(list(feats.values())) + final_labels.append(lbl) + del img + except Exception: + rows.append([0.0] * 148) + final_labels.append(lbl) + + del ds + gc.collect() + + X = np.array(rows, dtype=np.float64) + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + y = np.array(final_labels, dtype=np.int32) + return X, y + + +def load_tellif(extractor, max_per_class: int = 200) -> tuple[NDArray, NDArray, list[str]]: + """Load tellif semantically similar dataset.""" + from collections import defaultdict + from datasets import load_dataset + import gc + + print("\n=== Loading tellif (semantically similar) ===") + ds = load_dataset("tellif/ai_vs_real_image_semantically_similar", split="test") + print(f" Total samples: {len(ds)}") + + label_names = ds.features["label"].names + print(f" Label names: {label_names}") + + # First pass: read labels only + all_labels_raw = [ds[i]["label"] for i in range(len(ds))] + + unique, counts = np.unique(all_labels_raw, return_counts=True) + for u, c in zip(unique, counts): + print(f" {label_names[u]} ({u}): {c}") + + # Map to binary: find which labels are "real" vs AI-generated + real_keywords = ["real", "photograph", "photo", "human", "genuine", "original"] + real_label_ids = set() + for lid, name in enumerate(label_names): + if any(kw in name.lower() for kw in real_keywords): + real_label_ids.add(lid) + print(f" Detected real labels: {[label_names[i] for i in sorted(real_label_ids)]}") + if not real_label_ids: + # If no clear "real" label, treat the first label as real + print(" WARNING: No clear 'real' label found. Treating label 0 as real.") + real_label_ids = {0} + + binary_labels = [0 if l in real_label_ids else 1 for l in all_labels_raw] + + # Sample indices per binary class + buckets: dict[int, list[int]] = defaultdict(list) + for i, bl in enumerate(binary_labels): + buckets[bl].append(i) + rng = np.random.RandomState(42) + selected_indices: list[int] = [] + selected_binary: list[int] = [] + for cls in sorted(buckets.keys()): + idxs = buckets[cls] + if len(idxs) > max_per_class: + idxs = rng.choice(idxs, max_per_class, replace=False).tolist() + selected_indices.extend(idxs) + selected_binary.extend([cls] * len(idxs)) + print(f" Selected {len(selected_indices)} samples (binary: 0=real, 1=AI)") + + # Extract features one at a time + rows = [] + final_labels = [] + for idx, lbl in tqdm(zip(selected_indices, selected_binary), total=len(selected_indices), desc="tellif: extract features"): + try: + img = ds[idx]["image"] + if img is not None: + feats = extractor(img) + rows.append(list(feats.values())) + final_labels.append(lbl) + del img + except Exception: + continue + + del ds + gc.collect() + X = np.array(rows, dtype=np.float64) + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + y = np.array(final_labels, dtype=np.int32) + return X, y, label_names + + +# --------------------------------------------------------------------------- +# Experiments +# --------------------------------------------------------------------------- + +def experiment_1_baselines(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif) -> dict: + """Exp 1: Single-dataset baselines with 5-fold CV.""" + print("\n" + "=" * 70) + print("EXPERIMENT 1: Single-dataset baselines (5-fold CV)") + print("=" * 70) + + results = {} + for name, X, y in [("Defactify", X_def, y_def), ("Hemg", X_hemg, y_hemg), ("tellif", X_tellif, y_tellif)]: + print(f"\n {name}: {X.shape[0]} samples, {X.shape[1]} features") + r = cross_validate(X, y) + results[name] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + print("\n +--------------+----------+--------+") + print(" | Dataset | Accuracy | AUC |") + print(" +--------------+----------+--------+") + for name, r in results.items(): + print(f" | {name:<12} | {r['accuracy']:.4f} | {r['auc']:.4f} |") + print(" +--------------+----------+--------+") + return results + + +def experiment_2_cross_dataset(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif) -> dict: + """Exp 2: Cross-dataset generalization.""" + print("\n" + "=" * 70) + print("EXPERIMENT 2: Cross-dataset generalization") + print("=" * 70) + + # Normalize Hemg labels: 0=AI->1(fake), 1=real->0(real) to match Defactify convention + # Defactify: 0=real, 1=fake + # Hemg: 0=AI, 1=real -> need to flip + y_hemg_norm = 1 - y_hemg + # tellif: already 0=real, 1=AI + + results = {} + + # Train Defactify -> test Hemg + print("\n Train Defactify -> Test Hemg") + r = train_and_evaluate(X_def, y_def, X_hemg, y_hemg_norm) + results["Defactify->Hemg"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + # Train Hemg -> test Defactify + print("\n Train Hemg -> Test Defactify") + r = train_and_evaluate(X_hemg, y_hemg_norm, X_def, y_def) + results["Hemg->Defactify"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + # Combined train -> test each + X_combined = np.vstack([X_def, X_hemg]) + y_combined = np.concatenate([y_def, y_hemg_norm]) + + print("\n Train Combined -> Test Defactify") + r = train_and_evaluate(X_combined, y_combined, X_def, y_def) + results["Combined->Defactify"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + print("\n Train Combined -> Test Hemg") + r = train_and_evaluate(X_combined, y_combined, X_hemg, y_hemg_norm) + results["Combined->Hemg"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + print("\n Train Combined -> Test tellif") + r = train_and_evaluate(X_combined, y_combined, X_tellif, y_tellif) + results["Combined->tellif"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + print("\n +------------------------+----------+--------+") + print(" | Transfer | Accuracy | AUC |") + print(" +------------------------+----------+--------+") + for name, r in results.items(): + print(f" | {name:<22} | {r['accuracy']:.4f} | {r['auc']:.4f} |") + print(" +------------------------+----------+--------+") + return results + + +def experiment_3_generator_diversity(X_def, y_def, meta_def) -> dict: + """Exp 3: Generator diversity impact using Defactify.""" + print("\n" + "=" * 70) + print("EXPERIMENT 3: Generator diversity impact (Defactify)") + print("=" * 70) + + generators = meta_def["generators"] + gen_indices = meta_def["gen_indices"] + real_idx = np.array(gen_indices["real"]) # shared real block + results = {} + + # Build per-generator fake-only index arrays + gen_fake_arrays = {} + for g in generators: + gen_fake_arrays[g] = np.array(gen_indices[g]) + + rng = np.random.RandomState(42) + gen_order = list(generators) + rng.shuffle(gen_order) + print(f" Generator order: {gen_order}") + + # Split real indices proportionally for train/test + n_gens = len(generators) + + # Progressive: train on N generators, test on rest + for n_train in range(1, n_gens): + train_gens = gen_order[:n_train] + test_gens = gen_order[n_train:] + + # Split real images proportionally + real_shuffled = rng.permutation(real_idx) + n_real_train = int(len(real_shuffled) * n_train / n_gens) + real_train = real_shuffled[:n_real_train] + real_test = real_shuffled[n_real_train:] + + train_fake = np.concatenate([gen_fake_arrays[g] for g in train_gens]) + test_fake = np.concatenate([gen_fake_arrays[g] for g in test_gens]) + + train_idx = np.concatenate([real_train, train_fake]) + test_idx = np.concatenate([real_test, test_fake]) + + X_tr, y_tr = X_def[train_idx], y_def[train_idx] + X_te, y_te = X_def[test_idx], y_def[test_idx] + + label = f"{n_train}_gen_train" + print(f"\n Train on {n_train} gen ({', '.join(train_gens)}) -> Test on ({', '.join(test_gens)})") + print(f" Train: {len(X_tr)} (real={np.sum(y_tr==0)}, fake={np.sum(y_tr==1)})") + print(f" Test: {len(X_te)} (real={np.sum(y_te==0)}, fake={np.sum(y_te==1)})") + r = train_and_evaluate(X_tr, y_tr, X_te, y_te) + r["train_generators"] = train_gens + r["test_generators"] = test_gens + results[label] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + # All 5 generators: 5-fold CV + print(f"\n All {n_gens} generators: 5-fold CV") + r = cross_validate(X_def, y_def) + r["train_generators"] = generators + r["test_generators"] = generators + results[f"{n_gens}_gen_cv"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + print("\n +----------------+----------+--------+") + print(" | # Generators | Accuracy | AUC |") + print(" +----------------+----------+--------+") + for name, r in results.items(): + print(f" | {name:<14} | {r['accuracy']:.4f} | {r['auc']:.4f} |") + print(" +----------------+----------+--------+") + return results + + +def experiment_4_domain_transfer(X_def, y_def, X_hemg, y_hemg) -> dict: + """Exp 4: Art vs photos domain transfer.""" + print("\n" + "=" * 70) + print("EXPERIMENT 4: Art vs Photos domain transfer") + print("=" * 70) + + # Normalize Hemg: 0=AI->1, 1=real->0 + y_hemg_norm = 1 - y_hemg + + results = {} + + # Photos -> Art + print("\n Train Defactify (photos) -> Test Hemg (art)") + r = train_and_evaluate(X_def, y_def, X_hemg, y_hemg_norm) + results["Photos->Art"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + # Art -> Photos + print("\n Train Hemg (art) -> Test Defactify (photos)") + r = train_and_evaluate(X_hemg, y_hemg_norm, X_def, y_def) + results["Art->Photos"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + # Same-domain baselines for comparison + print("\n Baseline: Defactify 5-fold CV (same domain)") + r = cross_validate(X_def, y_def) + results["Photos_self_CV"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + print("\n Baseline: Hemg 5-fold CV (same domain)") + r = cross_validate(X_hemg, y_hemg_norm) + results["Art_self_CV"] = r + print(f" Accuracy: {r['accuracy']:.4f} AUC: {r['auc']:.4f}") + + # Compute domain gap + photo_self = results["Photos_self_CV"]["accuracy"] + art_self = results["Art_self_CV"]["accuracy"] + p2a = results["Photos->Art"]["accuracy"] + a2p = results["Art->Photos"]["accuracy"] + print(f"\n Domain gap (Photos->Art): {photo_self:.4f} -> {p2a:.4f} (delta: {photo_self - p2a:+.4f})") + print(f" Domain gap (Art->Photos): {art_self:.4f} -> {a2p:.4f} (delta: {art_self - a2p:+.4f})") + + print("\n +------------------+----------+--------+") + print(" | Transfer | Accuracy | AUC |") + print(" +------------------+----------+--------+") + for name, r in results.items(): + print(f" | {name:<16} | {r['accuracy']:.4f} | {r['auc']:.4f} |") + print(" +------------------+----------+--------+") + return results + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + start_time = time.time() + + from negate.extract.feature_artwork import ArtworkExtract + extractor = ArtworkExtract() + + # Determine feature count + from PIL import Image + dummy = Image.new("RGB", (255, 255), color="gray") + n_features = len(extractor(dummy)) + print(f"Feature count: {n_features}") + + # Load datasets + X_def, y_def, meta_def = load_defactify(extractor, max_per_gen=200) + X_hemg, y_hemg = load_hemg(extractor, max_per_class=1000) + X_tellif, y_tellif, tellif_labels = load_tellif(extractor, max_per_class=200) + + print(f"\nDataset shapes:") + print(f" Defactify: X={X_def.shape}, y={y_def.shape} (real={np.sum(y_def==0)}, fake={np.sum(y_def==1)})") + print(f" Hemg: X={X_hemg.shape}, y={y_hemg.shape} (AI={np.sum(y_hemg==0)}, real={np.sum(y_hemg==1)})") + print(f" tellif: X={X_tellif.shape}, y={y_tellif.shape} (real={np.sum(y_tellif==0)}, AI={np.sum(y_tellif==1)})") + + # Run experiments + results_all: dict = {"feature_count": n_features} + + results_all["exp1_baselines"] = experiment_1_baselines(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif) + results_all["exp2_cross_dataset"] = experiment_2_cross_dataset(X_def, y_def, X_hemg, y_hemg, X_tellif, y_tellif) + results_all["exp3_generator_diversity"] = experiment_3_generator_diversity(X_def, y_def, meta_def) + results_all["exp4_domain_transfer"] = experiment_4_domain_transfer(X_def, y_def, X_hemg, y_hemg) + + elapsed = time.time() - start_time + results_all["elapsed_seconds"] = round(elapsed, 1) + + # Save results + out_path = Path("D:/Projects/negate/negate/results/multidataset_benchmark_results.json") + out_path.parent.mkdir(parents=True, exist_ok=True) + + # Make JSON-serializable + def _make_serializable(obj): + if isinstance(obj, dict): + return {k: _make_serializable(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_make_serializable(v) for v in obj] + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return obj + + with open(out_path, "w") as f: + json.dump(_make_serializable(results_all), f, indent=2) + print(f"\nResults saved to {out_path}") + print(f"Total elapsed: {elapsed:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/tests/test_scale_evaluation.py b/tests/test_scale_evaluation.py deleted file mode 100644 index df9bbfe..0000000 --- a/tests/test_scale_evaluation.py +++ /dev/null @@ -1,363 +0,0 @@ -# SPDX-License-Identifier: MPL-2.0 AND LicenseRef-Commons-Clause-License-Condition-1.0 -"""Scale evaluation: test if more training data improves artwork detection. - -Runs the 49-feature pipeline on increasing sample sizes from Hemg (art vs art) -to determine if 71% accuracy is a data problem or a feature problem. -""" - -from __future__ import annotations - -import json -import sys -from datetime import datetime -from pathlib import Path - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from matplotlib.backends.backend_pdf import PdfPages -import numpy as np -import pandas as pd -import xgboost as xgb -from datasets import load_dataset, Image as HFImage -from PIL import Image -from sklearn.metrics import ( - accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, - confusion_matrix, -) -from sklearn.model_selection import StratifiedKFold -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC -from tqdm import tqdm - -sys.path.insert(0, str(Path(__file__).parent.parent)) -from negate.extract.feature_artwork import ArtworkExtract - -SEED = 42 -N_FOLDS = 5 -REPO = "Hemg/AI-Generated-vs-Real-Images-Datasets" -SAMPLE_SIZES = [400, 1000, 2000, 4000] # total (half per class) -RESULTS_DIR = Path(__file__).parent.parent / "results" - - -def extract_features_cached(dataset, n_per_class: int, extractor: ArtworkExtract): - """Extract features, balanced per class.""" - all_labels = dataset["label"] - features, labels, errors = [], [], 0 - - rng = np.random.RandomState(SEED) - - for lbl in [0, 1]: - indices = [i for i, l in enumerate(all_labels) if l == lbl] - chosen = rng.choice(indices, size=min(n_per_class, len(indices)), replace=False) - - for idx in tqdm(chosen, desc=f" Label {lbl} (n={n_per_class})"): - try: - img = dataset[int(idx)]["image"] - if img is None or not isinstance(img, Image.Image): - errors += 1 - continue - feat = extractor(img) - features.append(feat) - # label 0 = AI art (synthetic), label 1 = Real art (genuine) - # We want: 0 = genuine, 1 = synthetic - labels.append(1 if lbl == 0 else 0) - except Exception: - errors += 1 - - print(f" Extracted {len(features)} ({errors} errors)") - df = pd.DataFrame(features).fillna(0) - X = df.to_numpy(dtype=np.float64) - X = np.where(np.isfinite(X), X, 0) - y = np.array(labels) - return X, y, list(df.columns) - - -def run_cv(X, y, model_type="xgb"): - """Run 5-fold CV, return pooled y_true, y_prob.""" - skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) - all_true, all_prob = [], [] - - for train_idx, test_idx in skf.split(X, y): - X_train, X_test = X[train_idx], X[test_idx] - y_train, y_test = y[train_idx], y[test_idx] - - if model_type == "xgb": - spw = np.sum(y_train == 0) / max(np.sum(y_train == 1), 1) - params = { - "objective": "binary:logistic", "eval_metric": "logloss", - "max_depth": 4, "learning_rate": 0.1, "subsample": 0.8, - "colsample_bytree": 0.8, "scale_pos_weight": spw, "seed": SEED, - } - dtrain = xgb.DMatrix(X_train, label=y_train) - dtest = xgb.DMatrix(X_test, label=y_test) - model = xgb.train(params, dtrain, num_boost_round=200, - evals=[(dtest, "test")], early_stopping_rounds=10, - verbose_eval=False) - y_prob = model.predict(dtest) - elif model_type == "svm": - scaler = StandardScaler() - X_tr = scaler.fit_transform(X_train) - X_te = scaler.transform(X_test) - svm = SVC(kernel="rbf", probability=True, random_state=SEED) - svm.fit(X_tr, y_train) - y_prob = svm.predict_proba(X_te)[:, 1] - elif model_type == "mlp": - scaler = StandardScaler() - X_tr = scaler.fit_transform(X_train) - X_te = scaler.transform(X_test) - mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, - random_state=SEED, early_stopping=True) - mlp.fit(X_tr, y_train) - y_prob = mlp.predict_proba(X_te)[:, 1] - - all_true.extend(y_test) - all_prob.extend(y_prob) - - y_true = np.array(all_true) - y_prob = np.array(all_prob) - y_pred = (y_prob > 0.5).astype(int) - - return { - "accuracy": float(accuracy_score(y_true, y_pred)), - "precision": float(precision_score(y_true, y_pred, zero_division=0)), - "recall": float(recall_score(y_true, y_pred, zero_division=0)), - "f1": float(f1_score(y_true, y_pred, average="macro")), - "roc_auc": float(roc_auc_score(y_true, y_prob)), - } - - -def generate_pdf(all_results): - """Generate scaling analysis PDF.""" - RESULTS_DIR.mkdir(exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - pdf_path = RESULTS_DIR / f"scale_evaluation_{timestamp}.pdf" - - with PdfPages(str(pdf_path)) as pdf: - # PAGE 1: Title + scaling curves - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - - fig.suptitle("Scaling Analysis: Does More Data Improve\nArtwork Detection Accuracy?", - fontsize=16, fontweight="bold", fontfamily="serif", y=0.96) - - # Subtitle - fig.text(0.5, 0.90, f"negate project — darkshapes — {datetime.now().strftime('%B %d, %Y')}", - fontsize=10, ha="center", fontfamily="serif", style="italic") - - fig.text(0.5, 0.87, "Dataset: Hemg/AI-Generated-vs-Real-Images-Datasets (AI Art vs Real Art)", - fontsize=9, ha="center", fontfamily="serif") - - # Accuracy scaling curve - ax1 = fig.add_axes([0.12, 0.52, 0.76, 0.3]) - sizes = [r["total"] for r in all_results] - - for model, color, marker in [("xgb", "#4472C4", "o"), ("svm", "#ED7D31", "s"), ("mlp", "#70AD47", "^")]: - accs = [r[model]["accuracy"] for r in all_results] - ax1.plot(sizes, accs, f"-{marker}", color=color, label=model.upper(), markersize=8, linewidth=2) - for x, y in zip(sizes, accs): - ax1.annotate(f"{y:.1%}", (x, y), textcoords="offset points", - xytext=(0, 10), ha="center", fontsize=8) - - ax1.set_xlabel("Total Training Samples", fontsize=10) - ax1.set_ylabel("5-Fold CV Accuracy", fontsize=10) - ax1.set_title("Accuracy vs Training Set Size", fontsize=12, fontfamily="serif") - ax1.legend(fontsize=9) - ax1.grid(True, alpha=0.3) - ax1.set_ylim(0.5, 1.0) - ax1.axhline(y=0.5, color="red", linestyle="--", alpha=0.3, label="Random chance") - - # AUC scaling curve - ax2 = fig.add_axes([0.12, 0.12, 0.76, 0.3]) - - for model, color, marker in [("xgb", "#4472C4", "o"), ("svm", "#ED7D31", "s"), ("mlp", "#70AD47", "^")]: - aucs = [r[model]["roc_auc"] for r in all_results] - ax2.plot(sizes, aucs, f"-{marker}", color=color, label=model.upper(), markersize=8, linewidth=2) - for x, y in zip(sizes, aucs): - ax2.annotate(f"{y:.3f}", (x, y), textcoords="offset points", - xytext=(0, 10), ha="center", fontsize=8) - - ax2.set_xlabel("Total Training Samples", fontsize=10) - ax2.set_ylabel("5-Fold CV ROC-AUC", fontsize=10) - ax2.set_title("ROC-AUC vs Training Set Size", fontsize=12, fontfamily="serif") - ax2.legend(fontsize=9) - ax2.grid(True, alpha=0.3) - ax2.set_ylim(0.5, 1.0) - - pdf.savefig(fig) - plt.close(fig) - - # PAGE 2: Results table + analysis - fig = plt.figure(figsize=(8.5, 11)) - fig.patch.set_facecolor("white") - fig.suptitle("Detailed Results & Analysis", fontsize=14, - fontweight="bold", fontfamily="serif", y=0.96) - - # Results table - ax_table = fig.add_axes([0.05, 0.62, 0.9, 0.28]) - ax_table.axis("off") - - table_data = [] - for r in all_results: - for model in ["xgb", "svm", "mlp"]: - m = r[model] - table_data.append([ - str(r["total"]), model.upper(), - f"{m['accuracy']:.2%}", f"{m['precision']:.2%}", - f"{m['recall']:.2%}", f"{m['f1']:.2%}", f"{m['roc_auc']:.4f}" - ]) - - table = ax_table.table( - cellText=table_data, - colLabels=["Samples", "Model", "Accuracy", "Precision", "Recall", "F1", "AUC"], - loc="center", cellLoc="center", - ) - table.auto_set_font_size(False) - table.set_fontsize(7.5) - table.scale(1, 1.3) - for (row, col), cell in table.get_celld().items(): - if row == 0: - cell.set_facecolor("#4472C4") - cell.set_text_props(color="white", fontweight="bold") - - # Analysis - ax_text = fig.add_axes([0.08, 0.05, 0.84, 0.52]) - ax_text.axis("off") - - best_final = max(all_results[-1]["xgb"]["accuracy"], - all_results[-1]["svm"]["accuracy"], - all_results[-1]["mlp"]["accuracy"]) - best_initial = max(all_results[0]["xgb"]["accuracy"], - all_results[0]["svm"]["accuracy"], - all_results[0]["mlp"]["accuracy"]) - improvement = best_final - best_initial - - analysis = ( - "Analysis\n\n" - f"Sample sizes tested: {', '.join(str(r['total']) for r in all_results)}\n" - f"Best accuracy at smallest size ({all_results[0]['total']}): {best_initial:.1%}\n" - f"Best accuracy at largest size ({all_results[-1]['total']}): {best_final:.1%}\n" - f"Improvement from scaling: {improvement:+.1%}pp\n\n" - ) - - if improvement > 0.10: - analysis += ( - "FINDING: Significant improvement with more data.\n" - "The 49 features have capacity to learn — the initial low accuracy was\n" - "primarily a data limitation. With sufficient training data, the hand-crafted\n" - "features can achieve useful detection rates on artwork.\n\n" - "Recommendation: Scale to even larger samples (10K+) and consider\n" - "integrating these features into the negate pipeline." - ) - elif improvement > 0.03: - analysis += ( - "FINDING: Modest improvement with more data.\n" - "More data helps somewhat, but accuracy is plateauing. The features\n" - "capture some genuine signal but are limited by their expressiveness.\n\n" - "Recommendation: The hand-crafted features are hitting a ceiling.\n" - "To push past this, the pipeline needs learned features — either\n" - "fine-tuned CLIP/DINOv2 or the self-supervised approach from\n" - "Zhong et al. (2026)." - ) - else: - analysis += ( - "FINDING: Minimal improvement with more data.\n" - "The features are saturated — adding more training data does not help.\n" - "The 49 hand-crafted features simply don't capture enough discriminative\n" - "information to distinguish AI art from human art.\n\n" - "Recommendation: Fundamentally different features are needed.\n" - "Hand-crafted statistics cannot match the representational power of\n" - "learned features for this task." - ) - - analysis += ( - "\n\nContext\n\n" - "This evaluation uses only the Hemg dataset where BOTH classes are artwork.\n" - "This is the hardest and most honest test — no content shortcuts.\n" - "All processing is CPU-only, 49 features per image.\n" - "5-fold stratified cross-validation with fixed random seed (42).\n" - ) - - ax_text.text(0, 1, analysis, transform=ax_text.transAxes, fontsize=9, - ha="left", va="top", fontfamily="serif") - - pdf.savefig(fig) - plt.close(fig) - - print(f"PDF saved to: {pdf_path}") - return pdf_path - - -def main(): - print("=" * 60) - print(" SCALING ANALYSIS: Art Detection vs Training Data Size") - print(" Dataset: Hemg (AI Art vs Real Art)") - print("=" * 60) - - print("\nLoading dataset...") - ds = load_dataset(REPO, split="train") - ds = ds.cast_column("image", HFImage(decode=True, mode="RGB")) - print(f" Total rows: {len(ds)}") - - extractor = ArtworkExtract() - all_results = [] - - # We extract at the largest size once, then subsample - max_per_class = max(SAMPLE_SIZES) // 2 - print(f"\nExtracting features for {max_per_class} per class...") - X_full, y_full, feature_names = extract_features_cached(ds, max_per_class, extractor) - print(f" Total: {len(y_full)} images, {X_full.shape[1]} features") - print(f" Balance: {np.sum(y_full==0)} genuine, {np.sum(y_full==1)} synthetic") - - for total in SAMPLE_SIZES: - per_class = total // 2 - print(f"\n{'='*40}") - print(f" Testing with {total} samples ({per_class} per class)") - print(f"{'='*40}") - - # Subsample from the full extraction - rng = np.random.RandomState(SEED) - idx_0 = np.where(y_full == 0)[0] - idx_1 = np.where(y_full == 1)[0] - chosen_0 = rng.choice(idx_0, size=min(per_class, len(idx_0)), replace=False) - chosen_1 = rng.choice(idx_1, size=min(per_class, len(idx_1)), replace=False) - chosen = np.concatenate([chosen_0, chosen_1]) - X = X_full[chosen] - y = y_full[chosen] - - result = {"total": len(y)} - for model in ["xgb", "svm", "mlp"]: - print(f" Running {model.upper()}...") - result[model] = run_cv(X, y, model) - print(f" acc={result[model]['accuracy']:.2%} auc={result[model]['roc_auc']:.4f}") - - all_results.append(result) - - # Save JSON - RESULTS_DIR.mkdir(exist_ok=True) - json_path = RESULTS_DIR / "scale_evaluation_results.json" - with open(json_path, "w") as f: - json.dump({ - "timestamp": datetime.now().isoformat(), - "dataset": REPO, - "feature_count": X_full.shape[1], - "results": all_results, - }, f, indent=2) - print(f"\nJSON saved to: {json_path}") - - # Generate PDF - print("\nGenerating PDF...") - generate_pdf(all_results) - - # Print summary - print(f"\n{'='*60}") - print(" SUMMARY") - print(f"{'='*60}") - for r in all_results: - best = max(r["xgb"]["accuracy"], r["svm"]["accuracy"], r["mlp"]["accuracy"]) - print(f" n={r['total']:5d} best_acc={best:.2%}") - - -if __name__ == "__main__": - main()