From 4c8a38785a28056d9bb1a4729a4f7ed68eb4dba4 Mon Sep 17 00:00:00 2001 From: Edward Jarvis <146198405+sankdoy@users.noreply.github.com> Date: Thu, 15 Jan 2026 14:11:55 +0000 Subject: [PATCH] feat/encoder-ui-fixes switch to linear STFT for encoding/decoding, auto-fit text height, add image normalisation/inversion + preview, add noise/flutter controls, fix UI scroll and prevent text clipping, improve spectrogram colour rendering --- txt2spec.py | 425 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 341 insertions(+), 84 deletions(-) diff --git a/txt2spec.py b/txt2spec.py index 4d28e61..e0ddc9a 100644 --- a/txt2spec.py +++ b/txt2spec.py @@ -1,160 +1,417 @@ import logging +import os import tempfile import gradio as gr import librosa -import librosa.display -import matplotlib.pyplot as plt +import matplotlib.cm as cm import numpy as np import soundfile as sf from PIL import Image, ImageDraw, ImageFont -DEFAULT_FONT_PATH = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" +FONT_PATHS = [ + os.path.expanduser("~/Library/Fonts/Druk Wide.otf"), + os.path.expanduser("~/Library/Fonts/Druk Wide.ttf"), + os.path.expanduser("~/Library/Fonts/DrukWide.otf"), + os.path.expanduser("~/Library/Fonts/DrukWide.ttf"), + "/Library/Fonts/Druk Wide.otf", + "/Library/Fonts/Druk Wide.ttf", + "/Library/Fonts/DrukWide.otf", + "/Library/Fonts/DrukWide.ttf", + "/System/Library/Fonts/Supplemental/Arial Bold.ttf", + "/System/Library/Fonts/Supplemental/Arial.ttf", + "/System/Library/Fonts/Supplemental/Helvetica Bold.ttf", + "/System/Library/Fonts/Supplemental/Helvetica.ttf", + "/Library/Fonts/Arial.ttf", + "/Library/Fonts/Helvetica.ttf", + "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", +] + +def find_font_path(): + for path in FONT_PATHS: + if os.path.exists(path): + return path + return None + +DEFAULT_FONT_PATH = find_font_path() DEFAULT_SAMPLE_RATE = 22050 +DEFAULT_N_FFT = 2048 +DEFAULT_HOP_LENGTH = DEFAULT_N_FFT // 4 +DEFAULT_SPEC_HEIGHT = DEFAULT_N_FFT // 2 + 1 +DEFAULT_BASE_WIDTH = 1024 +DEFAULT_MARGIN = 10 +DEFAULT_LETTER_SPACING = 5 +DEFAULT_FONT_SIZE_CAP = DEFAULT_SPEC_HEIGHT +DEFAULT_FONT_SIZE = max(10, DEFAULT_SPEC_HEIGHT - DEFAULT_MARGIN * 2) +DEFAULT_FLUTTER_PX = 6 +DEFAULT_NOISE_AMOUNT = 0.03 +DEFAULT_NOISE_STRENGTH = 0.2 +DEFAULT_DISPLAY_CMAP = "turbo" +DEFAULT_DB_RANGE = 80.0 logging.basicConfig(level=logging.INFO) -def text_to_spectrogram_image(text, base_width=512, height=256, max_font_size=80, margin=10, letter_spacing=5): - try: - font = ImageFont.truetype(DEFAULT_FONT_PATH, max_font_size) - except IOError: - logging.warning(f"Font not found at {DEFAULT_FONT_PATH}. Using default font.") +def load_font(font_size): + if DEFAULT_FONT_PATH: + try: + return ImageFont.truetype(DEFAULT_FONT_PATH, font_size) + except Exception as e: + logging.warning(f"An error occurred while loading the font: {e}") + return ImageFont.load_default() + +def measure_text(text, font, letter_spacing): + if not text: + text = " " + draw = ImageDraw.Draw(Image.new("L", (1, 1))) + + text_widths = [] + for char in text: + bbox = draw.textbbox((0, 0), char, font=font) + text_widths.append(bbox[2] - bbox[0]) + + text_width = sum(text_widths) + letter_spacing * max(0, len(text) - 1) + bbox = draw.textbbox((0, 0), text, font=font) + text_height = bbox[3] - bbox[1] + return text_width, text_height, text_widths + +def fit_font_size(text, target_height, max_font_size, letter_spacing): + max_font_size = max(1, max_font_size) + if DEFAULT_FONT_PATH is None: font = ImageFont.load_default() - except Exception as e: - logging.error(f"An error occurred while loading the font: {e}") - raise + text_width, text_height, text_widths = measure_text(text, font, letter_spacing) + return font, text_width, text_height, text_widths + + low, high = 1, max_font_size + best = None + while low <= high: + mid = (low + high) // 2 + font = load_font(mid) + text_width, text_height, text_widths = measure_text(text, font, letter_spacing) + if text_height <= target_height: + best = (font, text_width, text_height, text_widths) + low = mid + 1 + else: + high = mid - 1 + + if best is None: + font = load_font(1) + text_width, text_height, text_widths = measure_text(text, font, letter_spacing) + return font, text_width, text_height, text_widths + + font, text_width, text_height, text_widths = best + return font, text_width, text_height, text_widths + +def apply_flutter(image, flutter_px): + if flutter_px <= 0: + return image + height, width = image.shape + offsets = np.random.randint(-flutter_px, flutter_px + 1, size=width) + if width >= 9: + kernel = np.ones(9, dtype=np.float32) / 9.0 + offsets = np.rint(np.convolve(offsets, kernel, mode="same")).astype(int) + + warped = np.zeros_like(image) + for x in range(width): + offset = offsets[x] + if offset == 0: + warped[:, x] = image[:, x] + elif offset > 0: + if offset < height: + warped[offset:, x] = image[: height - offset, x] + else: + shift = -offset + if shift < height: + warped[: height - shift, x] = image[shift:, x] + return warped + +def add_text_noise(image, noise_amount, noise_strength): + if noise_amount <= 0 and noise_strength <= 0: + return image + image = image.astype(np.float32) + mask = image > 0 + if noise_strength > 0: + noise = np.random.normal(0, noise_strength * 255, image.shape).astype(np.float32) + image = image + noise * mask + if noise_amount > 0: + drop = (np.random.rand(*image.shape) < noise_amount) & mask + image[drop] = 0 + return np.clip(image, 0, 255).astype(np.uint8) + +def spectrogram_to_rgb(S_dB, cmap_name=DEFAULT_DISPLAY_CMAP, db_range=DEFAULT_DB_RANGE): + if S_dB.size == 0: + return np.zeros((1, 1, 3), dtype=np.uint8) + + vmax = np.nanmax(S_dB) + if not np.isfinite(vmax): + vmax = 0.0 + vmin = vmax - db_range + if vmax <= vmin: + vmin = vmax - 1.0 + + norm = (S_dB - vmin) / (vmax - vmin) + norm = np.clip(norm, 0.0, 1.0) + norm = np.flipud(norm) + cmap = cm.get_cmap(cmap_name) + rgb = (cmap(norm)[:, :, :3] * 255).astype(np.uint8) + return rgb + +def auto_invert_image(image): + if image.mean() > 127: + return 255 - image + return image - draw = ImageDraw.Draw(Image.new("L", (1, 1))) +def normalize_image(image): + image = image.astype(np.float32) + if image.max() > image.min(): + lo, hi = np.percentile(image, [1, 99]) + if hi > lo: + image = (image - lo) / (hi - lo) + else: + image = image / 255.0 + else: + image = image / 255.0 + return np.clip(image, 0, 1) * 255 + +def prepare_image_spectrogram(image, target_height, min_width): + if image.mode != "L": + image = image.convert("L") + + width, height = image.size + if height <= 0: + raise ValueError("Image height must be greater than 0.") + + scale = target_height / height + new_width = max(1, int(round(width * scale))) + image = image.resize((new_width, target_height), Image.LANCZOS) + image = np.array(image) + image = auto_invert_image(image) + image = normalize_image(image).astype(np.uint8) - text_widths = [ - draw.textbbox((0, 0), char, font=font)[2] - draw.textbbox((0, 0), char, font=font)[0] - for char in text - ] - text_width = sum(text_widths) + letter_spacing * (len(text) - 1) - text_height = ( - draw.textbbox((0, 0), text[0], font=font)[3] - - draw.textbbox((0, 0), text[0], font=font)[1] + if new_width < min_width: + pad_left = (min_width - new_width) // 2 + pad_right = min_width - new_width - pad_left + image = np.pad(image, ((0, 0), (pad_left, pad_right)), mode="constant", constant_values=0) + return image + +def text_to_spectrogram_image( + text, + base_width=DEFAULT_BASE_WIDTH, + height=DEFAULT_SPEC_HEIGHT, + max_font_size=DEFAULT_FONT_SIZE, + margin=DEFAULT_MARGIN, + letter_spacing=DEFAULT_LETTER_SPACING, + flutter_px=DEFAULT_FLUTTER_PX, + noise_amount=DEFAULT_NOISE_AMOUNT, + noise_strength=DEFAULT_NOISE_STRENGTH, +): + safe_margin = max(0, margin + flutter_px) + target_height = max(1, height - safe_margin * 2) + font, text_width, text_height, text_widths = fit_font_size( + text, + target_height, + max_font_size, + letter_spacing, ) - width = max(base_width, text_width + margin * 2) - height = max(height, text_height + margin * 2) + width = max(base_width, text_width + safe_margin * 2) + height = max(height, text_height + safe_margin * 2) image = Image.new("L", (width, height), "black") draw = ImageDraw.Draw(image) text_start_x = (width - text_width) // 2 text_start_y = (height - text_height) // 2 + min_x = safe_margin + max_x = max(min_x, width - text_width - safe_margin) + min_y = safe_margin + max_y = max(min_y, height - text_height - safe_margin) + text_start_x = int(np.clip(text_start_x, min_x, max_x)) + text_start_y = int(np.clip(text_start_y, min_y, max_y)) current_x = text_start_x for char, char_width in zip(text, text_widths): - draw.text((current_x, text_start_y), char, font=font, fill="white") + offset_x = np.random.randint(-flutter_px, flutter_px + 1) if flutter_px > 0 else 0 + offset_y = np.random.randint(-flutter_px, flutter_px + 1) if flutter_px > 0 else 0 + max_x_char = max(min_x, width - char_width - safe_margin) + char_x = int(np.clip(current_x + offset_x, min_x, max_x_char)) + char_y = int(np.clip(text_start_y + offset_y, min_y, max_y)) + draw.text((char_x, char_y), char, font=font, fill="white") current_x += char_width + letter_spacing image = np.array(image) - image = np.where(image > 0, 255, image) + image = apply_flutter(image, flutter_px) + image = add_text_noise(image, noise_amount, noise_strength) return image -def spectrogram_image_to_audio(image, sr=DEFAULT_SAMPLE_RATE): +def spectrogram_image_to_audio(image, sr=DEFAULT_SAMPLE_RATE, hop_length=DEFAULT_HOP_LENGTH): flipped_image = np.flipud(image) - S = flipped_image.astype(np.float32) / 255.0 * 100.0 - y = librosa.griffinlim(S) - return y + S = flipped_image.astype(np.float32) / 255.0 + n_fft = max(2, (S.shape[0] - 1) * 2) + y = librosa.griffinlim(S, n_iter=64, hop_length=hop_length, win_length=n_fft) + return y, n_fft + +def save_linear_spectrogram(y, sr, n_fft, hop_length): + S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=n_fft)) + S_dB = librosa.amplitude_to_db(S, ref=np.max) + rgb = spectrogram_to_rgb(S_dB) + + with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_spectrogram: + spectrogram_path = temp_spectrogram.name + Image.fromarray(rgb).save(spectrogram_path) + return spectrogram_path -def create_audio_with_spectrogram(text, base_width, height, max_font_size, margin, letter_spacing): - spec_image = text_to_spectrogram_image(text, base_width, height, max_font_size, margin, letter_spacing) - y = spectrogram_image_to_audio(spec_image) +def create_audio_with_spectrogram( + text, + base_width, + height, + max_font_size, + margin, + letter_spacing, + flutter_px, + noise_amount, + noise_strength, +): + spec_image = text_to_spectrogram_image( + text, + base_width, + height, + max_font_size, + margin, + letter_spacing, + flutter_px, + noise_amount, + noise_strength, + ) + y, n_fft = spectrogram_image_to_audio(spec_image) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: audio_path = temp_audio.name sf.write(audio_path, y, DEFAULT_SAMPLE_RATE) - S = librosa.feature.melspectrogram(y=y, sr=DEFAULT_SAMPLE_RATE) - S_dB = librosa.power_to_db(S, ref=np.max) - plt.figure(figsize=(10, 4)) - librosa.display.specshow(S_dB, sr=DEFAULT_SAMPLE_RATE, x_axis="time", y_axis="mel") - plt.axis("off") - plt.tight_layout(pad=0) - - with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_spectrogram: - spectrogram_path = temp_spectrogram.name - plt.savefig(spectrogram_path, bbox_inches="tight", pad_inches=0, transparent=True) - plt.close() + spectrogram_path = save_linear_spectrogram(y, DEFAULT_SAMPLE_RATE, n_fft, DEFAULT_HOP_LENGTH) return audio_path, spectrogram_path def display_audio_spectrogram(audio_path): y, sr = librosa.load(audio_path, sr=None) - S = librosa.feature.melspectrogram(y=y, sr=sr) - S_dB = librosa.power_to_db(S, ref=np.max) + return save_linear_spectrogram(y, sr, DEFAULT_N_FFT, DEFAULT_HOP_LENGTH) - plt.figure(figsize=(10, 4)) - librosa.display.specshow(S_dB, sr=sr, x_axis="time", y_axis="mel") - plt.axis("off") - plt.tight_layout(pad=0) +def create_audio_from_image(image_path, sr=DEFAULT_SAMPLE_RATE): + if not image_path: + raise gr.Error("Please upload an image to encode.") - with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_spectrogram: - spectrogram_path = temp_spectrogram.name - plt.savefig(spectrogram_path, bbox_inches="tight", pad_inches=0, transparent=True) - plt.close() - return spectrogram_path - -def image_to_spectrogram_audio(image_path, sr=DEFAULT_SAMPLE_RATE): - image = Image.open(image_path).convert("L") - image = np.array(image) - y = spectrogram_image_to_audio(image, sr) + image = Image.open(image_path) + spec_image = prepare_image_spectrogram(image, DEFAULT_SPEC_HEIGHT, DEFAULT_BASE_WIDTH) + y, n_fft = spectrogram_image_to_audio(spec_image, sr) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: img2audio_path = temp_audio.name sf.write(img2audio_path, y, sr) - return img2audio_path -def gradio_interface_fn(text, base_width, height, max_font_size, margin, letter_spacing): - audio_path, spectrogram_path = create_audio_with_spectrogram(text, base_width, height, max_font_size, margin, letter_spacing) + spectrogram_path = save_linear_spectrogram(y, sr, n_fft, DEFAULT_HOP_LENGTH) + return img2audio_path, spectrogram_path + +def gradio_interface_fn( + text, + base_width, + height, + max_font_size, + margin, + letter_spacing, + flutter_px, + noise_amount, + noise_strength, +): + audio_path, spectrogram_path = create_audio_with_spectrogram( + text, + base_width, + height, + max_font_size, + margin, + letter_spacing, + flutter_px, + noise_amount, + noise_strength, + ) return audio_path, spectrogram_path def gradio_image_to_audio_fn(upload_image): - return image_to_spectrogram_audio(upload_image) + return create_audio_from_image(upload_image) def gradio_decode_fn(upload_audio): return display_audio_spectrogram(upload_audio) -with gr.Blocks(title="Audio Steganography", css="footer{display:none !important}", theme=gr.themes.Soft(primary_hue="green", secondary_hue="green", spacing_size="sm", radius_size="lg")) as txt2spec: - with gr.Tab("Text to Spectrogram"): +with gr.Blocks( + title="Audio Steganography", + css=( + "footer{display:none !important}" + "html,body,#root{height:100%}" + "body{overflow-y:auto !important}" + "main{overflow:visible !important}" + ".gradio-container{min-height:100vh;height:auto;overflow:visible;padding-bottom:48px}" + "#encoded-spectrogram img,#image-encoded-spectrogram img,#decoded-spectrogram img{height:auto !important;max-height:none !important;width:100%;object-fit:contain}" + "#encoded-spectrogram .image-container,#image-encoded-spectrogram .image-container,#decoded-spectrogram .image-container{height:auto !important;max-height:none !important;overflow:visible !important}" + ), + theme=gr.themes.Soft(primary_hue="green", secondary_hue="green", spacing_size="sm", radius_size="lg"), +) as txt2spec: + with gr.Tab("Text -> Audio (Encoder)"): with gr.Group(): - text = gr.Textbox(lines=2, placeholder="Enter your text:", label="Text", info="Enter the text you want to convert to audio.") + text = gr.Textbox(lines=2, placeholder="Enter your text:", label="Text", info="Encodes text into a linear spectrogram.") with gr.Row(variant="panel"): - base_width = gr.Slider(value=512, label="Image Width", visible=False) - height = gr.Slider(value=256, label="Image Height", visible=False) - max_font_size = gr.Slider(minimum=10, maximum=130, step=5, value=80, label="Font size") - margin = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Indent") - letter_spacing = gr.Slider(minimum=0, maximum=50, step=1, value=5, label="Letter spacing") - generate_button = gr.Button("Generate", variant="primary", size="lg") + base_width = gr.Slider(minimum=1, maximum=4096, step=1, value=DEFAULT_BASE_WIDTH, label="Image Width", visible=False) + height = gr.Slider(minimum=1, maximum=4097, step=1, value=DEFAULT_SPEC_HEIGHT, label="Image Height", visible=False) + max_font_size = gr.Slider(minimum=10, maximum=DEFAULT_FONT_SIZE_CAP, step=1, value=DEFAULT_FONT_SIZE, label="Font size (max)") + margin = gr.Slider(minimum=0, maximum=200, step=1, value=DEFAULT_MARGIN, label="Indent") + letter_spacing = gr.Slider(minimum=0, maximum=200, step=1, value=DEFAULT_LETTER_SPACING, label="Letter spacing") + flutter_px = gr.Slider(minimum=0, maximum=20, step=1, value=DEFAULT_FLUTTER_PX, label="Text flutter (px)") + noise_amount = gr.Slider(minimum=0.0, maximum=0.2, step=0.01, value=DEFAULT_NOISE_AMOUNT, label="Text noise") + noise_strength = gr.Slider(minimum=0.0, maximum=0.6, step=0.05, value=DEFAULT_NOISE_STRENGTH, label="Noise strength") + generate_button = gr.Button("Encode text", variant="primary", size="lg") with gr.Column(variant="panel"): with gr.Group(): - output_audio = gr.Audio(type="filepath", label="Generated audio") - output_spectrogram = gr.Image(type="filepath", label="Spectrogram") - - generate_button.click(gradio_interface_fn, inputs=[text, base_width, height, max_font_size, margin, letter_spacing], outputs=[output_audio, output_spectrogram]) - - with gr.Tab("Image to Spectrogram"): + output_audio = gr.Audio(type="filepath", label="Encoded audio") + output_spectrogram = gr.Image(type="filepath", label="Encoded spectrogram (preview)", elem_id="encoded-spectrogram") + + generate_button.click( + gradio_interface_fn, + inputs=[ + text, + base_width, + height, + max_font_size, + margin, + letter_spacing, + flutter_px, + noise_amount, + noise_strength, + ], + outputs=[output_audio, output_spectrogram], + ) + + with gr.Tab("Image -> Audio (Encoder)"): with gr.Group(): with gr.Column(): - upload_image = gr.Image(type="filepath", label="Upload image") - convert_button = gr.Button("Convert to audio", variant="primary", size="lg") + upload_image = gr.Image(type="filepath", label="Upload image (auto-normalize + auto-invert)") + convert_button = gr.Button("Encode image", variant="primary", size="lg") with gr.Column(variant="panel"): - output_audio_from_image = gr.Audio(type="filepath", label="Generated audio") + output_audio_from_image = gr.Audio(type="filepath", label="Encoded audio") + output_image_spectrogram = gr.Image(type="filepath", label="Encoded spectrogram (preview)", elem_id="image-encoded-spectrogram") - convert_button.click(gradio_image_to_audio_fn, inputs=[upload_image], outputs=[output_audio_from_image]) + convert_button.click( + gradio_image_to_audio_fn, + inputs=[upload_image], + outputs=[output_audio_from_image, output_image_spectrogram], + ) - with gr.Tab("Audio Spectrogram"): + with gr.Tab("Audio -> Spectrogram (Decoder)"): with gr.Group(): with gr.Column(): upload_audio = gr.Audio(type="filepath", label="Upload audio", scale=3) - decode_button = gr.Button("Show spectrogram", variant="primary", size="lg") + decode_button = gr.Button("Decode spectrogram", variant="primary", size="lg") with gr.Column(variant="panel"): - decoded_image = gr.Image(type="filepath", label="Audio Spectrogram") + decoded_image = gr.Image(type="filepath", label="Decoded spectrogram", elem_id="decoded-spectrogram") decode_button.click(gradio_decode_fn, inputs=[upload_audio], outputs=[decoded_image])