diff --git a/.gitignore b/.gitignore index c780f7d..d7b3df0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ __pycache__/ *.pyc -.env \ No newline at end of file +.env +venv +.venv +podcast_script.txt +final_podcast.wav +tmp*/ diff --git a/generate_audio.py b/generate_audio.py index 11c9ddc..8e1d4c7 100644 --- a/generate_audio.py +++ b/generate_audio.py @@ -6,11 +6,13 @@ from audio_processor import AudioGenerator from dotenv import load_dotenv from pydub import AudioSegment +import re load_dotenv() VOICE_A = os.getenv('VOICE_A', 'Puck') VOICE_B = os.getenv('VOICE_B', 'Kore') +VOICE_C = os.getenv('VOICE_C', 'Charon') def parse_conversation(file_path): with open(file_path, 'r', encoding='utf-8') as file: @@ -19,15 +21,18 @@ def parse_conversation(file_path): lines = content.strip().split('\n') speaker_a_lines = [] speaker_b_lines = [] + speaker_c_lines = [] - for line in lines: + for index, line in enumerate(lines, start=0): if line.strip(): if line.startswith("Speaker A:"): - speaker_a_lines.append(line.replace("Speaker A:", "").strip()) + speaker_a_lines.append(line.replace("Speaker A:", f"{index}|").strip()) elif line.startswith("Speaker B:"): - speaker_b_lines.append(line.replace("Speaker B:", "").strip()) + speaker_b_lines.append(line.replace("Speaker B:", f"{index}|").strip()) + elif line.startswith("Speaker C:"): + speaker_c_lines.append(line.replace("Speaker C:", f"{index}|").strip()) - return speaker_a_lines, speaker_b_lines + return speaker_a_lines, speaker_b_lines, speaker_c_lines def read_file_content(file_path): with open(file_path, 'r', encoding='utf-8') as file: @@ -40,19 +45,26 @@ async def setup_environment(): def read_and_parse_inputs(): system_instructions = read_file_content('system_instructions_audio.txt') full_script = read_file_content('podcast_script.txt') - speaker_a_lines, speaker_b_lines = parse_conversation('podcast_script.txt') - return system_instructions, full_script, speaker_a_lines, speaker_b_lines + speaker_a_lines, speaker_b_lines, speaker_c_lines = parse_conversation('podcast_script.txt') + return system_instructions, full_script, speaker_a_lines, speaker_b_lines, speaker_c_lines def prepare_speaker_dialogues(system_instructions, full_script, speaker_lines, voice, temp_dir): dialogues = [system_instructions + "\n\n" + full_script] output_files = [os.path.join(temp_dir, f"speaker_{voice}_initial.wav")] for i, line in enumerate(speaker_lines): - dialogues.append(line) - output_files.append(os.path.join(temp_dir, f"speaker_{voice}_{i}.wav")) + line_num, line_dialog = get_line_number(line) + dialogues.append(line_dialog) + output_files.append(os.path.join(temp_dir, f"{line_num}_speaker_{voice}.wav")) return dialogues, output_files +def get_line_number(line): + match = re.match(r"(\d+)\|(.*)", line) + if match: + return int(match.group(1)), match.group(2).strip() + return None, line + async def process_speaker(voice, dialogues, output_files): # Create a single generator for all dialogues generator = AudioGenerator(voice) @@ -64,20 +76,17 @@ async def process_speaker(voice, dialogues, output_files): if generator.ws: await generator.ws.close() -def interleave_output_files(speaker_a_files, speaker_b_files): - """Interleaves the audio files from both speakers to maintain conversation order""" - all_output_files = [] - min_length = min(len(speaker_a_files), len(speaker_b_files)) - - # Interleave files from both speakers - for i in range(min_length): - all_output_files.extend([speaker_a_files[i], speaker_b_files[i]]) - - # Add any remaining files from either speaker - all_output_files.extend(speaker_a_files[min_length:]) - all_output_files.extend(speaker_b_files[min_length:]) +def extract_line_num(filename): + match = re.search(r"(\d+)_speaker_.*\.wav", filename) + if match: + return int(match.group(1)) + return float('inf') # Return a large number if no match is found - return all_output_files +def interleave_output_files(speaker_a_files, speaker_b_files, speaker_c_files): + """Interleaves the audio files from all speakers to maintain conversation order""" + all_files = speaker_a_files + speaker_b_files + speaker_c_files + all_files.sort(key=extract_line_num) + return all_files def combine_audio_files(file_list, output_file, silence_duration_ms=50): combined = AudioSegment.empty() @@ -95,13 +104,15 @@ async def main(): script_dir = await setup_environment() with tempfile.TemporaryDirectory(dir=script_dir) as temp_dir: - system_instructions, full_script, speaker_a_lines, speaker_b_lines = read_and_parse_inputs() + system_instructions, full_script, speaker_a_lines, speaker_b_lines, speaker_c_lines = read_and_parse_inputs() # Prepare dialogues for both speakers dialogues_a, output_files_a = prepare_speaker_dialogues( system_instructions, full_script, speaker_a_lines, VOICE_A, temp_dir) dialogues_b, output_files_b = prepare_speaker_dialogues( system_instructions, full_script, speaker_b_lines, VOICE_B, temp_dir) + dialogues_c, output_files_c = prepare_speaker_dialogues( + system_instructions, full_script, speaker_c_lines, VOICE_C, temp_dir) # Process Speaker A first print("Processing Speaker A...") @@ -111,8 +122,13 @@ async def main(): print("Processing Speaker B...") await process_speaker(VOICE_B, dialogues_b, output_files_b) + # Then process Speaker C + print("Processing Speaker C...") + await process_speaker(VOICE_C, dialogues_c, output_files_c) + + # Interleave and combine audio as before - all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:]) + all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:], output_files_c[1:]) final_output = "final_podcast.wav" combine_audio_files(all_output_files, final_output, silence_duration_ms=50) print(f"\nFinal podcast audio created: {final_output}") diff --git a/generate_script.py b/generate_script.py index 2a623be..2ab3ee8 100644 --- a/generate_script.py +++ b/generate_script.py @@ -132,7 +132,7 @@ def create_podcast_script(content): def clean_podcast_script(script): # Define a regex pattern to identify the start of the podcast text - podcast_start_pattern = r"^(Speaker A:|Speaker B:)" + podcast_start_pattern = r"^(Speaker A:|Speaker B:|Speaker C:)" # Split the script into lines lines = script.splitlines() diff --git a/system_instructions_script.txt b/system_instructions_script.txt index aadac23..b8928eb 100644 --- a/system_instructions_script.txt +++ b/system_instructions_script.txt @@ -4,6 +4,7 @@ - Output must ONLY contain dialogue in the following format: Speaker A: [dialogue text] Speaker B: [dialogue text] +Speaker C: [dialogue text] - No other formatting, headers, or content should be included - No blank lines between speaker turns @@ -13,7 +14,8 @@ Speaker B: [dialogue text] - Speaker labels must be exactly "Speaker A:" and "Speaker B:" -- Create fun conversations between two hosts (Speaker A and B) with distinct personalities +- Create fun conversations between three hosts (mostly Speaker A and B with C adding colorful commentary from time to time) with distinct personalities +- Speaker C does not have to speak every turn - Include casual banter and appropriate humor to keep tone light - Use storytelling techniques with examples and real-world scenarios - Add occasional playful disagreements or friendly debates @@ -31,18 +33,44 @@ Speaker B: [dialogue text] Speaker A: [dialogue] Speaker B: [dialogue] +Speaker C: [dialogue] -Speaker A: -- Inquisitive and curious -- Asks clarifying questions -- Drives conversation forward +Speaker A (Host): +- **Inquisitive and Curious**: Constantly seeks to understand the latest advancements in technology and innovation, often expressing excitement about emerging trends. +- **Asks Clarifying Questions**: Engages actively in conversations by asking thoughtful and probing questions that encourage deeper discussion and ensure clarity. +- **Drives Conversation Forward**: Acts as a catalyst in discussions, steering the conversation towards new topics and encouraging others to share their insights. +- **Enthusiastic and Passionate**: Exhibits a genuine passion for technology and science, which is infectious and encourages others to engage in the conversation. +- **Open-Minded**: Welcomes diverse perspectives and is willing to explore ideas that may differ from their own, fostering a collaborative atmosphere. +- **Adaptable**: Easily adjusts to the flow of the conversation, ready to pivot to new subjects or angles as discussions evolve. +- **Empathetic Listener**: Demonstrates active listening skills, showing understanding and consideration for others' viewpoints, which builds rapport. +- **Analytical Thinker**: Approaches topics with a critical eye, breaking down information to understand underlying principles and implications. +- **Engaging Storyteller**: Uses anecdotes and personal experiences to illustrate points, making the conversation relatable and memorable. +- **Motivational**: Encourages others to share their knowledge and experiences, creating a positive environment that promotes learning and growth. -Speaker B: -- Explanatory and insightful -- Builds on discussion points -- Provides detailed responses -- Uses analogies to simplify and clarify complex concepts +Speaker B (Host): +- **Explanatory and Insightful**: Provides comprehensive and well-thought-out explanations, drawing from a rich background in technology and its implications. +- **Builds on Discussion Points**: Connects ideas and themes introduced by others, enhancing the depth of the conversation and creating a cohesive narrative. +- **Provides Detailed Responses**: Delivers in-depth answers that address various aspects of a question, ensuring that no critical information is overlooked. +- **Uses Analogies to Simplify and Clarify Complex Concepts**: Breaks down intricate ideas into relatable examples, making complex subjects more approachable. +- **Patient and Thorough**: Takes the time to ensure that explanations are clear and comprehensive, accommodating different levels of understanding among listeners. +- **Knowledgeable**: Possesses a deep understanding of technology, science, and media, which informs their contributions and enriches the discussion. +- **Critical Thinker**: Analyzes information thoughtfully, weighing pros and cons to provide balanced viewpoints on issues. +- **Communicative**: Clearly articulates thoughts and ideas in a way that is engaging and easy to follow, facilitating understanding. +- **Supportive**: Encourages questions and discussions, making others feel comfortable sharing their thoughts and concerns without judgment. +- **Visionary**: Offers forward-thinking perspectives on technology and its potential impact on society, inspiring others to think about the future of innovation. + +Speaker C (Comedian): +- **Witty and Observational**: Known for sharp humor and keen observations about everyday life, making relatable comments that resonate with audiences. +- **Playful and Light-hearted**: Approaches conversations with a sense of fun, infusing humor into discussions to keep things engaging and enjoyable. +- **Master of Timing**: Understands when to deliver punchlines or comedic insights, effectively using pauses and pacing to enhance humor. +- **Self-deprecating**: Often uses their own experiences and flaws as comedic material, creating a relatable persona that audiences can connect with. +- **Clever and Quick-thinking**: Responds with humor in real-time, able to think on their feet and turn any topic into a comedic opportunity. +- **Storyteller**: Crafts narratives around everyday situations, using humor to highlight the absurdities of life and make points more memorable. +- **Observant and Detail-oriented**: Notices small details in conversations and daily life that others might overlook, using them as fodder for jokes. +- **Engaging Performer**: Uses body language, facial expressions, and vocal variety to enhance their comedic delivery, making performances dynamic. +- **Non-confrontational**: Keeps conversations light and avoids heavy topics unless they can be presented in a humorous way, ensuring a positive atmosphere. +- **Relatable**: Builds rapport with audiences by discussing common experiences and shared frustrations, making them feel understood and included in the humor. 1. Strategic Pause Points: @@ -74,12 +102,12 @@ Speaker B: - Output format must be plain text only -- Each line must start with either "Speaker A:" or "Speaker B:" +- Each line must start with either "Speaker A:" or "Speaker B:" or "Speaker C:" - No empty lines or additional formatting - No music references - Accept content from various formats (PDF, URL, text, Markdown) - Introduce technical terms naturally -- Maintain 30-40 exchanges +- Maintain 40-50 exchanges - Keep responses to 2-3 sentences per turn