-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphase2.py
More file actions
72 lines (57 loc) · 2.87 KB
/
phase2.py
File metadata and controls
72 lines (57 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import asyncio
import edge_tts
import random
import json
WOMAN_VOICE_LIST=[
"en-US-JennyNeural",
"en-US-MichelleNeural",
"en-US-AriaNeural",
]
# --- THE SYNC DIAL ---
# MP3 compression adds a tiny delay to the audio file.
# If your text appears too EARLY, increase this number (e.g., 0.2).
# If your text appears too LATE, decrease it or make it negative (e.g., -0.1).
SYNC_OFFSET = -0.3
async def generate_audio_and_subs(text: str, output_audio_path: str, output_json_path: str, gender: str = "M"):
print(f"COMMENCING AUDIO GENERATION (Voice Profile: {gender})...")
voice_model = random.choice(WOMAN_VOICE_LIST) if gender == "F" else "en-US-ChristopherNeural"
communicate = edge_tts.Communicate(text, voice_model)
word_boundaries = []
sentence_fallback = []
with open(output_audio_path, "wb") as audio_file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_file.write(chunk["data"])
# 1. THE PRIMARY ENGINE (With Sync Offset)
elif chunk["type"] == "WordBoundary":
start_time = (chunk["offset"] / 10000000.0) + SYNC_OFFSET
end_time = ((chunk["offset"] + chunk["duration"]) / 10000000.0) + SYNC_OFFSET
word_boundaries.append({
"word": chunk["text"],
"start": round(start_time, 3),
"end": round(end_time, 3)
})
# 2. THE HEURISTIC SHIELD (With Sync Offset)
elif chunk["type"] == "SentenceBoundary":
start_time = (chunk["offset"] / 10000000.0) + SYNC_OFFSET
end_time = ((chunk["offset"] + chunk["duration"]) / 10000000.0) + SYNC_OFFSET
words = chunk["text"].split()
if not words:
continue
chunk_duration = (end_time - start_time) / len(words)
for i, word in enumerate(words):
sentence_fallback.append({
"word": word,
"start": round(start_time + (i * chunk_duration), 3),
"end": round(start_time + ((i + 1) * chunk_duration), 3)
})
# 3. THE LOGIC GATE
if not word_boundaries:
print("WARNING: Upstream API dropped WordBoundary packets. Deploying heuristic fallback.")
final_telemetry = sentence_fallback
else:
final_telemetry = sorted(word_boundaries, key=lambda x: x["start"])
# 4. COMMIT TO DISK
with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(final_telemetry, json_file, indent=4)
print(f"SUCCESS: Saved {output_audio_path} and precise telemetry to {output_json_path}")