Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion TranscribeHttp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _cfg() -> SpeechConfig:
return SpeechConfig(
key=os.environ["SPEECH_KEY"],
endpoint=endpoint.rstrip("/"),
api_version=os.environ.get("SPEECH_API_VERSION", "2025-10-15"),
api_version=os.environ.get("SPEECH_API_VERSION", "2024-11-15"),
)

def main(req: func.HttpRequest) -> func.HttpResponse:
Expand Down
132 changes: 132 additions & 0 deletions add_url_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""
Add URL tracking fields to Azure Search index
"""

import os
import requests
import sys

# Load from .env file
env_path = os.path.join(os.path.dirname(__file__), "ui", ".env")
env_vars = {}

if os.path.exists(env_path):
with open(env_path) as f:
for line in f:
if '=' in line and not line.startswith('#'):
key, value = line.strip().split('=', 1)
env_vars[key] = value

SEARCH_ENDPOINT = env_vars.get("SEARCH_ENDPOINT")
SEARCH_KEY = env_vars.get("SEARCH_KEY")
SEARCH_INDEX_NAME = env_vars.get("SEARCH_INDEX_NAME", "segments")

print(f"Endpoint: {SEARCH_ENDPOINT}")
print(f"Index: {SEARCH_INDEX_NAME}")
print(f"Key: {'*' * 10}{SEARCH_KEY[-4:] if SEARCH_KEY else 'NOT FOUND'}")
print()

if not SEARCH_ENDPOINT or not SEARCH_KEY:
print("ERROR: Missing SEARCH_ENDPOINT or SEARCH_KEY in .env")
sys.exit(1)

API_VERSION = "2024-07-01"

def get_index():
url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
headers = {"api-key": SEARCH_KEY}

print(f"Fetching index: {url}")
response = requests.get(url, headers=headers)

if response.status_code == 200:
return response.json()
else:
print(f"Failed to get index: {response.status_code}")
print(response.text)
return None

def update_index(index_def):
url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
headers = {
"Content-Type": "application/json",
"api-key": SEARCH_KEY
}

response = requests.put(url, headers=headers, json=index_def)

if response.status_code in [200, 201]:
print("✅ Index updated successfully!")
return True
else:
print(f"❌ Failed to update: {response.status_code}")
print(response.text)
return False

def main():
print("Fetching current index...")
index = get_index()
if not index:
sys.exit(1)

existing_fields = {f["name"] for f in index.get("fields", [])}
print(f"Existing fields: {existing_fields}")
print()

new_fields = [
{
"name": "source_url",
"type": "Edm.String",
"searchable": False,
"filterable": True,
"retrievable": True,
"sortable": False,
"facetable": False,
"key": False
},
{
"name": "source_type",
"type": "Edm.String",
"searchable": False,
"filterable": True,
"retrievable": True,
"sortable": False,
"facetable": True,
"key": False
},
{
"name": "processed_at",
"type": "Edm.DateTimeOffset",
"searchable": False,
"filterable": True,
"retrievable": True,
"sortable": True,
"facetable": False,
"key": False
}
]

added = 0
for field in new_fields:
if field["name"] in existing_fields:
print(f"⚠️ Already exists: {field['name']}")
else:
print(f"➕ Adding: {field['name']}")
index["fields"].append(field)
added += 1

if added == 0:
print("\n✅ All fields already present!")
return

print(f"\n💾 Saving with {added} new fields...")
if update_index(index):
print("\n🎉 SUCCESS! URL tracking fields added.")
print("\nNext steps:")
print("1. Restart your Streamlit app")
print("2. Go to 'System Diagnostics' page")
print("3. Click 'Check Index Schema' to verify")

if __name__ == "__main__":
main()
138 changes: 82 additions & 56 deletions scripts/box_shared_folder_manifest.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,17 @@
"""
scripts/box_shared_folder_manifest.py - Generate Video Manifest from Box

This script enumerates .m4a video files from a Box shared folder and generates
a manifest file (videos.jsonl) that lists all videos with their IDs and media URLs.
It creates open shared links for each file so Azure Speech Service can access them.

Architecture Role:
- Pre-processing step before video ingestion
- Generates videos.jsonl input file for import_videos.py
- Handles Box API authentication and folder traversal
- Creates publicly accessible download URLs for Speech Service

Usage:
python scripts/box_shared_folder_manifest.py

Output:
- videos.jsonl: One JSON object per line with video_id and media_url

Configuration (via .env):
- BOX_SHARED_FOLDER_URL: Box shared folder link
- BOX_TOKEN or BOX_ACCESS_TOKEN/BOX_REFRESH_TOKEN: Box authentication
- OUT_PATH: Output file path (default: videos.jsonl)
- RECURSIVE: Whether to traverse subfolders (default: 1)
"""

import json
import os
import requests
import time
from typing import Dict, Any, List, Optional

from box_auth import get_access_token

BOX_API = "https://api.box.com/2.0"

SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"] # https://...box.com/s/<id>
SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"]
print("BOX_SHARED_FOLDER_URL =", SHARED_FOLDER_URL)
OUT_PATH = os.environ.get("OUT_PATH", "videos.jsonl")
RECURSIVE = os.environ.get("RECURSIVE", "1") == "1"
Expand Down Expand Up @@ -63,7 +41,6 @@ def resolve_shared_folder(token: str) -> Dict[str, Any]:
return r.json()



def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int = 0) -> Dict[str, Any]:
url = f"{BOX_API}/folders/{folder_id}/items"
params = {"limit": limit, "offset": offset}
Expand All @@ -72,31 +49,48 @@ def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int
return r.json()


def ensure_open_shared_link_for_file(token: str, file_id: str) -> str:
def ensure_open_shared_link_for_file(token: str, file_id: str, max_retries: int = 3) -> Optional[str]:
"""
Ensure file has an open shared link and return a direct-download URL.
Ensure file has an open shared link with retry logic for timeouts.
"""
url = f"{BOX_API}/files/{file_id}"
payload = {"shared_link": {"access": "open"}}
params = {"fields": "shared_link"} # IMPORTANT: ask for shared_link back

r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=30)
r.raise_for_status()
data = r.json()

sl = data.get("shared_link") or {}
# print("shared_link =", json.dumps(sl, indent=2))
# Prefer direct static download URL
dl = sl.get("download_url")
if dl:
return dl

# Fallback: at least return the shared link (may require cookies)
if sl.get("url"):
return sl["url"]

raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")

params = {"fields": "shared_link"}

for attempt in range(max_retries):
try:
# Increased timeout to 60 seconds
r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=60)

if r.status_code == 404:
print(f"⚠️ Skipping file {file_id} (not found)")
return None

r.raise_for_status()
data = r.json()
sl = data.get("shared_link") or {}

dl = sl.get("download_url")
if dl:
return dl
if sl.get("url"):
return sl["url"]

raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")

except requests.exceptions.Timeout:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff: 1, 2, 4 seconds
print(f"⏱️ Timeout on file {file_id}, retrying in {wait_time}s ({attempt + 1}/{max_retries})...")
time.sleep(wait_time)
else:
print(f"⚠️ Skipping file {file_id} (timeout after {max_retries} attempts)")
return None
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
print(f"⚠️ Skipping file {file_id} (not found)")
return None
raise


def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
Expand All @@ -114,6 +108,22 @@ def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
return items


def load_existing_entries() -> set:
"""Load already processed file IDs to avoid duplicates."""
processed = set()
if os.path.exists(OUT_PATH):
with open(OUT_PATH, "r", encoding="utf-8") as f:
for line in f:
try:
entry = json.loads(line.strip())
video_id = entry.get("video_id", "")
if video_id.startswith("vid_"):
processed.add(video_id[4:])
except json.JSONDecodeError:
continue
return processed


def main():
token = get_access_token()

Expand All @@ -122,10 +132,16 @@ def main():
raise RuntimeError(f"Shared link did not resolve to a folder: {shared_folder.get('type')}")
root_id = shared_folder["id"]

# Load already processed files to resume
processed_ids = load_existing_entries()
print(f"Resuming: {len(processed_ids)} files already processed")

queue = [root_id]
out_count = 0
out_count = len(processed_ids)
skipped = 0
new_files = 0

with open(OUT_PATH, "w", encoding="utf-8") as f:
with open(OUT_PATH, "a", encoding="utf-8") as f:
while queue:
fid = queue.pop(0)
entries = walk(token, fid)
Expand All @@ -141,26 +157,36 @@ def main():

if et != "file":
continue

if not lname.endswith(".m4a"):
continue

file_id = e["id"]

# Skip if already processed
if file_id in processed_ids:
continue

video_id = f"vid_{file_id}"

# Make a per-file open shared link (Speech can fetch without auth).
# If your org disallows open links, this will fail — then you’ll need Blob staging.
file_link = ensure_open_shared_link_for_file(token, file_id)

if file_link is None:
skipped += 1
continue

# Encourage direct download behavior
media_url = file_link # + ("?download=1" if "?" not in file_link else "&download=1")

media_url = file_link
f.write(json.dumps({"video_id": video_id, "media_url": media_url}) + "\n")
f.flush() # Ensure write is saved immediately
out_count += 1
new_files += 1

if out_count % 10 == 0:
print(f"Wrote {out_count} entries...")
print(f"Wrote {out_count} entries total...")

# Small delay to avoid rate limiting
time.sleep(0.2)

print(f"Done. Wrote {out_count} m4a entries to {OUT_PATH}")
print(f"Done. Total: {out_count} entries (new: {new_files}, skipped: {skipped})")


if __name__ == "__main__":
Expand Down
Loading