tapilab · martintony4all · Feb 15, 2026 · Feb 15, 2026 · Feb 17, 2026 · Feb 24, 2026
diff --git a/TranscribeHttp/__init__.py b/TranscribeHttp/__init__.py
@@ -42,7 +42,7 @@ def _cfg() -> SpeechConfig:
     return SpeechConfig(
         key=os.environ["SPEECH_KEY"],
         endpoint=endpoint.rstrip("/"),
-        api_version=os.environ.get("SPEECH_API_VERSION", "2025-10-15"),
+        api_version=os.environ.get("SPEECH_API_VERSION", "2024-11-15"),
     )
 
 def main(req: func.HttpRequest) -> func.HttpResponse:

diff --git a/add_url_fields.py b/add_url_fields.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Add URL tracking fields to Azure Search index
+"""
+
+import os
+import requests
+import sys
+
+# Load from .env file
+env_path = os.path.join(os.path.dirname(__file__), "ui", ".env")
+env_vars = {}
+
+if os.path.exists(env_path):
+    with open(env_path) as f:
+        for line in f:
+            if '=' in line and not line.startswith('#'):
+                key, value = line.strip().split('=', 1)
+                env_vars[key] = value
+
+SEARCH_ENDPOINT = env_vars.get("SEARCH_ENDPOINT")
+SEARCH_KEY = env_vars.get("SEARCH_KEY")
+SEARCH_INDEX_NAME = env_vars.get("SEARCH_INDEX_NAME", "segments")
+
+print(f"Endpoint: {SEARCH_ENDPOINT}")
+print(f"Index: {SEARCH_INDEX_NAME}")
+print(f"Key: {'*' * 10}{SEARCH_KEY[-4:] if SEARCH_KEY else 'NOT FOUND'}")
+print()
+
+if not SEARCH_ENDPOINT or not SEARCH_KEY:
+    print("ERROR: Missing SEARCH_ENDPOINT or SEARCH_KEY in .env")
+    sys.exit(1)
+
+API_VERSION = "2024-07-01"
+
+def get_index():
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
+    headers = {"api-key": SEARCH_KEY}
+
+    print(f"Fetching index: {url}")
+    response = requests.get(url, headers=headers)
+
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to get index: {response.status_code}")
+        print(response.text)
+        return None
+
+def update_index(index_def):
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
+    headers = {
+        "Content-Type": "application/json",
+        "api-key": SEARCH_KEY
+    }
+
+    response = requests.put(url, headers=headers, json=index_def)
+
+    if response.status_code in [200, 201]:
+        print("✅ Index updated successfully!")
+        return True
+    else:
+        print(f"❌ Failed to update: {response.status_code}")
+        print(response.text)
+        return False
+
+def main():
+    print("Fetching current index...")
+    index = get_index()
+    if not index:
+        sys.exit(1)
+
+    existing_fields = {f["name"] for f in index.get("fields", [])}
+    print(f"Existing fields: {existing_fields}")
+    print()
+
+    new_fields = [
+        {
+            "name": "source_url",
+            "type": "Edm.String",
+            "searchable": False,
+            "filterable": True,
+            "retrievable": True,
+            "sortable": False,
+            "facetable": False,
+            "key": False
+        },
+        {
+            "name": "source_type",
+            "type": "Edm.String",
+            "searchable": False,
+            "filterable": True,
+            "retrievable": True,
+            "sortable": False,
+            "facetable": True,
+            "key": False
+        },
+        {
+            "name": "processed_at",
+            "type": "Edm.DateTimeOffset",
+            "searchable": False,
+            "filterable": True,
+            "retrievable": True,
+            "sortable": True,
+            "facetable": False,
+            "key": False
+        }
+    ]
+
+    added = 0
+    for field in new_fields:
+        if field["name"] in existing_fields:
+            print(f"⚠️  Already exists: {field['name']}")
+        else:
+            print(f"➕ Adding: {field['name']}")
+            index["fields"].append(field)
+            added += 1
+
+    if added == 0:
+        print("\n✅ All fields already present!")
+        return
+
+    print(f"\n💾 Saving with {added} new fields...")
+    if update_index(index):
+        print("\n🎉 SUCCESS! URL tracking fields added.")
+        print("\nNext steps:")
+        print("1. Restart your Streamlit app")
+        print("2. Go to 'System Diagnostics' page")
+        print("3. Click 'Check Index Schema' to verify")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/box_shared_folder_manifest.py b/scripts/box_shared_folder_manifest.py
@@ -1,39 +1,17 @@
 """
 scripts/box_shared_folder_manifest.py - Generate Video Manifest from Box
-
-This script enumerates .m4a video files from a Box shared folder and generates
-a manifest file (videos.jsonl) that lists all videos with their IDs and media URLs.
-It creates open shared links for each file so Azure Speech Service can access them.
-
-Architecture Role:
-- Pre-processing step before video ingestion
-- Generates videos.jsonl input file for import_videos.py
-- Handles Box API authentication and folder traversal
-- Creates publicly accessible download URLs for Speech Service
-
-Usage:
-  python scripts/box_shared_folder_manifest.py
-
-Output:
-  - videos.jsonl: One JSON object per line with video_id and media_url
-
-Configuration (via .env):
-  - BOX_SHARED_FOLDER_URL: Box shared folder link
-  - BOX_TOKEN or BOX_ACCESS_TOKEN/BOX_REFRESH_TOKEN: Box authentication
-  - OUT_PATH: Output file path (default: videos.jsonl)
-  - RECURSIVE: Whether to traverse subfolders (default: 1)
 """
 
 import json
 import os
 import requests
+import time
 from typing import Dict, Any, List, Optional
-
 from box_auth import get_access_token
 
 BOX_API = "https://api.box.com/2.0"
 
-SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"]  # https://...box.com/s/<id>
+SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"]
 print("BOX_SHARED_FOLDER_URL =", SHARED_FOLDER_URL)
 OUT_PATH = os.environ.get("OUT_PATH", "videos.jsonl")
 RECURSIVE = os.environ.get("RECURSIVE", "1") == "1"
@@ -63,7 +41,6 @@ def resolve_shared_folder(token: str) -> Dict[str, Any]:
     return r.json()
 
 
-
 def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int = 0) -> Dict[str, Any]:
     url = f"{BOX_API}/folders/{folder_id}/items"
     params = {"limit": limit, "offset": offset}
@@ -72,31 +49,48 @@ def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int
     return r.json()
 
 
-def ensure_open_shared_link_for_file(token: str, file_id: str) -> str:
+def ensure_open_shared_link_for_file(token: str, file_id: str, max_retries: int = 3) -> Optional[str]:
     """
-    Ensure file has an open shared link and return a direct-download URL.
+    Ensure file has an open shared link with retry logic for timeouts.
     """
     url = f"{BOX_API}/files/{file_id}"
     payload = {"shared_link": {"access": "open"}}
-    params = {"fields": "shared_link"}  # IMPORTANT: ask for shared_link back
-
-    r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=30)
-    r.raise_for_status()
-    data = r.json()
-
-    sl = data.get("shared_link") or {}
-    # print("shared_link =", json.dumps(sl, indent=2))
-    # Prefer direct static download URL
-    dl = sl.get("download_url")
-    if dl:
-        return dl
-
-    # Fallback: at least return the shared link (may require cookies)
-    if sl.get("url"):
-        return sl["url"]
-
-    raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")
-
+    params = {"fields": "shared_link"}
+
+    for attempt in range(max_retries):
+        try:
+            # Increased timeout to 60 seconds
+            r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=60)
+
+            if r.status_code == 404:
+                print(f"⚠️  Skipping file {file_id} (not found)")
+                return None
+
+            r.raise_for_status()
+            data = r.json()
+            sl = data.get("shared_link") or {}
+
+            dl = sl.get("download_url")
+            if dl:
+                return dl
+            if sl.get("url"):
+                return sl["url"]
+
+            raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")
+
+        except requests.exceptions.Timeout:
+            if attempt < max_retries - 1:
+                wait_time = 2 ** attempt  # Exponential backoff: 1, 2, 4 seconds
+                print(f"⏱️  Timeout on file {file_id}, retrying in {wait_time}s ({attempt + 1}/{max_retries})...")
+                time.sleep(wait_time)
+            else:
+                print(f"⚠️  Skipping file {file_id} (timeout after {max_retries} attempts)")
+                return None
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 404:
+                print(f"⚠️  Skipping file {file_id} (not found)")
+                return None
+            raise
 
 
 def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
@@ -114,6 +108,22 @@ def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
     return items
 
 
+def load_existing_entries() -> set:
+    """Load already processed file IDs to avoid duplicates."""
+    processed = set()
+    if os.path.exists(OUT_PATH):
+        with open(OUT_PATH, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    entry = json.loads(line.strip())
+                    video_id = entry.get("video_id", "")
+                    if video_id.startswith("vid_"):
+                        processed.add(video_id[4:])
+                except json.JSONDecodeError:
+                    continue
+    return processed
+
+
 def main():
     token = get_access_token()
 
@@ -122,10 +132,16 @@ def main():
         raise RuntimeError(f"Shared link did not resolve to a folder: {shared_folder.get('type')}")
     root_id = shared_folder["id"]
 
+    # Load already processed files to resume
+    processed_ids = load_existing_entries()
+    print(f"Resuming: {len(processed_ids)} files already processed")
+
     queue = [root_id]
-    out_count = 0
+    out_count = len(processed_ids)
+    skipped = 0
+    new_files = 0
 
-    with open(OUT_PATH, "w", encoding="utf-8") as f:
+    with open(OUT_PATH, "a", encoding="utf-8") as f:
         while queue:
             fid = queue.pop(0)
             entries = walk(token, fid)
@@ -141,26 +157,36 @@ def main():
 
                 if et != "file":
                     continue
-
                 if not lname.endswith(".m4a"):
                     continue
 
                 file_id = e["id"]
+
+                # Skip if already processed
+                if file_id in processed_ids:
+                    continue
+
                 video_id = f"vid_{file_id}"
 
-                # Make a per-file open shared link (Speech can fetch without auth).
-                # If your org disallows open links, this will fail — then you’ll need Blob staging.
                 file_link = ensure_open_shared_link_for_file(token, file_id)
+
+                if file_link is None:
+                    skipped += 1
+                    continue
 
-                # Encourage direct download behavior
-                media_url = file_link # + ("?download=1" if "?" not in file_link else "&download=1")
-
+                media_url = file_link
                 f.write(json.dumps({"video_id": video_id, "media_url": media_url}) + "\n")
+                f.flush()  # Ensure write is saved immediately
                 out_count += 1
+                new_files += 1
+
                 if out_count % 10 == 0:
-                    print(f"Wrote {out_count} entries...")
+                    print(f"Wrote {out_count} entries total...")
+
+                # Small delay to avoid rate limiting
+                time.sleep(0.2)
 
-    print(f"Done. Wrote {out_count} m4a entries to {OUT_PATH}")
+    print(f"Done. Total: {out_count} entries (new: {new_files}, skipped: {skipped})")
 
 
 if __name__ == "__main__":