From 35e1604760c1db4c3073c8f198aad37f711f3911 Mon Sep 17 00:00:00 2001
From: Martin <martintony4all@yahoo.com>
Date: Sun, 15 Feb 2026 08:17:13 -0600
Subject: [PATCH 1/8] Add multiple video upload options for processing multiple
 videos from CSV file

---
 TranscribeHttp/__init__.py                    |    2 +-
 scripts/box_shared_folder_manifest.py         |  138 +-
 .../__pycache__/speech_batch.cpython-311.pyc  |  Bin 21537 -> 22598 bytes
 ui/Dockerfile                                 |    3 +
 ui/ui_search.py                               | 1547 ++++++++++++++++-
 5 files changed, 1542 insertions(+), 148 deletions(-)
diff --git a/TranscribeHttp/__init__.py b/TranscribeHttp/__init__.py
index 6fecd2b..3af666b 100644
--- a/TranscribeHttp/__init__.py
+++ b/TranscribeHttp/__init__.py
@@ -42,7 +42,7 @@ def _cfg() -> SpeechConfig:
     return SpeechConfig(
         key=os.environ["SPEECH_KEY"],
         endpoint=endpoint.rstrip("/"),
-        api_version=os.environ.get("SPEECH_API_VERSION", "2025-10-15"),
+        api_version=os.environ.get("SPEECH_API_VERSION", "2024-11-15"),
     )
 
 def main(req: func.HttpRequest) -> func.HttpResponse:
diff --git a/scripts/box_shared_folder_manifest.py b/scripts/box_shared_folder_manifest.py
index 310df6e..d44eab0 100644
--- a/scripts/box_shared_folder_manifest.py
+++ b/scripts/box_shared_folder_manifest.py
@@ -1,39 +1,17 @@
 """
 scripts/box_shared_folder_manifest.py - Generate Video Manifest from Box
-
-This script enumerates .m4a video files from a Box shared folder and generates
-a manifest file (videos.jsonl) that lists all videos with their IDs and media URLs.
-It creates open shared links for each file so Azure Speech Service can access them.
-
-Architecture Role:
-- Pre-processing step before video ingestion
-- Generates videos.jsonl input file for import_videos.py
-- Handles Box API authentication and folder traversal
-- Creates publicly accessible download URLs for Speech Service
-
-Usage:
-  python scripts/box_shared_folder_manifest.py
-
-Output:
-  - videos.jsonl: One JSON object per line with video_id and media_url
-
-Configuration (via .env):
-  - BOX_SHARED_FOLDER_URL: Box shared folder link
-  - BOX_TOKEN or BOX_ACCESS_TOKEN/BOX_REFRESH_TOKEN: Box authentication
-  - OUT_PATH: Output file path (default: videos.jsonl)
-  - RECURSIVE: Whether to traverse subfolders (default: 1)
 """
 
 import json
 import os
 import requests
+import time
 from typing import Dict, Any, List, Optional
-
 from box_auth import get_access_token
 
 BOX_API = "https://api.box.com/2.0"
 
-SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"]  # https://...box.com/s/<id>
+SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"]
 print("BOX_SHARED_FOLDER_URL =", SHARED_FOLDER_URL)
 OUT_PATH = os.environ.get("OUT_PATH", "videos.jsonl")
 RECURSIVE = os.environ.get("RECURSIVE", "1") == "1"
@@ -63,7 +41,6 @@ def resolve_shared_folder(token: str) -> Dict[str, Any]:
     return r.json()
 
 
-
 def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int = 0) -> Dict[str, Any]:
     url = f"{BOX_API}/folders/{folder_id}/items"
     params = {"limit": limit, "offset": offset}
@@ -72,31 +49,48 @@ def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int
     return r.json()
 
 
-def ensure_open_shared_link_for_file(token: str, file_id: str) -> str:
+def ensure_open_shared_link_for_file(token: str, file_id: str, max_retries: int = 3) -> Optional[str]:
     """
-    Ensure file has an open shared link and return a direct-download URL.
+    Ensure file has an open shared link with retry logic for timeouts.
     """
     url = f"{BOX_API}/files/{file_id}"
     payload = {"shared_link": {"access": "open"}}
-    params = {"fields": "shared_link"}  # IMPORTANT: ask for shared_link back
-
-    r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=30)
-    r.raise_for_status()
-    data = r.json()
-
-    sl = data.get("shared_link") or {}
-    # print("shared_link =", json.dumps(sl, indent=2))
-    # Prefer direct static download URL
-    dl = sl.get("download_url")
-    if dl:
-        return dl
-
-    # Fallback: at least return the shared link (may require cookies)
-    if sl.get("url"):
-        return sl["url"]
-
-    raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")
-
+    params = {"fields": "shared_link"}
+    
+    for attempt in range(max_retries):
+        try:
+            # Increased timeout to 60 seconds
+            r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=60)
+            
+            if r.status_code == 404:
+                print(f"⚠️  Skipping file {file_id} (not found)")
+                return None
+            
+            r.raise_for_status()
+            data = r.json()
+            sl = data.get("shared_link") or {}
+            
+            dl = sl.get("download_url")
+            if dl:
+                return dl
+            if sl.get("url"):
+                return sl["url"]
+            
+            raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")
+            
+        except requests.exceptions.Timeout:
+            if attempt < max_retries - 1:
+                wait_time = 2 ** attempt  # Exponential backoff: 1, 2, 4 seconds
+                print(f"⏱️  Timeout on file {file_id}, retrying in {wait_time}s ({attempt + 1}/{max_retries})...")
+                time.sleep(wait_time)
+            else:
+                print(f"⚠️  Skipping file {file_id} (timeout after {max_retries} attempts)")
+                return None
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 404:
+                print(f"⚠️  Skipping file {file_id} (not found)")
+                return None
+            raise
 
 
 def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
@@ -114,6 +108,22 @@ def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
     return items
 
 
+def load_existing_entries() -> set:
+    """Load already processed file IDs to avoid duplicates."""
+    processed = set()
+    if os.path.exists(OUT_PATH):
+        with open(OUT_PATH, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    entry = json.loads(line.strip())
+                    video_id = entry.get("video_id", "")
+                    if video_id.startswith("vid_"):
+                        processed.add(video_id[4:])
+                except json.JSONDecodeError:
+                    continue
+    return processed
+
+
 def main():
     token = get_access_token()
 
@@ -122,10 +132,16 @@ def main():
         raise RuntimeError(f"Shared link did not resolve to a folder: {shared_folder.get('type')}")
     root_id = shared_folder["id"]
 
+    # Load already processed files to resume
+    processed_ids = load_existing_entries()
+    print(f"Resuming: {len(processed_ids)} files already processed")
+
     queue = [root_id]
-    out_count = 0
+    out_count = len(processed_ids)
+    skipped = 0
+    new_files = 0
 
-    with open(OUT_PATH, "w", encoding="utf-8") as f:
+    with open(OUT_PATH, "a", encoding="utf-8") as f:
         while queue:
             fid = queue.pop(0)
             entries = walk(token, fid)
@@ -141,26 +157,36 @@ def main():
 
                 if et != "file":
                     continue
-
                 if not lname.endswith(".m4a"):
                     continue
 
                 file_id = e["id"]
+                
+                # Skip if already processed
+                if file_id in processed_ids:
+                    continue
+                
                 video_id = f"vid_{file_id}"
 
-                # Make a per-file open shared link (Speech can fetch without auth).
-                # If your org disallows open links, this will fail — then you’ll need Blob staging.
                 file_link = ensure_open_shared_link_for_file(token, file_id)
+                
+                if file_link is None:
+                    skipped += 1
+                    continue
 
-                # Encourage direct download behavior
-                media_url = file_link # + ("?download=1" if "?" not in file_link else "&download=1")
-
+                media_url = file_link
                 f.write(json.dumps({"video_id": video_id, "media_url": media_url}) + "\n")
+                f.flush()  # Ensure write is saved immediately
                 out_count += 1
+                new_files += 1
+                
                 if out_count % 10 == 0:
-                    print(f"Wrote {out_count} entries...")
+                    print(f"Wrote {out_count} entries total...")
+                
+                # Small delay to avoid rate limiting
+                time.sleep(0.2)
 
-    print(f"Done. Wrote {out_count} m4a entries to {OUT_PATH}")
+    print(f"Done. Total: {out_count} entries (new: {new_files}, skipped: {skipped})")
 
 
 if __name__ == "__main__":
diff --git a/shared/__pycache__/speech_batch.cpython-311.pyc b/shared/__pycache__/speech_batch.cpython-311.pyc
index 525fbbc638d0ded8eda6304af50b337f3dd74394..58ca2888792699e4a1a7d3ce5b2dab4fcc63daf0 100644
GIT binary patch
delta 2294
zcmaJ@OK%%h6rMYF?8LE?rnU2EoA{b0t<%PN^pUo1DWqv0eWY!g7NontjD2q6A@i_i
z#tA7BrYKSuAP{)LrVA=0il{`9<`1xe1p*-fDyX2sf<-p0iUb=FXRhPeE)|x?Gxt2d
z^PSI@Z+}F-_>{EWi9|vYzSbVSrf!^lr>(p6nNO15lR#n;5Rf&(h`7s47Iz=>iMyZq
z@$S=_^Z*MGFV?RG^&kuC&8%5(VJ*Veq=ocW)~bhDSdXxX-p1O5J)pJg+t@a}gLUZJ
z*>-&g+ac^hZ7!<ESWFz6wW!|7I`uBrCC)8)?q=P359`5mi0y<{whO{+H$+%3w6Q)O
zkfj+3+Seu6_Mt3EcjfYpuFfcnZ#p{AvAyOfF$(8l2TwqhxA8=e$Afr5j3*WY(7E2k
zp2Fw&q6}T@ezpgnDL!}Ob3gQ8-cGz?JchBm&^7?Oc?^0z8^~zuL)+dhTO93l(-(pS
z`bzs+q!tM-h*DnK9=zg@LFwm70VfOt2xk!XdL?+6ClI1sgS_a6eOPioBq0KWw`ADQ
z<8XjS8payJ+!S8J`~ZybWJwfmgVFT>8{#|pE}YFv8^e%;*!GZU?X_xcXcdCP2qgta
z1e64}9>><hTe-2uw#dX$^p9-D*Xorv>Ba)bFkn;!9N#P&fHd01>WbD9t9b!9fiVZ+
zqyU=4I#1*I5S$YHG=LVP`r#S$9|nRfJ$p+o!9*icIE~~dYox*?5>+9)q1a+GoI%cx
zz|<2P{THYorm^uj%z*6i470cVaF$OcU<jv8J+|@xqdpAhu<=ol7o@>CrPnV>q4GaX
zoh6#06W5Ctrwawnb9uVJtra!L>1oBm(^X3`?3|?*9Mv@FHAmG{N9A@XbTzNqR5zih
zaayp<6&1KmbEaOfc%B=!y25F(8ci2Xi>{j1vT7{RRn^JYb!@G*43&TFik2oWpB<)W
z9hz4R(72rrrRX^`OYLG-S8dxvfeu<QH4Uqwvxbm@xoT*p0wTgNEnU&n>xzdQ<4$W{
z700xcB~Det;Y*fq(d5F!0=;(S0$t(;w@Sh|L>E;YtXS~M*@KuqiFgdId3J|e-XI<f
z;D9k|uG-|Ps&iz)SmBm{nj9M)HEimrxn=w`sjebyn`4AmH8f%6@~Xpgj>rEMQ{(AS
z30s+Xvh#|?fof{jQmi$>prUEiY1oHZ;FTe4M0m}{`0UyiAz41_I0XT7fiLOYaO$Y4
ztbDp?<UD%3^vjhcmTX#O4$IZrc?fjHp~<3y^D9OU*DY4*;K;V4=mkO3Y-MXg2^xR7
zDk@wmwYXitR`@$8D%k=V2i!(Zu{}wh=WBGjRE3ldQIw`wyOXKmK(v{c%xs!2c!F?f
zR>-u$FA|ryQ;D0*^W1XXAfAG-07^k|iuP7)wE`_TnWClHX*$E5y0nTQP$uv8M@u5B
zs<sS{oY~?mP0u_@g}km*rRi2i-Ee8#y{8?gp0{%gmo9qRLBIwm7SP4CAW(qNm1Ul>
z72Ct?K}H2D4t|#_lD36zvYzsl9B#}Gxf{OwWNc;x#bVhbx?(x1VXP{k7ME6wBUR}3
zi1&RqQz&^8X)>+y-%SZ})Qtzm2|49H7raTxwEM96OA@$<zo_<j`Choo{XFy=A<w%X
zhldCO_lHQ45Y3%ykIJOzW;;&#$m{OyC@18O8|=I<lLzI8-G7pM?)@I-CqKGleeV<U
zhx={(rzY~3`^msJGTH6Enw*nKzx&mGCX++%z7!#3(j6S0_K{h)F!p<p-1y*y<Al6X
z&ZNI4Z{p{Gd*#GoLSA%lo_rJ}-<4lE^EUau+&c9$x$ACBrJBhvZs)~$5<2!En8|>d
q%VgaArMF1zAdw_|OLi~*huQFL$Q#nec)5K(UhbQ3c8BL5$^Qbmo12vY

delta 1238
zcmaJ>OH30{6rDSrewcos&_Y{iseFV=Y5C)?;1>lo@e?-BRrNj;(`m63G;X>fQ5R}R
zqb@X2<3g9lx6&9BHM%e{?p&yG<-!<03lmLTdFMfZqH)qU@7;6nJ@>pbNnhN9TX(^8
z$L-b?dak8rCZFw`_XLWMEQ)efK?NbBu!S8M$XI2n+%3#PyPDAqE3<-GW69VI8?za9
zW;YznAu%meVdzXZoXlysn9Fc8x5TX(k5S1gjVe}Uc$wFzX4Mk6Woir`+wPNrodQ4e
z8?~%f#tw?>Se+4I0g5YF5Oo$pC#y#nYd|*(bE~TCQqVJ}VCDRmusgB{R1x5v{}L(U
z7*_Kr)^InE2F)wW2Yoy`VMG6%m93$3WJ1N-IgK^ZxslFwbZ*7~^#y79xQ}W>gtcHj
z_hEyHArKZOtaS;C5Z<_$(=kf9wG~CBLcu+f%4}<*6<H;vnYT$fE^Hw=?bvEka3_zF
zM0Hv6gobO0GKOvF!uIPb#&`tRaaY;3I;b~JtCO$8gvnbZ5)UTlY^;L^d5G?2wslcS
zH>G>zXmgZaHcH1{k`l*0DJ4py`)PFNQm?OkEIBbi`NU%VO1`4Bn+OJ}ASnyhFOu4D
z17Te&NLMy4^x1GD)pX+~DYT7vHdEY#Dd|s{Xz8JbLzM4Dg<KjYkG2%cmEu-1ziru6
z+(t$%m|aFT;lS<WSs(8B=b*oZYPgdI_u~kH=`-%SuHkOFXsUx=@l{{^f7ZKjlm-tV
z98zL?g~#y*hR2iR(>y<&JUWxf6$<I(j(l$FNbYpBeLtbOY&sK5P9<k2k>@(o*=(+m
zF68pbLOz|HKAN9A!b^UA4TQy@J^;`nvd*&raq-jj5}@z)6ORfh@w6%qFebixX8;Zg
zUgJ~YusGu%vcPfiAix1Gh{n(x6@Hg~HvEFi;#-($@K}sC-2`|mJ~zL!!Uypp_F4s1
zT#4^fK^GqrOogBr?goIE*xEB<frL2M|J4r1ZqKFwPL@s$y@EWwTg8dtK7bK%W6KXa
zJSdfRUxJ6F=F#^se}Amg0nbI^-~oU^asJRn@OFcu&`+5T(*M;b%O_35$3vDsG_D~I

diff --git a/ui/Dockerfile b/ui/Dockerfile
index e3a4bd9..8fd9d6c 100644
--- a/ui/Dockerfile
+++ b/ui/Dockerfile
@@ -4,6 +4,9 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
+RUN apt-get update && apt-get install -y ffmpeg
+RUN pip install azure-cognitiveservices-speech
+
 COPY . .
 
 EXPOSE 8501
diff --git a/ui/ui_search.py b/ui/ui_search.py
index 194fdec..26a2ecd 100644
--- a/ui/ui_search.py
+++ b/ui/ui_search.py
@@ -1,63 +1,89 @@
 """
-ui_search.py - Streamlit Web Interface for Video Segment Search
-
-This Streamlit application provides a user-friendly web interface for searching
-indexed video segments. Users can:
-- Enter text queries to search across all indexed segments
-- Choose search mode (keyword, vector, or hybrid)
-- Filter results by video_id and adjust result count
-- View segment text with timestamps and relevance scores
-
-Architecture Role:
-- Frontend user interface for the video annotation system
-- Deployed as Azure Container App (video-annotator-ui)
-- Calls SearchSegments Azure Function for all search operations
-- Displays results with formatted timestamps and metadata
-
-Deployment:
-  - Local: python -m streamlit run ui_search.py
-  - Azure: Deployed as Container App (see ui/README.md)
-
-Configuration (via .env or Container App env vars):
-  - SEARCH_FN_URL: SearchSegments function endpoint
-  - DEFAULT_MODE: Default search mode (hybrid/keyword/vector)
-  - DEFAULT_TOP: Default number of results
-  - DEFAULT_K: Default vector recall depth
+ui_search.py - Streamlit Web Interface for Video Segment Search & Upload
+
+This version calls Azure Speech API DIRECTLY, bypassing the Azure Function
+that has the wrong API version hardcoded.
 """
 
 import os
 import requests
 import streamlit as st
+import json
+import time
+import re
+import subprocess
+import hashlib
+import base64
+import hmac
+import uuid
+import urllib.parse
+import pandas as pd
+import io
+from datetime import datetime, timezone, timedelta
+from typing import Optional, Dict, Any, Tuple, List
+from pathlib import Path
 from dotenv import load_dotenv
 
-# Load .env locally (Container Apps/App Service will use real env vars)
+# Load environment variables
 load_dotenv()
 
-SEARCH_FN_URL = os.environ["SEARCH_FN_URL"]
+# Azure Function URLs (only Search uses these now)
+SEARCH_FN_URL = os.environ.get("SEARCH_FN_URL", "")
+
+# Azure Speech Service Configuration (DIRECT)
+SPEECH_KEY = os.environ.get("SPEECH_KEY")
+SPEECH_REGION = os.environ.get("SPEECH_REGION", "eastus")
+SPEECH_API_VERSION = os.environ.get("SPEECH_API_VERSION", "2024-11-15")
+
+# Azure OpenAI & Search for indexing
+AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_KEY = os.environ.get("AZURE_OPENAI_KEY")
+AZURE_OPENAI_DEPLOYMENT = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
+
+SEARCH_ENDPOINT = os.environ.get("SEARCH_ENDPOINT")
+SEARCH_KEY = os.environ.get("SEARCH_KEY")
+SEARCH_INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "segments")
+
+# Azure Storage Configuration
+AZURE_STORAGE_ACCOUNT = os.environ.get("AZURE_STORAGE_ACCOUNT", "storagevideoannotator")
+AZURE_STORAGE_KEY = os.environ.get("AZURE_STORAGE_KEY", "")
+INPUT_CONTAINER = os.environ.get("INPUT_CONTAINER", "speech-input")
+SEGMENTS_CONTAINER = os.environ.get("SEGMENTS_CONTAINER", "segments")
+
+# Default settings
 DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid")
 DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10"))
 DEFAULT_K = int(os.environ.get("DEFAULT_K", "40"))
+POLL_SECONDS = int(os.environ.get("POLL_SECONDS", "15"))
+BATCH_MAX_WORKERS = int(os.environ.get("BATCH_MAX_WORKERS", "3"))  # Concurrent processing limit
+
+st.set_page_config(page_title="Video Annotation Platform", layout="wide")
+st.title(" Video Annotation Platform")
 
-st.set_page_config(page_title="Video Segment Search", layout="wide")
-st.title("🔎 Search indexed video segments")
+# Initialize session state
+if 'yt_url_value' not in st.session_state:
+    st.session_state.yt_url_value = ""
+if 'batch_results' not in st.session_state:
+    st.session_state.batch_results = []
+if 'batch_processing' not in st.session_state:
+    st.session_state.batch_processing = False
 
+# Sidebar
 with st.sidebar:
-    st.header("Search settings")
-    mode = st.selectbox(
-        "Mode",
-        ["keyword", "hybrid", "vector"],
-        index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE)
-        if DEFAULT_MODE in ("keyword", "hybrid", "vector")
-        else 1,
-    )
-    top = st.slider("Top", 1, 50, DEFAULT_TOP)
-    k = st.slider("Vector k (hybrid/vector)", 5, 200, DEFAULT_K)
-    video_id_filter = st.text_input("Filter by video_id (optional)", value="")
-    st.caption("Tip: keep k ~ 4×top for hybrid.")
+    st.header("Navigation")
+    page = st.radio("Select Page", ["🔎 Search Segments", "⬆️ Upload & Transcribe"])
+    
+    if page == "🔎 Search Segments":
+        st.header("Settings")
+        mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], 
+                          index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1)
+        top = st.slider("Results", 1, 50, DEFAULT_TOP)
+        k = st.slider("Vector k", 5, 200, DEFAULT_K)
 
-q = st.text_input("Query", value="", placeholder="e.g., measles misinformation")
-go = st.button("Search", type="primary", disabled=(not q.strip()))
 
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
 
 def ms_to_ts(ms: int) -> str:
     s = max(0, int(ms // 1000))
@@ -66,60 +92,1399 @@ def ms_to_ts(ms: int) -> str:
     return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"
 
 
-def call_search_api(payload: dict) -> dict:
-    r = requests.post(
-        SEARCH_FN_URL,
-        json=payload,
-        timeout=60,
-        headers={"Content-Type": "application/json"},
-    )
+def call_api(url: str, payload: dict, timeout: int = 60) -> dict:
+    r = requests.post(url, json=payload, timeout=timeout, headers={"Content-Type": "application/json"})
     if r.status_code >= 400:
         raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
     return r.json() if r.text else {}
 
 
-if go:
-    payload = {"q": q.strip(), "mode": mode, "top": top}
-    if mode in ("hybrid", "vector"):
-        payload["k"] = k
-    if video_id_filter.strip():
-        payload["video_id"] = video_id_filter.strip()
+# =============================================================================
+# DIRECT AZURE SPEECH API FUNCTIONS (BYPASS AZURE FUNCTION)
+# =============================================================================
+
+def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]:
+    """
+    Submit transcription directly to Azure Speech API.
+    Bypasses the Azure Function with wrong API version.
+    """
+    if not SPEECH_KEY:
+        raise RuntimeError("SPEECH_KEY not configured in environment")
+    
+    endpoint = f"https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext/transcriptions:submit?api-version={SPEECH_API_VERSION}"
+    
+    headers = {
+        "Ocp-Apim-Subscription-Key": SPEECH_KEY,
+        "Content-Type": "application/json"
+    }
+    
+    payload = {
+        "contentUrls": [media_url],
+        "locale": "en-US",
+        "displayName": f"transcription_{video_id}",
+        "properties": {
+            "diarizationEnabled": False,
+            "wordLevelTimestampsEnabled": False,
+            "punctuationMode": "DictatedAndAutomatic",
+            "profanityFilterMode": "Masked",
+            "timeToLiveHours": 24
+        }
+    }
+    
+    try:
+        r = requests.post(endpoint, headers=headers, json=payload, timeout=60)
+        r.raise_for_status()
+        
+        # Get operation URL from Location header (this is the operation status URL)
+        operation_url = r.headers.get("Location")
+        if not operation_url:
+            result = r.json()
+            operation_url = result.get("self") or result.get("links", {}).get("self")
+        
+        if not operation_url:
+            raise RuntimeError("No operation URL returned from Speech API")
+        
+        return {"operation_url": operation_url, "video_id": video_id}
+        
+    except requests.exceptions.HTTPError as e:
+        if r.status_code == 401:
+            raise RuntimeError("Azure Speech API authentication failed. Check SPEECH_KEY.")
+        elif r.status_code == 400:
+            raise RuntimeError(f"Bad request: {r.text}")
+        else:
+            raise RuntimeError(f"Speech API error {r.status_code}: {r.text}")
+
+
+def poll_transcription_operation(operation_url: str) -> Dict[str, Any]:
+    """Poll transcription operation status directly from Azure Speech API."""
+    if not SPEECH_KEY:
+        raise RuntimeError("SPEECH_KEY not configured")
+    
+    headers = {
+        "Ocp-Apim-Subscription-Key": SPEECH_KEY
+    }
+    
+    try:
+        # CRITICAL FIX: Azure returns operation URL with :submit but we need to poll
+        # using the /transcriptions/{id} endpoint, not /transcriptions:submit/{id}
+        # The operation_url looks like: .../transcriptions:submit/{id}?api-version=...
+        # We need: .../transcriptions/{id}?api-version=...
+        
+        poll_url = operation_url.replace("/transcriptions:submit/", "/transcriptions/")
+        
+        # Debug info
+        st.session_state['debug_poll_url'] = poll_url
+        
+        r = requests.get(poll_url, headers=headers, timeout=30)
+        r.raise_for_status()
+        return r.json()
+        
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"Failed to poll transcription: {str(e)}")
+
+
+def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]:
+    """Get the actual transcription JSON from the result files."""
+    if not SPEECH_KEY:
+        raise RuntimeError("SPEECH_KEY not configured")
+    
+    headers = {
+        "Ocp-Apim-Subscription-Key": SPEECH_KEY
+    }
+    
+    try:
+        # Get the result files URL from the completed operation
+        links = result_data.get("links", {})
+        files_url = links.get("files")
+        
+        if not files_url:
+            # Try to construct from result data or get content directly
+            if "combinedRecognizedPhrases" in result_data:
+                # Result might be embedded directly
+                return result_data
+            
+            raise RuntimeError("No files URL in result")
+        
+        # Get list of files
+        r = requests.get(files_url, headers=headers, timeout=30)
+        r.raise_for_status()
+        files_data = r.json()
+        
+        # Find the transcription JSON file
+        for file in files_data.get("values", []):
+            if file.get("kind") == "Transcription":
+                content_url = file.get("links", {}).get("contentUrl")
+                if content_url:
+                    # Download the actual transcription content
+                    content_r = requests.get(content_url, timeout=60)
+                    content_r.raise_for_status()
+                    return content_r.json()
+        
+        raise RuntimeError("No transcription file found in results")
+        
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"Failed to get transcription result: {str(e)}")
+
+
+# =============================================================================
+# DIRECT EMBEDDING AND INDEXING (BYPASS AZURE FUNCTION)
+# =============================================================================
+
+def get_embeddings(texts: list) -> list:
+    """Get embeddings directly from Azure OpenAI."""
+    if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_KEY:
+        raise RuntimeError("Azure OpenAI not configured")
+    
+    url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/embeddings?api-version=2024-02-01"
+    
+    headers = {
+        "api-key": AZURE_OPENAI_KEY,
+        "Content-Type": "application/json"
+    }
+    
+    payload = {
+        "input": texts,
+        "model": "text-embedding-3-small"
+    }
+    
+    try:
+        r = requests.post(url, headers=headers, json=payload, timeout=60)
+        r.raise_for_status()
+        result = r.json()
+        return [item["embedding"] for item in result["data"]]
+    except Exception as e:
+        raise RuntimeError(f"Embedding failed: {str(e)}")
+
+
+def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
+    """
+    Index segments directly to Azure Cognitive Search.
+    Bypasses the EmbedAndIndex Azure Function.
+    """
+    if not SEARCH_ENDPOINT or not SEARCH_KEY:
+        raise RuntimeError("Azure Search not configured")
+    
+    # Generate embeddings for all segments
+    texts = [seg.get("text", "") for seg in segments]
+    try:
+        embeddings = get_embeddings(texts)
+    except Exception as e:
+        st.warning(f"Embedding failed, indexing without vectors: {e}")
+        embeddings = [None] * len(segments)
+    
+    # Prepare search documents
+    documents = []
+    for i, (seg, embedding) in enumerate(zip(segments, embeddings)):
+        doc = {
+            "id": f"{video_id}_{i}",
+            "video_id": video_id,
+            "segment_id": seg.get("segment_id", i),
+            "text": seg.get("text", ""),
+            "start_ms": seg.get("start_ms", 0),
+            "end_ms": seg.get("end_ms", 0),
+            "pred_labels": seg.get("pred_labels", [])
+        }
+        if embedding:
+            doc["embedding"] = embedding
+        
+        documents.append(doc)
+    
+    # Upload to Azure Search
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01"
+    
+    headers = {
+        "api-key": SEARCH_KEY,
+        "Content-Type": "application/json"
+    }
+    
+    payload = {
+        "value": documents
+    }
+    
+    try:
+        r = requests.post(url, headers=headers, json=payload, timeout=60)
+        r.raise_for_status()
+        return {"indexed": len(documents), "video_id": video_id}
+    except Exception as e:
+        raise RuntimeError(f"Indexing failed: {str(e)}")
+
+
+def process_transcription_to_segments(transcription_data: Dict, video_id: str) -> list:
+    """
+    Convert Azure Speech transcription JSON to segments format.
+    """
+    segments = []
+    
+    # Parse phrases/segments from transcription
+    phrases = transcription_data.get("recognizedPhrases", [])
+    
+    for i, phrase in enumerate(phrases):
+        # Extract timing
+        offset = phrase.get("offsetInTicks", 0) // 10000  # Convert to ms
+        duration = phrase.get("durationInTicks", 0) // 10000
+        
+        # Extract text
+        nbest = phrase.get("nBest", [])
+        if nbest:
+            text = nbest[0].get("display", "")
+        else:
+            text = ""
+        
+        # Create segment
+        segment = {
+            "segment_id": i,
+            "video_id": video_id,
+            "text": text,
+            "start_ms": offset,
+            "end_ms": offset + duration,
+            "pred_labels": []  # Could add label prediction here
+        }
+        
+        segments.append(segment)
+    
+    return segments
+
+
+# =============================================================================
+# STORAGE FUNCTIONS - FIXED UPLOAD
+# =============================================================================
+
+def generate_video_id(filename: str) -> str:
+    clean_name = Path(filename).stem
+    clean_name = re.sub(r'[^\w\s-]', '', clean_name)
+    clean_name = re.sub(r'[-\s]+', '_', clean_name)
+    hash_suffix = hashlib.md5(clean_name.encode()).hexdigest()[:8]
+    return f"vid_{clean_name[:50]}_{hash_suffix}"
+
+
+def test_sas_url(sas_url: str) -> Tuple[bool, str]:
+    """Test if SAS URL is accessible before sending to Speech API."""
+    try:
+        r = requests.head(sas_url, timeout=10, allow_redirects=True)
+        if r.status_code == 200:
+            return True, "SAS URL is accessible"
+        else:
+            return False, f"SAS URL returned HTTP {r.status_code}"
+    except Exception as e:
+        return False, f"SAS URL test failed: {str(e)}"
+
+
+def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    FIXED upload to Azure Blob using REST API.
+    Corrected string-to-sign format.
+    """
+    if not AZURE_STORAGE_KEY:
+        return None, "Azure Storage key not configured"
+    
+    try:
+        # Upload URL
+        url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}"
+        
+        # Create date header in the exact format Azure expects
+        date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
+        content_length = len(file_bytes)
+        
+        # ====================================================================
+        # CRITICAL FIX: Correct string-to-sign format for Azure Blob Storage
+        # Format: VERB\nContent-Encoding\nContent-Language\nContent-Length\n
+        #         Content-MD5\nContent-Type\nDate\nIf-Modified-Since\nIf-Match\n
+        #         If-None-Match\nIf-Unmodified-Since\nRange\n
+        #         CanonicalizedHeaders\nCanonicalizedResource
+        # ====================================================================
+        string_to_sign = (
+            f"PUT\n"                       # HTTP method
+            f"\n"                          # Content-Encoding (empty)
+            f"\n"                          # Content-Language (empty)
+            f"{content_length}\n"          # Content-Length (REQUIRED - must be exact)
+            f"\n"                          # Content-MD5 (empty)
+            f"application/octet-stream\n"  # Content-Type (REQUIRED for PUT)
+            f"\n"                          # Date (empty, using x-ms-date instead)
+            f"\n"                          # If-Modified-Since (empty)
+            f"\n"                          # If-Match (empty)
+            f"\n"                          # If-None-Match (empty)
+            f"\n"                          # If-Unmodified-Since (empty)
+            f"\n"                          # Range (empty)
+            f"x-ms-blob-type:BlockBlob\n"  # CanonicalizedHeaders (sorted alphabetically)
+            f"x-ms-date:{date_str}\n"
+            f"x-ms-version:2020-12-06\n"
+            f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"  # CanonicalizedResource
+        )
+        
+        # Sign with HMAC-SHA256
+        account_key = base64.b64decode(AZURE_STORAGE_KEY)
+        signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
+        signature = base64.b64encode(signed_hmac).decode('utf-8')
+        
+        # Build authorization header
+        auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
+        
+        # Set headers - MUST match what was signed
+        headers = {
+            "x-ms-date": date_str,
+            "x-ms-version": "2020-12-06",
+            "x-ms-blob-type": "BlockBlob",
+            "Content-Type": "application/octet-stream",
+            "Content-Length": str(content_length),
+            "Authorization": auth_header
+        }
+        
+        # Upload
+        r = requests.put(url, data=file_bytes, headers=headers, timeout=300)
+        
+        if r.status_code not in [201, 200]:
+            return None, f"Upload failed: HTTP {r.status_code} - {r.text}"
+        
+        # Generate SAS token for reading
+        sas_token = generate_sas_token_fixed(blob_name)
+        if not sas_token:
+            return None, "Failed to generate SAS token"
+        
+        sas_url = f"{url}?{sas_token}"
+        
+        # Test the SAS URL
+        is_valid, test_msg = test_sas_url(sas_url)
+        if not is_valid:
+            return None, f"SAS URL validation failed: {test_msg}"
+        
+        return sas_url, None
+        
+    except Exception as e:
+        import traceback
+        return None, f"Upload error: {str(e)}\n{traceback.format_exc()}"
+
+
+def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> str:
+    """
+    FIXED SAS token generation for Azure Blob - Service SAS format.
+    """
+    if not AZURE_STORAGE_KEY:
+        return None
+    
+    try:
+        # Set expiry in UTC
+        expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours)
+        expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ')
+        
+        # Decode account key
+        account_key = base64.b64decode(AZURE_STORAGE_KEY)
+        
+        # ====================================================================
+        # CRITICAL FIX: Service SAS string-to-sign format
+        # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas
+        # Format for Blob service SAS:
+        # StringToSign = signedPermissions + "\n" +
+        #                signedStart + "\n" +
+        #                signedExpiry + "\n" +
+        #                canonicalizedResource + "\n" +
+        #                signedIdentifier + "\n" +
+        #                signedIP + "\n" +
+        #                signedProtocol + "\n" +
+        #                signedVersion + "\n" +
+        #                signedResource + "\n" +
+        #                signedSnapshotTime + "\n" +
+        #                signedEncryptionScope + "\n" +
+        #                signedCacheControl + "\n" +
+        #                signedContentDisposition + "\n" +
+        #                signedContentEncoding + "\n" +
+        #                signedContentLanguage + "\n" +
+        #                signedContentType
+        # ====================================================================
+        
+        # Canonicalized resource for service SAS: /blob/{account}/{container}/{blob}
+        canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"
+        
+        # Build string to sign for Service SAS
+        string_to_sign = (
+            f"r\n"                           # signed permissions (read)
+            f"\n"                            # signed start (empty)
+            f"{expiry_str}\n"                # signed expiry
+            f"{canonicalized_resource}\n"    # canonicalized resource
+            f"\n"                            # signed identifier (empty)
+            f"\n"                            # signed IP (empty)
+            f"https\n"                       # signed protocol
+            f"2020-12-06\n"                  # signed version
+            f"b\n"                           # signed resource (b = blob)
+            f"\n"                            # signed snapshot time (empty)
+            f"\n"                            # signed encryption scope (empty)
+            f"\n"                            # signed cache control (empty)
+            f"\n"                            # signed content disposition (empty)
+            f"\n"                            # signed content encoding (empty)
+            f"\n"                            # signed content language (empty)
+            f""                              # signed content type (empty, no newline at end)
+        )
+        
+        # Sign with HMAC-SHA256
+        signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
+        signature = base64.b64encode(signed_hmac).decode('utf-8')
+        
+        # Build query parameters - Order matters for some clients
+        sas_params = {
+            'sv': '2020-12-06',             # signed version
+            'sr': 'b',                      # signed resource (blob)
+            'sp': 'r',                      # signed permissions (read)
+            'se': expiry_str,               # signed expiry
+            'spr': 'https',                 # signed protocol
+            'sig': signature                # signature
+        }
+        
+        # URL encode the signature and other values
+        sas_token = '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()])
+        return sas_token
+        
+    except Exception as e:
+        st.error(f"SAS generation error: {e}")
+        import traceback
+        st.error(traceback.format_exc())
+        return None
+
 
+def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Upload using Azure SDK (more reliable, requires azure-storage-blob package).
+    """
     try:
-        with st.spinner("Searching..."):
-            data = call_search_api(payload)
+        from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
+        
+        connection_string = (
+            f"DefaultEndpointsProtocol=https;"
+            f"AccountName={AZURE_STORAGE_ACCOUNT};"
+            f"AccountKey={AZURE_STORAGE_KEY};"
+            f"EndpointSuffix=core.windows.net"
+        )
+        
+        blob_service = BlobServiceClient.from_connection_string(connection_string)
+        container_client = blob_service.get_container_client(INPUT_CONTAINER)
+        
+        # Ensure container exists
+        try:
+            container_client.create_container()
+        except Exception:
+            pass
+        
+        # Upload blob
+        blob_client = container_client.get_blob_client(blob_name)
+        blob_client.upload_blob(file_bytes, overwrite=True)
+        
+        # Generate SAS token
+        sas_token = generate_blob_sas(
+            account_name=AZURE_STORAGE_ACCOUNT,
+            container_name=INPUT_CONTAINER,
+            blob_name=blob_name,
+            account_key=AZURE_STORAGE_KEY,
+            permission=BlobSasPermissions(read=True),
+            expiry=datetime.now(timezone.utc) + timedelta(hours=24),
+            protocol="https"
+        )
+        
+        sas_url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}?{sas_token}"
+        
+        # Test the SAS URL
+        is_valid, test_msg = test_sas_url(sas_url)
+        if not is_valid:
+            return None, f"SAS URL validation failed: {test_msg}"
+        
+        return sas_url, None
+        
+    except ImportError:
+        return None, "azure-storage-blob not installed"
     except Exception as e:
-        st.error(f"Search failed: {e}")
-        st.stop()
-
-    hits = data.get("hits", [])
-    st.caption(f"Count: {data.get('count')} | Returned: {len(hits)}")
-
-    for i, h in enumerate(hits, start=1):
-        start_ms = h.get("start_ms", 0)
-        end_ms = h.get("end_ms", 0)
-        vid = h.get("video_id", "")
-        seg = h.get("segment_id", "")
-        score = h.get("score", None)
-
-        header = f"{i}. {vid}  |  {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}"
-        if seg:
-            header += f"  |  seg={seg}"
-        if score is not None:
-            header += f"  |  score={score:.3f}" if isinstance(score, (int, float)) else f"  |  score={score}"
-
-        with st.expander(header, expanded=(i <= 3)):
-            st.write(h.get("text", ""))
-
-            labels = h.get("pred_labels") or []
-            conf = h.get("pred_confidence")
-            rationale = h.get("pred_rationale")
-
-            if labels or conf is not None or rationale:
-                st.subheader("Annotations")
-                if labels:
-                    st.write("**Labels:**", ", ".join(labels))
-                if conf is not None:
-                    st.write("**Confidence:**", conf)
-                if rationale:
-                    st.write("**Rationale:**", rationale)
+        import traceback
+        return None, f"SDK upload failed: {str(e)}\n{traceback.format_exc()}"
+
+
+def save_segments_to_blob(video_id: str, segments: list) -> str:
+    """Save segments JSON to blob storage."""
+    if not AZURE_STORAGE_KEY:
+        raise RuntimeError("Azure Storage key not configured")
+    
+    blob_name = f"{video_id}_segments.json"
+    url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{SEGMENTS_CONTAINER}/{blob_name}"
+    
+    json_bytes = json.dumps(segments, indent=2).encode('utf-8')
+    content_length = len(json_bytes)
+    
+    date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
+    string_to_sign = (
+        f"PUT\n"
+        f"\n"
+        f"\n"
+        f"{content_length}\n"
+        f"\n"
+        f"application/json\n"
+        f"\n"
+        f"\n"
+        f"\n"
+        f"\n"
+        f"\n"
+        f"\n"
+        f"x-ms-blob-type:BlockBlob\n"
+        f"x-ms-date:{date_str}\n"
+        f"x-ms-version:2020-12-06\n"
+        f"/{AZURE_STORAGE_ACCOUNT}/{SEGMENTS_CONTAINER}/{blob_name}"
+    )
+    
+    account_key = base64.b64decode(AZURE_STORAGE_KEY)
+    signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
+    signature = base64.b64encode(signed_hmac).decode('utf-8')
+    auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
+    
+    headers = {
+        "x-ms-date": date_str,
+        "x-ms-version": "2020-12-06",
+        "x-ms-blob-type": "BlockBlob",
+        "Content-Type": "application/json",
+        "Content-Length": str(content_length),
+        "Authorization": auth_header
+    }
+    
+    r = requests.put(url, data=json_bytes, headers=headers, timeout=60)
+    r.raise_for_status()
+    
+    return blob_name
+
+
+def check_yt_dlp() -> bool:
+    try:
+        result = subprocess.run(["which", "yt-dlp"], capture_output=True, text=True)
+        return result.returncode == 0
+    except:
+        return False
+
+
+def download_youtube_audio(youtube_url: str, output_path: str, progress_callback=None) -> Tuple[Optional[str], Optional[str]]:
+    """Download YouTube audio to specific path."""
+    if not check_yt_dlp():
+        return None, "yt-dlp not installed. Run: pip install yt-dlp"
+    
+    if not youtube_url or not youtube_url.strip():
+        return None, "YouTube URL is empty"
+    
+    try:
+        cmd = [
+            "yt-dlp",
+            "-f", "bestaudio[ext=m4a]/bestaudio",
+            "--extract-audio",
+            "--audio-format", "m4a",
+            "--audio-quality", "0",
+            "--no-check-certificate",  # Added for compatibility
+            "--no-warnings",           # Reduce noise
+            "-o", output_path,
+            youtube_url.strip()
+        ]
+        
+        # Try to use Node.js runtime if available, otherwise let yt-dlp handle it
+        # This fixes the "No supported JavaScript runtime" error
+        try:
+            node_check = subprocess.run(["which", "node"], capture_output=True, text=True)
+            if node_check.returncode != 0:
+                # No node.js, try to use legacy format that doesn't require JS
+                cmd.extend(["--extractor-args", "youtube:player_client=web"])
+        except:
+            pass
+        
+        if progress_callback:
+            progress_callback(15, "Downloading from YouTube...")
+        
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+        
+        if result.returncode != 0:
+            error_msg = result.stderr[:500]
+            # Provide helpful error message for JS runtime issues
+            if "JavaScript runtime" in error_msg:
+                error_msg += "\n\n💡 Tip: Install Node.js or run: pip install yt-dlp --upgrade"
+            return None, f"yt-dlp failed: {error_msg}"
+        
+        # Find the actual file
+        if os.path.exists(output_path):
+            return output_path, None
+        
+        # Try alternative extensions
+        base = output_path.rsplit('.', 1)[0]
+        for ext in ['.m4a', '.mp3', '.webm', '.opus']:
+            alt_path = base + ext
+            if os.path.exists(alt_path):
+                return alt_path, None
+        
+        return None, "Download completed but file not found"
+        
+    except subprocess.TimeoutExpired:
+        return None, "Download timed out after 10 minutes"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+
+
+def detect_url_type(url: str) -> str:
+    """Detect if URL is YouTube, direct media, or unknown."""
+    if not url:
+        return "unknown"
+    
+    url_lower = str(url).lower().strip()
+    
+    # YouTube patterns
+    youtube_patterns = [
+        r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)',
+        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=',
+        r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/',
+        r'youtube\.com\/shorts\/'
+    ]
+    
+    for pattern in youtube_patterns:
+        if re.search(pattern, url_lower):
+            return "youtube"
+    
+    # Direct media patterns
+    media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm']
+    if any(url_lower.endswith(ext) for ext in media_extensions):
+        return "direct"
+    
+    # Box.com, Google Drive, Dropbox, etc. - treat as direct
+    cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive']
+    if any(pattern in url_lower for pattern in cloud_patterns):
+        return "direct"
+    
+    return "unknown"
+
+
+def process_single_video(url: str, custom_id: Optional[str] = None, 
+                        progress_bar=None, status_text=None, 
+                        overall_progress: Tuple[int, int] = (0, 1)) -> Dict[str, Any]:
+    """
+    Process a single video URL (YouTube or Direct).
+    Returns result dict with status and metadata.
+    """
+    result = {
+        "url": url,
+        "video_id": None,
+        "status": "pending",
+        "segments_count": 0,
+        "error": None,
+        "index_status": None
+    }
+    
+    try:
+        # Detect URL type
+        url_type = detect_url_type(url)
+        
+        if url_type == "unknown":
+            result["status"] = "failed"
+            result["error"] = "Unknown URL type. Must be YouTube or direct media URL."
+            return result
+        
+        # Generate video ID
+        if custom_id:
+            video_id = custom_id.strip()
+        else:
+            video_id = generate_video_id(f"batch_{url}")
+        
+        result["video_id"] = video_id
+        
+        # Update progress
+        current, total = overall_progress
+        base_progress = int((current / total) * 100) if progress_bar else 0
+        
+        if status_text:
+            status_text.text(f"[{current}/{total}] Processing: {video_id}")
+        
+        media_url = None
+        
+        # Handle YouTube
+        if url_type == "youtube":
+            if not check_yt_dlp():
+                result["status"] = "failed"
+                result["error"] = "yt-dlp not installed"
+                return result
+            
+            import tempfile
+            with tempfile.TemporaryDirectory() as tmpdir:
+                if status_text:
+                    status_text.text(f"[{current}/{total}] Downloading from YouTube...")
+                
+                output_path = f"{tmpdir}/youtube_{video_id}.m4a"
+                downloaded_path, error = download_youtube_audio(url.strip(), output_path)
+                
+                if error:
+                    result["status"] = "failed"
+                    result["error"] = f"Download failed: {error}"
+                    return result
+                
+                # Read and upload
+                with open(downloaded_path, 'rb') as f:
+                    file_bytes = f.read()
+                
+                blob_name = f"batch_youtube_{video_id}_{int(time.time())}.m4a"
+                
+                if status_text:
+                    status_text.text(f"[{current}/{total}] Uploading to Azure...")
+                
+                # Try SDK first
+                sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
+                if error and ("not installed" in error or "SDK" in error):
+                    sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
+                
+                if error:
+                    result["status"] = "failed"
+                    result["error"] = f"Upload failed: {error}"
+                    return result
+                
+                media_url = sas_url
+        
+        # Handle Direct URL
+        elif url_type == "direct":
+            media_url = url.strip()
+            if status_text:
+                status_text.text(f"[{current}/{total}] Using direct URL...")
+        
+        if not media_url:
+            result["status"] = "failed"
+            result["error"] = "No media URL available"
+            return result
+        
+        # Submit transcription
+        if status_text:
+            status_text.text(f"[{current}/{total}] Submitting to Speech API...")
+        
+        submit_result = submit_transcription_direct(video_id, media_url)
+        operation_url = submit_result.get("operation_url")
+        
+        if not operation_url:
+            result["status"] = "failed"
+            result["error"] = "No operation URL returned"
+            return result
+        
+        # Poll for completion
+        max_polls = 120
+        transcription_data = None
+        
+        for i in range(max_polls):
+            time.sleep(POLL_SECONDS)
+            poll_result = poll_transcription_operation(operation_url)
+            status = poll_result.get("status", "unknown")
+            
+            # Update progress during polling
+            if progress_bar:
+                poll_progress = min(int((i / max_polls) * 20), 20)  # 20% of progress for polling
+                overall = base_progress + int((1 / total) * 80) + int((1 / total) * poll_progress)
+                progress_bar.progress(min(overall, 99))
+            
+            if status.lower() == "succeeded":
+                transcription_data = get_transcription_from_result(poll_result)
+                break
+            elif status.lower() == "failed":
+                error_msg = poll_result.get("properties", {}).get("error", {}).get("message", "Unknown error")
+                result["status"] = "failed"
+                result["error"] = f"Transcription failed: {error_msg}"
+                return result
+        
+        if not transcription_data:
+            result["status"] = "failed"
+            result["error"] = "Transcription timed out"
+            return result
+        
+        # Process segments
+        if status_text:
+            status_text.text(f"[{current}/{total}] Processing segments...")
+        
+        segments = process_transcription_to_segments(transcription_data, video_id)
+        result["segments_count"] = len(segments)
+        
+        # Save to blob
+        save_segments_to_blob(video_id, segments)
+        
+        # Index to search
+        try:
+            index_result = index_segments_direct(video_id, segments)
+            result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents"
+        except Exception as e:
+            result["index_status"] = f"Indexing skipped: {str(e)}"
+        
+        result["status"] = "success"
+        
+    except Exception as e:
+        result["status"] = "failed"
+        result["error"] = str(e)
+        import traceback
+        result["error"] += f"\n{traceback.format_exc()}"
+    
+    return result
+
+
+# =============================================================================
+# PAGE 1: SEARCH
+# =============================================================================
+
+if page == "🔎 Search Segments":
+    st.header("Search Indexed Video Segments")
+    
+    if not SEARCH_FN_URL:
+        st.error("SEARCH_FN_URL not configured. Cannot search.")
+    else:
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            q = st.text_input("Query", placeholder="e.g., measles vaccine side effects")
+        with col2:
+            video_id_filter = st.text_input("Filter by video_id (optional)")
+        
+        if st.button("Search", type="primary", disabled=(not q.strip())):
+            try:
+                payload = {"q": q.strip(), "mode": mode, "top": top}
+                if mode in ("hybrid", "vector"):
+                    payload["k"] = k
+                if video_id_filter.strip():
+                    payload["video_id"] = video_id_filter.strip()
+                
+                with st.spinner("Searching..."):
+                    data = call_api(SEARCH_FN_URL, payload)
+                
+                hits = data.get("hits", [])
+                st.caption(f"Found {data.get('count', 0)} total | Showing {len(hits)}")
+                
+                for i, h in enumerate(hits, start=1):
+                    start_ms, end_ms = h.get("start_ms", 0), h.get("end_ms", 0)
+                    vid, seg, score = h.get("video_id", ""), h.get("segment_id", ""), h.get("score")
+                    
+                    header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}"
+                    if seg:
+                        header += f" | seg={seg}"
+                    if score is not None:
+                        header += f" | score={score:.3f}" if isinstance(score, (int, float)) else f" | score={score}"
+                    
+                    with st.expander(header, expanded=(i <= 3)):
+                        st.write(h.get("text", ""))
+                        if h.get("pred_labels"):
+                            st.caption(f"Labels: {', '.join(h['pred_labels'])}")
+            except Exception as e:
+                st.error(f"Search failed: {e}")
+
+
+# =============================================================================
+# PAGE 2: UPLOAD (DIRECT API VERSION)
+# =============================================================================
+
+elif page == "⬆️ Upload & Transcribe":
+    st.header("Upload Video for Transcription")
+    st.info(" Using direct Azure Speech API (bypassing Azure Function)")
+    
+    # Check Azure config
+    azure_configured = bool(AZURE_STORAGE_KEY) and bool(SPEECH_KEY)
+    if not azure_configured:
+        st.error("⚠️ Azure Storage and Speech keys required. Check .env file.")
+    
+    source_type = st.radio("Select Source", 
+                          ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"],
+                          horizontal=True)
+    
+    media_url = None
+    video_id = None
+    file_bytes = None
+    yt_url = None  # Initialize to None
+    csv_df = None
+    
+    # -------------------------------------------------------------------------
+    # FILE UPLOAD
+    # -------------------------------------------------------------------------
+    if source_type == "File Upload":
+        if not azure_configured:
+            st.info("Please configure Azure Storage to enable file upload")
+        else:
+            uploaded_file = st.file_uploader(
+                "Choose video/audio file",
+                type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"],
+                accept_multiple_files=False
+            )
+            
+            if uploaded_file:
+                st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)")
+                file_bytes = uploaded_file.getvalue()
+                video_id = generate_video_id(uploaded_file.name)
+                st.info("File ready for upload to Azure")
+    
+    # -------------------------------------------------------------------------
+    # DIRECT URL
+    # -------------------------------------------------------------------------
+    elif source_type == "Direct URL":
+        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/    ...")
+        
+        if url_input.strip():
+            media_url = url_input.strip()
+            video_id = generate_video_id(url_input)
+            st.success("✅ URL validated")
+    
+    # -------------------------------------------------------------------------
+    # YOUTUBE - FIXED with session state
+    # -------------------------------------------------------------------------
+    elif source_type == "YouTube":
+        # Use session state to persist the URL
+        yt_url = st.text_input(
+            "YouTube URL", 
+            placeholder="https://youtube.com/watch?v=  ...",
+            value=st.session_state.yt_url_value,
+            key="yt_url_input"
+        )
+        
+        # Update session state when URL changes - FIXED: removed experimental_rerun
+        if yt_url != st.session_state.yt_url_value:
+            st.session_state.yt_url_value = yt_url
+            # Use st.rerun() instead of st.experimental_rerun() for newer Streamlit versions
+            try:
+                st.rerun()
+            except AttributeError:
+                # Fallback for older versions
+                try:
+                    st.experimental_rerun()
+                except AttributeError:
+                    pass  # If neither exists, just continue without rerun
+        
+        if not check_yt_dlp():
+            st.warning("yt-dlp not installed")
+            if st.button("Install yt-dlp"):
+                with st.spinner("Installing..."):
+                    subprocess.run(["pip", "install", "-q", "yt-dlp"])
+                # FIXED: Use st.rerun() instead of experimental_rerun
+                try:
+                    st.rerun()
+                except AttributeError:
+                    try:
+                        st.experimental_rerun()
+                    except AttributeError:
+                        st.info("Please refresh the page manually")
+        elif yt_url and yt_url.strip():
+            video_id = generate_video_id(f"yt_{yt_url.strip()}")
+            st.success("YouTube URL ready")
+    
+    # -------------------------------------------------------------------------
+    # BATCH CSV UPLOAD - NEW FEATURE
+    # -------------------------------------------------------------------------
+    elif source_type == "📁 Batch CSV Upload":
+        st.subheader("📁 Batch Process Videos from CSV")
+        
+        csv_file = st.file_uploader(
+            "Upload CSV file",
+            type=["csv"],
+            help="CSV must contain a column with video URLs (YouTube or direct links)"
+        )
+        
+        if csv_file:
+            try:
+                # Read CSV - handle various formats
+                # Try to detect if URLs are in header or rows
+                content = csv_file.read().decode('utf-8')
+                csv_file.seek(0)  # Reset pointer
+                
+                # First attempt: standard read
+                try:
+                    csv_df = pd.read_csv(csv_file)
+                except Exception:
+                    # Second attempt: maybe single column with no header
+                    csv_file.seek(0)
+                    csv_df = pd.read_csv(csv_file, header=None)
+                    csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))]
+                
+                # Check if column names look like URLs (common issue)
+                url_like_columns = []
+                for col in csv_df.columns:
+                    col_str = str(col).strip()
+                    if detect_url_type(col_str) != "unknown" or col_str.startswith('http'):
+                        url_like_columns.append(col)
+                
+                # If column names look like URLs, treat them as data
+                if url_like_columns and len(csv_df.columns) == 1:
+                    # The column name is actually a URL, convert to data
+                    url_col_name = csv_df.columns[0]
+                    new_row = {url_col_name: url_col_name}
+                    csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True)
+                
+                st.success(f"✅ Loaded CSV with {len(csv_df)} rows and {len(csv_df.columns)} columns")
+                
+                # Show available columns
+                st.write("**Available columns:**", list(csv_df.columns))
+                
+                # Let user select the URL column
+                url_column = st.selectbox(
+                    "Select column containing video URLs",
+                    options=csv_df.columns.tolist(),
+                    help="Choose the column that contains YouTube or direct media URLs"
+                )
+                
+                # Optional: Select custom ID column
+                id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column]
+                id_column = st.selectbox(
+                    "Select column for custom Video ID (optional)",
+                    options=id_column_options,
+                    index=0,
+                    help="Optional: Choose a column to use as custom video ID (e.g., title, ID field)"
+                )
+                
+                # Extract and validate URLs
+                urls_raw = csv_df[url_column].dropna().astype(str).tolist()
+                
+                # Clean URLs (remove whitespace)
+                urls_to_process = [u.strip() for u in urls_raw if u.strip()]
+                
+                # Preview
+                with st.expander(f"Preview URLs to process ({len(urls_to_process)} found)"):
+                    for i, url in enumerate(urls_to_process[:10], 1):
+                        url_type = detect_url_type(url)
+                        icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓"
+                        st.text(f"{i}. {icon} {url[:80]}...")
+                    if len(urls_to_process) > 10:
+                        st.caption(f"... and {len(urls_to_process) - 10} more")
+                
+                # Validate URLs
+                valid_urls = []
+                invalid_urls = []
+                
+                for url in urls_to_process:
+                    url_type = detect_url_type(str(url))
+                    if url_type in ["youtube", "direct"]:
+                        valid_urls.append(url)
+                    else:
+                        invalid_urls.append(url)
+                
+                col1, col2, col3 = st.columns(3)
+                col1.metric("Total URLs", len(urls_to_process))
+                col2.metric("✅ Valid", len(valid_urls), f"{len(valid_urls)/len(urls_to_process)*100:.1f}%" if urls_to_process else "0%")
+                col3.metric("❌ Invalid", len(invalid_urls))
+                
+                if invalid_urls:
+                    with st.expander(f"Show {len(invalid_urls)} invalid URLs"):
+                        for url in invalid_urls[:10]:
+                            st.text(f"❌ {url[:100]}...")
+                
+                # Store in session state for processing
+                st.session_state['batch_urls'] = valid_urls
+                st.session_state['batch_df'] = csv_df
+                st.session_state['batch_url_column'] = url_column
+                st.session_state['batch_id_column'] = id_column
+                
+            except Exception as e:
+                st.error(f"Error reading CSV: {e}")
+                import traceback
+                st.error(traceback.format_exc())
+    
+    # Custom ID (for single uploads)
+    custom_id = st.text_input("Custom Video ID (optional)")
+    if custom_id.strip() and source_type != "📁 Batch CSV Upload":
+        video_id = custom_id.strip()
+    
+    # Determine if we can process
+    can_process = False
+    if source_type == "File Upload":
+        can_process = file_bytes is not None and azure_configured
+    elif source_type == "Direct URL":
+        can_process = media_url is not None and len(str(media_url).strip()) > 0
+    elif source_type == "YouTube":
+        yt_url_to_check = st.session_state.get('yt_url_value', '') or (yt_url if yt_url else '')
+        can_process = len(str(yt_url_to_check).strip()) > 0 and check_yt_dlp()
+    elif source_type == "📁 Batch CSV Upload":
+        can_process = (st.session_state.get('batch_urls') and 
+                      len(st.session_state.get('batch_urls', [])) > 0 and 
+                      azure_configured and
+                      not st.session_state.get('batch_processing', False))
+    
+    # Process button
+    button_text = " Start Transcription"
+    if source_type == "📁 Batch CSV Upload":
+        count = len(st.session_state.get('batch_urls', []))
+        button_text = f" Process {count} Videos from CSV"
+    
+    if st.button(button_text, type="primary", disabled=not can_process):
+        
+        # ---------------------------------------------------------------------
+        # BATCH PROCESSING
+        # ---------------------------------------------------------------------
+        if source_type == "📁 Batch CSV Upload":
+            st.session_state.batch_processing = True
+            st.session_state.batch_results = []
+            
+            urls = st.session_state.get('batch_urls', [])
+            csv_df = st.session_state.get('batch_df')
+            url_column = st.session_state.get('batch_url_column')
+            id_column = st.session_state.get('batch_id_column')
+            
+            total = len(urls)
+            
+            st.info(f"Starting batch processing of {total} videos...")
+            
+            # Create progress containers
+            overall_progress = st.progress(0)
+            status_text = st.empty()
+            results_container = st.container()
+            
+            # Process each URL
+            results = []
+            for idx, url in enumerate(urls, 1):
+                # Get custom ID if specified
+                custom_vid_id = None
+                if id_column != "Auto-generate":
+                    # Find the row with this URL and get the ID
+                    row = csv_df[csv_df[url_column] == url]
+                    if not row.empty:
+                        custom_vid_id = str(row[id_column].iloc[0])
+                        # Sanitize ID
+                        custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50]
+                
+                # Process video
+                result = process_single_video(
+                    url=url,
+                    custom_id=custom_vid_id,
+                    progress_bar=overall_progress,
+                    status_text=status_text,
+                    overall_progress=(idx, total)
+                )
+                
+                results.append(result)
+                st.session_state.batch_results = results
+                
+                # Update progress
+                progress_pct = int((idx / total) * 100)
+                overall_progress.progress(progress_pct)
+                
+                # Show result in container
+                with results_container:
+                    if result['status'] == 'success':
+                        st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments")
+                    else:
+                        error_msg = result.get('error', 'Unknown error')
+                        # Truncate long error messages
+                        if len(error_msg) > 200:
+                            error_msg = error_msg[:200] + "..."
+                        st.error(f"❌ [{idx}/{total}] Failed: {error_msg}")
+                
+                # Small delay to prevent rate limiting
+                time.sleep(1)
+            
+            # Final summary
+            overall_progress.progress(100)
+            status_text.text("Batch processing complete!")
+            
+            successful = [r for r in results if r['status'] == 'success']
+            failed = [r for r in results if r['status'] == 'failed']
+            
+            st.markdown("---")
+            st.subheader("📊 Batch Processing Summary")
+            
+            col1, col2, col3 = st.columns(3)
+            col1.metric("Total", total)
+            col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%")
+            col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%")
+            
+            # Detailed results table
+            with st.expander("View Detailed Results"):
+                results_df = pd.DataFrame([
+                    {
+                        'Video ID': r['video_id'],
+                        'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'],
+                        'Status': r['status'],
+                        'Segments': r.get('segments_count', 0),
+                        'Indexing': r.get('index_status', 'N/A'),
+                        'Error': (r.get('error', '')[:100] + '...') if r.get('error') else ''
+                    }
+                    for r in results
+                ])
+                st.dataframe(results_df)
+                
+                # Download results as CSV
+                csv_buffer = io.StringIO()
+                results_df.to_csv(csv_buffer, index=False)
+                st.download_button(
+                    "Download Results CSV",
+                    csv_buffer.getvalue(),
+                    "batch_processing_results.csv",
+                    "text/csv"
+                )
+            
+            # Search hint
+            if successful:
+                st.info("💡 **Search processed videos using:**")
+                video_ids = [r['video_id'] for r in successful[:5]]
+                st.code(f"video_id:({' OR '.join(video_ids)})")
+            
+            st.session_state.batch_processing = False
+            
+        else:
+            # -----------------------------------------------------------------
+            # SINGLE VIDEO PROCESSING (Original logic)
+            # -----------------------------------------------------------------
+            progress_bar = st.progress(0)
+            status = st.empty()
+            
+            try:
+                # -------------------------------------------------------------
+                # HANDLE FILE UPLOAD (Direct to Azure)
+                # -------------------------------------------------------------
+                if source_type == "File Upload" and file_bytes:
+                    progress_bar.progress(10)
+                    status.text("Uploading to Azure Blob...")
+                    
+                    blob_name = f"upload_{video_id}_{int(time.time())}.m4a"
+                    
+                    # Try SDK method first, fallback to fixed REST method
+                    sas_url = None
+                    error = None
+                    
+                    try:
+                        sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
+                    except Exception as e:
+                        error = str(e)
+                    
+                    if error and ("not installed" in error or "SDK" in error):
+                        st.info("Using REST API for upload...")
+                        sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
+                    
+                    if error:
+                        raise Exception(error)
+                    
+                    if not sas_url:
+                        raise Exception("Failed to generate SAS URL")
+                    
+                    media_url = sas_url
+                    progress_bar.progress(50)
+                    status.text("Upload complete, starting transcription...")
+                
+                # -------------------------------------------------------------
+                # HANDLE YOUTUBE (Download then Upload)
+                # -------------------------------------------------------------
+                elif source_type == "YouTube":
+                    # Get URL from session state
+                    yt_url = st.session_state.get('yt_url_value', '')
+                    
+                    if not yt_url or not yt_url.strip():
+                        raise Exception("YouTube URL is empty. Please enter a valid YouTube URL.")
+                    
+                    import tempfile
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        progress_bar.progress(10)
+                        status.text("Downloading from YouTube...")
+                        
+                        output_path = f"{tmpdir}/youtube_{video_id}.m4a"
+                        downloaded_path, error = download_youtube_audio(
+                            yt_url.strip(), 
+                            output_path,
+                            lambda p, m: (progress_bar.progress(p), status.text(m))
+                        )
+                        
+                        if error:
+                            raise Exception(error)
+                        
+                        progress_bar.progress(50)
+                        status.text("Uploading to Azure Blob...")
+                        
+                        # Read file and upload
+                        with open(downloaded_path, 'rb') as f:
+                            file_bytes = f.read()
+                        
+                        blob_name = f"youtube_{video_id}_{int(time.time())}.m4a"
+                        
+                        # Try SDK first, fallback to fixed REST
+                        sas_url = None
+                        error = None
+                        
+                        try:
+                            sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
+                        except Exception as e:
+                            error = str(e)
+                        
+                        if error and ("not installed" in error or "SDK" in error):
+                            st.info("Using REST API for upload...")
+                            sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
+                        
+                        if error:
+                            raise Exception(error)
+                        
+                        if not sas_url:
+                            raise Exception("Failed to generate SAS URL")
+                        
+                        media_url = sas_url
+                        progress_bar.progress(75)
+                        status.text("Processing with Azure Speech...")
+                
+                # -------------------------------------------------------------
+                # TRANSCRIBE (All paths lead here)
+                # -------------------------------------------------------------
+                if not media_url:
+                    raise Exception("No media URL available")
+                
+                # Submit directly to Azure Speech API
+                status.text("Submitting to Azure Speech-to-Text...")
+                result = submit_transcription_direct(video_id, media_url)
+                operation_url = result.get("operation_url")
+                
+                if not operation_url:
+                    raise Exception("No operation URL returned")
+                
+                # Debug info
+                st.info(f"Debug: Operation URL received")
+                
+                # Poll
+                max_polls = 120
+                transcription_data = None
+                
+                for i in range(max_polls):
+                    time.sleep(POLL_SECONDS)
+                    poll_result = poll_transcription_operation(operation_url)
+                    status_text = poll_result.get("status", "unknown")
+                    
+                    progress = min(75 + int((i / max_polls) * 20), 95)
+                    progress_bar.progress(progress)
+                    status.text(f"Transcribing... ({i * POLL_SECONDS // 60} min) - Status: {status_text}")
+                    
+                    if status_text.lower() == "succeeded":
+                        status.text("Transcription complete, retrieving results...")
+                        transcription_data = get_transcription_from_result(poll_result)
+                        break
+                        
+                    elif status_text.lower() == "failed":
+                        error_msg = poll_result.get("properties", {}).get("error", {}).get("message", "Unknown error")
+                        raise Exception(f"Transcription failed: {error_msg}")
+                
+                if not transcription_data:
+                    raise Exception("Transcription timed out")
+                
+                # -------------------------------------------------------------
+                # PROCESS & INDEX (DIRECT)
+                # -------------------------------------------------------------
+                progress_bar.progress(98)
+                status.text("Processing segments and indexing...")
+                
+                # Convert to segments
+                segments = process_transcription_to_segments(transcription_data, video_id)
+                
+                # Save to blob
+                blob_name = save_segments_to_blob(video_id, segments)
+                
+                # Index to search
+                try:
+                    index_result = index_segments_direct(video_id, segments)
+                    index_msg = f"Indexed: {index_result.get('indexed', 0)} documents"
+                except Exception as e:
+                    index_msg = f"Indexing skipped: {e}"
+                
+                progress_bar.progress(100)
+                status.text("Complete!")
+                
+                st.success(f"""
+                ✅ **Transcription Complete!**
+                - Video ID: {video_id}
+                - Segments: {len(segments)}
+                - {index_msg}
+                """)
+                st.code(f'Search: video_id:{video_id}')
+                
+                # Show sample segments
+                with st.expander("View first 5 segments"):
+                    for seg in segments[:5]:
+                        st.write(f"**{ms_to_ts(seg['start_ms'])} - {ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...")
+                    
+            except Exception as e:
+                st.error(f"❌ Error: {str(e)}")
+                st.exception(e)
+                
+                # Debug info
+                if 'debug_poll_url' in st.session_state:
+                    st.error(f"Debug - Poll URL used: {st.session_state['debug_poll_url']}")
+
+
+# Footer
+st.sidebar.markdown("---")
+st.sidebar.caption("Video Annotation Platform v1.0 - Direct API Mode")
\ No newline at end of file

From 5826913d7d9aeb25a5c743b5e34c47f8ce43e556 Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Sun, 15 Feb 2026 09:43:08 -0600
Subject: [PATCH 2/8] Add batch CSV upload feature + UI updates (multiple
 upload)

---
 ..._folder_manifest.py.backup.20260205_170952 | 167 ++++++++
 ui/host.json                                  |   8 +
 ui/ui_search1.py                              | 125 ++++++
 ui/ui_search2.py                              | 372 ++++++++++++++++++
 4 files changed, 672 insertions(+)
 create mode 100644 scripts/box_shared_folder_manifest.py.backup.20260205_170952
 create mode 100644 ui/host.json
 create mode 100644 ui/ui_search1.py
 create mode 100644 ui/ui_search2.py

diff --git a/scripts/box_shared_folder_manifest.py.backup.20260205_170952 b/scripts/box_shared_folder_manifest.py.backup.20260205_170952
new file mode 100644
index 0000000..310df6e
--- /dev/null
+++ b/scripts/box_shared_folder_manifest.py.backup.20260205_170952
@@ -0,0 +1,167 @@
+"""
+scripts/box_shared_folder_manifest.py - Generate Video Manifest from Box
+
+This script enumerates .m4a video files from a Box shared folder and generates
+a manifest file (videos.jsonl) that lists all videos with their IDs and media URLs.
+It creates open shared links for each file so Azure Speech Service can access them.
+
+Architecture Role:
+- Pre-processing step before video ingestion
+- Generates videos.jsonl input file for import_videos.py
+- Handles Box API authentication and folder traversal
+- Creates publicly accessible download URLs for Speech Service
+
+Usage:
+  python scripts/box_shared_folder_manifest.py
+
+Output:
+  - videos.jsonl: One JSON object per line with video_id and media_url
+
+Configuration (via .env):
+  - BOX_SHARED_FOLDER_URL: Box shared folder link
+  - BOX_TOKEN or BOX_ACCESS_TOKEN/BOX_REFRESH_TOKEN: Box authentication
+  - OUT_PATH: Output file path (default: videos.jsonl)
+  - RECURSIVE: Whether to traverse subfolders (default: 1)
+"""
+
+import json
+import os
+import requests
+from typing import Dict, Any, List, Optional
+
+from box_auth import get_access_token
+
+BOX_API = "https://api.box.com/2.0"
+
+SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"]  # https://...box.com/s/<id>
+print("BOX_SHARED_FOLDER_URL =", SHARED_FOLDER_URL)
+OUT_PATH = os.environ.get("OUT_PATH", "videos.jsonl")
+RECURSIVE = os.environ.get("RECURSIVE", "1") == "1"
+
+
+def shared_headers(token: str) -> Dict[str, str]:
+    return {
+        "Authorization": f"Bearer {token}",
+        "BoxApi": f"shared_link={SHARED_FOLDER_URL}",
+        "Content-Type": "application/json",
+    }
+
+
+def auth_headers(token: str) -> Dict[str, str]:
+    return {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+    }
+
+
+def resolve_shared_folder(token: str) -> Dict[str, Any]:
+    url = f"{BOX_API}/shared_items"
+    h = shared_headers(token)
+    r = requests.get(url, headers=h, timeout=30)
+    if r.status_code != 200:
+        raise RuntimeError(f"{r.status_code} {r.text} (headers sent: {h.get('BoxApi')})")
+    return r.json()
+
+
+
+def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int = 0) -> Dict[str, Any]:
+    url = f"{BOX_API}/folders/{folder_id}/items"
+    params = {"limit": limit, "offset": offset}
+    r = requests.get(url, headers=shared_headers(token), params=params, timeout=30)
+    r.raise_for_status()
+    return r.json()
+
+
+def ensure_open_shared_link_for_file(token: str, file_id: str) -> str:
+    """
+    Ensure file has an open shared link and return a direct-download URL.
+    """
+    url = f"{BOX_API}/files/{file_id}"
+    payload = {"shared_link": {"access": "open"}}
+    params = {"fields": "shared_link"}  # IMPORTANT: ask for shared_link back
+
+    r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=30)
+    r.raise_for_status()
+    data = r.json()
+
+    sl = data.get("shared_link") or {}
+    # print("shared_link =", json.dumps(sl, indent=2))
+    # Prefer direct static download URL
+    dl = sl.get("download_url")
+    if dl:
+        return dl
+
+    # Fallback: at least return the shared link (may require cookies)
+    if sl.get("url"):
+        return sl["url"]
+
+    raise RuntimeError(f"No shared_link returned for file {file_id}: {data}")
+
+
+
+def walk(token: str, folder_id: str) -> List[Dict[str, Any]]:
+    items: List[Dict[str, Any]] = []
+    offset = 0
+    limit = 1000
+    while True:
+        page = list_folder_items(token, folder_id, limit=limit, offset=offset)
+        entries = page.get("entries", [])
+        items.extend(entries)
+        total = page.get("total_count", 0)
+        offset += len(entries)
+        if offset >= total or not entries:
+            break
+    return items
+
+
+def main():
+    token = get_access_token()
+
+    shared_folder = resolve_shared_folder(token)
+    if shared_folder.get("type") != "folder":
+        raise RuntimeError(f"Shared link did not resolve to a folder: {shared_folder.get('type')}")
+    root_id = shared_folder["id"]
+
+    queue = [root_id]
+    out_count = 0
+
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        while queue:
+            fid = queue.pop(0)
+            entries = walk(token, fid)
+
+            for e in entries:
+                et = e.get("type")
+                name = (e.get("name") or "")
+                lname = name.lower()
+
+                if et == "folder" and RECURSIVE:
+                    queue.append(e["id"])
+                    continue
+
+                if et != "file":
+                    continue
+
+                if not lname.endswith(".m4a"):
+                    continue
+
+                file_id = e["id"]
+                video_id = f"vid_{file_id}"
+
+                # Make a per-file open shared link (Speech can fetch without auth).
+                # If your org disallows open links, this will fail — then you’ll need Blob staging.
+                file_link = ensure_open_shared_link_for_file(token, file_id)
+
+                # Encourage direct download behavior
+                media_url = file_link # + ("?download=1" if "?" not in file_link else "&download=1")
+
+                f.write(json.dumps({"video_id": video_id, "media_url": media_url}) + "\n")
+                out_count += 1
+                if out_count % 10 == 0:
+                    print(f"Wrote {out_count} entries...")
+
+    print(f"Done. Wrote {out_count} m4a entries to {OUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ui/host.json b/ui/host.json
new file mode 100644
index 0000000..3369578
--- /dev/null
+++ b/ui/host.json
@@ -0,0 +1,8 @@
+{
+  "version": "2.0",
+  "isDefaultHostConfig": true,
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  }
+}
\ No newline at end of file
diff --git a/ui/ui_search1.py b/ui/ui_search1.py
new file mode 100644
index 0000000..194fdec
--- /dev/null
+++ b/ui/ui_search1.py
@@ -0,0 +1,125 @@
+"""
+ui_search.py - Streamlit Web Interface for Video Segment Search
+
+This Streamlit application provides a user-friendly web interface for searching
+indexed video segments. Users can:
+- Enter text queries to search across all indexed segments
+- Choose search mode (keyword, vector, or hybrid)
+- Filter results by video_id and adjust result count
+- View segment text with timestamps and relevance scores
+
+Architecture Role:
+- Frontend user interface for the video annotation system
+- Deployed as Azure Container App (video-annotator-ui)
+- Calls SearchSegments Azure Function for all search operations
+- Displays results with formatted timestamps and metadata
+
+Deployment:
+  - Local: python -m streamlit run ui_search.py
+  - Azure: Deployed as Container App (see ui/README.md)
+
+Configuration (via .env or Container App env vars):
+  - SEARCH_FN_URL: SearchSegments function endpoint
+  - DEFAULT_MODE: Default search mode (hybrid/keyword/vector)
+  - DEFAULT_TOP: Default number of results
+  - DEFAULT_K: Default vector recall depth
+"""
+
+import os
+import requests
+import streamlit as st
+from dotenv import load_dotenv
+
+# Load .env locally (Container Apps/App Service will use real env vars)
+load_dotenv()
+
+SEARCH_FN_URL = os.environ["SEARCH_FN_URL"]
+DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid")
+DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10"))
+DEFAULT_K = int(os.environ.get("DEFAULT_K", "40"))
+
+st.set_page_config(page_title="Video Segment Search", layout="wide")
+st.title("🔎 Search indexed video segments")
+
+with st.sidebar:
+    st.header("Search settings")
+    mode = st.selectbox(
+        "Mode",
+        ["keyword", "hybrid", "vector"],
+        index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE)
+        if DEFAULT_MODE in ("keyword", "hybrid", "vector")
+        else 1,
+    )
+    top = st.slider("Top", 1, 50, DEFAULT_TOP)
+    k = st.slider("Vector k (hybrid/vector)", 5, 200, DEFAULT_K)
+    video_id_filter = st.text_input("Filter by video_id (optional)", value="")
+    st.caption("Tip: keep k ~ 4×top for hybrid.")
+
+q = st.text_input("Query", value="", placeholder="e.g., measles misinformation")
+go = st.button("Search", type="primary", disabled=(not q.strip()))
+
+
+def ms_to_ts(ms: int) -> str:
+    s = max(0, int(ms // 1000))
+    m, s = divmod(s, 60)
+    h, m = divmod(m, 60)
+    return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"
+
+
+def call_search_api(payload: dict) -> dict:
+    r = requests.post(
+        SEARCH_FN_URL,
+        json=payload,
+        timeout=60,
+        headers={"Content-Type": "application/json"},
+    )
+    if r.status_code >= 400:
+        raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
+    return r.json() if r.text else {}
+
+
+if go:
+    payload = {"q": q.strip(), "mode": mode, "top": top}
+    if mode in ("hybrid", "vector"):
+        payload["k"] = k
+    if video_id_filter.strip():
+        payload["video_id"] = video_id_filter.strip()
+
+    try:
+        with st.spinner("Searching..."):
+            data = call_search_api(payload)
+    except Exception as e:
+        st.error(f"Search failed: {e}")
+        st.stop()
+
+    hits = data.get("hits", [])
+    st.caption(f"Count: {data.get('count')} | Returned: {len(hits)}")
+
+    for i, h in enumerate(hits, start=1):
+        start_ms = h.get("start_ms", 0)
+        end_ms = h.get("end_ms", 0)
+        vid = h.get("video_id", "")
+        seg = h.get("segment_id", "")
+        score = h.get("score", None)
+
+        header = f"{i}. {vid}  |  {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}"
+        if seg:
+            header += f"  |  seg={seg}"
+        if score is not None:
+            header += f"  |  score={score:.3f}" if isinstance(score, (int, float)) else f"  |  score={score}"
+
+        with st.expander(header, expanded=(i <= 3)):
+            st.write(h.get("text", ""))
+
+            labels = h.get("pred_labels") or []
+            conf = h.get("pred_confidence")
+            rationale = h.get("pred_rationale")
+
+            if labels or conf is not None or rationale:
+                st.subheader("Annotations")
+                if labels:
+                    st.write("**Labels:**", ", ".join(labels))
+                if conf is not None:
+                    st.write("**Confidence:**", conf)
+                if rationale:
+                    st.write("**Rationale:**", rationale)
diff --git a/ui/ui_search2.py b/ui/ui_search2.py
new file mode 100644
index 0000000..c81dda1
--- /dev/null
+++ b/ui/ui_search2.py
@@ -0,0 +1,372 @@
+"""
+ui_search.py - Streamlit Web Interface for Video Segment Search & Upload
+
+This Streamlit application provides:
+- Search indexed video segments (keyword, vector, hybrid)
+- Upload new videos for transcription and indexing
+- View processing status and results
+
+Architecture:
+- Frontend for video annotation system
+- Calls SearchSegments, TranscribeHttp, and EmbedAndIndex Azure Functions
+- Supports both search and ingest workflows
+"""
+
+import os
+import requests
+import streamlit as st
+import tempfile
+import json
+import time
+from typing import Optional, Dict, Any
+from pathlib import Path
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Azure Function URLs
+SEARCH_FN_URL = os.environ["SEARCH_FN_URL"]
+TRANSCRIBE_URL = os.environ.get("TRANSCRIBE_URL", "")
+EMBED_INDEX_URL = os.environ.get("EMBED_INDEX_URL", "")
+
+# Default settings
+DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid")
+DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10"))
+DEFAULT_K = int(os.environ.get("DEFAULT_K", "40"))
+POLL_SECONDS = int(os.environ.get("POLL_SECONDS", "15"))
+
+st.set_page_config(page_title="Video Annotation Platform", layout="wide")
+st.title("🎬 Video Annotation Platform")
+
+# Sidebar navigation
+with st.sidebar:
+    st.header("Navigation")
+    page = st.radio("Select Page", ["🔎 Search Segments", "⬆️ Upload & Transcribe"])
+    
+    st.header("Settings")
+    if page == "🔎 Search Segments":
+        mode = st.selectbox(
+            "Search Mode",
+            ["keyword", "hybrid", "vector"],
+            index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE)
+            if DEFAULT_MODE in ("keyword", "hybrid", "vector")
+            else 1,
+        )
+        top = st.slider("Results", 1, 50, DEFAULT_TOP)
+        k = st.slider("Vector k", 5, 200, DEFAULT_K)
+        st.caption("Tip: keep k ~ 4×top for hybrid")
+
+
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+
+def ms_to_ts(ms: int) -> str:
+    """Convert milliseconds to timestamp."""
+    s = max(0, int(ms // 1000))
+    m, s = divmod(s, 60)
+    h, m = divmod(m, 60)
+    return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"
+
+
+def call_api(url: str, payload: dict, timeout: int = 60) -> dict:
+    """Generic API call with error handling."""
+    if not url:
+        raise RuntimeError("API URL not configured")
+    
+    r = requests.post(
+        url,
+        json=payload,
+        timeout=timeout,
+        headers={"Content-Type": "application/json"},
+    )
+    if r.status_code >= 400:
+        raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
+    return r.json() if r.text else {}
+
+
+def submit_transcription(video_id: str, media_url: str) -> Dict[str, Any]:
+    """Submit video for transcription."""
+    payload = {
+        "video_id": video_id,
+        "media_url": media_url,
+        "language": "en-US"
+    }
+    return call_api(TRANSCRIBE_URL, payload, timeout=60)
+
+
+def poll_transcription(job_url: str) -> Dict[str, Any]:
+    """Poll transcription job status."""
+    r = requests.get(job_url, timeout=30)
+    r.raise_for_status()
+    return r.json()
+
+
+def embed_and_index(video_id: str, transcript_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Send transcript for embedding and indexing."""
+    payload = {
+        "video_id": video_id,
+        "transcript": transcript_data
+    }
+    return call_api(EMBED_INDEX_URL, payload, timeout=60)
+
+
+def process_video_pipeline(video_id: str, media_url: str, progress_bar=None, status_text=None):
+    """
+    Complete pipeline: transcribe -> poll -> embed/index
+    Returns final status
+    """
+    # Step 1: Submit transcription
+    if status_text:
+        status_text.text("Submitting to Azure Speech-to-Text...")
+    if progress_bar:
+        progress_bar.progress(10)
+    
+    try:
+        result = submit_transcription(video_id, media_url)
+        job_url = result.get("job_url")
+        
+        if not job_url:
+            return {"status": "failed", "error": "No job URL returned"}
+        
+        # Step 2: Poll for completion
+        if status_text:
+            status_text.text("Transcribing audio (this may take several minutes)...")
+        
+        max_polls = 120  # 30 minutes max
+        for i in range(max_polls):
+            time.sleep(POLL_SECONDS)
+            
+            poll_result = poll_transcription(job_url)
+            status = poll_result.get("status", "unknown").lower()
+            
+            if progress_bar:
+                progress = min(10 + int((i / max_polls) * 70), 80)
+                progress_bar.progress(progress)
+            
+            if status == "succeeded":
+                if status_text:
+                    status_text.text("Transcription complete! Indexing segments...")
+                if progress_bar:
+                    progress_bar.progress(85)
+                
+                # Step 3: Embed and index
+                transcript_data = poll_result.get("transcript", {})
+                index_result = embed_and_index(video_id, transcript_data)
+                
+                if progress_bar:
+                    progress_bar.progress(100)
+                if status_text:
+                    status_text.text("✅ Complete! Video is now searchable.")
+                
+                return {
+                    "status": "completed",
+                    "video_id": video_id,
+                    "segments_indexed": index_result.get("indexed", 0),
+                    "job_url": job_url
+                }
+                
+            elif status == "failed":
+                error = poll_result.get("error", "Unknown error")
+                return {"status": "failed", "error": error}
+            
+            # Still running, continue polling
+            if status_text and i % 4 == 0:  # Update every minute
+                status_text.text(f"Transcribing... ({i * POLL_SECONDS // 60} minutes elapsed)")
+        
+        # Timeout
+        return {"status": "timeout", "error": "Transcription timed out after 30 minutes"}
+        
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+
+
+def generate_video_id(filename: str) -> str:
+    """Generate unique video ID from filename."""
+    import hashlib
+    clean_name = Path(filename).stem
+    hash_suffix = hashlib.md5(clean_name.encode()).hexdigest()[:8]
+    return f"vid_{clean_name[:50]}_{hash_suffix}"
+
+
+# =============================================================================
+# PAGE 1: SEARCH SEGMENTS
+# =============================================================================
+
+if page == "🔎 Search Segments":
+    st.header("Search Indexed Video Segments")
+    
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        q = st.text_input("Query", value="", placeholder="e.g., measles vaccine side effects")
+    with col2:
+        video_id_filter = st.text_input("Filter by video_id (optional)", value="")
+    
+    go = st.button("Search", type="primary", disabled=(not q.strip()))
+    
+    if go:
+        payload = {"q": q.strip(), "mode": mode, "top": top}
+        if mode in ("hybrid", "vector"):
+            payload["k"] = k
+        if video_id_filter.strip():
+            payload["video_id"] = video_id_filter.strip()
+        
+        try:
+            with st.spinner("Searching..."):
+                data = call_api(SEARCH_FN_URL, payload)
+        except Exception as e:
+            st.error(f"Search failed: {e}")
+            st.stop()
+        
+        hits = data.get("hits", [])
+        total_count = data.get("count", 0)
+        
+        st.caption(f"Found {total_count} total segments | Showing top {len(hits)}")
+        
+        if not hits:
+            st.info("No results found. Try a different query or upload videos first.")
+        
+        for i, h in enumerate(hits, start=1):
+            start_ms = h.get("start_ms", 0)
+            end_ms = h.get("end_ms", 0)
+            vid = h.get("video_id", "")
+            seg = h.get("segment_id", "")
+            score = h.get("score", None)
+            
+            header = f"{i}. {vid}  |  {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}"
+            if seg:
+                header += f"  |  seg={seg}"
+            if score is not None:
+                header += f"  |  score={score:.3f}" if isinstance(score, (float, int)) else f"  |  score={score}"
+            
+            with st.expander(header, expanded=(i <= 3)):
+                st.write(h.get("text", ""))
+                
+                # Show annotations if present
+                labels = h.get("pred_labels") or []
+                conf = h.get("pred_confidence")
+                rationale = h.get("pred_rationale")
+                
+                if labels or conf is not None or rationale:
+                    st.subheader("Annotations")
+                    cols = st.columns(3)
+                    if labels:
+                        cols[0].metric("Labels", ", ".join(labels))
+                    if conf is not None:
+                        cols[1].metric("Confidence", f"{conf:.2f}" if isinstance(conf, float) else conf)
+                    if rationale:
+                        cols[2].metric("Rationale", rationale[:100] + "..." if len(str(rationale)) > 100 else rationale)
+
+
+# =============================================================================
+# PAGE 2: UPLOAD & TRANSCRIBE
+# =============================================================================
+
+elif page == "⬆️ Upload & Transcribe":
+    st.header("Upload Video for Transcription")
+    
+    st.markdown("""
+    Upload a video file to:
+    1. Extract audio and transcribe using Azure Speech-to-Text
+    2. Segment the transcript into searchable chunks
+    3. Create vector embeddings and index for semantic search
+    
+    Supported formats: MP4, AVI, MOV, MKV, M4A, MP3, WAV
+    """)
+    
+    # File uploader
+    uploaded_file = st.file_uploader(
+        "Choose a video or audio file",
+        type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"],
+        accept_multiple_files=False
+    )
+    
+    # Or provide URL
+    st.markdown("**OR** provide a media URL:")
+    media_url_input = st.text_input(
+        "Media URL (e.g., Box shared link, Azure Blob URL)",
+        placeholder="https://..."
+    )
+    
+    # Video ID input (optional)
+    custom_video_id = st.text_input(
+        "Custom Video ID (optional)",
+        placeholder="my_video_001",
+        help="Leave blank to auto-generate from filename"
+    )
+    
+    # Process button
+    process_clicked = st.button(
+        "🚀 Start Transcription",
+        type="primary",
+        disabled=(not uploaded_file and not media_url_input.strip())
+    )
+    
+    if process_clicked:
+        # Determine video ID and media URL
+        if uploaded_file:
+            # Save uploaded file temporarily
+            video_id = custom_video_id or generate_video_id(uploaded_file.name)
+            
+            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+                tmp.write(uploaded_file.getvalue())
+                tmp_path = tmp.name
+            
+            st.info(f"📁 File saved temporarily: {tmp_path}")
+            st.warning("⚠️ Direct file upload requires Azure Blob storage integration. Please use 'Media URL' option with a publicly accessible URL (Box, Azure Blob, etc.) for now.")
+            
+            # For now, instruct user to use URL option
+            st.error("Please use the 'Media URL' option instead. Upload your file to Box or Azure Blob first, then paste the direct download URL.")
+            
+        elif media_url_input.strip():
+            video_id = custom_video_id or generate_video_id(media_url_input)
+            media_url = media_url_input.strip()
+            
+            st.success(f"🎬 Processing: {video_id}")
+            st.info(f"URL: {media_url[:80]}...")
+            
+            # Progress tracking
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            
+            # Run pipeline
+            result = process_video_pipeline(video_id, media_url, progress_bar, status_text)
+            
+            # Display results
+            if result["status"] == "completed":
+                st.success(f"""
+                ✅ **Transcription Complete!**
+                
+                - **Video ID**: {result['video_id']}
+                - **Segments Indexed**: {result['segments_indexed']}
+                - **Status**: Ready for search
+                
+                Go to the **Search** page to query this video's content.
+                """)
+                
+                # Show sample query
+                st.code(f'Query: "video_id:{video_id}" to see all segments from this video', language="text")
+                
+            elif result["status"] == "failed":
+                st.error(f"❌ **Processing Failed**\n\nError: {result.get('error', 'Unknown error')}")
+                
+            elif result["status"] == "timeout":
+                st.warning(f"⏱️ **Processing Timeout**\n\nThe transcription is taking longer than expected. Check pipeline_state.json for status.")
+                
+            else:
+                st.error(f"⚠️ **Unexpected Error**: {result.get('error', 'Unknown')}")
+
+
+# =============================================================================
+# FOOTER
+# =============================================================================
+
+st.sidebar.markdown("---")
+st.sidebar.caption("""
+**Video Annotation Platform v1.0**
+
+- Search: Query indexed segments
+- Upload: Add new videos via URL
+- Azure Speech-to-Text powered
+""")
\ No newline at end of file

From b4e9a9a357289dc5fbd8a13c2b27db2a73c3ab80 Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Mon, 16 Feb 2026 19:14:39 -0600
Subject: [PATCH 3/8] Update  upload feature + UI updates (multiple uploads)

---
 ui/ui_search.py | 211 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 192 insertions(+), 19 deletions(-)

diff --git a/ui/ui_search.py b/ui/ui_search.py
index 26a2ecd..0af5cc7 100644
--- a/ui/ui_search.py
+++ b/ui/ui_search.py
@@ -67,6 +67,8 @@
     st.session_state.batch_results = []
 if 'batch_processing' not in st.session_state:
     st.session_state.batch_processing = False
+if 'index_schema_cache' not in st.session_state:
+    st.session_state.index_schema_cache = None
 
 # Sidebar
 with st.sidebar:
@@ -79,6 +81,22 @@
                           index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1)
         top = st.slider("Results", 1, 50, DEFAULT_TOP)
         k = st.slider("Vector k", 5, 200, DEFAULT_K)
+    
+    # Debug section
+    st.markdown("---")
+    if st.button("🔍 Debug Index Schema"):
+        with st.spinner("Fetching index schema..."):
+            schema_info = debug_check_index_schema()
+            if isinstance(schema_info, dict):
+                st.success(f"Index: {schema_info['index_name']}")
+                st.write(f"**Key Field:** `{schema_info['key_field']}`")
+                st.write("**Fields:**")
+                for field in schema_info['fields']:
+                    key_badge = "🔑 " if field['key'] else ""
+                    st.caption(f"{key_badge}`{field['name']}` ({field['type']})")
+                st.session_state.index_schema_cache = schema_info
+            else:
+                st.error(schema_info)
 
 
 # =============================================================================
@@ -99,6 +117,94 @@ def call_api(url: str, payload: dict, timeout: int = 60) -> dict:
     return r.json() if r.text else {}
 
 
+def sanitize_id(id_string: str) -> str:
+    """
+    Sanitize ID to be valid for Azure Search (alphanumeric, hyphens, underscores only).
+    Document key rules: Cannot start with underscore, max 1024 chars.
+    """
+    if not id_string:
+        id_string = "unknown"
+    
+    # Replace invalid characters with underscore
+    sanitized = re.sub(r'[^\w\-]', '_', str(id_string))
+    
+    # Ensure it doesn't start with underscore (invalid for Azure Search keys)
+    if sanitized.startswith('_'):
+        sanitized = 'id' + sanitized
+    
+    # Ensure it doesn't start with dash (also problematic)
+    if sanitized.startswith('-'):
+        sanitized = 'id' + sanitized
+    
+    # Limit length to 1024 characters (Azure Search limit)
+    if len(sanitized) > 1024:
+        # Use hash to ensure uniqueness while truncating
+        hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:16]
+        sanitized = sanitized[:1000] + "_" + hash_suffix
+    
+    return sanitized
+
+
+# =============================================================================
+# AZURE SEARCH SCHEMA FUNCTIONS
+# =============================================================================
+
+def debug_check_index_schema():
+    """Check if your index exists and verify the key field"""
+    if not SEARCH_ENDPOINT or not SEARCH_KEY or not SEARCH_INDEX_NAME:
+        return "Search not configured - check SEARCH_ENDPOINT, SEARCH_KEY, and SEARCH_INDEX_NAME"
+    
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version=2024-07-01"
+    headers = {"api-key": SEARCH_KEY}
+    
+    try:
+        r = requests.get(url, headers=headers, timeout=30)
+        if r.status_code == 200:
+            schema = r.json()
+            key_field = None
+            fields_info = []
+            
+            for field in schema.get("fields", []):
+                field_info = {
+                    "name": field.get("name"),
+                    "type": field.get("type"),
+                    "key": field.get("key", False),
+                    "searchable": field.get("searchable", False),
+                    "filterable": field.get("filterable", False),
+                    "sortable": field.get("sortable", False),
+                    "facetable": field.get("facetable", False),
+                    "retrievable": field.get("retrievable", False)
+                }
+                fields_info.append(field_info)
+                
+                if field.get("key", False):
+                    key_field = field.get("name")
+            
+            result = {
+                "index_name": schema.get("name"),
+                "key_field": key_field,
+                "fields": fields_info
+            }
+            return result
+        else:
+            return f"Index check failed: HTTP {r.status_code} - {r.text[:500]}"
+    except Exception as e:
+        return f"Error checking index: {str(e)}"
+
+
+def get_index_schema():
+    """Get cached schema or fetch new one"""
+    if st.session_state.index_schema_cache:
+        return st.session_state.index_schema_cache
+    
+    schema_info = debug_check_index_schema()
+    if isinstance(schema_info, dict):
+        st.session_state.index_schema_cache = schema_info
+        return schema_info
+    else:
+        raise RuntimeError(f"Cannot fetch index schema: {schema_info}")
+
+
 # =============================================================================
 # DIRECT AZURE SPEECH API FUNCTIONS (BYPASS AZURE FUNCTION)
 # =============================================================================
@@ -259,11 +365,23 @@ def get_embeddings(texts: list) -> list:
 def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
     """
     Index segments directly to Azure Cognitive Search.
-    Bypasses the EmbedAndIndex Azure Function.
+    
+    CRITICAL: Automatically detects the key field from index schema instead of assuming 'id'
     """
     if not SEARCH_ENDPOINT or not SEARCH_KEY:
         raise RuntimeError("Azure Search not configured")
     
+    # Get the key field name from the index schema
+    schema_info = get_index_schema()
+    key_field = schema_info.get("key_field")
+    
+    if not key_field:
+        available = [f.get("name") for f in schema_info.get("fields", [])]
+        raise RuntimeError(f"No key field found in index. Available fields: {available}")
+    
+    # Get list of available fields to ensure we only send existing fields
+    available_fields = {f.get("name") for f in schema_info.get("fields", [])}
+    
     # Generate embeddings for all segments
     texts = [seg.get("text", "") for seg in segments]
     try:
@@ -275,17 +393,44 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
     # Prepare search documents
     documents = []
     for i, (seg, embedding) in enumerate(zip(segments, embeddings)):
+        safe_video_id = sanitize_id(video_id)
+        doc_id = f"{safe_video_id}_{i}"
+        
+        # Build document dynamically based on what fields actually exist in the index
         doc = {
-            "id": f"{video_id}_{i}",
-            "video_id": video_id,
-            "segment_id": seg.get("segment_id", i),
-            "text": seg.get("text", ""),
-            "start_ms": seg.get("start_ms", 0),
-            "end_ms": seg.get("end_ms", 0),
-            "pred_labels": seg.get("pred_labels", [])
+            "@search.action": "upload"
+        }
+        
+        # Add the key field (whatever it's actually named in your index)
+        doc[key_field] = doc_id
+        
+        # Map of our field names to potential index field names
+        field_mappings = {
+            "video_id": safe_video_id,
+            "segment_id": str(seg.get("segment_id", i)),
+            "text": str(seg.get("text", "")),
+            "start_ms": int(seg.get("start_ms", 0)),
+            "end_ms": int(seg.get("end_ms", 0)),
+            "pred_labels": seg.get("pred_labels", []) if seg.get("pred_labels") else []
         }
-        if embedding:
-            doc["embedding"] = embedding
+        
+        # Only add fields that exist in the index schema
+        for field_name, value in field_mappings.items():
+            if field_name in available_fields:
+                doc[field_name] = value
+        
+        # Handle embedding field - check for common naming variations
+        embedding_field = None
+        for possible_name in ["embedding", "embeddings", "vector", "vectors"]:
+            if possible_name in available_fields:
+                embedding_field = possible_name
+                break
+        
+        if embedding and isinstance(embedding, list) and len(embedding) > 0 and embedding_field:
+            try:
+                doc[embedding_field] = [float(x) for x in embedding]
+            except (ValueError, TypeError):
+                st.warning(f"Skipping embedding for segment {i} due to conversion error")
         
         documents.append(doc)
     
@@ -303,8 +448,36 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
     
     try:
         r = requests.post(url, headers=headers, json=payload, timeout=60)
-        r.raise_for_status()
-        return {"indexed": len(documents), "video_id": video_id}
+        
+        if r.status_code >= 400:
+            error_detail = ""
+            try:
+                error_json = r.json()
+                error_detail = json.dumps(error_json, indent=2)
+            except:
+                error_detail = r.text
+            
+            raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\nDetails: {error_detail}")
+        
+        result = r.json()
+        
+        # Check for partial failures (207 Multi-Status)
+        if r.status_code == 207:
+            failed_docs = [item for item in result.get("value", []) if not item.get("status", False)]
+            if failed_docs:
+                st.warning(f"Partial indexing failure: {len(failed_docs)} documents failed")
+                for fail in failed_docs[:3]:
+                    st.error(f"Doc {fail.get('key', 'unknown')}: {fail.get('errorMessage', 'Unknown error')}")
+        
+        return {
+            "indexed": len(documents), 
+            "video_id": video_id, 
+            "key_field_used": key_field,
+            "api_response": result
+        }
+        
+    except requests.exceptions.HTTPError as e:
+        raise RuntimeError(f"HTTP Error: {str(e)}")
     except Exception as e:
         raise RuntimeError(f"Indexing failed: {str(e)}")
 
@@ -471,7 +644,7 @@ def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> str:
         
         # ====================================================================
         # CRITICAL FIX: Service SAS string-to-sign format
-        # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas
+        # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas  
         # Format for Blob service SAS:
         # StringToSign = signedPermissions + "\n" +
         #                signedStart + "\n" +
@@ -894,9 +1067,9 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
         # Index to search
         try:
             index_result = index_segments_direct(video_id, segments)
-            result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents"
+            result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents (key: {index_result.get('key_field_used', 'unknown')})"
         except Exception as e:
-            result["index_status"] = f"Indexing skipped: {str(e)}"
+            result["index_status"] = f"Indexing failed: {str(e)}"
         
         result["status"] = "success"
         
@@ -1003,7 +1176,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
     # DIRECT URL
     # -------------------------------------------------------------------------
     elif source_type == "Direct URL":
-        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/    ...")
+        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/...")
         
         if url_input.strip():
             media_url = url_input.strip()
@@ -1017,7 +1190,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
         # Use session state to persist the URL
         yt_url = st.text_input(
             "YouTube URL", 
-            placeholder="https://youtube.com/watch?v=  ...",
+            placeholder="https://youtube.com/watch?v=...",
             value=st.session_state.yt_url_value,
             key="yt_url_input"
         )
@@ -1456,9 +1629,9 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 # Index to search
                 try:
                     index_result = index_segments_direct(video_id, segments)
-                    index_msg = f"Indexed: {index_result.get('indexed', 0)} documents"
+                    index_msg = f"Indexed: {index_result.get('indexed', 0)} documents (key field: {index_result.get('key_field_used', 'unknown')})"
                 except Exception as e:
-                    index_msg = f"Indexing skipped: {e}"
+                    index_msg = f"Indexing failed: {str(e)}"
                 
                 progress_bar.progress(100)
                 status.text("Complete!")

From 26fea22370cf80506dfffd9a78639e62b426c4d5 Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Mon, 23 Feb 2026 19:42:13 -0600
Subject: [PATCH 4/8] Update  UI to store videos and display source URL

---
 ui/ui_search.py | 1396 +++++++++++++++++++++++++++--------------------
 1 file changed, 807 insertions(+), 589 deletions(-)

diff --git a/ui/ui_search.py b/ui/ui_search.py
index 0af5cc7..15ef358 100644
--- a/ui/ui_search.py
+++ b/ui/ui_search.py
@@ -1,8 +1,12 @@
 """
 ui_search.py - Streamlit Web Interface for Video Segment Search & Upload
 
-This version calls Azure Speech API DIRECTLY, bypassing the Azure Function
-that has the wrong API version hardcoded.
+Features:
+- Direct Azure Speech API integration (bypasses Azure Function)
+- URL tracking for all processed videos (source_url, source_type, processed_at)
+- Handles existing videos without URL data gracefully
+- Batch processing with CSV upload
+- Video management interface with filtering and deletion
 """
 
 import os
@@ -27,130 +31,178 @@
 # Load environment variables
 load_dotenv()
 
-# Azure Function URLs (only Search uses these now)
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+
+# Azure Function URLs
 SEARCH_FN_URL = os.environ.get("SEARCH_FN_URL", "")
 
-# Azure Speech Service Configuration (DIRECT)
+# Azure Speech Service (DIRECT)
 SPEECH_KEY = os.environ.get("SPEECH_KEY")
 SPEECH_REGION = os.environ.get("SPEECH_REGION", "eastus")
 SPEECH_API_VERSION = os.environ.get("SPEECH_API_VERSION", "2024-11-15")
 
-# Azure OpenAI & Search for indexing
+# Azure OpenAI
 AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
 AZURE_OPENAI_KEY = os.environ.get("AZURE_OPENAI_KEY")
 AZURE_OPENAI_DEPLOYMENT = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
 
+# Azure Cognitive Search
 SEARCH_ENDPOINT = os.environ.get("SEARCH_ENDPOINT")
 SEARCH_KEY = os.environ.get("SEARCH_KEY")
 SEARCH_INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "segments")
 
-# Azure Storage Configuration
+# Azure Storage
 AZURE_STORAGE_ACCOUNT = os.environ.get("AZURE_STORAGE_ACCOUNT", "storagevideoannotator")
 AZURE_STORAGE_KEY = os.environ.get("AZURE_STORAGE_KEY", "")
 INPUT_CONTAINER = os.environ.get("INPUT_CONTAINER", "speech-input")
 SEGMENTS_CONTAINER = os.environ.get("SEGMENTS_CONTAINER", "segments")
 
-# Default settings
+# Settings
 DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid")
 DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10"))
 DEFAULT_K = int(os.environ.get("DEFAULT_K", "40"))
 POLL_SECONDS = int(os.environ.get("POLL_SECONDS", "15"))
-BATCH_MAX_WORKERS = int(os.environ.get("BATCH_MAX_WORKERS", "3"))  # Concurrent processing limit
+
+# =============================================================================
+# STREAMLIT SETUP
+# =============================================================================
 
 st.set_page_config(page_title="Video Annotation Platform", layout="wide")
 st.title(" Video Annotation Platform")
 
 # Initialize session state
-if 'yt_url_value' not in st.session_state:
-    st.session_state.yt_url_value = ""
-if 'batch_results' not in st.session_state:
-    st.session_state.batch_results = []
-if 'batch_processing' not in st.session_state:
-    st.session_state.batch_processing = False
-if 'index_schema_cache' not in st.session_state:
-    st.session_state.index_schema_cache = None
-
-# Sidebar
+session_state_defaults = {
+    'yt_url_value': "",
+    'batch_results': [],
+    'batch_processing': False,
+    'index_schema_cache': None,
+    'stored_videos_cache': None,
+    'url_fields_status': None,
+    'debug_info': {}
+}
+
+for key, value in session_state_defaults.items():
+    if key not in st.session_state:
+        st.session_state[key] = value
+
+# =============================================================================
+# SIDEBAR NAVIGATION
+# =============================================================================
+
 with st.sidebar:
     st.header("Navigation")
-    page = st.radio("Select Page", ["🔎 Search Segments", "⬆️ Upload & Transcribe"])
-    
+    page = st.radio("Select Page", [
+        "🔎 Search Segments", 
+        "⬆️ Upload & Transcribe",
+        "📚 Manage Videos",
+        "⚙️ System Diagnostics"
+    ])
+    
+    # Settings for search page
     if page == "🔎 Search Segments":
-        st.header("Settings")
+        st.header("Search Settings")
         mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], 
                           index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1)
         top = st.slider("Results", 1, 50, DEFAULT_TOP)
         k = st.slider("Vector k", 5, 200, DEFAULT_K)
     
-    # Debug section
+    # Quick actions
     st.markdown("---")
-    if st.button("🔍 Debug Index Schema"):
-        with st.spinner("Fetching index schema..."):
-            schema_info = debug_check_index_schema()
-            if isinstance(schema_info, dict):
-                st.success(f"Index: {schema_info['index_name']}")
-                st.write(f"**Key Field:** `{schema_info['key_field']}`")
-                st.write("**Fields:**")
-                for field in schema_info['fields']:
-                    key_badge = "🔑 " if field['key'] else ""
-                    st.caption(f"{key_badge}`{field['name']}` ({field['type']})")
-                st.session_state.index_schema_cache = schema_info
-            else:
-                st.error(schema_info)
+    if st.button("🔄 Refresh Schema Cache"):
+        st.session_state.index_schema_cache = None
+        st.session_state.url_fields_status = None
+        st.success("Cache cleared! Navigate to System Diagnostics to refresh.")
+    
+    st.markdown("---")
+    st.caption("Video Annotation Platform v2.1")
+    st.caption("With URL Tracking")
 
 
 # =============================================================================
-# HELPER FUNCTIONS
+# UTILITY FUNCTIONS
 # =============================================================================
 
 def ms_to_ts(ms: int) -> str:
+    """Convert milliseconds to timestamp string."""
     s = max(0, int(ms // 1000))
     m, s = divmod(s, 60)
     h, m = divmod(m, 60)
     return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"
 
 
-def call_api(url: str, payload: dict, timeout: int = 60) -> dict:
-    r = requests.post(url, json=payload, timeout=timeout, headers={"Content-Type": "application/json"})
-    if r.status_code >= 400:
-        raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
-    return r.json() if r.text else {}
-
-
 def sanitize_id(id_string: str) -> str:
-    """
-    Sanitize ID to be valid for Azure Search (alphanumeric, hyphens, underscores only).
-    Document key rules: Cannot start with underscore, max 1024 chars.
-    """
+    """Sanitize ID for Azure Search (alphanumeric, hyphens, underscores only)."""
     if not id_string:
         id_string = "unknown"
     
-    # Replace invalid characters with underscore
     sanitized = re.sub(r'[^\w\-]', '_', str(id_string))
     
-    # Ensure it doesn't start with underscore (invalid for Azure Search keys)
-    if sanitized.startswith('_'):
+    if sanitized.startswith('_') or sanitized.startswith('-'):
         sanitized = 'id' + sanitized
     
-    # Ensure it doesn't start with dash (also problematic)
-    if sanitized.startswith('-'):
-        sanitized = 'id' + sanitized
-    
-    # Limit length to 1024 characters (Azure Search limit)
     if len(sanitized) > 1024:
-        # Use hash to ensure uniqueness while truncating
         hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:16]
         sanitized = sanitized[:1000] + "_" + hash_suffix
     
     return sanitized
 
 
+def detect_url_type(url: str) -> str:
+    """Detect if URL is YouTube, direct media, or unknown."""
+    if not url:
+        return "unknown"
+    
+    url_lower = str(url).lower().strip()
+    
+    youtube_patterns = [
+        r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)',
+        r'youtube\.com\/watch\?v=',
+        r'youtu\.be\/',
+        r'youtube\.com\/shorts\/'
+    ]
+    
+    for pattern in youtube_patterns:
+        if re.search(pattern, url_lower):
+            return "youtube"
+    
+    media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm']
+    if any(url_lower.endswith(ext) for ext in media_extensions):
+        return "direct"
+    
+    cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive']
+    if any(pattern in url_lower for pattern in cloud_patterns):
+        return "direct"
+    
+    return "unknown"
+
+
+def check_yt_dlp() -> bool:
+    """Check if yt-dlp is installed."""
+    try:
+        result = subprocess.run(["which", "yt-dlp"], capture_output=True, text=True)
+        return result.returncode == 0
+    except:
+        return False
+
+
+def call_api(url: str, payload: dict) -> dict:
+    """Make API call to search function."""
+    try:
+        r = requests.post(url, json=payload, timeout=30)
+        r.raise_for_status()
+        return r.json()
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"API call failed: {str(e)}")
+
+
 # =============================================================================
 # AZURE SEARCH SCHEMA FUNCTIONS
 # =============================================================================
 
 def debug_check_index_schema():
-    """Check if your index exists and verify the key field"""
+    """Check index schema and verify URL tracking fields."""
     if not SEARCH_ENDPOINT or not SEARCH_KEY or not SEARCH_INDEX_NAME:
         return "Search not configured - check SEARCH_ENDPOINT, SEARCH_KEY, and SEARCH_INDEX_NAME"
     
@@ -164,28 +216,35 @@ def debug_check_index_schema():
             key_field = None
             fields_info = []
             
+            url_fields = ['source_url', 'source_type', 'processed_at']
+            found_url_fields = []
+            
             for field in schema.get("fields", []):
                 field_info = {
                     "name": field.get("name"),
                     "type": field.get("type"),
                     "key": field.get("key", False),
-                    "searchable": field.get("searchable", False),
+                    "retrievable": field.get("retrievable", False),
                     "filterable": field.get("filterable", False),
                     "sortable": field.get("sortable", False),
-                    "facetable": field.get("facetable", False),
-                    "retrievable": field.get("retrievable", False)
+                    "facetable": field.get("facetable", False)
                 }
                 fields_info.append(field_info)
                 
                 if field.get("key", False):
                     key_field = field.get("name")
+                
+                if field.get("name") in url_fields:
+                    found_url_fields.append(field.get("name"))
             
-            result = {
+            return {
                 "index_name": schema.get("name"),
                 "key_field": key_field,
-                "fields": fields_info
+                "fields": fields_info,
+                "found_url_fields": found_url_fields,
+                "missing_url_fields": list(set(url_fields) - set(found_url_fields)),
+                "has_all_url_fields": len(found_url_fields) == len(url_fields)
             }
-            return result
         else:
             return f"Index check failed: HTTP {r.status_code} - {r.text[:500]}"
     except Exception as e:
@@ -193,7 +252,7 @@ def debug_check_index_schema():
 
 
 def get_index_schema():
-    """Get cached schema or fetch new one"""
+    """Get cached schema or fetch new one."""
     if st.session_state.index_schema_cache:
         return st.session_state.index_schema_cache
     
@@ -205,17 +264,41 @@ def get_index_schema():
         raise RuntimeError(f"Cannot fetch index schema: {schema_info}")
 
 
+def check_url_fields_status():
+    """Check URL fields status with caching."""
+    if st.session_state.url_fields_status:
+        return st.session_state.url_fields_status
+    
+    try:
+        schema = get_index_schema()
+        if isinstance(schema, dict):
+            result = {
+                'fields_exist': schema.get('has_all_url_fields', False),
+                'found_fields': schema.get('found_url_fields', []),
+                'missing_fields': schema.get('missing_url_fields', []),
+                'key_field': schema.get('key_field')
+            }
+            st.session_state.url_fields_status = result
+            return result
+    except:
+        pass
+    
+    return {
+        'fields_exist': False,
+        'found_fields': [],
+        'missing_fields': ['source_url', 'source_type', 'processed_at'],
+        'key_field': None
+    }
+
+
 # =============================================================================
-# DIRECT AZURE SPEECH API FUNCTIONS (BYPASS AZURE FUNCTION)
+# AZURE SPEECH API FUNCTIONS
 # =============================================================================
 
 def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]:
-    """
-    Submit transcription directly to Azure Speech API.
-    Bypasses the Azure Function with wrong API version.
-    """
+    """Submit transcription directly to Azure Speech API."""
     if not SPEECH_KEY:
-        raise RuntimeError("SPEECH_KEY not configured in environment")
+        raise RuntimeError("SPEECH_KEY not configured")
     
     endpoint = f"https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext/transcriptions:submit?api-version={SPEECH_API_VERSION}"
     
@@ -241,7 +324,6 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]
         r = requests.post(endpoint, headers=headers, json=payload, timeout=60)
         r.raise_for_status()
         
-        # Get operation URL from Location header (this is the operation status URL)
         operation_url = r.headers.get("Location")
         if not operation_url:
             result = r.json()
@@ -253,32 +335,21 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]
         return {"operation_url": operation_url, "video_id": video_id}
         
     except requests.exceptions.HTTPError as e:
+        error_msg = f"Speech API error {r.status_code}: {r.text}"
         if r.status_code == 401:
-            raise RuntimeError("Azure Speech API authentication failed. Check SPEECH_KEY.")
-        elif r.status_code == 400:
-            raise RuntimeError(f"Bad request: {r.text}")
-        else:
-            raise RuntimeError(f"Speech API error {r.status_code}: {r.text}")
+            error_msg = "Azure Speech API authentication failed. Check SPEECH_KEY."
+        raise RuntimeError(error_msg)
 
 
 def poll_transcription_operation(operation_url: str) -> Dict[str, Any]:
-    """Poll transcription operation status directly from Azure Speech API."""
+    """Poll transcription operation status."""
     if not SPEECH_KEY:
         raise RuntimeError("SPEECH_KEY not configured")
     
-    headers = {
-        "Ocp-Apim-Subscription-Key": SPEECH_KEY
-    }
+    headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY}
     
     try:
-        # CRITICAL FIX: Azure returns operation URL with :submit but we need to poll
-        # using the /transcriptions/{id} endpoint, not /transcriptions:submit/{id}
-        # The operation_url looks like: .../transcriptions:submit/{id}?api-version=...
-        # We need: .../transcriptions/{id}?api-version=...
-        
         poll_url = operation_url.replace("/transcriptions:submit/", "/transcriptions/")
-        
-        # Debug info
         st.session_state['debug_poll_url'] = poll_url
         
         r = requests.get(poll_url, headers=headers, timeout=30)
@@ -290,38 +361,29 @@ def poll_transcription_operation(operation_url: str) -> Dict[str, Any]:
 
 
 def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]:
-    """Get the actual transcription JSON from the result files."""
+    """Get transcription JSON from result files."""
     if not SPEECH_KEY:
         raise RuntimeError("SPEECH_KEY not configured")
     
-    headers = {
-        "Ocp-Apim-Subscription-Key": SPEECH_KEY
-    }
+    headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY}
     
     try:
-        # Get the result files URL from the completed operation
         links = result_data.get("links", {})
         files_url = links.get("files")
         
         if not files_url:
-            # Try to construct from result data or get content directly
             if "combinedRecognizedPhrases" in result_data:
-                # Result might be embedded directly
                 return result_data
-            
             raise RuntimeError("No files URL in result")
         
-        # Get list of files
         r = requests.get(files_url, headers=headers, timeout=30)
         r.raise_for_status()
         files_data = r.json()
         
-        # Find the transcription JSON file
         for file in files_data.get("values", []):
             if file.get("kind") == "Transcription":
                 content_url = file.get("links", {}).get("contentUrl")
                 if content_url:
-                    # Download the actual transcription content
                     content_r = requests.get(content_url, timeout=60)
                     content_r.raise_for_status()
                     return content_r.json()
@@ -333,11 +395,11 @@ def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]:
 
 
 # =============================================================================
-# DIRECT EMBEDDING AND INDEXING (BYPASS AZURE FUNCTION)
+# EMBEDDING AND INDEXING WITH URL TRACKING
 # =============================================================================
 
 def get_embeddings(texts: list) -> list:
-    """Get embeddings directly from Azure OpenAI."""
+    """Get embeddings from Azure OpenAI."""
     if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_KEY:
         raise RuntimeError("Azure OpenAI not configured")
     
@@ -362,27 +424,28 @@ def get_embeddings(texts: list) -> list:
         raise RuntimeError(f"Embedding failed: {str(e)}")
 
 
-def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
+def index_segments_direct(video_id: str, segments: list, source_url: str = None, source_type: str = None) -> Dict[str, Any]:
     """
-    Index segments directly to Azure Cognitive Search.
-    
-    CRITICAL: Automatically detects the key field from index schema instead of assuming 'id'
+    Index segments to Azure Cognitive Search with URL tracking.
     """
     if not SEARCH_ENDPOINT or not SEARCH_KEY:
         raise RuntimeError("Azure Search not configured")
     
-    # Get the key field name from the index schema
     schema_info = get_index_schema()
     key_field = schema_info.get("key_field")
+    available_fields = {f.get("name") for f in schema_info.get("fields", [])}
     
     if not key_field:
-        available = [f.get("name") for f in schema_info.get("fields", [])]
-        raise RuntimeError(f"No key field found in index. Available fields: {available}")
+        raise RuntimeError("No key field found in index")
     
-    # Get list of available fields to ensure we only send existing fields
-    available_fields = {f.get("name") for f in schema_info.get("fields", [])}
+    # Check URL field availability
+    url_fields_available = {
+        'source_url': 'source_url' in available_fields,
+        'source_type': 'source_type' in available_fields,
+        'processed_at': 'processed_at' in available_fields
+    }
     
-    # Generate embeddings for all segments
+    # Generate embeddings
     texts = [seg.get("text", "") for seg in segments]
     try:
         embeddings = get_embeddings(texts)
@@ -390,21 +453,17 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
         st.warning(f"Embedding failed, indexing without vectors: {e}")
         embeddings = [None] * len(segments)
     
-    # Prepare search documents
+    # Prepare documents
     documents = []
+    processed_timestamp = datetime.utcnow().isoformat() + "Z"
+    
     for i, (seg, embedding) in enumerate(zip(segments, embeddings)):
         safe_video_id = sanitize_id(video_id)
         doc_id = f"{safe_video_id}_{i}"
         
-        # Build document dynamically based on what fields actually exist in the index
-        doc = {
-            "@search.action": "upload"
-        }
+        doc = {"@search.action": "upload", key_field: doc_id}
         
-        # Add the key field (whatever it's actually named in your index)
-        doc[key_field] = doc_id
-        
-        # Map of our field names to potential index field names
+        # Core fields
         field_mappings = {
             "video_id": safe_video_id,
             "segment_id": str(seg.get("segment_id", i)),
@@ -414,115 +473,216 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]:
             "pred_labels": seg.get("pred_labels", []) if seg.get("pred_labels") else []
         }
         
-        # Only add fields that exist in the index schema
+        # URL tracking fields
+        if url_fields_available['source_url']:
+            field_mappings["source_url"] = str(source_url) if source_url else ""
+        if url_fields_available['source_type']:
+            field_mappings["source_type"] = str(source_type) if source_type else "unknown"
+        if url_fields_available['processed_at']:
+            field_mappings["processed_at"] = processed_timestamp
+        
+        # Only add existing fields
         for field_name, value in field_mappings.items():
             if field_name in available_fields:
                 doc[field_name] = value
         
-        # Handle embedding field - check for common naming variations
-        embedding_field = None
-        for possible_name in ["embedding", "embeddings", "vector", "vectors"]:
-            if possible_name in available_fields:
-                embedding_field = possible_name
-                break
-        
-        if embedding and isinstance(embedding, list) and len(embedding) > 0 and embedding_field:
+        # Handle embedding
+        embedding_field = next((f for f in ["embedding", "embeddings", "vector", "vectors"] if f in available_fields), None)
+        if embedding and embedding_field:
             try:
                 doc[embedding_field] = [float(x) for x in embedding]
             except (ValueError, TypeError):
-                st.warning(f"Skipping embedding for segment {i} due to conversion error")
+                pass
         
         documents.append(doc)
     
-    # Upload to Azure Search
+    # Upload to search index
     url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01"
-    
-    headers = {
-        "api-key": SEARCH_KEY,
-        "Content-Type": "application/json"
-    }
-    
-    payload = {
-        "value": documents
-    }
+    headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"}
+    payload = {"value": documents}
     
     try:
         r = requests.post(url, headers=headers, json=payload, timeout=60)
         
         if r.status_code >= 400:
-            error_detail = ""
+            error_detail = r.text
             try:
-                error_json = r.json()
-                error_detail = json.dumps(error_json, indent=2)
+                error_detail = json.dumps(r.json(), indent=2)
             except:
-                error_detail = r.text
-            
-            raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\nDetails: {error_detail}")
+                pass
+            raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\n{error_detail}")
         
         result = r.json()
         
-        # Check for partial failures (207 Multi-Status)
+        # Check for partial failures
         if r.status_code == 207:
             failed_docs = [item for item in result.get("value", []) if not item.get("status", False)]
             if failed_docs:
                 st.warning(f"Partial indexing failure: {len(failed_docs)} documents failed")
-                for fail in failed_docs[:3]:
-                    st.error(f"Doc {fail.get('key', 'unknown')}: {fail.get('errorMessage', 'Unknown error')}")
         
         return {
-            "indexed": len(documents), 
-            "video_id": video_id, 
+            "indexed": len(documents),
+            "video_id": video_id,
             "key_field_used": key_field,
-            "api_response": result
+            "source_url_stored": bool(source_url and url_fields_available['source_url']),
+            "source_type_stored": bool(source_type and url_fields_available['source_type']),
+            "url_fields_available": url_fields_available
         }
         
-    except requests.exceptions.HTTPError as e:
-        raise RuntimeError(f"HTTP Error: {str(e)}")
     except Exception as e:
         raise RuntimeError(f"Indexing failed: {str(e)}")
 
 
 def process_transcription_to_segments(transcription_data: Dict, video_id: str) -> list:
-    """
-    Convert Azure Speech transcription JSON to segments format.
-    """
+    """Convert Azure Speech transcription to segments."""
     segments = []
     
-    # Parse phrases/segments from transcription
-    phrases = transcription_data.get("recognizedPhrases", [])
-    
-    for i, phrase in enumerate(phrases):
-        # Extract timing
-        offset = phrase.get("offsetInTicks", 0) // 10000  # Convert to ms
+    for i, phrase in enumerate(transcription_data.get("recognizedPhrases", [])):
+        offset = phrase.get("offsetInTicks", 0) // 10000
         duration = phrase.get("durationInTicks", 0) // 10000
         
-        # Extract text
         nbest = phrase.get("nBest", [])
-        if nbest:
-            text = nbest[0].get("display", "")
-        else:
-            text = ""
+        text = nbest[0].get("display", "") if nbest else ""
         
-        # Create segment
-        segment = {
+        segments.append({
             "segment_id": i,
             "video_id": video_id,
             "text": text,
             "start_ms": offset,
             "end_ms": offset + duration,
-            "pred_labels": []  # Could add label prediction here
-        }
-        
-        segments.append(segment)
+            "pred_labels": []
+        })
     
     return segments
 
 
 # =============================================================================
-# STORAGE FUNCTIONS - FIXED UPLOAD
+# VIDEO RETRIEVAL AND MANAGEMENT
+# =============================================================================
+
+def get_stored_videos(video_id: str = None, source_type: str = None, 
+                     include_missing: bool = True, limit: int = 1000) -> List[Dict]:
+    """
+    Retrieve videos from search index with URL data.
+    """
+    if not SEARCH_ENDPOINT or not SEARCH_KEY:
+        return []
+    
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01"
+    headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"}
+    
+    # Build filter
+    filters = []
+    if video_id:
+        filters.append(f"video_id eq '{video_id}'")
+    if source_type:
+        filters.append(f"source_type eq '{source_type}'")
+    
+    filter_query = " and ".join(filters) if filters else None
+    
+    # Get available fields
+    schema = get_index_schema()
+    available_fields = {f['name'] for f in schema.get('fields', [])}
+    
+    # Build select
+    select_fields = ["video_id"]
+    for field in ["source_url", "source_type", "processed_at"]:
+        if field in available_fields:
+            select_fields.append(field)
+    
+    payload = {
+        "search": "*",
+        "select": ",".join(select_fields),
+        "top": limit
+    }
+    
+    if "processed_at" in available_fields:
+        payload["orderby"] = "processed_at desc"
+    if filter_query:
+        payload["filter"] = filter_query
+    
+    try:
+        r = requests.post(url, headers=headers, json=payload, timeout=30)
+        r.raise_for_status()
+        docs = r.json().get("value", [])
+        
+        # Deduplicate and normalize
+        seen = set()
+        unique_docs = []
+        for doc in docs:
+            vid = doc.get('video_id')
+            if vid and vid not in seen:
+                seen.add(vid)
+                # Normalize missing values
+                doc['source_type'] = doc.get('source_type') or 'unknown'
+                doc['source_url'] = doc.get('source_url') or ''
+                doc['processed_at'] = doc.get('processed_at') or 'unknown'
+                unique_docs.append(doc)
+        
+        return unique_docs
+        
+    except Exception as e:
+        st.error(f"Failed to retrieve videos: {e}")
+        return []
+
+
+def delete_video_by_id(video_id: str) -> bool:
+    """Delete all segments for a video_id from the index."""
+    if not SEARCH_ENDPOINT or not SEARCH_KEY:
+        return False
+    
+    # Find all documents
+    search_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01"
+    headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"}
+    
+    payload = {
+        "search": "*",
+        "filter": f"video_id eq '{video_id}'",
+        "select": "video_id",
+        "top": 1000
+    }
+    
+    try:
+        r = requests.post(search_url, headers=headers, json=payload, timeout=30)
+        r.raise_for_status()
+        docs = r.json().get("value", [])
+        
+        if not docs:
+            return False
+        
+        # Delete documents
+        schema = get_index_schema()
+        key_field = schema.get('key_field', 'id')
+        
+        delete_docs = []
+        for doc in docs:
+            doc_key = doc.get(key_field) or doc.get('id')
+            if doc_key:
+                delete_docs.append({
+                    "@search.action": "delete",
+                    key_field: doc_key
+                })
+        
+        if not delete_docs:
+            return False
+        
+        delete_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01"
+        r = requests.post(delete_url, headers=headers, json={"value": delete_docs}, timeout=60)
+        r.raise_for_status()
+        
+        return True
+        
+    except Exception as e:
+        st.error(f"Delete failed: {e}")
+        return False
+
+
+# =============================================================================
+# AZURE STORAGE FUNCTIONS
 # =============================================================================
 
 def generate_video_id(filename: str) -> str:
+    """Generate safe video ID from filename."""
     clean_name = Path(filename).stem
     clean_name = re.sub(r'[^\w\s-]', '', clean_name)
     clean_name = re.sub(r'[-\s]+', '_', clean_name)
@@ -531,91 +691,90 @@ def generate_video_id(filename: str) -> str:
 
 
 def test_sas_url(sas_url: str) -> Tuple[bool, str]:
-    """Test if SAS URL is accessible before sending to Speech API."""
+    """Test if SAS URL is accessible."""
     try:
         r = requests.head(sas_url, timeout=10, allow_redirects=True)
-        if r.status_code == 200:
-            return True, "SAS URL is accessible"
-        else:
-            return False, f"SAS URL returned HTTP {r.status_code}"
+        return (r.status_code == 200, f"HTTP {r.status_code}")
     except Exception as e:
-        return False, f"SAS URL test failed: {str(e)}"
+        return (False, str(e))
+
+
+def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> Optional[str]:
+    """Generate SAS token for blob access."""
+    if not AZURE_STORAGE_KEY:
+        return None
+    
+    try:
+        expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours)
+        expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ')
+        
+        account_key = base64.b64decode(AZURE_STORAGE_KEY)
+        canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"
+        
+        string_to_sign = (
+            f"r\n\n{expiry_str}\n{canonicalized_resource}\n\n\nhttps\n2020-12-06\nb\n\n\n\n\n\n\n"
+        )
+        
+        signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
+        signature = base64.b64encode(signed_hmac).decode('utf-8')
+        
+        sas_params = {
+            'sv': '2020-12-06',
+            'sr': 'b',
+            'sp': 'r',
+            'se': expiry_str,
+            'spr': 'https',
+            'sig': signature
+        }
+        
+        return '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()])
+        
+    except Exception as e:
+        st.error(f"SAS generation error: {e}")
+        return None
 
 
 def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]:
-    """
-    FIXED upload to Azure Blob using REST API.
-    Corrected string-to-sign format.
-    """
+    """Upload to Azure Blob using REST API."""
     if not AZURE_STORAGE_KEY:
         return None, "Azure Storage key not configured"
     
     try:
-        # Upload URL
         url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}"
         
-        # Create date header in the exact format Azure expects
         date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
         content_length = len(file_bytes)
         
-        # ====================================================================
-        # CRITICAL FIX: Correct string-to-sign format for Azure Blob Storage
-        # Format: VERB\nContent-Encoding\nContent-Language\nContent-Length\n
-        #         Content-MD5\nContent-Type\nDate\nIf-Modified-Since\nIf-Match\n
-        #         If-None-Match\nIf-Unmodified-Since\nRange\n
-        #         CanonicalizedHeaders\nCanonicalizedResource
-        # ====================================================================
         string_to_sign = (
-            f"PUT\n"                       # HTTP method
-            f"\n"                          # Content-Encoding (empty)
-            f"\n"                          # Content-Language (empty)
-            f"{content_length}\n"          # Content-Length (REQUIRED - must be exact)
-            f"\n"                          # Content-MD5 (empty)
-            f"application/octet-stream\n"  # Content-Type (REQUIRED for PUT)
-            f"\n"                          # Date (empty, using x-ms-date instead)
-            f"\n"                          # If-Modified-Since (empty)
-            f"\n"                          # If-Match (empty)
-            f"\n"                          # If-None-Match (empty)
-            f"\n"                          # If-Unmodified-Since (empty)
-            f"\n"                          # Range (empty)
-            f"x-ms-blob-type:BlockBlob\n"  # CanonicalizedHeaders (sorted alphabetically)
-            f"x-ms-date:{date_str}\n"
-            f"x-ms-version:2020-12-06\n"
-            f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"  # CanonicalizedResource
+            f"PUT\n\n\n{content_length}\n\napplication/octet-stream\n\n\n\n\n\n\n"
+            f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n"
+            f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"
         )
         
-        # Sign with HMAC-SHA256
         account_key = base64.b64decode(AZURE_STORAGE_KEY)
         signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
         signature = base64.b64encode(signed_hmac).decode('utf-8')
         
-        # Build authorization header
-        auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
-        
-        # Set headers - MUST match what was signed
         headers = {
             "x-ms-date": date_str,
             "x-ms-version": "2020-12-06",
             "x-ms-blob-type": "BlockBlob",
             "Content-Type": "application/octet-stream",
             "Content-Length": str(content_length),
-            "Authorization": auth_header
+            "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
         }
         
-        # Upload
         r = requests.put(url, data=file_bytes, headers=headers, timeout=300)
         
         if r.status_code not in [201, 200]:
-            return None, f"Upload failed: HTTP {r.status_code} - {r.text}"
+            return None, f"Upload failed: HTTP {r.status_code}"
         
-        # Generate SAS token for reading
         sas_token = generate_sas_token_fixed(blob_name)
         if not sas_token:
             return None, "Failed to generate SAS token"
         
         sas_url = f"{url}?{sas_token}"
         
-        # Test the SAS URL
         is_valid, test_msg = test_sas_url(sas_url)
         if not is_valid:
             return None, f"SAS URL validation failed: {test_msg}"
@@ -624,98 +783,11 @@ def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optio
         
     except Exception as e:
         import traceback
-        return None, f"Upload error: {str(e)}\n{traceback.format_exc()}"
-
-
-def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> str:
-    """
-    FIXED SAS token generation for Azure Blob - Service SAS format.
-    """
-    if not AZURE_STORAGE_KEY:
-        return None
-    
-    try:
-        # Set expiry in UTC
-        expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours)
-        expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ')
-        
-        # Decode account key
-        account_key = base64.b64decode(AZURE_STORAGE_KEY)
-        
-        # ====================================================================
-        # CRITICAL FIX: Service SAS string-to-sign format
-        # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas  
-        # Format for Blob service SAS:
-        # StringToSign = signedPermissions + "\n" +
-        #                signedStart + "\n" +
-        #                signedExpiry + "\n" +
-        #                canonicalizedResource + "\n" +
-        #                signedIdentifier + "\n" +
-        #                signedIP + "\n" +
-        #                signedProtocol + "\n" +
-        #                signedVersion + "\n" +
-        #                signedResource + "\n" +
-        #                signedSnapshotTime + "\n" +
-        #                signedEncryptionScope + "\n" +
-        #                signedCacheControl + "\n" +
-        #                signedContentDisposition + "\n" +
-        #                signedContentEncoding + "\n" +
-        #                signedContentLanguage + "\n" +
-        #                signedContentType
-        # ====================================================================
-        
-        # Canonicalized resource for service SAS: /blob/{account}/{container}/{blob}
-        canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"
-        
-        # Build string to sign for Service SAS
-        string_to_sign = (
-            f"r\n"                           # signed permissions (read)
-            f"\n"                            # signed start (empty)
-            f"{expiry_str}\n"                # signed expiry
-            f"{canonicalized_resource}\n"    # canonicalized resource
-            f"\n"                            # signed identifier (empty)
-            f"\n"                            # signed IP (empty)
-            f"https\n"                       # signed protocol
-            f"2020-12-06\n"                  # signed version
-            f"b\n"                           # signed resource (b = blob)
-            f"\n"                            # signed snapshot time (empty)
-            f"\n"                            # signed encryption scope (empty)
-            f"\n"                            # signed cache control (empty)
-            f"\n"                            # signed content disposition (empty)
-            f"\n"                            # signed content encoding (empty)
-            f"\n"                            # signed content language (empty)
-            f""                              # signed content type (empty, no newline at end)
-        )
-        
-        # Sign with HMAC-SHA256
-        signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
-        signature = base64.b64encode(signed_hmac).decode('utf-8')
-        
-        # Build query parameters - Order matters for some clients
-        sas_params = {
-            'sv': '2020-12-06',             # signed version
-            'sr': 'b',                      # signed resource (blob)
-            'sp': 'r',                      # signed permissions (read)
-            'se': expiry_str,               # signed expiry
-            'spr': 'https',                 # signed protocol
-            'sig': signature                # signature
-        }
-        
-        # URL encode the signature and other values
-        sas_token = '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()])
-        return sas_token
-        
-    except Exception as e:
-        st.error(f"SAS generation error: {e}")
-        import traceback
-        st.error(traceback.format_exc())
-        return None
+        return None, f"Upload error: {str(e)}"
 
 
 def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]:
-    """
-    Upload using Azure SDK (more reliable, requires azure-storage-blob package).
-    """
+    """Upload using Azure SDK (preferred method)."""
     try:
         from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
         
@@ -729,17 +801,14 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona
         blob_service = BlobServiceClient.from_connection_string(connection_string)
         container_client = blob_service.get_container_client(INPUT_CONTAINER)
         
-        # Ensure container exists
         try:
             container_client.create_container()
         except Exception:
             pass
         
-        # Upload blob
         blob_client = container_client.get_blob_client(blob_name)
         blob_client.upload_blob(file_bytes, overwrite=True)
         
-        # Generate SAS token
         sas_token = generate_blob_sas(
             account_name=AZURE_STORAGE_ACCOUNT,
             container_name=INPUT_CONTAINER,
@@ -752,7 +821,6 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona
         
         sas_url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}?{sas_token}"
         
-        # Test the SAS URL
         is_valid, test_msg = test_sas_url(sas_url)
         if not is_valid:
             return None, f"SAS URL validation failed: {test_msg}"
@@ -763,7 +831,7 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona
         return None, "azure-storage-blob not installed"
     except Exception as e:
         import traceback
-        return None, f"SDK upload failed: {str(e)}\n{traceback.format_exc()}"
+        return None, f"SDK upload failed: {str(e)}"
 
 
 def save_segments_to_blob(video_id: str, segments: list) -> str:
@@ -779,28 +847,14 @@ def save_segments_to_blob(video_id: str, segments: list) -> str:
     
     date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
     string_to_sign = (
-        f"PUT\n"
-        f"\n"
-        f"\n"
-        f"{content_length}\n"
-        f"\n"
-        f"application/json\n"
-        f"\n"
-        f"\n"
-        f"\n"
-        f"\n"
-        f"\n"
-        f"\n"
-        f"x-ms-blob-type:BlockBlob\n"
-        f"x-ms-date:{date_str}\n"
-        f"x-ms-version:2020-12-06\n"
+        f"PUT\n\n\n{content_length}\n\napplication/json\n\n\n\n\n\n\n"
+        f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n"
         f"/{AZURE_STORAGE_ACCOUNT}/{SEGMENTS_CONTAINER}/{blob_name}"
     )
     
     account_key = base64.b64decode(AZURE_STORAGE_KEY)
     signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
     signature = base64.b64encode(signed_hmac).decode('utf-8')
-    auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
     
     headers = {
         "x-ms-date": date_str,
@@ -808,7 +862,7 @@ def save_segments_to_blob(video_id: str, segments: list) -> str:
         "x-ms-blob-type": "BlockBlob",
         "Content-Type": "application/json",
         "Content-Length": str(content_length),
-        "Authorization": auth_header
+        "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
     }
     
     r = requests.put(url, data=json_bytes, headers=headers, timeout=60)
@@ -817,16 +871,9 @@ def save_segments_to_blob(video_id: str, segments: list) -> str:
     return blob_name
 
 
-def check_yt_dlp() -> bool:
-    try:
-        result = subprocess.run(["which", "yt-dlp"], capture_output=True, text=True)
-        return result.returncode == 0
-    except:
-        return False
-
-
-def download_youtube_audio(youtube_url: str, output_path: str, progress_callback=None) -> Tuple[Optional[str], Optional[str]]:
-    """Download YouTube audio to specific path."""
+def download_youtube_audio(youtube_url: str, output_path: str, 
+                          progress_callback=None) -> Tuple[Optional[str], Optional[str]]:
+    """Download audio from YouTube."""
     if not check_yt_dlp():
         return None, "yt-dlp not installed. Run: pip install yt-dlp"
     
@@ -840,18 +887,16 @@ def download_youtube_audio(youtube_url: str, output_path: str, progress_callback
             "--extract-audio",
             "--audio-format", "m4a",
             "--audio-quality", "0",
-            "--no-check-certificate",  # Added for compatibility
-            "--no-warnings",           # Reduce noise
+            "--no-check-certificate",
+            "--no-warnings",
             "-o", output_path,
             youtube_url.strip()
         ]
         
-        # Try to use Node.js runtime if available, otherwise let yt-dlp handle it
-        # This fixes the "No supported JavaScript runtime" error
+        # Handle missing Node.js
         try:
             node_check = subprocess.run(["which", "node"], capture_output=True, text=True)
             if node_check.returncode != 0:
-                # No node.js, try to use legacy format that doesn't require JS
                 cmd.extend(["--extractor-args", "youtube:player_client=web"])
         except:
             pass
@@ -863,16 +908,14 @@ def download_youtube_audio(youtube_url: str, output_path: str, progress_callback
         
         if result.returncode != 0:
             error_msg = result.stderr[:500]
-            # Provide helpful error message for JS runtime issues
             if "JavaScript runtime" in error_msg:
                 error_msg += "\n\n💡 Tip: Install Node.js or run: pip install yt-dlp --upgrade"
             return None, f"yt-dlp failed: {error_msg}"
         
-        # Find the actual file
+        # Find downloaded file
         if os.path.exists(output_path):
             return output_path, None
         
-        # Try alternative extensions
         base = output_path.rsplit('.', 1)[0]
         for ext in ['.m4a', '.mp3', '.webm', '.opus']:
             alt_path = base + ext
@@ -887,44 +930,16 @@ def download_youtube_audio(youtube_url: str, output_path: str, progress_callback
         return None, f"Error: {str(e)}"
 
 
-def detect_url_type(url: str) -> str:
-    """Detect if URL is YouTube, direct media, or unknown."""
-    if not url:
-        return "unknown"
-    
-    url_lower = str(url).lower().strip()
-    
-    # YouTube patterns
-    youtube_patterns = [
-        r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)',
-        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=',
-        r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/',
-        r'youtube\.com\/shorts\/'
-    ]
-    
-    for pattern in youtube_patterns:
-        if re.search(pattern, url_lower):
-            return "youtube"
-    
-    # Direct media patterns
-    media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm']
-    if any(url_lower.endswith(ext) for ext in media_extensions):
-        return "direct"
-    
-    # Box.com, Google Drive, Dropbox, etc. - treat as direct
-    cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive']
-    if any(pattern in url_lower for pattern in cloud_patterns):
-        return "direct"
-    
-    return "unknown"
-
+# =============================================================================
+# MAIN VIDEO PROCESSING
+# =============================================================================
 
-def process_single_video(url: str, custom_id: Optional[str] = None, 
-                        progress_bar=None, status_text=None, 
+def process_single_video(url: str, custom_id: Optional[str] = None,
+                        source_type: str = "unknown",
+                        progress_bar=None, status_text=None,
                         overall_progress: Tuple[int, int] = (0, 1)) -> Dict[str, Any]:
     """
-    Process a single video URL (YouTube or Direct).
-    Returns result dict with status and metadata.
+    Process a single video: download (if needed), transcribe, segment, index.
     """
     result = {
         "url": url,
@@ -932,27 +947,24 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
         "status": "pending",
         "segments_count": 0,
         "error": None,
-        "index_status": None
+        "index_status": None,
+        "source_url": url,
+        "source_type": source_type,
+        "url_stored": False
     }
     
     try:
-        # Detect URL type
+        # Validate URL
         url_type = detect_url_type(url)
-        
         if url_type == "unknown":
             result["status"] = "failed"
             result["error"] = "Unknown URL type. Must be YouTube or direct media URL."
             return result
         
         # Generate video ID
-        if custom_id:
-            video_id = custom_id.strip()
-        else:
-            video_id = generate_video_id(f"batch_{url}")
-        
+        video_id = custom_id.strip() if custom_id else generate_video_id(f"batch_{url}")
         result["video_id"] = video_id
         
-        # Update progress
         current, total = overall_progress
         base_progress = int((current / total) * 100) if progress_bar else 0
         
@@ -981,7 +993,6 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                     result["error"] = f"Download failed: {error}"
                     return result
                 
-                # Read and upload
                 with open(downloaded_path, 'rb') as f:
                     file_bytes = f.read()
                 
@@ -990,7 +1001,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 if status_text:
                     status_text.text(f"[{current}/{total}] Uploading to Azure...")
                 
-                # Try SDK first
+                # Try SDK first, fallback to REST
                 sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
                 if error and ("not installed" in error or "SDK" in error):
                     sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
@@ -1013,7 +1024,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
             result["error"] = "No media URL available"
             return result
         
-        # Submit transcription
+        # Submit to Speech API
         if status_text:
             status_text.text(f"[{current}/{total}] Submitting to Speech API...")
         
@@ -1034,9 +1045,8 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
             poll_result = poll_transcription_operation(operation_url)
             status = poll_result.get("status", "unknown")
             
-            # Update progress during polling
             if progress_bar:
-                poll_progress = min(int((i / max_polls) * 20), 20)  # 20% of progress for polling
+                poll_progress = min(int((i / max_polls) * 20), 20)
                 overall = base_progress + int((1 / total) * 80) + int((1 / total) * poll_progress)
                 progress_bar.progress(min(overall, 99))
             
@@ -1054,7 +1064,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
             result["error"] = "Transcription timed out"
             return result
         
-        # Process segments
+        # Process and index
         if status_text:
             status_text.text(f"[{current}/{total}] Processing segments...")
         
@@ -1064,10 +1074,25 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
         # Save to blob
         save_segments_to_blob(video_id, segments)
         
-        # Index to search
+        # Index with URL tracking
         try:
-            index_result = index_segments_direct(video_id, segments)
-            result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents (key: {index_result.get('key_field_used', 'unknown')})"
+            index_result = index_segments_direct(
+                video_id,
+                segments,
+                source_url=url,
+                source_type=source_type
+            )
+            
+            result["url_stored"] = index_result.get('source_url_stored', False)
+            result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents"
+            
+            # Debug info
+            st.session_state['debug_info'][video_id] = {
+                'url_fields_available': index_result.get('url_fields_available', {}),
+                'source_url_stored': index_result.get('source_url_stored', False),
+                'source_type_stored': index_result.get('source_type_stored', False)
+            }
+            
         except Exception as e:
             result["index_status"] = f"Indexing failed: {str(e)}"
         
@@ -1083,7 +1108,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
 
 
 # =============================================================================
-# PAGE 1: SEARCH
+# PAGE 1: SEARCH SEGMENTS
 # =============================================================================
 
 if page == "🔎 Search Segments":
@@ -1091,6 +1116,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
     
     if not SEARCH_FN_URL:
         st.error("SEARCH_FN_URL not configured. Cannot search.")
+        st.info("Please set SEARCH_FN_URL environment variable.")
     else:
         col1, col2 = st.columns([3, 1])
         with col1:
@@ -1116,7 +1142,17 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                     start_ms, end_ms = h.get("start_ms", 0), h.get("end_ms", 0)
                     vid, seg, score = h.get("video_id", ""), h.get("segment_id", ""), h.get("score")
                     
-                    header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}"
+                    # Show URL info if available
+                    source_url = h.get('source_url', '')
+                    source_type = h.get('source_type', '')
+                    url_indicator = ""
+                    
+                    if source_url:
+                        url_indicator = f" | 🔗 {source_type}: {source_url[:40]}..."
+                    elif source_type and source_type != 'unknown':
+                        url_indicator = f" | 📁 {source_type}"
+                    
+                    header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}{url_indicator}"
                     if seg:
                         header += f" | seg={seg}"
                     if score is not None:
@@ -1126,23 +1162,40 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                         st.write(h.get("text", ""))
                         if h.get("pred_labels"):
                             st.caption(f"Labels: {', '.join(h['pred_labels'])}")
+                        if source_url:
+                            st.caption(f"**Source:** [{source_url}]({source_url})")
+                            st.caption(f"**Type:** {source_type}")
+            
             except Exception as e:
                 st.error(f"Search failed: {e}")
 
 
 # =============================================================================
-# PAGE 2: UPLOAD (DIRECT API VERSION)
+# PAGE 2: UPLOAD & TRANSCRIBE
 # =============================================================================
 
 elif page == "⬆️ Upload & Transcribe":
     st.header("Upload Video for Transcription")
-    st.info(" Using direct Azure Speech API (bypassing Azure Function)")
     
-    # Check Azure config
+    # Check URL fields status
+    url_status = check_url_fields_status()
+    
+    if url_status['fields_exist']:
+        st.success("✅ URL Tracking Enabled - Original source URLs will be stored")
+    else:
+        st.warning(f"""
+        ⚠️ **Partial URL Tracking** - Missing fields: {', '.join(url_status['missing_fields'])}
+        
+        Videos will still be processed, but URL information will be limited.
+        Add missing fields to your Azure Search index for full functionality.
+        """)
+    
+    # Check Azure configuration
     azure_configured = bool(AZURE_STORAGE_KEY) and bool(SPEECH_KEY)
     if not azure_configured:
         st.error("⚠️ Azure Storage and Speech keys required. Check .env file.")
     
+    # Source selection
     source_type = st.radio("Select Source", 
                           ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"],
                           horizontal=True)
@@ -1150,12 +1203,11 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
     media_url = None
     video_id = None
     file_bytes = None
-    yt_url = None  # Initialize to None
+    yt_url = None
     csv_df = None
+    detected_source_type = "unknown"
     
-    # -------------------------------------------------------------------------
-    # FILE UPLOAD
-    # -------------------------------------------------------------------------
+    # --- File Upload ---
     if source_type == "File Upload":
         if not azure_configured:
             st.info("Please configure Azure Storage to enable file upload")
@@ -1170,143 +1222,105 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)")
                 file_bytes = uploaded_file.getvalue()
                 video_id = generate_video_id(uploaded_file.name)
-                st.info("File ready for upload to Azure")
+                detected_source_type = "upload"
+                st.info("File ready for upload")
     
-    # -------------------------------------------------------------------------
-    # DIRECT URL
-    # -------------------------------------------------------------------------
+    # --- Direct URL ---
     elif source_type == "Direct URL":
-        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/...")
+        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...")
         
         if url_input.strip():
             media_url = url_input.strip()
             video_id = generate_video_id(url_input)
+            detected_source_type = "direct"
             st.success("✅ URL validated")
     
-    # -------------------------------------------------------------------------
-    # YOUTUBE - FIXED with session state
-    # -------------------------------------------------------------------------
+    # --- YouTube ---
     elif source_type == "YouTube":
-        # Use session state to persist the URL
         yt_url = st.text_input(
-            "YouTube URL", 
-            placeholder="https://youtube.com/watch?v=...",
+            "YouTube URL",
+            placeholder="https://youtube.com/watch?v= ...",
             value=st.session_state.yt_url_value,
             key="yt_url_input"
         )
         
-        # Update session state when URL changes - FIXED: removed experimental_rerun
+        # Update session state
         if yt_url != st.session_state.yt_url_value:
             st.session_state.yt_url_value = yt_url
-            # Use st.rerun() instead of st.experimental_rerun() for newer Streamlit versions
             try:
                 st.rerun()
-            except AttributeError:
-                # Fallback for older versions
-                try:
-                    st.experimental_rerun()
-                except AttributeError:
-                    pass  # If neither exists, just continue without rerun
+            except:
+                pass
         
+        # Check yt-dlp
         if not check_yt_dlp():
             st.warning("yt-dlp not installed")
             if st.button("Install yt-dlp"):
                 with st.spinner("Installing..."):
                     subprocess.run(["pip", "install", "-q", "yt-dlp"])
-                # FIXED: Use st.rerun() instead of experimental_rerun
                 try:
                     st.rerun()
-                except AttributeError:
-                    try:
-                        st.experimental_rerun()
-                    except AttributeError:
-                        st.info("Please refresh the page manually")
+                except:
+                    st.info("Please refresh the page")
         elif yt_url and yt_url.strip():
             video_id = generate_video_id(f"yt_{yt_url.strip()}")
+            detected_source_type = "youtube"
             st.success("YouTube URL ready")
     
-    # -------------------------------------------------------------------------
-    # BATCH CSV UPLOAD - NEW FEATURE
-    # -------------------------------------------------------------------------
+    # --- Batch CSV Upload ---
     elif source_type == "📁 Batch CSV Upload":
         st.subheader("📁 Batch Process Videos from CSV")
         
         csv_file = st.file_uploader(
             "Upload CSV file",
             type=["csv"],
-            help="CSV must contain a column with video URLs (YouTube or direct links)"
+            help="CSV must contain a column with video URLs"
         )
         
         if csv_file:
             try:
-                # Read CSV - handle various formats
-                # Try to detect if URLs are in header or rows
-                content = csv_file.read().decode('utf-8')
-                csv_file.seek(0)  # Reset pointer
-                
-                # First attempt: standard read
+                # Read CSV with flexible parsing
                 try:
                     csv_df = pd.read_csv(csv_file)
                 except Exception:
-                    # Second attempt: maybe single column with no header
                     csv_file.seek(0)
                     csv_df = pd.read_csv(csv_file, header=None)
                     csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))]
                 
-                # Check if column names look like URLs (common issue)
+                # Handle case where column names are URLs
                 url_like_columns = []
                 for col in csv_df.columns:
                     col_str = str(col).strip()
-                    if detect_url_type(col_str) != "unknown" or col_str.startswith('http'):
+                    if detect_url_type(col_str) != "unknown":
                         url_like_columns.append(col)
                 
-                # If column names look like URLs, treat them as data
                 if url_like_columns and len(csv_df.columns) == 1:
-                    # The column name is actually a URL, convert to data
                     url_col_name = csv_df.columns[0]
                     new_row = {url_col_name: url_col_name}
                     csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True)
                 
-                st.success(f"✅ Loaded CSV with {len(csv_df)} rows and {len(csv_df.columns)} columns")
-                
-                # Show available columns
-                st.write("**Available columns:**", list(csv_df.columns))
+                st.success(f"✅ Loaded CSV with {len(csv_df)} rows")
                 
-                # Let user select the URL column
-                url_column = st.selectbox(
-                    "Select column containing video URLs",
-                    options=csv_df.columns.tolist(),
-                    help="Choose the column that contains YouTube or direct media URLs"
-                )
+                # Column selection
+                url_column = st.selectbox("Select column containing video URLs", options=csv_df.columns.tolist())
                 
-                # Optional: Select custom ID column
                 id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column]
-                id_column = st.selectbox(
-                    "Select column for custom Video ID (optional)",
-                    options=id_column_options,
-                    index=0,
-                    help="Optional: Choose a column to use as custom video ID (e.g., title, ID field)"
-                )
+                id_column = st.selectbox("Select column for custom Video ID (optional)", options=id_column_options, index=0)
                 
                 # Extract and validate URLs
                 urls_raw = csv_df[url_column].dropna().astype(str).tolist()
-                
-                # Clean URLs (remove whitespace)
                 urls_to_process = [u.strip() for u in urls_raw if u.strip()]
                 
                 # Preview
-                with st.expander(f"Preview URLs to process ({len(urls_to_process)} found)"):
+                with st.expander(f"Preview URLs ({len(urls_to_process)} found)"):
                     for i, url in enumerate(urls_to_process[:10], 1):
                         url_type = detect_url_type(url)
                         icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓"
                         st.text(f"{i}. {icon} {url[:80]}...")
-                    if len(urls_to_process) > 10:
-                        st.caption(f"... and {len(urls_to_process) - 10} more")
                 
-                # Validate URLs
+                # Validate
                 valid_urls = []
                 invalid_urls = []
-                
                 for url in urls_to_process:
                     url_type = detect_url_type(str(url))
                     if url_type in ["youtube", "direct"]:
@@ -1315,16 +1329,11 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                         invalid_urls.append(url)
                 
                 col1, col2, col3 = st.columns(3)
-                col1.metric("Total URLs", len(urls_to_process))
-                col2.metric("✅ Valid", len(valid_urls), f"{len(valid_urls)/len(urls_to_process)*100:.1f}%" if urls_to_process else "0%")
+                col1.metric("Total", len(urls_to_process))
+                col2.metric("✅ Valid", len(valid_urls))
                 col3.metric("❌ Invalid", len(invalid_urls))
                 
-                if invalid_urls:
-                    with st.expander(f"Show {len(invalid_urls)} invalid URLs"):
-                        for url in invalid_urls[:10]:
-                            st.text(f"❌ {url[:100]}...")
-                
-                # Store in session state for processing
+                # Store in session state
                 st.session_state['batch_urls'] = valid_urls
                 st.session_state['batch_df'] = csv_df
                 st.session_state['batch_url_column'] = url_column
@@ -1335,7 +1344,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 import traceback
                 st.error(traceback.format_exc())
     
-    # Custom ID (for single uploads)
+    # Custom ID input
     custom_id = st.text_input("Custom Video ID (optional)")
     if custom_id.strip() and source_type != "📁 Batch CSV Upload":
         video_id = custom_id.strip()
@@ -1347,7 +1356,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
     elif source_type == "Direct URL":
         can_process = media_url is not None and len(str(media_url).strip()) > 0
     elif source_type == "YouTube":
-        yt_url_to_check = st.session_state.get('yt_url_value', '') or (yt_url if yt_url else '')
+        yt_url_to_check = st.session_state.get('yt_url_value', '')
         can_process = len(str(yt_url_to_check).strip()) > 0 and check_yt_dlp()
     elif source_type == "📁 Batch CSV Upload":
         can_process = (st.session_state.get('batch_urls') and 
@@ -1356,16 +1365,14 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                       not st.session_state.get('batch_processing', False))
     
     # Process button
-    button_text = " Start Transcription"
+    button_text = "🚀 Start Transcription"
     if source_type == "📁 Batch CSV Upload":
         count = len(st.session_state.get('batch_urls', []))
-        button_text = f" Process {count} Videos from CSV"
+        button_text = f"🚀 Process {count} Videos from CSV"
     
     if st.button(button_text, type="primary", disabled=not can_process):
         
-        # ---------------------------------------------------------------------
-        # BATCH PROCESSING
-        # ---------------------------------------------------------------------
+        # --- BATCH PROCESSING ---
         if source_type == "📁 Batch CSV Upload":
             st.session_state.batch_processing = True
             st.session_state.batch_results = []
@@ -1376,31 +1383,32 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
             id_column = st.session_state.get('batch_id_column')
             
             total = len(urls)
-            
             st.info(f"Starting batch processing of {total} videos...")
             
-            # Create progress containers
+            # Progress UI
             overall_progress = st.progress(0)
             status_text = st.empty()
             results_container = st.container()
             
-            # Process each URL
             results = []
             for idx, url in enumerate(urls, 1):
                 # Get custom ID if specified
                 custom_vid_id = None
                 if id_column != "Auto-generate":
-                    # Find the row with this URL and get the ID
                     row = csv_df[csv_df[url_column] == url]
                     if not row.empty:
                         custom_vid_id = str(row[id_column].iloc[0])
-                        # Sanitize ID
                         custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50]
                 
-                # Process video
+                # Detect source type
+                url_type = detect_url_type(url)
+                src_type = "youtube" if url_type == "youtube" else "direct"
+                
+                # Process
                 result = process_single_video(
                     url=url,
                     custom_id=custom_vid_id,
+                    source_type=src_type,
                     progress_bar=overall_progress,
                     status_text=status_text,
                     overall_progress=(idx, total)
@@ -1413,19 +1421,16 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 progress_pct = int((idx / total) * 100)
                 overall_progress.progress(progress_pct)
                 
-                # Show result in container
+                # Show result
                 with results_container:
                     if result['status'] == 'success':
-                        st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments")
+                        url_stored = "✅ URL saved" if result.get('url_stored') else "⚠️ URL not stored"
+                        st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments ({url_stored})")
                     else:
-                        error_msg = result.get('error', 'Unknown error')
-                        # Truncate long error messages
-                        if len(error_msg) > 200:
-                            error_msg = error_msg[:200] + "..."
-                        st.error(f"❌ [{idx}/{total}] Failed: {error_msg}")
+                        error_msg = result.get('error', 'Unknown error')[:200]
+                        st.error(f"❌ [{idx}/{total}] Failed: {error_msg}...")
                 
-                # Small delay to prevent rate limiting
-                time.sleep(1)
+                time.sleep(1)  # Rate limiting
             
             # Final summary
             overall_progress.progress(100)
@@ -1442,14 +1447,16 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
             col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%")
             col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%")
             
-            # Detailed results table
+            # Detailed results
             with st.expander("View Detailed Results"):
                 results_df = pd.DataFrame([
                     {
                         'Video ID': r['video_id'],
                         'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'],
+                        'Source Type': r.get('source_type', 'unknown'),
                         'Status': r['status'],
                         'Segments': r.get('segments_count', 0),
+                        'URL Stored': r.get('url_stored', False),
                         'Indexing': r.get('index_status', 'N/A'),
                         'Error': (r.get('error', '')[:100] + '...') if r.get('error') else ''
                     }
@@ -1457,73 +1464,48 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 ])
                 st.dataframe(results_df)
                 
-                # Download results as CSV
+                # Download results
                 csv_buffer = io.StringIO()
                 results_df.to_csv(csv_buffer, index=False)
-                st.download_button(
-                    "Download Results CSV",
-                    csv_buffer.getvalue(),
-                    "batch_processing_results.csv",
-                    "text/csv"
-                )
+                st.download_button("Download Results CSV", csv_buffer.getvalue(), "batch_results.csv", "text/csv")
             
             # Search hint
             if successful:
-                st.info("💡 **Search processed videos using:**")
+                st.info("💡 **Search processed videos:**")
                 video_ids = [r['video_id'] for r in successful[:5]]
                 st.code(f"video_id:({' OR '.join(video_ids)})")
             
             st.session_state.batch_processing = False
-            
+        
+        # --- SINGLE VIDEO PROCESSING ---
         else:
-            # -----------------------------------------------------------------
-            # SINGLE VIDEO PROCESSING (Original logic)
-            # -----------------------------------------------------------------
             progress_bar = st.progress(0)
             status = st.empty()
             
             try:
-                # -------------------------------------------------------------
-                # HANDLE FILE UPLOAD (Direct to Azure)
-                # -------------------------------------------------------------
+                # Upload file if needed
                 if source_type == "File Upload" and file_bytes:
                     progress_bar.progress(10)
                     status.text("Uploading to Azure Blob...")
                     
                     blob_name = f"upload_{video_id}_{int(time.time())}.m4a"
                     
-                    # Try SDK method first, fallback to fixed REST method
-                    sas_url = None
-                    error = None
-                    
-                    try:
-                        sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
-                    except Exception as e:
-                        error = str(e)
-                    
+                    sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
                     if error and ("not installed" in error or "SDK" in error):
-                        st.info("Using REST API for upload...")
                         sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
                     
                     if error:
                         raise Exception(error)
                     
-                    if not sas_url:
-                        raise Exception("Failed to generate SAS URL")
-                    
                     media_url = sas_url
                     progress_bar.progress(50)
-                    status.text("Upload complete, starting transcription...")
                 
-                # -------------------------------------------------------------
-                # HANDLE YOUTUBE (Download then Upload)
-                # -------------------------------------------------------------
+                # Download YouTube if needed
                 elif source_type == "YouTube":
-                    # Get URL from session state
                     yt_url = st.session_state.get('yt_url_value', '')
                     
                     if not yt_url or not yt_url.strip():
-                        raise Exception("YouTube URL is empty. Please enter a valid YouTube URL.")
+                        raise Exception("YouTube URL is empty")
                     
                     import tempfile
                     with tempfile.TemporaryDirectory() as tmpdir:
@@ -1531,11 +1513,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                         status.text("Downloading from YouTube...")
                         
                         output_path = f"{tmpdir}/youtube_{video_id}.m4a"
-                        downloaded_path, error = download_youtube_audio(
-                            yt_url.strip(), 
-                            output_path,
-                            lambda p, m: (progress_bar.progress(p), status.text(m))
-                        )
+                        downloaded_path, error = download_youtube_audio(yt_url.strip(), output_path)
                         
                         if error:
                             raise Exception(error)
@@ -1543,42 +1521,25 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                         progress_bar.progress(50)
                         status.text("Uploading to Azure Blob...")
                         
-                        # Read file and upload
                         with open(downloaded_path, 'rb') as f:
                             file_bytes = f.read()
                         
                         blob_name = f"youtube_{video_id}_{int(time.time())}.m4a"
                         
-                        # Try SDK first, fallback to fixed REST
-                        sas_url = None
-                        error = None
-                        
-                        try:
-                            sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
-                        except Exception as e:
-                            error = str(e)
-                        
-                        if error and ("not installed" in error or "SDK" in error):
-                            st.info("Using REST API for upload...")
+                        sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
+                        if error and ("not installed" in error):
                             sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
                         
                         if error:
                             raise Exception(error)
                         
-                        if not sas_url:
-                            raise Exception("Failed to generate SAS URL")
-                        
                         media_url = sas_url
                         progress_bar.progress(75)
-                        status.text("Processing with Azure Speech...")
                 
-                # -------------------------------------------------------------
-                # TRANSCRIBE (All paths lead here)
-                # -------------------------------------------------------------
                 if not media_url:
                     raise Exception("No media URL available")
                 
-                # Submit directly to Azure Speech API
+                # Transcribe
                 status.text("Submitting to Azure Speech-to-Text...")
                 result = submit_transcription_direct(video_id, media_url)
                 operation_url = result.get("operation_url")
@@ -1586,9 +1547,6 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 if not operation_url:
                     raise Exception("No operation URL returned")
                 
-                # Debug info
-                st.info(f"Debug: Operation URL received")
-                
                 # Poll
                 max_polls = 120
                 transcription_data = None
@@ -1603,35 +1561,40 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                     status.text(f"Transcribing... ({i * POLL_SECONDS // 60} min) - Status: {status_text}")
                     
                     if status_text.lower() == "succeeded":
-                        status.text("Transcription complete, retrieving results...")
                         transcription_data = get_transcription_from_result(poll_result)
                         break
-                        
                     elif status_text.lower() == "failed":
-                        error_msg = poll_result.get("properties", {}).get("error", {}).get("message", "Unknown error")
-                        raise Exception(f"Transcription failed: {error_msg}")
+                        raise Exception(f"Transcription failed: {poll_result.get('properties', {}).get('error', {}).get('message', 'Unknown error')}")
                 
                 if not transcription_data:
                     raise Exception("Transcription timed out")
                 
-                # -------------------------------------------------------------
-                # PROCESS & INDEX (DIRECT)
-                # -------------------------------------------------------------
+                # Process and index
                 progress_bar.progress(98)
                 status.text("Processing segments and indexing...")
                 
-                # Convert to segments
                 segments = process_transcription_to_segments(transcription_data, video_id)
                 
                 # Save to blob
-                blob_name = save_segments_to_blob(video_id, segments)
+                save_segments_to_blob(video_id, segments)
                 
-                # Index to search
-                try:
-                    index_result = index_segments_direct(video_id, segments)
-                    index_msg = f"Indexed: {index_result.get('indexed', 0)} documents (key field: {index_result.get('key_field_used', 'unknown')})"
-                except Exception as e:
-                    index_msg = f"Indexing failed: {str(e)}"
+                # Index with URL tracking
+                original_url = None
+                if source_type == "YouTube":
+                    original_url = st.session_state.get('yt_url_value', '')
+                elif source_type == "Direct URL":
+                    original_url = media_url
+                elif source_type == "File Upload":
+                    original_url = f"uploaded_file://{video_id}"
+                
+                index_result = index_segments_direct(
+                    video_id,
+                    segments,
+                    source_url=original_url,
+                    source_type=detected_source_type
+                )
+                
+                url_stored_msg = "✅ Source URL stored" if index_result.get('source_url_stored') else "⚠️ URL storage not available"
                 
                 progress_bar.progress(100)
                 status.text("Complete!")
@@ -1640,24 +1603,279 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 ✅ **Transcription Complete!**
                 - Video ID: {video_id}
                 - Segments: {len(segments)}
-                - {index_msg}
+                - Source Type: {detected_source_type}
+                - Indexed: {index_result.get('indexed', 0)} documents
+                - {url_stored_msg}
                 """)
+                
+                if original_url:
+                    st.info(f"**Original Source:** [{original_url}]({original_url})")
+                
                 st.code(f'Search: video_id:{video_id}')
                 
-                # Show sample segments
                 with st.expander("View first 5 segments"):
                     for seg in segments[:5]:
                         st.write(f"**{ms_to_ts(seg['start_ms'])} - {ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...")
-                    
+                
             except Exception as e:
                 st.error(f"❌ Error: {str(e)}")
                 st.exception(e)
+
+
+# =============================================================================
+# PAGE 3: MANAGE VIDEOS
+# =============================================================================
+
+elif page == "📚 Manage Videos":
+    st.header("📚 Manage Stored Videos")
+    st.info("View, search, and manage all processed videos and their source URLs")
+    
+    if not SEARCH_ENDPOINT or not SEARCH_KEY:
+        st.error("Azure Search not configured. Cannot retrieve video list.")
+    else:
+        # Check URL fields status
+        url_status = check_url_fields_status()
+        
+        if url_status['fields_exist']:
+            st.success("✅ URL tracking fields are configured")
+        else:
+            st.warning(f"⚠️ Missing URL fields: {', '.join(url_status['missing_fields'])}")
+        
+        # URL coverage analysis
+        if st.button("📊 Analyze URL Data Coverage"):
+            with st.spinner("Analyzing..."):
+                all_videos = get_stored_videos(include_missing=True)
+                
+                with_urls = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']]
+                without_urls = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']]
+                
+                col1, col2, col3 = st.columns(3)
+                col1.metric("Total Videos", len(all_videos))
+                col2.metric("✅ With URL Data", len(with_urls), f"{len(with_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%")
+                col3.metric("⚠️ Missing URL Data", len(without_urls), f"{len(without_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%")
+                
+                # By type breakdown
+                st.subheader("Breakdown by Source Type")
+                type_counts = {}
+                for v in all_videos:
+                    t = v.get('source_type') or 'unknown'
+                    type_counts[t] = type_counts.get(t, 0) + 1
+                
+                cols = st.columns(len(type_counts) if type_counts else 1)
+                for i, (stype, count) in enumerate(sorted(type_counts.items())):
+                    icon = "🎬" if stype == "youtube" else "📄" if stype == "direct" else "📁" if stype == "upload" else "❓"
+                    cols[i % len(cols)].metric(f"{icon} {stype}", count)
                 
-                # Debug info
-                if 'debug_poll_url' in st.session_state:
-                    st.error(f"Debug - Poll URL used: {st.session_state['debug_poll_url']}")
+                if without_urls:
+                    with st.expander(f"Videos without URL data ({len(without_urls)})"):
+                        st.info("These were likely processed before URL tracking was enabled")
+                        for v in without_urls[:20]:
+                            st.text(f"• {v.get('video_id')}")
+        
+        st.markdown("---")
+        
+        # Filters
+        st.subheader("Filter Videos")
+        col1, col2 = st.columns(2)
+        
+        with col1:
+            filter_video_id = st.text_input("Filter by Video ID (optional)")
+        with col2:
+            filter_options = ["All", "With URL Data Only", "Missing URL Data Only", "youtube", "direct", "upload", "unknown"]
+            filter_source_type = st.selectbox("Filter by Source Type", options=filter_options, index=0)
+        
+        # Load videos
+        if st.button("🔍 Load Videos", type="primary"):
+            with st.spinner("Retrieving videos..."):
+                
+                # Handle special filters
+                if filter_source_type == "Missing URL Data Only":
+                    all_videos = get_stored_videos(include_missing=True)
+                    videos = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']]
+                    if filter_video_id.strip():
+                        videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()]
+                elif filter_source_type == "With URL Data Only":
+                    all_videos = get_stored_videos(include_missing=True)
+                    videos = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']]
+                    if filter_video_id.strip():
+                        videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()]
+                else:
+                    source_type = None if filter_source_type == "All" else filter_source_type
+                    videos = get_stored_videos(
+                        video_id=filter_video_id if filter_video_id.strip() else None,
+                        source_type=source_type,
+                        include_missing=True,
+                        limit=1000
+                    )
+                
+                st.session_state.stored_videos_cache = videos
+                st.success(f"Found {len(videos)} videos")
+        
+        # Display videos
+        if st.session_state.stored_videos_cache:
+            videos = st.session_state.stored_videos_cache
+            
+            # Metrics
+            st.markdown("---")
+            cols = st.columns(4)
+            
+            type_counts = {}
+            for v in videos:
+                t = v.get('source_type') or 'unknown'
+                type_counts[t] = type_counts.get(t, 0) + 1
+            
+            cols[0].metric("Total", len(videos))
+            cols[1].metric("YouTube", type_counts.get('youtube', 0))
+            cols[2].metric("Direct", type_counts.get('direct', 0))
+            cols[3].metric("Upload", type_counts.get('upload', 0))
+            
+            # Group by type
+            st.markdown("---")
+            st.subheader("Video List")
+            
+            videos_by_type = {}
+            for v in videos:
+                stype = v.get('source_type') or 'unknown'
+                if stype not in videos_by_type:
+                    videos_by_type[stype] = []
+                videos_by_type[stype].append(v)
+            
+            # Display by category
+            for source_type in ['youtube', 'direct', 'upload', 'unknown']:
+                if source_type not in videos_by_type:
+                    continue
+                
+                type_videos = videos_by_type[source_type]
+                icon = "🎬" if source_type == "youtube" else "📄" if source_type == "direct" else "📁" if source_type == "upload" else "❓"
+                
+                with st.expander(f"{icon} {source_type.upper()} ({len(type_videos)} videos)", expanded=(source_type == 'youtube')):
+                    for i, video in enumerate(type_videos, 1):
+                        vid = video.get('video_id', 'unknown')
+                        src_url = video.get('source_url', '')
+                        processed = video.get('processed_at', 'unknown')
+                        
+                        has_url = bool(src_url)
+                        status_icon = "✅" if has_url else "⚠️"
+                        
+                        with st.container():
+                            cols = st.columns([4, 1])
+                            
+                            with cols[0]:
+                                st.write(f"**{status_icon} {i}. {vid}**")
+                                st.caption(f"Processed: {processed}")
+                                
+                                if src_url:
+                                    display_url = src_url[:80] + "..." if len(str(src_url)) > 80 else src_url
+                                    st.code(display_url)
+                                    if str(src_url).startswith('http'):
+                                        st.markdown(f"[Open Source ↗]({src_url})")
+                                else:
+                                    st.warning("No source URL stored")
+                            
+                            with cols[1]:
+                                if st.button(f"🗑️ Delete", key=f"del_{vid}_{i}_{source_type}"):
+                                    if delete_video_by_id(vid):
+                                        st.success(f"Deleted {vid}")
+                                        st.session_state.stored_videos_cache = [
+                                            v for v in videos if v.get('video_id') != vid
+                                        ]
+                                        try:
+                                            st.rerun()
+                                        except:
+                                            pass
+                            
+                            st.markdown("---")
+            
+            # Export
+            st.markdown("---")
+            if st.button("📥 Export to CSV"):
+                export_df = pd.DataFrame([
+                    {
+                        'video_id': v.get('video_id'),
+                        'source_type': v.get('source_type') or 'unknown',
+                        'source_url': v.get('source_url', ''),
+                        'has_url_data': bool(v.get('source_url')),
+                        'processed_at': v.get('processed_at', 'unknown')
+                    }
+                    for v in videos
+                ])
+                
+                csv_buffer = io.StringIO()
+                export_df.to_csv(csv_buffer, index=False)
+                st.download_button("Download CSV", csv_buffer.getvalue(), "video_list.csv", "text/csv")
+
+
+# =============================================================================
+# PAGE 4: SYSTEM DIAGNOSTICS
+# =============================================================================
+
+elif page == "⚙️ System Diagnostics":
+    st.header("⚙️ System Diagnostics")
+    st.info("Check system configuration and troubleshoot issues")
+    
+    # Configuration status
+    st.subheader("Configuration Status")
+    
+    config_checks = {
+        "Azure Speech (SPEECH_KEY)": bool(SPEECH_KEY),
+        "Azure OpenAI (AZURE_OPENAI_KEY)": bool(AZURE_OPENAI_KEY),
+        "Azure Search (SEARCH_KEY)": bool(SEARCH_KEY),
+        "Azure Storage (AZURE_STORAGE_KEY)": bool(AZURE_STORAGE_KEY),
+        "Search Function (SEARCH_FN_URL)": bool(SEARCH_FN_URL),
+        "yt-dlp installed": check_yt_dlp()
+    }
+    
+    cols = st.columns(2)
+    for i, (name, status) in enumerate(config_checks.items()):
+        icon = "✅" if status else "❌"
+        cols[i % 2].write(f"{icon} {name}: {'OK' if status else 'Not configured'}")
+    
+    # Index schema check
+    st.markdown("---")
+    st.subheader("Index Schema Check")
+    
+    if st.button("🔍 Check Index Schema"):
+        with st.spinner("Fetching schema..."):
+            schema = debug_check_index_schema()
+            
+            if isinstance(schema, dict):
+                st.success(f"Index: {schema['index_name']}")
+                st.write(f"Key Field: `{schema['key_field']}`")
+                
+                # URL fields status
+                if schema.get('has_all_url_fields'):
+                    st.success("✅ All URL tracking fields present")
+                else:
+                    st.warning(f"⚠️ Missing fields: {', '.join(schema.get('missing_url_fields', []))}")
+                
+                # Show all fields
+                with st.expander("View all fields"):
+                    for field in schema['fields']:
+                        key = "🔑" if field['key'] else ""
+                        url = "🔗" if 'url' in field['name'].lower() else ""
+                        st.caption(f"{key}{url} `{field['name']}` ({field['type']})")
+                
+                st.session_state.index_schema_cache = schema
+            else:
+                st.error(f"Schema check failed: {schema}")
+    
+    # Debug info
+    st.markdown("---")
+    st.subheader("Debug Information")
+    
+    with st.expander("Session State"):
+        st.json({
+            k: str(v)[:100] + "..." if len(str(v)) > 100 else v 
+            for k, v in st.session_state.items()
+        })
+    
+    with st.expander("Recent Processing Debug"):
+        if st.session_state.get('debug_info'):
+            st.json(st.session_state['debug_info'])
+        else:
+            st.info("No debug info yet. Process a video first.")
 
 
 # Footer
 st.sidebar.markdown("---")
-st.sidebar.caption("Video Annotation Platform v1.0 - Direct API Mode")
\ No newline at end of file
+st.sidebar.caption("Video Annotation Platform v2.1")
\ No newline at end of file

From 7ee4987fd789f7b47d20d4dc66e54bdaedf5a91d Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Mon, 23 Feb 2026 20:22:47 -0600
Subject: [PATCH 5/8] =?UTF-8?q?scripts=20to=20verify/enable=20URL=E2=80=91?=
 =?UTF-8?q?tracking?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 add_url_fields.py | 132 ++++++++++++++++++++++++++++++++++++++++++++++
 verify_fields.py  |  67 +++++++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 add_url_fields.py
 create mode 100644 verify_fields.py

diff --git a/add_url_fields.py b/add_url_fields.py
new file mode 100644
index 0000000..a9a7ed8
--- /dev/null
+++ b/add_url_fields.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Add URL tracking fields to Azure Search index
+"""
+
+import os
+import requests
+import sys
+
+# Load from .env file
+env_path = os.path.join(os.path.dirname(__file__), "ui", ".env")
+env_vars = {}
+
+if os.path.exists(env_path):
+    with open(env_path) as f:
+        for line in f:
+            if '=' in line and not line.startswith('#'):
+                key, value = line.strip().split('=', 1)
+                env_vars[key] = value
+
+SEARCH_ENDPOINT = env_vars.get("SEARCH_ENDPOINT")
+SEARCH_KEY = env_vars.get("SEARCH_KEY")
+SEARCH_INDEX_NAME = env_vars.get("SEARCH_INDEX_NAME", "segments")
+
+print(f"Endpoint: {SEARCH_ENDPOINT}")
+print(f"Index: {SEARCH_INDEX_NAME}")
+print(f"Key: {'*' * 10}{SEARCH_KEY[-4:] if SEARCH_KEY else 'NOT FOUND'}")
+print()
+
+if not SEARCH_ENDPOINT or not SEARCH_KEY:
+    print("ERROR: Missing SEARCH_ENDPOINT or SEARCH_KEY in .env")
+    sys.exit(1)
+
+API_VERSION = "2024-07-01"
+
+def get_index():
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
+    headers = {"api-key": SEARCH_KEY}
+    
+    print(f"Fetching index: {url}")
+    response = requests.get(url, headers=headers)
+    
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to get index: {response.status_code}")
+        print(response.text)
+        return None
+
+def update_index(index_def):
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
+    headers = {
+        "Content-Type": "application/json",
+        "api-key": SEARCH_KEY
+    }
+    
+    response = requests.put(url, headers=headers, json=index_def)
+    
+    if response.status_code in [200, 201]:
+        print("✅ Index updated successfully!")
+        return True
+    else:
+        print(f"❌ Failed to update: {response.status_code}")
+        print(response.text)
+        return False
+
+def main():
+    print("Fetching current index...")
+    index = get_index()
+    if not index:
+        sys.exit(1)
+    
+    existing_fields = {f["name"] for f in index.get("fields", [])}
+    print(f"Existing fields: {existing_fields}")
+    print()
+    
+    new_fields = [
+        {
+            "name": "source_url",
+            "type": "Edm.String",
+            "searchable": False,
+            "filterable": True,
+            "retrievable": True,
+            "sortable": False,
+            "facetable": False,
+            "key": False
+        },
+        {
+            "name": "source_type",
+            "type": "Edm.String",
+            "searchable": False,
+            "filterable": True,
+            "retrievable": True,
+            "sortable": False,
+            "facetable": True,
+            "key": False
+        },
+        {
+            "name": "processed_at",
+            "type": "Edm.DateTimeOffset",
+            "searchable": False,
+            "filterable": True,
+            "retrievable": True,
+            "sortable": True,
+            "facetable": False,
+            "key": False
+        }
+    ]
+    
+    added = 0
+    for field in new_fields:
+        if field["name"] in existing_fields:
+            print(f"⚠️  Already exists: {field['name']}")
+        else:
+            print(f"➕ Adding: {field['name']}")
+            index["fields"].append(field)
+            added += 1
+    
+    if added == 0:
+        print("\n✅ All fields already present!")
+        return
+    
+    print(f"\n💾 Saving with {added} new fields...")
+    if update_index(index):
+        print("\n🎉 SUCCESS! URL tracking fields added.")
+        print("\nNext steps:")
+        print("1. Restart your Streamlit app")
+        print("2. Go to 'System Diagnostics' page")
+        print("3. Click 'Check Index Schema' to verify")
+
+if __name__ == "__main__":
+    main()
diff --git a/verify_fields.py b/verify_fields.py
new file mode 100644
index 0000000..be7742b
--- /dev/null
+++ b/verify_fields.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Verify URL fields were added to the index"""
+
+import os
+import requests
+
+# Load .env
+env_path = os.path.join(os.path.dirname(__file__), "ui", ".env")
+env_vars = {}
+
+if os.path.exists(env_path):
+    with open(env_path) as f:
+        for line in f:
+            if '=' in line and not line.startswith('#'):
+                key, value = line.strip().split('=', 1)
+                env_vars[key] = value
+
+SEARCH_ENDPOINT = env_vars.get("SEARCH_ENDPOINT")
+SEARCH_KEY = env_vars.get("SEARCH_KEY")
+SEARCH_INDEX_NAME = env_vars.get("SEARCH_INDEX_NAME", "segments")
+
+API_VERSION = "2024-07-01"
+
+def check_index():
+    url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}"
+    headers = {"api-key": SEARCH_KEY}
+    
+    response = requests.get(url, headers=headers)
+    
+    if response.status_code == 200:
+        index = response.json()
+        fields = {f["name"]: f["type"] for f in index.get("fields", [])}
+        
+        print("✅ Successfully connected to index!")
+        print(f"\nTotal fields: {len(fields)}")
+        print(f"\nChecking URL tracking fields:")
+        
+        url_fields = {
+            "source_url": "Edm.String",
+            "source_type": "Edm.String", 
+            "processed_at": "Edm.DateTimeOffset"
+        }
+        
+        all_present = True
+        for field, expected_type in url_fields.items():
+            if field in fields:
+                print(f"  ✅ {field}: {fields[field]}")
+            else:
+                print(f"  ❌ {field}: MISSING")
+                all_present = False
+        
+        if all_present:
+            print("\n🎉 SUCCESS! All URL tracking fields are present!")
+            print("\nYou can now:")
+            print("1. Restart your Streamlit app")
+            print("2. Process new videos - URLs will be stored automatically")
+        else:
+            print("\n⚠️  Some fields are missing. Run the add script again.")
+            
+        return all_present
+    else:
+        print(f"❌ Failed to get index: {response.status_code}")
+        print(response.text)
+        return False
+
+if __name__ == "__main__":
+    check_index()

From 2f1555a2f47295773e9905b19604d86eae625ec9 Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Mon, 2 Mar 2026 18:46:08 -0600
Subject: [PATCH 6/8] Add .gitignore for macOS, JSONL, Python caches, and local
 venv

---
 .gitignore                                    | 181 +-----------------
 .../__pycache__/speech_batch.cpython-311.pyc  | Bin 22598 -> 0 bytes
 2 files changed, 9 insertions(+), 172 deletions(-)
 delete mode 100644 shared/__pycache__/speech_batch.cpython-311.pyc

diff --git a/.gitignore b/.gitignore
index 6e659d7..e29811b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,175 +1,12 @@
-local.settings.json
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
+# macOS
+.DS_Store
 
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#uv.lock
+# JSON lines
+*.jsonl
 
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# Ruff stuff:
-.ruff_cache/
+# Python bytecode
+*.pyc
+__pycache__/
 
-# PyPI configuration file
-.pypirc
+# Local virtual env
+func_venv/
diff --git a/shared/__pycache__/speech_batch.cpython-311.pyc b/shared/__pycache__/speech_batch.cpython-311.pyc
deleted file mode 100644
index 58ca2888792699e4a1a7d3ce5b2dab4fcc63daf0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22598
zcmdUXYm6LMc3xHWyQk;Lhn^IRA|+01In9SwT7|3C?vNab)KWv5Gn7}oR@yyXHDr_h
z(ybZ}*{%VkwXpHVguUyR42ug)){)jS=@<zD+b|5nS{VKl$Bvyv5>$RT4>%bnOC%1H
zKxPmm-#NG5-OW&j10*%mw{P9|<J@z;d+xdY-SP3Fg6G=Zx0_q16y@L2i~me|p1Z%7
zQk3@;L#Zl;YG^IBs`9s1)%ZJAP4RcSn#Omkm1$?IS(VbJTe)_wnrr8)`F5dN;IK@q
z*dD8nwM*4fd%QZ{o~TZ6c(yg!o~ll@r>oQLnd(e?TXh?U=UUs_JE}X_pND^Eb!Ypb
z>O=fq!2841hue=-AHjRE`lvBheat9TA2-IUyNrqI6J}OZUQvw6cNJsmLrqa`;v=6A
z@paRGQL4MWl+#E#bKjJ@@nvJ1xyRUUPMCWh2}&?_n0r>U#?E&$)hFTKy{Z`xy_>G?
zg<psNVfgnMk09Tp`0OxuAnjuad&+p+++pmZFhfJw69{`c2-}Tt-HR(4dl0w3pcEA2
zG?zrV_TsZU(UN`UEVrg)JcZVjji;#uvt;f;3$?Jtt7&6DN_@tcHO7td4b6DQ+--cu
z9FO(t0P@b^Q!zhd9Q4YaFb=(&tsXERH6KIIDA!@6ID*(`IR(`;7OSafd=@R4GoIs?
z>_N@XqvlGGcSpQdjKk*;f6z-mSYFzA0r7`8{%?3?W{sl=I~*x<FkjaAJklI7UgUPp
zq8(qr`?JPNJUX#9^9ZJmFCzZ4hKe!z5=Lk~-cI8fMt(LNQR6s9Nse@^tkt}60weaE
zu`slr!FHyNmr?KY#w&(KqimeKkv2}Dp9k=nOU(BD#=T^`ih4h1XiG}@wfe5Kq7<!*
zbzvF@t)6K%F4k7+cH?5DcTJzu=lg5I)R#D1Uoyp&X2aBvQM|q^>K&^gnmxPO?da$2
zW~*s8O{-X3zSy+%cGp;InR-ujuQUzQ(i`1&PnZ|Yj@7(k>T7;#eYGp}^{%+w>|D^-
zoA$+_H5R!=x*e-={_LrWe#+J_);mVaw2l_%^jEtpy0x~_Zdw+#2@!g)+iIa~M279v
zjqZA<)vX(xqSF=adaK#5Q$LXISc^)ryP|%<)SDgKydXGApIw??($AketzR%ZrVKlv
zuQt)kwH}pbm67{6n$s~`6mFY>dPS`;Oap0}4Zn`%X4}M&bgr0!TRMCA(4mf{+s(#h
zxLv*7M5`?mDX7%qyl7l(+GfM1$vD?-nbg_iUA)MN(_76IQ5V;EH0!NaVx+n~uelbo
zowrOwU%3_x$BKE<wtL*lCG$et?AVcklGFe4TBkvSL)p*z6Y38eN(<+O8qluW`s|vG
z-q$+~%skJkjSgFOz1`zMIw_}&iK1m*_2~;UsteQVp(e0LF3IACnFgk!QMZV?zHDC8
zg9aTf>L5~{a=R8b18uEc>aHBsmxvf_eF*t*#tWt`(;dHPHZDi{LDYi+Kn8Vt%?i?n
zHPqHb%Q~vRV%m{hk{CcCmcA;wZLfPaMhInbAwJ>NFaWh+1dr+`h7qJ0DO$E6$Pwx5
zQ4j*rksTeySC`H#5*?xa9iuJ`Z#20L?fPZ2X4Ng~0`-7t8Eae;H=2$Wxk788-oW%(
z#oHv!7R#zzFzR-_(W+ZkIpyZ@V$!U-1#%6uW!K$2dHdatSx&pD`OY;reWKa0-Sp|E
zWxM$^%qi+EMCNJuupn%=2;W*q`t!ZIu*}+9mRmrm*=ckQ^CQ)rrWNV&p5v`1D737(
z)7~O9Yt&Zik2}NZ>eg9PcpSh&!ks}vtR}nOM0Yv$Ll`#sx)jTDo1o9_IdV$mJPfBf
zwO>&h@B|OA4?lO0;OjlbRDgbzs&-y6Qzr1viwI?(Mm{4|&G1{A-!kC4S+^vaDq6AH
zyx^uTo7ddD*)e)uu$+GJ$e|<8%^f~8clbGXtln$ZJVvpIjQLuvQ*WEKnp>>ZB%6i5
zRI9zUR&RMRDa#h_WDR82>4Jo5?bd2y7uxcXau=?0@0EjK1j0Jlt_!=_>8#g{=Gukz
zwSzt-S_g@5cjvqky5it)86h4;Y3zIs9_s*{t>|Bx-fk;fTJ+oVBS!4=vkxEMFCqHV
z4sSKfgPrBU&S9|g?)Z}A=iX@NK-%54jx8QT@%Ktn?9DYhy*1mN<gZ#lEceE}=<b?L
z@l*0UOt4tai6^KRyUEeX*+b5g<UB(T4T{)H4)^vc@`0eF^AbGP3veEI*mx+DLrCSh
zX_|gFXPB$?wU+H>g$aq#X$0^VX&~(tBHp_lIPWPoT>rVG`o7I*4nI(F6i1DN;0?{u
ztQk8OBmum4l=qcyYagPln?ZSBQyl^uTK`F6n8JK(tqHiHo1H67(e04T(XZ4+v%b<Y
zEB*bik@$w-yl=XrI@pNTwcMOnikkvAcZ;vrTWjWm5M5DD30lRX0>{l<p>T_OuIsv>
zDZoM^{*Ub9dAz=ik4L_D(mzoClky*UMs6!Vpl^)%oyfTRVLYKR%FUw~9^c=A2jd%z
zfuh(H?mdPYAE8)WV)Ta3hLK8?koC(}luI<r{A^}?U-a8Z8=3e_-pYQ6*4+$Rh}p{Z
zpFB+%Ns_+~Zo%LW3j+*mP2!)5TU<K3uyFij?aK>qxRaiLVe!P-Gp81p@2Lmf87~a7
zz4rRTxusKQ7W?Td&sL6LdT8~D$H^hiRL)9XLX!c~1qW;P#9@jh9xI+DXD>OyA|fGR
z*JS%>J)&LW3rMkuk442`($X|-G>QJVoLCu>nC6_E?7>Mbpe!1iyWfZN*E6YhFh!WI
zVAwY!p{UAxjFgd%PuHi)oiVcU+_!QcqMn;UUBn{$kIBjQaBpIkcg$DMpIfLcEuT3z
z|H?vb{`m1T=NFgV>3A^Hz^TQv=a*~8&nzy_pITfvC(7uAJ9Xy#ay<Am2rj3^EWRWi
z1e~&*k<5`M@c%IXGd%yY$vy!bN46iv$0GJWi~#Ov@o7)KZxteB5H>KzMQK$rK@e+?
z6NKb|hW720quxrz2Oa~R?mu>1n7~dFV+E^<aZTl3{vyD*b@brD{>};kn@R&Hzp~!!
z06AKfj!DpG%#&UO;M`&ZoD>YSS#Jqa&6KsU$riD3pWAIiLR9O)CApZzNhC1vu?XeR
zU`aX6qDb4yL<YhwNt`7!x`jYK#>Rz6ipRx)=RHV_O?(eNkRvhqu%Q(cN860>$}mq{
zB|{3HC1tM?T>$$Okye#AIA&eBntDT7S0QVj^2Rx6soq@Gu^BMBrq$VR>*iIU(#j&R
z>~<ofAX%-!bKy>T0~SQMg@vmPlR+$)7xZF^cohy3`bg5t8W%HD*z;oF7FHn6B3BC^
zk4J5#L!QK<Mx%E*_B0cp1d>NepwuUyd}5W5|Gn*V%C8kapZaa}^C`E$^TPn3K%3H9
zuWny#1Zy10NkH)A405qS(XP_u4V@4N!u(be2&C<Zpm9=$7AO4Yl#Ps&-lW<G9w%d@
zZYB5|ra41@@lq(1CjYQvrPnn%(KPWY{wSuL?LT>*x8i!M+o<y<9Gj@Fx6~w`s9=Tg
zw54glTurPj)83{FguO1X$zB9kq6u;*i7I$dJ%UJMAKh`s<Yu2^AhH575I=-W;vD51
zi{?^SEgE>B4m=WgZ1alXX|lUql=u-OTEoY>00&EPf~MXh0wxGvlwOFWh=4=};<r+f
z+$hKY1LgQ$`nIM~*Ka*Mzbp0euJrt#+{gP=xVNdjA(x?@N#<gAO<0Y{94kbCOii<(
z66k>$_!*j`0{*2pGEv|Y0?X*K4W(Y38Q3Iky~jy|g~nJ#A2AzQ2iVRY^HV7dO`VjZ
z+2diDk-n9Q=XJ7xIq2L?>L5QGIVbbJV&t3@+`N-R+d)URuj57<S{W#p2FiN?p1i+X
zg%2?zH-k|@`NepMSH3Jc1qvPO?@#Wy!wok!-T^DAhy~Qwe`vTAI9OIi4e>;}Y4CM}
zHk7_tU>-0Ll*GG5Nu7TaS==JDLEiq@&6-zxO>s>eMZ`Vzix9naN|ND~;HII2vfWhO
zFx;$UhDfl21foF>l_p4f2op}ZAc)zxsZMuYFa@R7?1rGh5j1*YhMaBWY==|MOF2bL
zQI^E(hdSlvSD@-4>Ba@SDQxa7!qvQlf>l(CDl$YD@n6Ho`Z+kD?Wri8V>v<S-vJ7w
zC3A<D1NpEoRkpB0Wx1z8_GyuYrEQIS3oKC<m3${lsr~eRU7TLq8uRk;687U5Rk}qF
ztzuC%qTH2M60aua>!3Plm$e(TX-f6?y@E~CMNBgG>r5<5QXMPRWp80yV_TK}RJ|8y
z7!O{ux*dqXN6s{QbMw7sdk%`QKvz2l6<WV^96}4YVQ%>v=wUCa2TAp40I69Xqnh-=
zwO6|ki2E)jqGqCJGzuB&A@^<>;A97sxkjyJUNKv>Q0WIHk=?2>69Fknl}Cvv6fIy?
znww>gZMr3+31v|Inxt9_)d(9-Ajk;P(0-E#>q}@Wp^QrikPntI7*WV@u%J`KE*vl>
zL%X0lX%aU#0H|-IH*!&-6|y5I8)HGrrC@<4*h$XFfT4iuLnytGck-74_G0ahvJ$YF
z{3W7eewg9pF9pnw*k-(Z?3(kNML&J?`$N2h$5a~yqu>;-sA48oThS??%*0+2Wg>NW
zIKeFG&}cO&X)Y`JIo2astP+KKv(i809TnJJ+q`NY4E3U<X*w$Pv0sGVYR=PJeL>K?
zk=n2!_2ZWNC=tqOqLbx7H@wgx6@cMR66Gv+PvfZIBncjD^~XHAScHZ{tfMe@%%h+4
zP$hz?3DR=Y0mN-m#-OU4N3?F5R)d?PGNHTa@AQ_BY^A<h2Us$W>aME0Y0GS__MblA
z0b7K&9V&`I9X-n|6IzM=lxBY!`iHS_@9O4BSwRcYP4~Lky3#`7eGPOJm)+7iD4S3{
z>m}SVsZ|CbG)y;511N}dm5YMpzqmpU^-x?TCs+dD9jwh}*q`gwuhE$URV)?eZVFRn
z5xMwm(uW%ld--CsM0#)WO{D&He0-+4o%I@6&i*$qaE(|bv%4I=3$)zt>_iG?mXf7%
zthaC4nxfp2k^1GxZ;vf9-1YpPP9;FN0pgKukW4d&wsNmabedE1&P61ZU`A{0W@pVd
zt$;_MM<Ljq0OiGZXd$<VuOq-#wU4#V5F=fP>1zN2luJaz{KR1?z)nO17V2J{OdO(w
zd?CKh8UBKwh1!>mY;+|jcS~v<pk(8v0k~&311{sYn7-q^<#Yk}8J!MXG75)s1GVKw
zC@m7;i#Vf^y`l)sflf9C4};3%Ep*bCdHVvswj)SFV2vQ=0AGuh7(hcOkxo^v{{YI4
zV`o7~k>eHryoZ))%Rje4V95Gbn;mQ@K#1O%P~|r17na4_a6i(-JNObm4#&;P)eTWt
zSX#pg3U=p!PcJi<>F-*O9br;{-5WDJf{0z1H~J6)qgvsy&)&l)oEb*hCGVIhCX
z^l-`uwX~??z_5nHo0@C@^AxwVEV-WKJQ=HHnmr&jW;EZT(td`VpQVQ94L}T>m6~pz
zml@%&pQ6~GCg(jk7LiN9UF2S7C#M}`E#E=Xe?fVE8xGjPgvTi0zwO~JpG8PkKnM-#
z@QJ4MS&5$}z9sURNVs37GNYm48?i0?A)eH`G7YJCldOkggIv2M14>~mVsI@$AgkzI
zoVz$me1&YNQQfYihFk=GLNWZn&$Lc2fPU&~V)Z~yYm2S|{7s}S2!4Um8W&AaOBQM&
zpeINk?D(;}{bh>G;4ICyVj7?lC<8iGoD-RV{J)A+{}mtWX*hBi!a=4X@IN$cGKL2u
zrtirjFZJQ>_uwYRA)({hjKW=Hkxx<&PA3XLMp~5HGKNcy;IXN{GLKC|Arax*ev5|i
z$@GKpQz$qx55@_5J4dEW@baE|0h?WXa!&Sy^5#zv4;@iQf-m>frm_K4I3jgEPNTi;
zBU2XQDL0Bv(NeF!JhC0dc>11djBP%Mlo@lzj1r{74ktGp^E!!=!O6ej<gCBTBlCCu
zKNy+uzv9T0oYH0>&q68%pO}c3D0Uf>4#v<*8&mZCsmAg<B$dn4{X?|bWD|z&YD7S?
zX~QtHh6?Kr`r8hYGRul%{<qOu@e6R=tUP{mGn~rJK+IeQXb%y5;0LF5)363I#0SXu
z5-}HdJSJY;ZLwG2Upni2Xh<Mc#%F;2OojFU;vZ0H(*kxi7dp+pY4k1%GAb#Tm>UX`
z=pzNK_%_87P)GB->!xtC*m2{;o{#M<64>SOP_RN@PD%_Pxw1-@O)glbx@oed5WhyH
zrO{<ee3eT7d-U$_YghsD_bHf;F)h+;1+teso-UEJlwsd_GcA4>X+NP%1d2%DNe-S+
zGb1KM`cmu-6Te$gb@YJWmknFqiK!=ischOcWT_7Pc^b+{Y$9Z!ZBVYn!nk}dzqf0M
zr}%^`8yYpN39-T{*L^FDrIuX^-B?3Ep?*tA8I^b>-C0upDLipW8EPv??Q<)kI^R$o
zb@&PJhbSr30;n(nMM|n;;EAW?^OML90p}-=V^0y0g`vg+xP)XpKXGo;(B4p%l;FH2
zwJ3fbsXkH}1;Zrb>v)q`HCQ;}vk0Tr16w)qlkj_Bg_lGOs*DGQQoL@zhqP#-Vx5ME
zunJ)l`t#|5rpNb3=`}byky(7)$C>#Ug*215w#_Tb$I9OMr&Aw4rNU(%5W_hE-W!~~
z0lcTh@SYapy&-d(GEz<onwzYnLN|psVnNZFpgn2waAYGL(@$xU+0w$<a<ogqj6&<B
zg_@@!*fP^!f^C?tFYr;79?{(Bu;ql|rbRXy&5mwgG}#1_F(kG^Y~eU-bamJn>AsDl
zS*~ztq@A9=j2#C~IO?6tH7Nx8WYFX*hL9p-bS?8OY)Tl}5}R`~5~5DP&rnwo0gLaF
z^D7jeUxjfdnT-Gs|1yP={ebw0qBOD|>zL~j<yxe=qmD^#Dc+}a<m@AA`U*Utro2x_
zv=xw)<<U~Wy?kmTP8!mkpFTv&<db>tgO&dX)y6phYZn4;k<cz^BeaxSj0#3-DVgfP
z6(ZUN(A;LQs?nI>p<#}46r>AiC`bfPJRK{kzr!>B%EB^}NGOo}oi#`Zn!JWtDRRPj
z?2LQ1rJE3&rXVv;GUW86rH->~>_xBnry*nv6hju3iX^Zw2Z>+op<|RgyKd?@yTF-m
znak(BI^z)>J}@OP*?b)xN^Gh_S&zfOaIZ|HM#n8y&mwkCZI(hfM?x5l7k`N0XNk0q
z!2{CblLMdYjNai!Syt4~MC4}JEI79R45Y~+)RUkk#nuq$J$?A&d>gZ+_V*q4Y)0>6
zcZ`ElH$|$2((<X}UtX%6T{u^JZ3(9Y{}^RQPYK4jeKj0pwk`1|h&)asFa?i<9(S-L
zB8$S!O4t?a5Uo=ZU3XpN9c?&}+%94(rEDVG;DZxrlynmx`S4Z>qEs|+5ReKL;Ji`N
zj5M}auW#U(VAIbK`*sqikYK^oTJLev(R0oC@)ggFI`@t*BC+$q(ls;{V)yzF8yT;~
zBl`dUV2iULy<C5h*Wy`Wu9`ykl=?a;!KKY~=NPo;Up@3@#f$np6!fjuN}bHJyLy+n
z1@<Cl%djL1a5J34%`8$zdCZW938Xwla&sp<PpWBm+|;S%MK=eFb6DcH+}vr;Q)t(%
z%cen*h1Yy9&i4Nq=2!f4atN%r>19&YyY&7q$@u^d_yg(V++q_Zi)3xxFxmRk&9JTo
z@`(@w8o7ejxoLLQ%dUDQA(`Cq8Wmn|0L+u;&ynjSEu()34<t@L4nX5?h?f91)Z2S<
zDvppzUeRKdGA@dlB7W2+ku+5sL$(i|-ABM5Y3>BT17^DBo%^_1z6xPNc^_y(<(}o=
zqoZOAPQVw4jb-s+zVNrf^8S|ko&se^3ZJy2oKU{0zIkkvr2&rl2BCBos__VOMA_aG
zUyBT-vkwBLH*%dajFBD`OlL!Q7UxbIxi^$f+6U9$P~UkAYLI|;`K_dHSOUiQ+D1O6
zp3mFFcfBXUiC%H?Vj-T|$b_8Gp7HZRWMGuSTXEtUBMaqyBrTK+y1gw-?<CYu8@ZoQ
zS5+hTE)-TL5^%xE8Egk1Rq7AqF93QF9vpa}s2V;lNtG5<XYrB3fk*mNmYGm~kqk+e
z`Azlu3AB|IvICD%d{?<Omdt@x3{PDenpzXTV?P!a9jDZdBG$r~lY%TbxdBCyF&@q*
z-X>yili}NHYGd3PcZ!%t_4*S|ag(6-z~dxTRv5vt&B$R;n87I}dwKnoGX{B@7H{6C
z^%H3Qq(idz<i?aU6>I%eto2jI)IjSeyw*=VaO)>Ww0>$x>nAtTun0;u)s1QN0@iEJ
zv{9h#nKMm_vifr%Ll$NJjftqXjPKugcYxT);k;<gTIB=SCdtcYxHHB!?!kLRhcjC~
zzy;IAF&uW|>XX!QLE>?EoTEzYOz8QLKI>Wj<8+#~UAo6~5d+gl|AnQew0@*pMfA++
zi@3DoWup6I{wckzP_pc>qGhp$#KLUDXbuHIXoXupT<!C3ejTI}eq7*MMS$!GSI2q+
z@p~A_#i%astklJ^1b1^@0|Hq<q?W;lc+TA(oS$b|;3#h%VFd|71|K7_THkpD3X;KO
z2uY^=hIX0d#IGN6)Ng6mpC!;Net?AdB6$z<0<Ch$ReOTOmS^wzHg5**6<;K?DZ20Y
zQvaDij}g1@Re6!EU96Yj@zcHoda|hO&_ZvhM#|q{^7Vui3x26?!d0qZ=wZP{V;>tn
z+B5L*$vp$<n!+W8x5y=6yhR}LDjBY&7u`JYjlgNU1+y_+Ujx`DSj4tNAE~Z-HL@qT
zmw&NUZ?72jm-<i+$jg@TJ8#h;kSTwX2H;_oNE>W_BQBweba^)f1ru?@zt)}!ItLT~
z4UU_papNwD|Bm-amjo$PAG|l$AE-BB&2lXN2kIzKjntiqj}b3Z{u%i$!htH8?$~V~
z#!1M&4<=Ry3|qPamVA~8g$-r=O*rKlahv=B@Ny@x^lK3!D$qL<lGL*NBdSibFTrTG
z=N5wsalCDrW~a7BH;<qLf=6AG#xF8r-H%vz)pfd+L>haG;D-LA#D1cW+)(cvPBda1
zqxz#XvVQ@G7Rr+ZQt<)eBxAk@*9=C?sz>Msx1blg9{?sSjTAuBA*Pn^<Puo+kT$iC
z{<N)>kG*dzcZy-WMDuY>@8=yIhyG?|ER`Nl9|?gD>yGBfax47Sc>93p5|iwNQ^lZv
zd>p(|Fy{aFN~RugYk2hJ1wuAg{*CSPW7@}Kd*=^lK0c(vb!Wn@1hh>x(iq3s1EF)?
zLmUMQ<Q)VZtKx`x6A^>YfQ|vFd9U&Nz%!}M!3+pVY8$XW8I>}rNdU5VM{_c7tJjZ2
zbqZ0XpOe7+akK*S#_Y(P2{VFtPBwIc?emB}CGxTc1$bWEBSW50bnr3r@KH(Z3ZShi
zjHqh>y&HuG+JC{waYTuU<6>$(aOV3!%VQ&QBs50A)1y)*=Wu*v$^`n~m~cw(e8m}m
z=XGa7ERUSE1e%V|8W8j3$eanp9?!}1Jp~;XL4kqCnRKR{38Qo)(K{Tp-{_P_Qb}@-
z)d!xrq98mrye(NL=M;_c!M0$(^#{mBSa9GmCa{B=jQ1IP7Hvf8sbQ&&X-8$uuL1LG
z!2HDk^Uw6(V*tAtV0h1xG{%2)#{SToc74DcO4OtJSAWb0Q3r7Sl|a-1R(@@~=|OQ8
zx5Lv>)W)0oEU<k23hc?qR@T#O|3{34JKkMgg@ixVS*D+na3>A#lBO51D87$WjIX6;
zi#1svP}qMVhp|U0f=C7e2=2u=D`8}YlqAbZYa4}>OR$Sul*R3(@`vK7rvnrb*#He<
z!+%OLpKM!MEU@5WG${TOVV|Z+VjD`j559ZA_I!U-d;S9Lf##5KJgdURACvE2kwchE
z{4Y2FUifh%MNnEf^nVJUo0ZcCJ3MdRB2)3NDe1o<hvvX75GmA}oz<?JU$1%jGV3+u
zbhGQML3DHLWC#e<hj37uC{dFcb4B46`x$eI|4GG@bCXj3DjW=U$&<qID{6@0s}uLb
z2m$KB6<kj5AA`Vdn2{4<68S+L9+L5J3e`;c@nLiluP0$?Kp4o&H&_an6hl_=vJ!8o
z)BC%FR!UUml|8K0UT!~+O!>*y<kn2A%BblADQ1Ue`%8Zes5HOM0enZ<>CYP-o(K4O
zG4wNLXYTwGRMP7-l~>968FGFWjyuI$&KitOn{|0N5TX1KFAXr#f5w`QA1Ea>PMN6e
zNyKGis3jXQZ_I|~jd9%~%!|jocHj$KKdK`|TS6@)OcatSbAHZvnMfJYf`(TtMi`bQ
zv;@$oKIE61Y>|`4aX%zhc;N$eg-{5l*Reil{c;c<$T=r(KkA3W_&`nE7%SLxD(O8=
z#woxs05{NjX&6Q3$?%zOgX3l!%8uM)E84sLJV2va7|SKcZciAGQj53}lTO|PXKX|k
z5d)8rz-LC`GM|6JH2bMAu~Br!hMH#Y57SWH@wSoP5<emlEoaOr4lSn~DW_zVU^qV>
z|Jj0#abtqyw=@2M3S+pUQ;O0P3`?hY3{%GR4QTd0P=UuLpxl_LZwI?$3B2qbORSjQ
z9a3H}@|S%`At!%QBh=$XfBi_FNO3Vswux)p>^)r3y-TTNkCwcS%1Mq?{9h`HLkS!y
zbNsUvoekEL=II>y*~+M&5ArtJoQfUq9?wzz$K-9R0}*ZA0s0v(!}{U@T^`hLa1L1L
z{1676FB|qJVy&B77No6#wBr-<E0io`JQls$aQbw6Lx+u9>?l3pOKey!evuOH^v7QQ
zqL_a%r@wcx8`Z6{2_f#mtzkCkw~A0al#w8JMf^K*s149au(bgjC*rd9w<v5XY<^AJ
zAr|Ftz0h^t@(%v#2yvW>I7!YdIizdi>%fs;ayd+4zXC^IRHW;ce0!JgAi6tGEWA8_
z{`7JUKd4kYdFK4NrP{I6XO8h*8*ziOeuo@J2;U$d8SAlPqdX~gfea&q7>ttZ{RoZs
zBM0t;_xohtyS#-=6Xh0UmoVP4(=a^5#RWD_U~Nx^YG5otiu?o=kjj@?f1Fm#xX!Ld
z8ts~tBWznqSAYKuAL|7;&;~p$<(97%z$HnR-@fE0od|{asbX(g|5q8}v6RLO!DU0B
zf#8GEyo;81W*`rxMC2MM5B@03yTU0AF~V{<PV;~1VQV5Pu7_R<U`rsGpK6jliM37E
z=}T@(pMq8@<K3VNEY`SXbdQQY%GQi0yUXe#4E)8Ln3M7gZmw1{x()nd6O-C65rKS-
zoUg;dO*sE&xJ*kB7v_7n@}eyX95-uU>%nA3kaQFz*hPUHg3oRVgn=K6tMD&vg?9-A
znOzAw1afEU^pk%TkHGl)u^?$W>=LiL-6!_5XHK84EiK?@KTa&U5BZ_Xr(RpYFMrft
zJGFRzd0|QH1(~>y`LW>w34bl*pHNLC9_4R<>>=Ot<a~jgaB-fc*B>E=0d|*s0uGMY
zyWN&tY`o6snuA<o-y`4q<oqEye?rcmlS4{HiGGQm<>p`?MUY5>CR_f-)=T0DV)3t7
zyD;jQU81$;c@`xelQbvkz^6n1I~pfb#M4E?Mx~{2ro%*?e%@$Z6b20=w~@mAPl<h;
zgQ)p-idGa*evmg38o9nKb%b8M@ypeq;Nb7kz!!!MxjYT3hc47x9XaS(3ux<@m-k7;
zkw2b-_=rqZ+8Xy#zN$v6i!||ZWS-<WSqNpOg($0{0wxeYwKeVK|B8wXbgwa9C)LOO
zKz6@%FL+t}{*jJ2_=_~*<fyg{8;&Q!BBU=;ke-P1n&fSaf~rA?vI{)b$e$uo!M8uW
qoD%PTM_vzRJW?5D@c%kAD9l1RNU}02Ej|?<kszZSan8=$k^c>lMnV7p


From 7a013521fc492a24cd88f89c8124380b66e708f1 Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Mon, 2 Mar 2026 19:06:16 -0600
Subject: [PATCH 7/8] Update ui_search, remove obsolete ui_search2, add
 .gitignore

---
 ui/manage_videos.py      |  213 ++++++++
 ui/system_diagnostics.py |   75 +++
 ui/ui_search.py          | 1124 +++++---------------------------------
 ui/upload_transcribe.py  |  461 ++++++++++++++++
 4 files changed, 872 insertions(+), 1001 deletions(-)
 create mode 100644 ui/manage_videos.py
 create mode 100644 ui/system_diagnostics.py
 create mode 100644 ui/upload_transcribe.py

diff --git a/ui/manage_videos.py b/ui/manage_videos.py
new file mode 100644
index 0000000..b5cce69
--- /dev/null
+++ b/ui/manage_videos.py
@@ -0,0 +1,213 @@
+"""
+manage_videos.py - Manage Videos page for the Video Annotation Platform
+"""
+
+import streamlit as st
+import pandas as pd
+import io
+import time
+
+import ui_search
+
+def show_manage_videos_page():
+    """Display the Manage Videos page."""
+    st.header("📚 Manage Stored Videos")
+    st.info("View, search, and manage all processed videos and their source URLs")
+    
+    if not ui_search.SEARCH_ENDPOINT or not ui_search.SEARCH_KEY:
+        st.error("Azure Search not configured. Cannot retrieve video list.")
+    else:
+        # Check URL fields status
+        url_status = ui_search.check_url_fields_status()
+        
+        if url_status['fields_exist']:
+            st.success("✅ URL tracking fields are configured")
+        else:
+            st.warning(f"⚠️ Missing URL fields: {', '.join(url_status['missing_fields'])}")
+        
+        # URL coverage analysis
+        if st.button("📊 Analyze URL Data Coverage"):
+            with st.spinner("Analyzing..."):
+                all_videos = ui_search.get_stored_videos(include_missing=True)
+                
+                with_urls = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']]
+                without_urls = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']]
+                
+                col1, col2, col3 = st.columns(3)
+                col1.metric("Total Videos", len(all_videos))
+                col2.metric("✅ With URL Data", len(with_urls), f"{len(with_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%")
+                col3.metric("⚠️ Missing URL Data", len(without_urls), f"{len(without_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%")
+                
+                # By type breakdown
+                st.subheader("Breakdown by Source Type")
+                type_counts = {}
+                for v in all_videos:
+                    t = v.get('source_type') or 'unknown'
+                    type_counts[t] = type_counts.get(t, 0) + 1
+                
+                cols = st.columns(len(type_counts) if type_counts else 1)
+                for i, (stype, count) in enumerate(sorted(type_counts.items())):
+                    icon = "🎬" if stype == "youtube" else "📄" if stype == "direct" else "📁" if stype == "upload" else "❓"
+                    cols[i % len(cols)].metric(f"{icon} {stype}", count)
+                
+                if without_urls:
+                    with st.expander(f"Videos without URL data ({len(without_urls)})"):
+                        st.info("These were likely processed before URL tracking was enabled")
+                        for v in without_urls[:20]:
+                            st.text(f"• {v.get('video_id')}")
+        
+        st.markdown("---")
+        
+        # Filters
+        st.subheader("Filter Videos")
+        col1, col2 = st.columns(2)
+        
+        with col1:
+            filter_video_id = st.text_input("Filter by Video ID (optional)")
+        with col2:
+            filter_options = ["All", "With URL Data Only", "Missing URL Data Only", "youtube", "direct", "upload", "unknown"]
+            filter_source_type = st.selectbox("Filter by Source Type", options=filter_options, index=0)
+        
+        # Load videos button
+        load_clicked = st.button("🔍 Load Videos", type="primary")
+        
+        # Handle deletion using session state
+        if st.session_state.video_to_delete:
+            vid_to_delete = st.session_state.video_to_delete
+            
+            with st.spinner(f"Deleting {vid_to_delete}..."):
+                success = ui_search.delete_video_by_id(vid_to_delete)
+            
+            if success:
+                # Remove from cache immediately
+                if st.session_state.stored_videos_cache:
+                    st.session_state.stored_videos_cache = [
+                        v for v in st.session_state.stored_videos_cache 
+                        if v.get('video_id') != vid_to_delete
+                    ]
+                st.success(f"✅ Deleted {vid_to_delete}")
+                st.session_state.delete_success = True
+            else:
+                st.error(f"❌ Failed to delete {vid_to_delete}")
+            
+            # Clear the trigger
+            st.session_state.video_to_delete = None
+            time.sleep(0.5)
+            st.rerun()
+        
+        # Load videos if button clicked
+        if load_clicked:
+            with st.spinner("Retrieving videos..."):
+                
+                # Handle special filters
+                if filter_source_type == "Missing URL Data Only":
+                    all_videos = ui_search.get_stored_videos(include_missing=True)
+                    videos = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']]
+                    if filter_video_id.strip():
+                        videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()]
+                elif filter_source_type == "With URL Data Only":
+                    all_videos = ui_search.get_stored_videos(include_missing=True)
+                    videos = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']]
+                    if filter_video_id.strip():
+                        videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()]
+                else:
+                    source_type = None if filter_source_type == "All" else filter_source_type
+                    videos = ui_search.get_stored_videos(
+                        video_id=filter_video_id if filter_video_id.strip() else None,
+                        source_type=source_type,
+                        include_missing=True,
+                        limit=1000
+                    )
+                
+                st.session_state.stored_videos_cache = videos
+                st.session_state.videos_loaded = True
+                st.success(f"Found {len(videos)} videos")
+        
+        # Display videos
+        if st.session_state.stored_videos_cache:
+            videos = st.session_state.stored_videos_cache
+            
+            # Metrics
+            st.markdown("---")
+            cols = st.columns(4)
+            
+            type_counts = {}
+            for v in videos:
+                t = v.get('source_type') or 'unknown'
+                type_counts[t] = type_counts.get(t, 0) + 1
+            
+            cols[0].metric("Total", len(videos))
+            cols[1].metric("YouTube", type_counts.get('youtube', 0))
+            cols[2].metric("Direct", type_counts.get('direct', 0))
+            cols[3].metric("Upload", type_counts.get('upload', 0))
+            
+            # Group by type
+            st.markdown("---")
+            st.subheader("Video List")
+            
+            videos_by_type = {}
+            for v in videos:
+                stype = v.get('source_type') or 'unknown'
+                if stype not in videos_by_type:
+                    videos_by_type[stype] = []
+                videos_by_type[stype].append(v)
+            
+            # Display by category
+            for source_type in ['youtube', 'direct', 'upload', 'unknown']:
+                if source_type not in videos_by_type:
+                    continue
+                
+                type_videos = videos_by_type[source_type]
+                icon = "🎬" if source_type == "youtube" else "📄" if source_type == "direct" else "📁" if source_type == "upload" else "❓"
+                
+                with st.expander(f"{icon} {source_type.upper()} ({len(type_videos)} videos)", expanded=(source_type == 'youtube')):
+                    for i, video in enumerate(type_videos, 1):
+                        vid = video.get('video_id', 'unknown')
+                        src_url = video.get('source_url', '')
+                        processed = video.get('processed_at', 'unknown')
+                        
+                        has_url = bool(src_url)
+                        status_icon = "✅" if has_url else "⚠️"
+                        
+                        with st.container():
+                            cols = st.columns([4, 1])
+                            
+                            with cols[0]:
+                                st.write(f"**{status_icon} {i}. {vid}**")
+                                st.caption(f"Processed: {processed}")
+                                
+                                if src_url:
+                                    display_url = src_url[:80] + "..." if len(str(src_url)) > 80 else src_url
+                                    st.code(display_url)
+                                    if str(src_url).startswith('http'):
+                                        st.markdown(f"[Open Source ↗]({src_url})")
+                                else:
+                                    st.warning("No source URL stored")
+                            
+                            with cols[1]:
+                                # Capture current vid value for callback
+                                st.button(
+                                    f"🗑️ Delete", 
+                                    key=f"del_{vid}_{i}_{source_type}",
+                                    on_click=lambda v=vid: setattr(st.session_state, 'video_to_delete', v)
+                                )
+                            
+                            st.markdown("---")
+            
+            # Export
+            st.markdown("---")
+            if st.button("📥 Export to CSV"):
+                export_df = pd.DataFrame([
+                    {
+                        'video_id': v.get('video_id'),
+                        'source_type': v.get('source_type') or 'unknown',
+                        'source_url': v.get('source_url', ''),
+                        'has_url_data': bool(v.get('source_url')),
+                        'processed_at': v.get('processed_at', 'unknown')
+                    }
+                    for v in videos
+                ])
+                
+                csv_buffer = io.StringIO()
+                export_df.to_csv(csv_buffer, index=False)
+                st.download_button("Download CSV", csv_buffer.getvalue(), "video_list.csv", "text/csv")
\ No newline at end of file
diff --git a/ui/system_diagnostics.py b/ui/system_diagnostics.py
new file mode 100644
index 0000000..790b7ec
--- /dev/null
+++ b/ui/system_diagnostics.py
@@ -0,0 +1,75 @@
+"""
+system_diagnostics.py - System Diagnostics page for the Video Annotation Platform
+"""
+
+import streamlit as st
+
+import ui_search
+
+def show_system_diagnostics_page():
+    """Display the System Diagnostics page."""
+    st.header("⚙️ System Diagnostics")
+    st.info("Check system configuration and troubleshoot issues")
+    
+    # Configuration status
+    st.subheader("Configuration Status")
+    
+    config_checks = {
+        "Azure Speech (SPEECH_KEY)": bool(ui_search.SPEECH_KEY),
+        "Azure OpenAI (AZURE_OPENAI_KEY)": bool(ui_search.AZURE_OPENAI_KEY),
+        "Azure Search (SEARCH_KEY)": bool(ui_search.SEARCH_KEY),
+        "Azure Storage (AZURE_STORAGE_KEY)": bool(ui_search.AZURE_STORAGE_KEY),
+        "Search Function (SEARCH_FN_URL)": bool(ui_search.SEARCH_FN_URL),
+        "yt-dlp installed": ui_search.check_yt_dlp()
+    }
+    
+    cols = st.columns(2)
+    for i, (name, status) in enumerate(config_checks.items()):
+        icon = "✅" if status else "❌"
+        cols[i % 2].write(f"{icon} {name}: {'OK' if status else 'Not configured'}")
+    
+    # Index schema check
+    st.markdown("---")
+    st.subheader("Index Schema Check")
+    
+    if st.button("🔍 Check Index Schema"):
+        with st.spinner("Fetching schema..."):
+            schema = ui_search.debug_check_index_schema()
+            
+            if isinstance(schema, dict):
+                st.success(f"Index: {schema['index_name']}")
+                st.write(f"Key Field: `{schema['key_field']}`")
+                
+                # URL fields status
+                if schema.get('has_all_url_fields'):
+                    st.success("✅ All URL tracking fields present")
+                else:
+                    st.warning(f"⚠️ Missing fields: {', '.join(schema.get('missing_url_fields', []))}")
+                
+                # Show all fields
+                with st.expander("View all fields"):
+                    for field in schema['fields']:
+                        key = "🔑" if field['key'] else ""
+                        url = "🔗" if 'url' in field['name'].lower() else ""
+                        facet = "📊" if field.get('facetable') else ""
+                        st.caption(f"{key}{url}{facet} `{field['name']}` ({field['type']}) - facetable: {field.get('facetable', False)}")
+                
+                st.session_state.index_schema_cache = schema
+            else:
+                st.error(f"Schema check failed: {schema}")
+    
+    # Debug info
+    st.markdown("---")
+    st.subheader("Debug Information")
+    
+    with st.expander("Session State"):
+        st.json({
+            k: str(v)[:100] + "..." if len(str(v)) > 100 else v 
+            for k, v in st.session_state.items()
+        })
+    
+    with st.expander("Recent Processing Debug"):
+        if st.session_state.get('debug_info'):
+            st.json(st.session_state['debug_info'])
+        else:
+            st.info("No debug info yet. Process a video first.")
\ No newline at end of file
diff --git a/ui/ui_search.py b/ui/ui_search.py
index 15ef358..eec7833 100644
--- a/ui/ui_search.py
+++ b/ui/ui_search.py
@@ -1,12 +1,11 @@
 """
-ui_search.py - Streamlit Web Interface for Video Segment Search & Upload
-
-Features:
-- Direct Azure Speech API integration (bypasses Azure Function)
-- URL tracking for all processed videos (source_url, source_type, processed_at)
-- Handles existing videos without URL data gracefully
-- Batch processing with CSV upload
-- Video management interface with filtering and deletion
+ui_search.py - Main Streamlit entry point
+Contains:
+- Environment configuration
+- Shared utility functions
+- Sidebar navigation
+- Search Segments page (default)
+- Imports and calls the other three page modules
 """
 
 import os
@@ -80,46 +79,16 @@
     'index_schema_cache': None,
     'stored_videos_cache': None,
     'url_fields_status': None,
-    'debug_info': {}
+    'debug_info': {},
+    'video_to_delete': None,
+    'delete_success': False,
+    'videos_loaded': False
 }
 
 for key, value in session_state_defaults.items():
     if key not in st.session_state:
         st.session_state[key] = value
 
-# =============================================================================
-# SIDEBAR NAVIGATION
-# =============================================================================
-
-with st.sidebar:
-    st.header("Navigation")
-    page = st.radio("Select Page", [
-        "🔎 Search Segments", 
-        "⬆️ Upload & Transcribe",
-        "📚 Manage Videos",
-        "⚙️ System Diagnostics"
-    ])
-    
-    # Settings for search page
-    if page == "🔎 Search Segments":
-        st.header("Search Settings")
-        mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], 
-                          index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1)
-        top = st.slider("Results", 1, 50, DEFAULT_TOP)
-        k = st.slider("Vector k", 5, 200, DEFAULT_K)
-    
-    # Quick actions
-    st.markdown("---")
-    if st.button("🔄 Refresh Schema Cache"):
-        st.session_state.index_schema_cache = None
-        st.session_state.url_fields_status = None
-        st.success("Cache cleared! Navigate to System Diagnostics to refresh.")
-    
-    st.markdown("---")
-    st.caption("Video Annotation Platform v2.1")
-    st.caption("With URL Tracking")
-
-
 # =============================================================================
 # UTILITY FUNCTIONS
 # =============================================================================
@@ -131,53 +100,44 @@ def ms_to_ts(ms: int) -> str:
     h, m = divmod(m, 60)
     return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"
 
-
 def sanitize_id(id_string: str) -> str:
     """Sanitize ID for Azure Search (alphanumeric, hyphens, underscores only)."""
     if not id_string:
         id_string = "unknown"
-    
     sanitized = re.sub(r'[^\w\-]', '_', str(id_string))
-    
+    sanitized = re.sub(r'_+', '_', sanitized)
+    sanitized = sanitized.strip('_')
+    if not sanitized:
+        sanitized = "unknown"
     if sanitized.startswith('_') or sanitized.startswith('-'):
         sanitized = 'id' + sanitized
-    
     if len(sanitized) > 1024:
         hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:16]
         sanitized = sanitized[:1000] + "_" + hash_suffix
-    
     return sanitized
 
-
 def detect_url_type(url: str) -> str:
     """Detect if URL is YouTube, direct media, or unknown."""
     if not url:
         return "unknown"
-    
     url_lower = str(url).lower().strip()
-    
     youtube_patterns = [
         r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)',
         r'youtube\.com\/watch\?v=',
         r'youtu\.be\/',
         r'youtube\.com\/shorts\/'
     ]
-    
     for pattern in youtube_patterns:
         if re.search(pattern, url_lower):
             return "youtube"
-    
     media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm']
     if any(url_lower.endswith(ext) for ext in media_extensions):
         return "direct"
-    
     cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive']
     if any(pattern in url_lower for pattern in cloud_patterns):
         return "direct"
-    
     return "unknown"
 
-
 def check_yt_dlp() -> bool:
     """Check if yt-dlp is installed."""
     try:
@@ -186,7 +146,6 @@ def check_yt_dlp() -> bool:
     except:
         return False
 
-
 def call_api(url: str, payload: dict) -> dict:
     """Make API call to search function."""
     try:
@@ -196,7 +155,6 @@ def call_api(url: str, payload: dict) -> dict:
     except requests.exceptions.RequestException as e:
         raise RuntimeError(f"API call failed: {str(e)}")
 
-
 # =============================================================================
 # AZURE SEARCH SCHEMA FUNCTIONS
 # =============================================================================
@@ -205,20 +163,16 @@ def debug_check_index_schema():
     """Check index schema and verify URL tracking fields."""
     if not SEARCH_ENDPOINT or not SEARCH_KEY or not SEARCH_INDEX_NAME:
         return "Search not configured - check SEARCH_ENDPOINT, SEARCH_KEY, and SEARCH_INDEX_NAME"
-    
     url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version=2024-07-01"
     headers = {"api-key": SEARCH_KEY}
-    
     try:
         r = requests.get(url, headers=headers, timeout=30)
         if r.status_code == 200:
             schema = r.json()
             key_field = None
             fields_info = []
-            
             url_fields = ['source_url', 'source_type', 'processed_at']
             found_url_fields = []
-            
             for field in schema.get("fields", []):
                 field_info = {
                     "name": field.get("name"),
@@ -230,13 +184,10 @@ def debug_check_index_schema():
                     "facetable": field.get("facetable", False)
                 }
                 fields_info.append(field_info)
-                
                 if field.get("key", False):
                     key_field = field.get("name")
-                
                 if field.get("name") in url_fields:
                     found_url_fields.append(field.get("name"))
-            
             return {
                 "index_name": schema.get("name"),
                 "key_field": key_field,
@@ -250,12 +201,10 @@ def debug_check_index_schema():
     except Exception as e:
         return f"Error checking index: {str(e)}"
 
-
 def get_index_schema():
     """Get cached schema or fetch new one."""
     if st.session_state.index_schema_cache:
         return st.session_state.index_schema_cache
-    
     schema_info = debug_check_index_schema()
     if isinstance(schema_info, dict):
         st.session_state.index_schema_cache = schema_info
@@ -263,12 +212,10 @@ def get_index_schema():
     else:
         raise RuntimeError(f"Cannot fetch index schema: {schema_info}")
 
-
 def check_url_fields_status():
     """Check URL fields status with caching."""
     if st.session_state.url_fields_status:
         return st.session_state.url_fields_status
-    
     try:
         schema = get_index_schema()
         if isinstance(schema, dict):
@@ -282,7 +229,6 @@ def check_url_fields_status():
             return result
     except:
         pass
-    
     return {
         'fields_exist': False,
         'found_fields': [],
@@ -290,7 +236,6 @@ def check_url_fields_status():
         'key_field': None
     }
 
-
 # =============================================================================
 # AZURE SPEECH API FUNCTIONS
 # =============================================================================
@@ -299,14 +244,11 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]
     """Submit transcription directly to Azure Speech API."""
     if not SPEECH_KEY:
         raise RuntimeError("SPEECH_KEY not configured")
-    
     endpoint = f"https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext/transcriptions:submit?api-version={SPEECH_API_VERSION}"
-    
     headers = {
         "Ocp-Apim-Subscription-Key": SPEECH_KEY,
         "Content-Type": "application/json"
     }
-    
     payload = {
         "contentUrls": [media_url],
         "locale": "en-US",
@@ -319,67 +261,51 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]
             "timeToLiveHours": 24
         }
     }
-    
     try:
         r = requests.post(endpoint, headers=headers, json=payload, timeout=60)
         r.raise_for_status()
-        
         operation_url = r.headers.get("Location")
         if not operation_url:
             result = r.json()
             operation_url = result.get("self") or result.get("links", {}).get("self")
-        
         if not operation_url:
             raise RuntimeError("No operation URL returned from Speech API")
-        
         return {"operation_url": operation_url, "video_id": video_id}
-        
     except requests.exceptions.HTTPError as e:
         error_msg = f"Speech API error {r.status_code}: {r.text}"
         if r.status_code == 401:
             error_msg = "Azure Speech API authentication failed. Check SPEECH_KEY."
         raise RuntimeError(error_msg)
 
-
 def poll_transcription_operation(operation_url: str) -> Dict[str, Any]:
     """Poll transcription operation status."""
     if not SPEECH_KEY:
         raise RuntimeError("SPEECH_KEY not configured")
-    
     headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY}
-    
     try:
         poll_url = operation_url.replace("/transcriptions:submit/", "/transcriptions/")
         st.session_state['debug_poll_url'] = poll_url
-        
         r = requests.get(poll_url, headers=headers, timeout=30)
         r.raise_for_status()
         return r.json()
-        
     except requests.exceptions.RequestException as e:
         raise RuntimeError(f"Failed to poll transcription: {str(e)}")
 
-
 def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]:
     """Get transcription JSON from result files."""
     if not SPEECH_KEY:
         raise RuntimeError("SPEECH_KEY not configured")
-    
     headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY}
-    
     try:
         links = result_data.get("links", {})
         files_url = links.get("files")
-        
         if not files_url:
             if "combinedRecognizedPhrases" in result_data:
                 return result_data
             raise RuntimeError("No files URL in result")
-        
         r = requests.get(files_url, headers=headers, timeout=30)
         r.raise_for_status()
         files_data = r.json()
-        
         for file in files_data.get("values", []):
             if file.get("kind") == "Transcription":
                 content_url = file.get("links", {}).get("contentUrl")
@@ -387,13 +313,10 @@ def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]:
                     content_r = requests.get(content_url, timeout=60)
                     content_r.raise_for_status()
                     return content_r.json()
-        
         raise RuntimeError("No transcription file found in results")
-        
     except requests.exceptions.RequestException as e:
         raise RuntimeError(f"Failed to get transcription result: {str(e)}")
 
-
 # =============================================================================
 # EMBEDDING AND INDEXING WITH URL TRACKING
 # =============================================================================
@@ -402,19 +325,15 @@ def get_embeddings(texts: list) -> list:
     """Get embeddings from Azure OpenAI."""
     if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_KEY:
         raise RuntimeError("Azure OpenAI not configured")
-    
     url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/embeddings?api-version=2024-02-01"
-    
     headers = {
         "api-key": AZURE_OPENAI_KEY,
         "Content-Type": "application/json"
     }
-    
     payload = {
         "input": texts,
         "model": "text-embedding-3-small"
     }
-    
     try:
         r = requests.post(url, headers=headers, json=payload, timeout=60)
         r.raise_for_status()
@@ -423,47 +342,34 @@ def get_embeddings(texts: list) -> list:
     except Exception as e:
         raise RuntimeError(f"Embedding failed: {str(e)}")
 
-
 def index_segments_direct(video_id: str, segments: list, source_url: str = None, source_type: str = None) -> Dict[str, Any]:
     """
     Index segments to Azure Cognitive Search with URL tracking.
     """
     if not SEARCH_ENDPOINT or not SEARCH_KEY:
         raise RuntimeError("Azure Search not configured")
-    
     schema_info = get_index_schema()
     key_field = schema_info.get("key_field")
     available_fields = {f.get("name") for f in schema_info.get("fields", [])}
-    
     if not key_field:
         raise RuntimeError("No key field found in index")
-    
-    # Check URL field availability
     url_fields_available = {
         'source_url': 'source_url' in available_fields,
         'source_type': 'source_type' in available_fields,
         'processed_at': 'processed_at' in available_fields
     }
-    
-    # Generate embeddings
     texts = [seg.get("text", "") for seg in segments]
     try:
         embeddings = get_embeddings(texts)
     except Exception as e:
         st.warning(f"Embedding failed, indexing without vectors: {e}")
         embeddings = [None] * len(segments)
-    
-    # Prepare documents
     documents = []
     processed_timestamp = datetime.utcnow().isoformat() + "Z"
-    
     for i, (seg, embedding) in enumerate(zip(segments, embeddings)):
         safe_video_id = sanitize_id(video_id)
         doc_id = f"{safe_video_id}_{i}"
-        
         doc = {"@search.action": "upload", key_field: doc_id}
-        
-        # Core fields
         field_mappings = {
             "video_id": safe_video_id,
             "segment_id": str(seg.get("segment_id", i)),
@@ -472,38 +378,27 @@ def index_segments_direct(video_id: str, segments: list, source_url: str = None,
             "end_ms": int(seg.get("end_ms", 0)),
             "pred_labels": seg.get("pred_labels", []) if seg.get("pred_labels") else []
         }
-        
-        # URL tracking fields
         if url_fields_available['source_url']:
             field_mappings["source_url"] = str(source_url) if source_url else ""
         if url_fields_available['source_type']:
             field_mappings["source_type"] = str(source_type) if source_type else "unknown"
         if url_fields_available['processed_at']:
             field_mappings["processed_at"] = processed_timestamp
-        
-        # Only add existing fields
         for field_name, value in field_mappings.items():
             if field_name in available_fields:
                 doc[field_name] = value
-        
-        # Handle embedding
         embedding_field = next((f for f in ["embedding", "embeddings", "vector", "vectors"] if f in available_fields), None)
         if embedding and embedding_field:
             try:
                 doc[embedding_field] = [float(x) for x in embedding]
             except (ValueError, TypeError):
                 pass
-        
         documents.append(doc)
-    
-    # Upload to search index
     url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01"
     headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"}
     payload = {"value": documents}
-    
     try:
         r = requests.post(url, headers=headers, json=payload, timeout=60)
-        
         if r.status_code >= 400:
             error_detail = r.text
             try:
@@ -511,15 +406,11 @@ def index_segments_direct(video_id: str, segments: list, source_url: str = None,
             except:
                 pass
             raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\n{error_detail}")
-        
         result = r.json()
-        
-        # Check for partial failures
         if r.status_code == 207:
             failed_docs = [item for item in result.get("value", []) if not item.get("status", False)]
             if failed_docs:
                 st.warning(f"Partial indexing failure: {len(failed_docs)} documents failed")
-        
         return {
             "indexed": len(documents),
             "video_id": video_id,
@@ -528,22 +419,17 @@ def index_segments_direct(video_id: str, segments: list, source_url: str = None,
             "source_type_stored": bool(source_type and url_fields_available['source_type']),
             "url_fields_available": url_fields_available
         }
-        
     except Exception as e:
         raise RuntimeError(f"Indexing failed: {str(e)}")
 
-
 def process_transcription_to_segments(transcription_data: Dict, video_id: str) -> list:
     """Convert Azure Speech transcription to segments."""
     segments = []
-    
     for i, phrase in enumerate(transcription_data.get("recognizedPhrases", [])):
         offset = phrase.get("offsetInTicks", 0) // 10000
         duration = phrase.get("durationInTicks", 0) // 10000
-        
         nbest = phrase.get("nBest", [])
         text = nbest[0].get("display", "") if nbest else ""
-        
         segments.append({
             "segment_id": i,
             "video_id": video_id,
@@ -552,108 +438,107 @@ def process_transcription_to_segments(transcription_data: Dict, video_id: str) -
             "end_ms": offset + duration,
             "pred_labels": []
         })
-    
     return segments
 
-
 # =============================================================================
-# VIDEO RETRIEVAL AND MANAGEMENT
+# VIDEO RETRIEVAL AND MANAGEMENT (get_stored_videos, delete_video_by_id)
 # =============================================================================
 
 def get_stored_videos(video_id: str = None, source_type: str = None, 
                      include_missing: bool = True, limit: int = 1000) -> List[Dict]:
     """
     Retrieve videos from search index with URL data.
+    FALLBACK METHOD: Does not use faceting (requires facetable field).
+    Instead uses pagination to get all documents and deduplicates.
     """
     if not SEARCH_ENDPOINT or not SEARCH_KEY:
         return []
-    
     url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01"
     headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"}
-    
-    # Build filter
+    try:
+        schema = get_index_schema()
+        available_fields = {f['name'] for f in schema.get('fields', [])}
+    except:
+        available_fields = set()
     filters = []
     if video_id:
-        filters.append(f"video_id eq '{video_id}'")
-    if source_type:
-        filters.append(f"source_type eq '{source_type}'")
-    
+        escaped_id = video_id.replace("'", "''")
+        filters.append(f"video_id eq '{escaped_id}'")
+    if source_type and source_type != "All":
+        escaped_type = source_type.replace("'", "''")
+        filters.append(f"source_type eq '{escaped_type}'")
     filter_query = " and ".join(filters) if filters else None
-    
-    # Get available fields
-    schema = get_index_schema()
-    available_fields = {f['name'] for f in schema.get('fields', [])}
-    
-    # Build select
     select_fields = ["video_id"]
-    for field in ["source_url", "source_type", "processed_at"]:
-        if field in available_fields:
+    optional_fields = ["source_url", "source_type", "processed_at"]
+    for field in optional_fields:
+        if not available_fields or field in available_fields:
             select_fields.append(field)
-    
-    payload = {
-        "search": "*",
-        "select": ",".join(select_fields),
-        "top": limit
-    }
-    
-    if "processed_at" in available_fields:
-        payload["orderby"] = "processed_at desc"
-    if filter_query:
-        payload["filter"] = filter_query
-    
+    all_videos = {}
+    skip = 0
+    batch_size = 1000
+    max_iterations = 100
     try:
-        r = requests.post(url, headers=headers, json=payload, timeout=30)
-        r.raise_for_status()
-        docs = r.json().get("value", [])
-        
-        # Deduplicate and normalize
-        seen = set()
-        unique_docs = []
-        for doc in docs:
-            vid = doc.get('video_id')
-            if vid and vid not in seen:
-                seen.add(vid)
-                # Normalize missing values
-                doc['source_type'] = doc.get('source_type') or 'unknown'
-                doc['source_url'] = doc.get('source_url') or ''
-                doc['processed_at'] = doc.get('processed_at') or 'unknown'
-                unique_docs.append(doc)
-        
-        return unique_docs
-        
+        for iteration in range(max_iterations):
+            payload = {
+                "search": "*",
+                "select": ",".join(select_fields),
+                "top": batch_size,
+                "skip": skip,
+                "count": True
+            }
+            if filter_query:
+                payload["filter"] = filter_query
+            if "processed_at" in available_fields:
+                payload["orderby"] = "processed_at desc"
+            r = requests.post(url, headers=headers, json=payload, timeout=30)
+            r.raise_for_status()
+            data = r.json()
+            docs = data.get("value", [])
+            total_count = data.get("@odata.count", 0)
+            if not docs:
+                break
+            for doc in docs:
+                vid = doc.get('video_id')
+                if vid and vid not in all_videos:
+                    all_videos[vid] = {
+                        'video_id': vid,
+                        'source_type': doc.get('source_type') or 'unknown',
+                        'source_url': doc.get('source_url', ''),
+                        'processed_at': doc.get('processed_at', 'unknown')
+                    }
+            skip += len(docs)
+            if skip >= total_count or len(docs) < batch_size:
+                break
+        videos = list(all_videos.values())[:limit]
+        return videos
     except Exception as e:
         st.error(f"Failed to retrieve videos: {e}")
+        import traceback
+        st.error(traceback.format_exc())
         return []
 
-
 def delete_video_by_id(video_id: str) -> bool:
     """Delete all segments for a video_id from the index."""
     if not SEARCH_ENDPOINT or not SEARCH_KEY:
         return False
-    
-    # Find all documents
     search_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01"
     headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"}
-    
+    escaped_id = video_id.replace("'", "''")
     payload = {
         "search": "*",
-        "filter": f"video_id eq '{video_id}'",
+        "filter": f"video_id eq '{escaped_id}'",
         "select": "video_id",
         "top": 1000
     }
-    
     try:
         r = requests.post(search_url, headers=headers, json=payload, timeout=30)
         r.raise_for_status()
         docs = r.json().get("value", [])
-        
         if not docs:
+            st.warning(f"No documents found for video_id: {video_id}")
             return False
-        
-        # Delete documents
         schema = get_index_schema()
         key_field = schema.get('key_field', 'id')
-        
         delete_docs = []
         for doc in docs:
             doc_key = doc.get(key_field) or doc.get('id')
@@ -662,21 +547,19 @@ def delete_video_by_id(video_id: str) -> bool:
                     "@search.action": "delete",
                     key_field: doc_key
                 })
-        
         if not delete_docs:
+            st.warning("No valid documents to delete")
             return False
-        
         delete_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01"
         r = requests.post(delete_url, headers=headers, json={"value": delete_docs}, timeout=60)
         r.raise_for_status()
-        
         return True
-        
     except Exception as e:
         st.error(f"Delete failed: {e}")
+        import traceback
+        st.error(traceback.format_exc())
         return False
 
-
 # =============================================================================
 # AZURE STORAGE FUNCTIONS
 # =============================================================================
@@ -689,7 +572,6 @@ def generate_video_id(filename: str) -> str:
     hash_suffix = hashlib.md5(clean_name.encode()).hexdigest()[:8]
     return f"vid_{clean_name[:50]}_{hash_suffix}"
 
-
 def test_sas_url(sas_url: str) -> Tuple[bool, str]:
     """Test if SAS URL is accessible."""
     try:
@@ -698,26 +580,20 @@ def test_sas_url(sas_url: str) -> Tuple[bool, str]:
     except Exception as e:
         return (False, str(e))
 
-
 def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> Optional[str]:
     """Generate SAS token for blob access."""
     if not AZURE_STORAGE_KEY:
         return None
-    
     try:
         expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours)
         expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ')
-        
         account_key = base64.b64decode(AZURE_STORAGE_KEY)
         canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"
-        
         string_to_sign = (
             f"r\n\n{expiry_str}\n{canonicalized_resource}\n\n\nhttps\n2020-12-06\nb\n\n\n\n\n\n\n"
         )
-        
         signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
         signature = base64.b64encode(signed_hmac).decode('utf-8')
-        
         sas_params = {
             'sv': '2020-12-06',
             'sr': 'b',
@@ -726,35 +602,27 @@ def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> Optional
             'spr': 'https',
             'sig': signature
         }
-        
         return '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()])
-        
     except Exception as e:
         st.error(f"SAS generation error: {e}")
         return None
 
-
 def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]:
     """Upload to Azure Blob using REST API."""
     if not AZURE_STORAGE_KEY:
         return None, "Azure Storage key not configured"
-    
     try:
         url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}"
-        
         date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
         content_length = len(file_bytes)
-        
         string_to_sign = (
             f"PUT\n\n\n{content_length}\n\napplication/octet-stream\n\n\n\n\n\n\n"
             f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n"
             f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}"
         )
-        
         account_key = base64.b64decode(AZURE_STORAGE_KEY)
         signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
         signature = base64.b64encode(signed_hmac).decode('utf-8')
-        
         headers = {
             "x-ms-date": date_str,
             "x-ms-version": "2020-12-06",
@@ -763,52 +631,39 @@ def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optio
             "Content-Length": str(content_length),
             "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
         }
-        
         r = requests.put(url, data=file_bytes, headers=headers, timeout=300)
-        
         if r.status_code not in [201, 200]:
             return None, f"Upload failed: HTTP {r.status_code}"
-        
         sas_token = generate_sas_token_fixed(blob_name)
         if not sas_token:
             return None, "Failed to generate SAS token"
-        
         sas_url = f"{url}?{sas_token}"
-        
         is_valid, test_msg = test_sas_url(sas_url)
         if not is_valid:
             return None, f"SAS URL validation failed: {test_msg}"
-        
         return sas_url, None
-        
     except Exception as e:
         import traceback
         return None, f"Upload error: {str(e)}"
 
-
 def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]:
     """Upload using Azure SDK (preferred method)."""
     try:
         from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
-        
         connection_string = (
             f"DefaultEndpointsProtocol=https;"
             f"AccountName={AZURE_STORAGE_ACCOUNT};"
             f"AccountKey={AZURE_STORAGE_KEY};"
             f"EndpointSuffix=core.windows.net"
         )
-        
         blob_service = BlobServiceClient.from_connection_string(connection_string)
         container_client = blob_service.get_container_client(INPUT_CONTAINER)
-        
         try:
             container_client.create_container()
         except Exception:
             pass
-        
         blob_client = container_client.get_blob_client(blob_name)
         blob_client.upload_blob(file_bytes, overwrite=True)
-        
         sas_token = generate_blob_sas(
             account_name=AZURE_STORAGE_ACCOUNT,
             container_name=INPUT_CONTAINER,
@@ -818,44 +673,34 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona
             expiry=datetime.now(timezone.utc) + timedelta(hours=24),
             protocol="https"
         )
-        
         sas_url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}?{sas_token}"
-        
         is_valid, test_msg = test_sas_url(sas_url)
         if not is_valid:
             return None, f"SAS URL validation failed: {test_msg}"
-        
         return sas_url, None
-        
     except ImportError:
         return None, "azure-storage-blob not installed"
     except Exception as e:
         import traceback
         return None, f"SDK upload failed: {str(e)}"
 
-
 def save_segments_to_blob(video_id: str, segments: list) -> str:
     """Save segments JSON to blob storage."""
     if not AZURE_STORAGE_KEY:
         raise RuntimeError("Azure Storage key not configured")
-    
     blob_name = f"{video_id}_segments.json"
     url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{SEGMENTS_CONTAINER}/{blob_name}"
-    
     json_bytes = json.dumps(segments, indent=2).encode('utf-8')
     content_length = len(json_bytes)
-    
     date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
     string_to_sign = (
         f"PUT\n\n\n{content_length}\n\napplication/json\n\n\n\n\n\n\n"
         f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n"
         f"/{AZURE_STORAGE_ACCOUNT}/{SEGMENTS_CONTAINER}/{blob_name}"
     )
-    
     account_key = base64.b64decode(AZURE_STORAGE_KEY)
     signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest()
     signature = base64.b64encode(signed_hmac).decode('utf-8')
-    
     headers = {
         "x-ms-date": date_str,
         "x-ms-version": "2020-12-06",
@@ -864,22 +709,17 @@ def save_segments_to_blob(video_id: str, segments: list) -> str:
         "Content-Length": str(content_length),
         "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}"
     }
-    
     r = requests.put(url, data=json_bytes, headers=headers, timeout=60)
     r.raise_for_status()
-    
     return blob_name
 
-
 def download_youtube_audio(youtube_url: str, output_path: str, 
                           progress_callback=None) -> Tuple[Optional[str], Optional[str]]:
     """Download audio from YouTube."""
     if not check_yt_dlp():
         return None, "yt-dlp not installed. Run: pip install yt-dlp"
-    
     if not youtube_url or not youtube_url.strip():
         return None, "YouTube URL is empty"
-    
     try:
         cmd = [
             "yt-dlp",
@@ -892,46 +732,35 @@ def download_youtube_audio(youtube_url: str, output_path: str,
             "-o", output_path,
             youtube_url.strip()
         ]
-        
-        # Handle missing Node.js
         try:
             node_check = subprocess.run(["which", "node"], capture_output=True, text=True)
             if node_check.returncode != 0:
                 cmd.extend(["--extractor-args", "youtube:player_client=web"])
         except:
             pass
-        
         if progress_callback:
             progress_callback(15, "Downloading from YouTube...")
-        
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
-        
         if result.returncode != 0:
             error_msg = result.stderr[:500]
             if "JavaScript runtime" in error_msg:
                 error_msg += "\n\n💡 Tip: Install Node.js or run: pip install yt-dlp --upgrade"
             return None, f"yt-dlp failed: {error_msg}"
-        
-        # Find downloaded file
         if os.path.exists(output_path):
             return output_path, None
-        
         base = output_path.rsplit('.', 1)[0]
         for ext in ['.m4a', '.mp3', '.webm', '.opus']:
             alt_path = base + ext
             if os.path.exists(alt_path):
                 return alt_path, None
-        
         return None, "Download completed but file not found"
-        
     except subprocess.TimeoutExpired:
         return None, "Download timed out after 10 minutes"
     except Exception as e:
         return None, f"Error: {str(e)}"
 
-
 # =============================================================================
-# MAIN VIDEO PROCESSING
+# MAIN VIDEO PROCESSING (process_single_video)
 # =============================================================================
 
 def process_single_video(url: str, custom_id: Optional[str] = None,
@@ -952,104 +781,73 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
         "source_type": source_type,
         "url_stored": False
     }
-    
     try:
-        # Validate URL
         url_type = detect_url_type(url)
         if url_type == "unknown":
             result["status"] = "failed"
             result["error"] = "Unknown URL type. Must be YouTube or direct media URL."
             return result
-        
-        # Generate video ID
         video_id = custom_id.strip() if custom_id else generate_video_id(f"batch_{url}")
         result["video_id"] = video_id
-        
         current, total = overall_progress
         base_progress = int((current / total) * 100) if progress_bar else 0
-        
         if status_text:
             status_text.text(f"[{current}/{total}] Processing: {video_id}")
-        
         media_url = None
-        
-        # Handle YouTube
         if url_type == "youtube":
             if not check_yt_dlp():
                 result["status"] = "failed"
                 result["error"] = "yt-dlp not installed"
                 return result
-            
             import tempfile
             with tempfile.TemporaryDirectory() as tmpdir:
                 if status_text:
                     status_text.text(f"[{current}/{total}] Downloading from YouTube...")
-                
                 output_path = f"{tmpdir}/youtube_{video_id}.m4a"
                 downloaded_path, error = download_youtube_audio(url.strip(), output_path)
-                
                 if error:
                     result["status"] = "failed"
                     result["error"] = f"Download failed: {error}"
                     return result
-                
                 with open(downloaded_path, 'rb') as f:
                     file_bytes = f.read()
-                
                 blob_name = f"batch_youtube_{video_id}_{int(time.time())}.m4a"
-                
                 if status_text:
                     status_text.text(f"[{current}/{total}] Uploading to Azure...")
-                
-                # Try SDK first, fallback to REST
                 sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
                 if error and ("not installed" in error or "SDK" in error):
                     sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
-                
                 if error:
                     result["status"] = "failed"
                     result["error"] = f"Upload failed: {error}"
                     return result
-                
                 media_url = sas_url
-        
-        # Handle Direct URL
         elif url_type == "direct":
             media_url = url.strip()
             if status_text:
                 status_text.text(f"[{current}/{total}] Using direct URL...")
-        
         if not media_url:
             result["status"] = "failed"
             result["error"] = "No media URL available"
             return result
-        
-        # Submit to Speech API
         if status_text:
             status_text.text(f"[{current}/{total}] Submitting to Speech API...")
-        
         submit_result = submit_transcription_direct(video_id, media_url)
         operation_url = submit_result.get("operation_url")
-        
         if not operation_url:
             result["status"] = "failed"
             result["error"] = "No operation URL returned"
             return result
-        
-        # Poll for completion
         max_polls = 120
         transcription_data = None
-        
         for i in range(max_polls):
             time.sleep(POLL_SECONDS)
             poll_result = poll_transcription_operation(operation_url)
             status = poll_result.get("status", "unknown")
-            
             if progress_bar:
                 poll_progress = min(int((i / max_polls) * 20), 20)
                 overall = base_progress + int((1 / total) * 80) + int((1 / total) * poll_progress)
                 progress_bar.progress(min(overall, 99))
-            
             if status.lower() == "succeeded":
                 transcription_data = get_transcription_from_result(poll_result)
                 break
@@ -1058,23 +856,15 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 result["status"] = "failed"
                 result["error"] = f"Transcription failed: {error_msg}"
                 return result
-        
         if not transcription_data:
             result["status"] = "failed"
             result["error"] = "Transcription timed out"
             return result
-        
-        # Process and index
         if status_text:
             status_text.text(f"[{current}/{total}] Processing segments...")
-        
         segments = process_transcription_to_segments(transcription_data, video_id)
         result["segments_count"] = len(segments)
-        
-        # Save to blob
         save_segments_to_blob(video_id, segments)
-        
-        # Index with URL tracking
         try:
             index_result = index_segments_direct(
                 video_id,
@@ -1082,36 +872,71 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                 source_url=url,
                 source_type=source_type
             )
-            
             result["url_stored"] = index_result.get('source_url_stored', False)
             result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents"
-            
-            # Debug info
             st.session_state['debug_info'][video_id] = {
                 'url_fields_available': index_result.get('url_fields_available', {}),
                 'source_url_stored': index_result.get('source_url_stored', False),
                 'source_type_stored': index_result.get('source_type_stored', False)
             }
-            
         except Exception as e:
             result["index_status"] = f"Indexing failed: {str(e)}"
-        
         result["status"] = "success"
-        
     except Exception as e:
         result["status"] = "failed"
         result["error"] = str(e)
         import traceback
         result["error"] += f"\n{traceback.format_exc()}"
-    
     return result
 
+# =============================================================================
+# IMPORT PAGE MODULES (after utilities are defined)
+# =============================================================================
+
+import upload_transcribe
+import manage_videos
+import system_diagnostics
 
 # =============================================================================
-# PAGE 1: SEARCH SEGMENTS
+# SIDEBAR NAVIGATION
+# =============================================================================
+
+with st.sidebar:
+    st.header("Navigation")
+    page = st.radio("Select Page", [
+        "🔎 Search Segments", 
+        "⬆️ Upload & Transcribe",
+        "📚 Manage Videos",
+        "⚙️ System Diagnostics"
+    ])
+    
+    # Settings for search page
+    if page == "🔎 Search Segments":
+        st.header("Search Settings")
+        mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], 
+                          index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1)
+        top = st.slider("Results", 1, 50, DEFAULT_TOP)
+        k = st.slider("Vector k", 5, 200, DEFAULT_K)
+    
+    # Quick actions
+    st.markdown("---")
+    if st.button("🔄 Refresh Schema Cache"):
+        st.session_state.index_schema_cache = None
+        st.session_state.url_fields_status = None
+        st.success("Cache cleared! Navigate to System Diagnostics to refresh.")
+    
+    st.markdown("---")
+    st.caption("Video Annotation Platform v2.1")
+    st.caption("With URL Tracking")
+
+# =============================================================================
+# PAGE ROUTING
 # =============================================================================
 
 if page == "🔎 Search Segments":
+    # -------------------------------------------------------------------------
+    # SEARCH SEGMENTS PAGE (embedded)
+    # -------------------------------------------------------------------------
     st.header("Search Indexed Video Segments")
     
     if not SEARCH_FN_URL:
@@ -1142,7 +967,6 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
                     start_ms, end_ms = h.get("start_ms", 0), h.get("end_ms", 0)
                     vid, seg, score = h.get("video_id", ""), h.get("segment_id", ""), h.get("score")
                     
-                    # Show URL info if available
                     source_url = h.get('source_url', '')
                     source_type = h.get('source_type', '')
                     url_indicator = ""
@@ -1169,713 +993,11 @@ def process_single_video(url: str, custom_id: Optional[str] = None,
             except Exception as e:
                 st.error(f"Search failed: {e}")
 
-
-# =============================================================================
-# PAGE 2: UPLOAD & TRANSCRIBE
-# =============================================================================
-
 elif page == "⬆️ Upload & Transcribe":
-    st.header("Upload Video for Transcription")
-    
-    # Check URL fields status
-    url_status = check_url_fields_status()
-    
-    if url_status['fields_exist']:
-        st.success("✅ URL Tracking Enabled - Original source URLs will be stored")
-    else:
-        st.warning(f"""
-        ⚠️ **Partial URL Tracking** - Missing fields: {', '.join(url_status['missing_fields'])}
-        
-        Videos will still be processed, but URL information will be limited.
-        Add missing fields to your Azure Search index for full functionality.
-        """)
-    
-    # Check Azure configuration
-    azure_configured = bool(AZURE_STORAGE_KEY) and bool(SPEECH_KEY)
-    if not azure_configured:
-        st.error("⚠️ Azure Storage and Speech keys required. Check .env file.")
-    
-    # Source selection
-    source_type = st.radio("Select Source", 
-                          ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"],
-                          horizontal=True)
-    
-    media_url = None
-    video_id = None
-    file_bytes = None
-    yt_url = None
-    csv_df = None
-    detected_source_type = "unknown"
-    
-    # --- File Upload ---
-    if source_type == "File Upload":
-        if not azure_configured:
-            st.info("Please configure Azure Storage to enable file upload")
-        else:
-            uploaded_file = st.file_uploader(
-                "Choose video/audio file",
-                type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"],
-                accept_multiple_files=False
-            )
-            
-            if uploaded_file:
-                st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)")
-                file_bytes = uploaded_file.getvalue()
-                video_id = generate_video_id(uploaded_file.name)
-                detected_source_type = "upload"
-                st.info("File ready for upload")
-    
-    # --- Direct URL ---
-    elif source_type == "Direct URL":
-        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...")
-        
-        if url_input.strip():
-            media_url = url_input.strip()
-            video_id = generate_video_id(url_input)
-            detected_source_type = "direct"
-            st.success("✅ URL validated")
-    
-    # --- YouTube ---
-    elif source_type == "YouTube":
-        yt_url = st.text_input(
-            "YouTube URL",
-            placeholder="https://youtube.com/watch?v= ...",
-            value=st.session_state.yt_url_value,
-            key="yt_url_input"
-        )
-        
-        # Update session state
-        if yt_url != st.session_state.yt_url_value:
-            st.session_state.yt_url_value = yt_url
-            try:
-                st.rerun()
-            except:
-                pass
-        
-        # Check yt-dlp
-        if not check_yt_dlp():
-            st.warning("yt-dlp not installed")
-            if st.button("Install yt-dlp"):
-                with st.spinner("Installing..."):
-                    subprocess.run(["pip", "install", "-q", "yt-dlp"])
-                try:
-                    st.rerun()
-                except:
-                    st.info("Please refresh the page")
-        elif yt_url and yt_url.strip():
-            video_id = generate_video_id(f"yt_{yt_url.strip()}")
-            detected_source_type = "youtube"
-            st.success("YouTube URL ready")
-    
-    # --- Batch CSV Upload ---
-    elif source_type == "📁 Batch CSV Upload":
-        st.subheader("📁 Batch Process Videos from CSV")
-        
-        csv_file = st.file_uploader(
-            "Upload CSV file",
-            type=["csv"],
-            help="CSV must contain a column with video URLs"
-        )
-        
-        if csv_file:
-            try:
-                # Read CSV with flexible parsing
-                try:
-                    csv_df = pd.read_csv(csv_file)
-                except Exception:
-                    csv_file.seek(0)
-                    csv_df = pd.read_csv(csv_file, header=None)
-                    csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))]
-                
-                # Handle case where column names are URLs
-                url_like_columns = []
-                for col in csv_df.columns:
-                    col_str = str(col).strip()
-                    if detect_url_type(col_str) != "unknown":
-                        url_like_columns.append(col)
-                
-                if url_like_columns and len(csv_df.columns) == 1:
-                    url_col_name = csv_df.columns[0]
-                    new_row = {url_col_name: url_col_name}
-                    csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True)
-                
-                st.success(f"✅ Loaded CSV with {len(csv_df)} rows")
-                
-                # Column selection
-                url_column = st.selectbox("Select column containing video URLs", options=csv_df.columns.tolist())
-                
-                id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column]
-                id_column = st.selectbox("Select column for custom Video ID (optional)", options=id_column_options, index=0)
-                
-                # Extract and validate URLs
-                urls_raw = csv_df[url_column].dropna().astype(str).tolist()
-                urls_to_process = [u.strip() for u in urls_raw if u.strip()]
-                
-                # Preview
-                with st.expander(f"Preview URLs ({len(urls_to_process)} found)"):
-                    for i, url in enumerate(urls_to_process[:10], 1):
-                        url_type = detect_url_type(url)
-                        icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓"
-                        st.text(f"{i}. {icon} {url[:80]}...")
-                
-                # Validate
-                valid_urls = []
-                invalid_urls = []
-                for url in urls_to_process:
-                    url_type = detect_url_type(str(url))
-                    if url_type in ["youtube", "direct"]:
-                        valid_urls.append(url)
-                    else:
-                        invalid_urls.append(url)
-                
-                col1, col2, col3 = st.columns(3)
-                col1.metric("Total", len(urls_to_process))
-                col2.metric("✅ Valid", len(valid_urls))
-                col3.metric("❌ Invalid", len(invalid_urls))
-                
-                # Store in session state
-                st.session_state['batch_urls'] = valid_urls
-                st.session_state['batch_df'] = csv_df
-                st.session_state['batch_url_column'] = url_column
-                st.session_state['batch_id_column'] = id_column
-                
-            except Exception as e:
-                st.error(f"Error reading CSV: {e}")
-                import traceback
-                st.error(traceback.format_exc())
-    
-    # Custom ID input
-    custom_id = st.text_input("Custom Video ID (optional)")
-    if custom_id.strip() and source_type != "📁 Batch CSV Upload":
-        video_id = custom_id.strip()
-    
-    # Determine if we can process
-    can_process = False
-    if source_type == "File Upload":
-        can_process = file_bytes is not None and azure_configured
-    elif source_type == "Direct URL":
-        can_process = media_url is not None and len(str(media_url).strip()) > 0
-    elif source_type == "YouTube":
-        yt_url_to_check = st.session_state.get('yt_url_value', '')
-        can_process = len(str(yt_url_to_check).strip()) > 0 and check_yt_dlp()
-    elif source_type == "📁 Batch CSV Upload":
-        can_process = (st.session_state.get('batch_urls') and 
-                      len(st.session_state.get('batch_urls', [])) > 0 and 
-                      azure_configured and
-                      not st.session_state.get('batch_processing', False))
-    
-    # Process button
-    button_text = "🚀 Start Transcription"
-    if source_type == "📁 Batch CSV Upload":
-        count = len(st.session_state.get('batch_urls', []))
-        button_text = f"🚀 Process {count} Videos from CSV"
-    
-    if st.button(button_text, type="primary", disabled=not can_process):
-        
-        # --- BATCH PROCESSING ---
-        if source_type == "📁 Batch CSV Upload":
-            st.session_state.batch_processing = True
-            st.session_state.batch_results = []
-            
-            urls = st.session_state.get('batch_urls', [])
-            csv_df = st.session_state.get('batch_df')
-            url_column = st.session_state.get('batch_url_column')
-            id_column = st.session_state.get('batch_id_column')
-            
-            total = len(urls)
-            st.info(f"Starting batch processing of {total} videos...")
-            
-            # Progress UI
-            overall_progress = st.progress(0)
-            status_text = st.empty()
-            results_container = st.container()
-            
-            results = []
-            for idx, url in enumerate(urls, 1):
-                # Get custom ID if specified
-                custom_vid_id = None
-                if id_column != "Auto-generate":
-                    row = csv_df[csv_df[url_column] == url]
-                    if not row.empty:
-                        custom_vid_id = str(row[id_column].iloc[0])
-                        custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50]
-                
-                # Detect source type
-                url_type = detect_url_type(url)
-                src_type = "youtube" if url_type == "youtube" else "direct"
-                
-                # Process
-                result = process_single_video(
-                    url=url,
-                    custom_id=custom_vid_id,
-                    source_type=src_type,
-                    progress_bar=overall_progress,
-                    status_text=status_text,
-                    overall_progress=(idx, total)
-                )
-                
-                results.append(result)
-                st.session_state.batch_results = results
-                
-                # Update progress
-                progress_pct = int((idx / total) * 100)
-                overall_progress.progress(progress_pct)
-                
-                # Show result
-                with results_container:
-                    if result['status'] == 'success':
-                        url_stored = "✅ URL saved" if result.get('url_stored') else "⚠️ URL not stored"
-                        st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments ({url_stored})")
-                    else:
-                        error_msg = result.get('error', 'Unknown error')[:200]
-                        st.error(f"❌ [{idx}/{total}] Failed: {error_msg}...")
-                
-                time.sleep(1)  # Rate limiting
-            
-            # Final summary
-            overall_progress.progress(100)
-            status_text.text("Batch processing complete!")
-            
-            successful = [r for r in results if r['status'] == 'success']
-            failed = [r for r in results if r['status'] == 'failed']
-            
-            st.markdown("---")
-            st.subheader("📊 Batch Processing Summary")
-            
-            col1, col2, col3 = st.columns(3)
-            col1.metric("Total", total)
-            col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%")
-            col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%")
-            
-            # Detailed results
-            with st.expander("View Detailed Results"):
-                results_df = pd.DataFrame([
-                    {
-                        'Video ID': r['video_id'],
-                        'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'],
-                        'Source Type': r.get('source_type', 'unknown'),
-                        'Status': r['status'],
-                        'Segments': r.get('segments_count', 0),
-                        'URL Stored': r.get('url_stored', False),
-                        'Indexing': r.get('index_status', 'N/A'),
-                        'Error': (r.get('error', '')[:100] + '...') if r.get('error') else ''
-                    }
-                    for r in results
-                ])
-                st.dataframe(results_df)
-                
-                # Download results
-                csv_buffer = io.StringIO()
-                results_df.to_csv(csv_buffer, index=False)
-                st.download_button("Download Results CSV", csv_buffer.getvalue(), "batch_results.csv", "text/csv")
-            
-            # Search hint
-            if successful:
-                st.info("💡 **Search processed videos:**")
-                video_ids = [r['video_id'] for r in successful[:5]]
-                st.code(f"video_id:({' OR '.join(video_ids)})")
-            
-            st.session_state.batch_processing = False
-        
-        # --- SINGLE VIDEO PROCESSING ---
-        else:
-            progress_bar = st.progress(0)
-            status = st.empty()
-            
-            try:
-                # Upload file if needed
-                if source_type == "File Upload" and file_bytes:
-                    progress_bar.progress(10)
-                    status.text("Uploading to Azure Blob...")
-                    
-                    blob_name = f"upload_{video_id}_{int(time.time())}.m4a"
-                    
-                    sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
-                    if error and ("not installed" in error or "SDK" in error):
-                        sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
-                    
-                    if error:
-                        raise Exception(error)
-                    
-                    media_url = sas_url
-                    progress_bar.progress(50)
-                
-                # Download YouTube if needed
-                elif source_type == "YouTube":
-                    yt_url = st.session_state.get('yt_url_value', '')
-                    
-                    if not yt_url or not yt_url.strip():
-                        raise Exception("YouTube URL is empty")
-                    
-                    import tempfile
-                    with tempfile.TemporaryDirectory() as tmpdir:
-                        progress_bar.progress(10)
-                        status.text("Downloading from YouTube...")
-                        
-                        output_path = f"{tmpdir}/youtube_{video_id}.m4a"
-                        downloaded_path, error = download_youtube_audio(yt_url.strip(), output_path)
-                        
-                        if error:
-                            raise Exception(error)
-                        
-                        progress_bar.progress(50)
-                        status.text("Uploading to Azure Blob...")
-                        
-                        with open(downloaded_path, 'rb') as f:
-                            file_bytes = f.read()
-                        
-                        blob_name = f"youtube_{video_id}_{int(time.time())}.m4a"
-                        
-                        sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name)
-                        if error and ("not installed" in error):
-                            sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name)
-                        
-                        if error:
-                            raise Exception(error)
-                        
-                        media_url = sas_url
-                        progress_bar.progress(75)
-                
-                if not media_url:
-                    raise Exception("No media URL available")
-                
-                # Transcribe
-                status.text("Submitting to Azure Speech-to-Text...")
-                result = submit_transcription_direct(video_id, media_url)
-                operation_url = result.get("operation_url")
-                
-                if not operation_url:
-                    raise Exception("No operation URL returned")
-                
-                # Poll
-                max_polls = 120
-                transcription_data = None
-                
-                for i in range(max_polls):
-                    time.sleep(POLL_SECONDS)
-                    poll_result = poll_transcription_operation(operation_url)
-                    status_text = poll_result.get("status", "unknown")
-                    
-                    progress = min(75 + int((i / max_polls) * 20), 95)
-                    progress_bar.progress(progress)
-                    status.text(f"Transcribing... ({i * POLL_SECONDS // 60} min) - Status: {status_text}")
-                    
-                    if status_text.lower() == "succeeded":
-                        transcription_data = get_transcription_from_result(poll_result)
-                        break
-                    elif status_text.lower() == "failed":
-                        raise Exception(f"Transcription failed: {poll_result.get('properties', {}).get('error', {}).get('message', 'Unknown error')}")
-                
-                if not transcription_data:
-                    raise Exception("Transcription timed out")
-                
-                # Process and index
-                progress_bar.progress(98)
-                status.text("Processing segments and indexing...")
-                
-                segments = process_transcription_to_segments(transcription_data, video_id)
-                
-                # Save to blob
-                save_segments_to_blob(video_id, segments)
-                
-                # Index with URL tracking
-                original_url = None
-                if source_type == "YouTube":
-                    original_url = st.session_state.get('yt_url_value', '')
-                elif source_type == "Direct URL":
-                    original_url = media_url
-                elif source_type == "File Upload":
-                    original_url = f"uploaded_file://{video_id}"
-                
-                index_result = index_segments_direct(
-                    video_id,
-                    segments,
-                    source_url=original_url,
-                    source_type=detected_source_type
-                )
-                
-                url_stored_msg = "✅ Source URL stored" if index_result.get('source_url_stored') else "⚠️ URL storage not available"
-                
-                progress_bar.progress(100)
-                status.text("Complete!")
-                
-                st.success(f"""
-                ✅ **Transcription Complete!**
-                - Video ID: {video_id}
-                - Segments: {len(segments)}
-                - Source Type: {detected_source_type}
-                - Indexed: {index_result.get('indexed', 0)} documents
-                - {url_stored_msg}
-                """)
-                
-                if original_url:
-                    st.info(f"**Original Source:** [{original_url}]({original_url})")
-                
-                st.code(f'Search: video_id:{video_id}')
-                
-                with st.expander("View first 5 segments"):
-                    for seg in segments[:5]:
-                        st.write(f"**{ms_to_ts(seg['start_ms'])} - {ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...")
-                
-            except Exception as e:
-                st.error(f"❌ Error: {str(e)}")
-                st.exception(e)
-
-
-# =============================================================================
-# PAGE 3: MANAGE VIDEOS
-# =============================================================================
+    upload_transcribe.show_upload_transcribe_page()
 
 elif page == "📚 Manage Videos":
-    st.header("📚 Manage Stored Videos")
-    st.info("View, search, and manage all processed videos and their source URLs")
-    
-    if not SEARCH_ENDPOINT or not SEARCH_KEY:
-        st.error("Azure Search not configured. Cannot retrieve video list.")
-    else:
-        # Check URL fields status
-        url_status = check_url_fields_status()
-        
-        if url_status['fields_exist']:
-            st.success("✅ URL tracking fields are configured")
-        else:
-            st.warning(f"⚠️ Missing URL fields: {', '.join(url_status['missing_fields'])}")
-        
-        # URL coverage analysis
-        if st.button("📊 Analyze URL Data Coverage"):
-            with st.spinner("Analyzing..."):
-                all_videos = get_stored_videos(include_missing=True)
-                
-                with_urls = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']]
-                without_urls = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']]
-                
-                col1, col2, col3 = st.columns(3)
-                col1.metric("Total Videos", len(all_videos))
-                col2.metric("✅ With URL Data", len(with_urls), f"{len(with_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%")
-                col3.metric("⚠️ Missing URL Data", len(without_urls), f"{len(without_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%")
-                
-                # By type breakdown
-                st.subheader("Breakdown by Source Type")
-                type_counts = {}
-                for v in all_videos:
-                    t = v.get('source_type') or 'unknown'
-                    type_counts[t] = type_counts.get(t, 0) + 1
-                
-                cols = st.columns(len(type_counts) if type_counts else 1)
-                for i, (stype, count) in enumerate(sorted(type_counts.items())):
-                    icon = "🎬" if stype == "youtube" else "📄" if stype == "direct" else "📁" if stype == "upload" else "❓"
-                    cols[i % len(cols)].metric(f"{icon} {stype}", count)
-                
-                if without_urls:
-                    with st.expander(f"Videos without URL data ({len(without_urls)})"):
-                        st.info("These were likely processed before URL tracking was enabled")
-                        for v in without_urls[:20]:
-                            st.text(f"• {v.get('video_id')}")
-        
-        st.markdown("---")
-        
-        # Filters
-        st.subheader("Filter Videos")
-        col1, col2 = st.columns(2)
-        
-        with col1:
-            filter_video_id = st.text_input("Filter by Video ID (optional)")
-        with col2:
-            filter_options = ["All", "With URL Data Only", "Missing URL Data Only", "youtube", "direct", "upload", "unknown"]
-            filter_source_type = st.selectbox("Filter by Source Type", options=filter_options, index=0)
-        
-        # Load videos
-        if st.button("🔍 Load Videos", type="primary"):
-            with st.spinner("Retrieving videos..."):
-                
-                # Handle special filters
-                if filter_source_type == "Missing URL Data Only":
-                    all_videos = get_stored_videos(include_missing=True)
-                    videos = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']]
-                    if filter_video_id.strip():
-                        videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()]
-                elif filter_source_type == "With URL Data Only":
-                    all_videos = get_stored_videos(include_missing=True)
-                    videos = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']]
-                    if filter_video_id.strip():
-                        videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()]
-                else:
-                    source_type = None if filter_source_type == "All" else filter_source_type
-                    videos = get_stored_videos(
-                        video_id=filter_video_id if filter_video_id.strip() else None,
-                        source_type=source_type,
-                        include_missing=True,
-                        limit=1000
-                    )
-                
-                st.session_state.stored_videos_cache = videos
-                st.success(f"Found {len(videos)} videos")
-        
-        # Display videos
-        if st.session_state.stored_videos_cache:
-            videos = st.session_state.stored_videos_cache
-            
-            # Metrics
-            st.markdown("---")
-            cols = st.columns(4)
-            
-            type_counts = {}
-            for v in videos:
-                t = v.get('source_type') or 'unknown'
-                type_counts[t] = type_counts.get(t, 0) + 1
-            
-            cols[0].metric("Total", len(videos))
-            cols[1].metric("YouTube", type_counts.get('youtube', 0))
-            cols[2].metric("Direct", type_counts.get('direct', 0))
-            cols[3].metric("Upload", type_counts.get('upload', 0))
-            
-            # Group by type
-            st.markdown("---")
-            st.subheader("Video List")
-            
-            videos_by_type = {}
-            for v in videos:
-                stype = v.get('source_type') or 'unknown'
-                if stype not in videos_by_type:
-                    videos_by_type[stype] = []
-                videos_by_type[stype].append(v)
-            
-            # Display by category
-            for source_type in ['youtube', 'direct', 'upload', 'unknown']:
-                if source_type not in videos_by_type:
-                    continue
-                
-                type_videos = videos_by_type[source_type]
-                icon = "🎬" if source_type == "youtube" else "📄" if source_type == "direct" else "📁" if source_type == "upload" else "❓"
-                
-                with st.expander(f"{icon} {source_type.upper()} ({len(type_videos)} videos)", expanded=(source_type == 'youtube')):
-                    for i, video in enumerate(type_videos, 1):
-                        vid = video.get('video_id', 'unknown')
-                        src_url = video.get('source_url', '')
-                        processed = video.get('processed_at', 'unknown')
-                        
-                        has_url = bool(src_url)
-                        status_icon = "✅" if has_url else "⚠️"
-                        
-                        with st.container():
-                            cols = st.columns([4, 1])
-                            
-                            with cols[0]:
-                                st.write(f"**{status_icon} {i}. {vid}**")
-                                st.caption(f"Processed: {processed}")
-                                
-                                if src_url:
-                                    display_url = src_url[:80] + "..." if len(str(src_url)) > 80 else src_url
-                                    st.code(display_url)
-                                    if str(src_url).startswith('http'):
-                                        st.markdown(f"[Open Source ↗]({src_url})")
-                                else:
-                                    st.warning("No source URL stored")
-                            
-                            with cols[1]:
-                                if st.button(f"🗑️ Delete", key=f"del_{vid}_{i}_{source_type}"):
-                                    if delete_video_by_id(vid):
-                                        st.success(f"Deleted {vid}")
-                                        st.session_state.stored_videos_cache = [
-                                            v for v in videos if v.get('video_id') != vid
-                                        ]
-                                        try:
-                                            st.rerun()
-                                        except:
-                                            pass
-                            
-                            st.markdown("---")
-            
-            # Export
-            st.markdown("---")
-            if st.button("📥 Export to CSV"):
-                export_df = pd.DataFrame([
-                    {
-                        'video_id': v.get('video_id'),
-                        'source_type': v.get('source_type') or 'unknown',
-                        'source_url': v.get('source_url', ''),
-                        'has_url_data': bool(v.get('source_url')),
-                        'processed_at': v.get('processed_at', 'unknown')
-                    }
-                    for v in videos
-                ])
-                
-                csv_buffer = io.StringIO()
-                export_df.to_csv(csv_buffer, index=False)
-                st.download_button("Download CSV", csv_buffer.getvalue(), "video_list.csv", "text/csv")
-
-
-# =============================================================================
-# PAGE 4: SYSTEM DIAGNOSTICS
-# =============================================================================
+    manage_videos.show_manage_videos_page()
 
 elif page == "⚙️ System Diagnostics":
-    st.header("⚙️ System Diagnostics")
-    st.info("Check system configuration and troubleshoot issues")
-    
-    # Configuration status
-    st.subheader("Configuration Status")
-    
-    config_checks = {
-        "Azure Speech (SPEECH_KEY)": bool(SPEECH_KEY),
-        "Azure OpenAI (AZURE_OPENAI_KEY)": bool(AZURE_OPENAI_KEY),
-        "Azure Search (SEARCH_KEY)": bool(SEARCH_KEY),
-        "Azure Storage (AZURE_STORAGE_KEY)": bool(AZURE_STORAGE_KEY),
-        "Search Function (SEARCH_FN_URL)": bool(SEARCH_FN_URL),
-        "yt-dlp installed": check_yt_dlp()
-    }
-    
-    cols = st.columns(2)
-    for i, (name, status) in enumerate(config_checks.items()):
-        icon = "✅" if status else "❌"
-        cols[i % 2].write(f"{icon} {name}: {'OK' if status else 'Not configured'}")
-    
-    # Index schema check
-    st.markdown("---")
-    st.subheader("Index Schema Check")
-    
-    if st.button("🔍 Check Index Schema"):
-        with st.spinner("Fetching schema..."):
-            schema = debug_check_index_schema()
-            
-            if isinstance(schema, dict):
-                st.success(f"Index: {schema['index_name']}")
-                st.write(f"Key Field: `{schema['key_field']}`")
-                
-                # URL fields status
-                if schema.get('has_all_url_fields'):
-                    st.success("✅ All URL tracking fields present")
-                else:
-                    st.warning(f"⚠️ Missing fields: {', '.join(schema.get('missing_url_fields', []))}")
-                
-                # Show all fields
-                with st.expander("View all fields"):
-                    for field in schema['fields']:
-                        key = "🔑" if field['key'] else ""
-                        url = "🔗" if 'url' in field['name'].lower() else ""
-                        st.caption(f"{key}{url} `{field['name']}` ({field['type']})")
-                
-                st.session_state.index_schema_cache = schema
-            else:
-                st.error(f"Schema check failed: {schema}")
-    
-    # Debug info
-    st.markdown("---")
-    st.subheader("Debug Information")
-    
-    with st.expander("Session State"):
-        st.json({
-            k: str(v)[:100] + "..." if len(str(v)) > 100 else v 
-            for k, v in st.session_state.items()
-        })
-    
-    with st.expander("Recent Processing Debug"):
-        if st.session_state.get('debug_info'):
-            st.json(st.session_state['debug_info'])
-        else:
-            st.info("No debug info yet. Process a video first.")
-
-
-# Footer
-st.sidebar.markdown("---")
-st.sidebar.caption("Video Annotation Platform v2.1")
\ No newline at end of file
+    system_diagnostics.show_system_diagnostics_page()
\ No newline at end of file
diff --git a/ui/upload_transcribe.py b/ui/upload_transcribe.py
new file mode 100644
index 0000000..96414dc
--- /dev/null
+++ b/ui/upload_transcribe.py
@@ -0,0 +1,461 @@
+"""
+upload_transcribe.py - Upload & Transcribe page for the Video Annotation Platform
+"""
+
+import streamlit as st
+import time
+import pandas as pd
+import io
+import tempfile
+from typing import Tuple, Optional
+
+# Import shared utilities from ui_search (must be in same directory)
+import ui_search
+
+def show_upload_transcribe_page():
+    """Display the Upload & Transcribe page."""
+    st.header("Upload Video for Transcription")
+    
+    # Check URL fields status
+    url_status = ui_search.check_url_fields_status()
+    
+    if url_status['fields_exist']:
+        st.success("✅ URL Tracking Enabled - Original source URLs will be stored")
+    else:
+        st.warning(f"""
+        ⚠️ **Partial URL Tracking** - Missing fields: {', '.join(url_status['missing_fields'])}
+        
+        Videos will still be processed, but URL information will be limited.
+        Add missing fields to your Azure Search index for full functionality.
+        """)
+    
+    # Check Azure configuration
+    azure_configured = bool(ui_search.AZURE_STORAGE_KEY) and bool(ui_search.SPEECH_KEY)
+    if not azure_configured:
+        st.error("⚠️ Azure Storage and Speech keys required. Check .env file.")
+    
+    # Source selection
+    source_type = st.radio("Select Source", 
+                          ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"],
+                          horizontal=True)
+    
+    media_url = None
+    video_id = None
+    file_bytes = None
+    yt_url = None
+    csv_df = None
+    detected_source_type = "unknown"
+    
+    # --- File Upload ---
+    if source_type == "File Upload":
+        if not azure_configured:
+            st.info("Please configure Azure Storage to enable file upload")
+        else:
+            uploaded_file = st.file_uploader(
+                "Choose video/audio file",
+                type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"],
+                accept_multiple_files=False
+            )
+            
+            if uploaded_file:
+                st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)")
+                file_bytes = uploaded_file.getvalue()
+                video_id = ui_search.generate_video_id(uploaded_file.name)
+                detected_source_type = "upload"
+                st.info("File ready for upload")
+    
+    # --- Direct URL ---
+    elif source_type == "Direct URL":
+        url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/    ...")
+        
+        if url_input.strip():
+            media_url = url_input.strip()
+            video_id = ui_search.generate_video_id(url_input)
+            detected_source_type = "direct"
+            st.success("✅ URL validated")
+    
+    # --- YouTube ---
+    elif source_type == "YouTube":
+        yt_url = st.text_input(
+            "YouTube URL",
+            placeholder="https://youtube.com/watch?v=    ...",
+            value=st.session_state.yt_url_value,
+            key="yt_url_input"
+        )
+        
+        # Update session state
+        if yt_url != st.session_state.yt_url_value:
+            st.session_state.yt_url_value = yt_url
+            try:
+                st.rerun()
+            except:
+                pass
+        
+        # Check yt-dlp
+        if not ui_search.check_yt_dlp():
+            st.warning("yt-dlp not installed")
+            if st.button("Install yt-dlp"):
+                with st.spinner("Installing..."):
+                    import subprocess
+                    subprocess.run(["pip", "install", "-q", "yt-dlp"])
+                try:
+                    st.rerun()
+                except:
+                    st.info("Please refresh the page")
+        elif yt_url and yt_url.strip():
+            video_id = ui_search.generate_video_id(f"yt_{yt_url.strip()}")
+            detected_source_type = "youtube"
+            st.success("YouTube URL ready")
+    
+    # --- Batch CSV Upload ---
+    elif source_type == "📁 Batch CSV Upload":
+        st.subheader("📁 Batch Process Videos from CSV")
+        
+        csv_file = st.file_uploader(
+            "Upload CSV file",
+            type=["csv"],
+            help="CSV must contain a column with video URLs"
+        )
+        
+        if csv_file:
+            try:
+                # Read CSV with flexible parsing
+                try:
+                    csv_df = pd.read_csv(csv_file)
+                except Exception:
+                    csv_file.seek(0)
+                    csv_df = pd.read_csv(csv_file, header=None)
+                    csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))]
+                
+                # Handle case where column names are URLs
+                url_like_columns = []
+                for col in csv_df.columns:
+                    col_str = str(col).strip()
+                    if ui_search.detect_url_type(col_str) != "unknown":
+                        url_like_columns.append(col)
+                
+                if url_like_columns and len(csv_df.columns) == 1:
+                    url_col_name = csv_df.columns[0]
+                    new_row = {url_col_name: url_col_name}
+                    csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True)
+                
+                st.success(f"✅ Loaded CSV with {len(csv_df)} rows")
+                
+                # Column selection
+                url_column = st.selectbox("Select column containing video URLs", options=csv_df.columns.tolist())
+                
+                id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column]
+                id_column = st.selectbox("Select column for custom Video ID (optional)", options=id_column_options, index=0)
+                
+                # Extract and validate URLs
+                urls_raw = csv_df[url_column].dropna().astype(str).tolist()
+                urls_to_process = [u.strip() for u in urls_raw if u.strip()]
+                
+                # Preview
+                with st.expander(f"Preview URLs ({len(urls_to_process)} found)"):
+                    for i, url in enumerate(urls_to_process[:10], 1):
+                        url_type = ui_search.detect_url_type(url)
+                        icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓"
+                        st.text(f"{i}. {icon} {url[:80]}...")
+                
+                # Validate
+                valid_urls = []
+                invalid_urls = []
+                for url in urls_to_process:
+                    url_type = ui_search.detect_url_type(str(url))
+                    if url_type in ["youtube", "direct"]:
+                        valid_urls.append(url)
+                    else:
+                        invalid_urls.append(url)
+                
+                col1, col2, col3 = st.columns(3)
+                col1.metric("Total", len(urls_to_process))
+                col2.metric("✅ Valid", len(valid_urls))
+                col3.metric("❌ Invalid", len(invalid_urls))
+                
+                # Store in session state
+                st.session_state['batch_urls'] = valid_urls
+                st.session_state['batch_df'] = csv_df
+                st.session_state['batch_url_column'] = url_column
+                st.session_state['batch_id_column'] = id_column
+                
+            except Exception as e:
+                st.error(f"Error reading CSV: {e}")
+                import traceback
+                st.error(traceback.format_exc())
+    
+    # Custom ID input
+    custom_id = st.text_input("Custom Video ID (optional)")
+    if custom_id.strip() and source_type != "📁 Batch CSV Upload":
+        video_id = custom_id.strip()
+    
+    # Determine if we can process
+    can_process = False
+    if source_type == "File Upload":
+        can_process = file_bytes is not None and azure_configured
+    elif source_type == "Direct URL":
+        can_process = media_url is not None and len(str(media_url).strip()) > 0
+    elif source_type == "YouTube":
+        yt_url_to_check = st.session_state.get('yt_url_value', '')
+        can_process = len(str(yt_url_to_check).strip()) > 0 and ui_search.check_yt_dlp()
+    elif source_type == "📁 Batch CSV Upload":
+        can_process = (st.session_state.get('batch_urls') and 
+                      len(st.session_state.get('batch_urls', [])) > 0 and 
+                      azure_configured and
+                      not st.session_state.get('batch_processing', False))
+    
+    # Process button
+    button_text = "🚀 Start Transcription"
+    if source_type == "📁 Batch CSV Upload":
+        count = len(st.session_state.get('batch_urls', []))
+        button_text = f"🚀 Process {count} Videos from CSV"
+    
+    if st.button(button_text, type="primary", disabled=not can_process):
+        
+        # --- BATCH PROCESSING ---
+        if source_type == "📁 Batch CSV Upload":
+            st.session_state.batch_processing = True
+            st.session_state.batch_results = []
+            
+            urls = st.session_state.get('batch_urls', [])
+            csv_df = st.session_state.get('batch_df')
+            url_column = st.session_state.get('batch_url_column')
+            id_column = st.session_state.get('batch_id_column')
+            
+            total = len(urls)
+            st.info(f"Starting batch processing of {total} videos...")
+            
+            # Progress UI
+            overall_progress = st.progress(0)
+            status_text = st.empty()
+            results_container = st.container()
+            
+            results = []
+            for idx, url in enumerate(urls, 1):
+                # Get custom ID if specified
+                custom_vid_id = None
+                if id_column != "Auto-generate":
+                    row = csv_df[csv_df[url_column] == url]
+                    if not row.empty:
+                        custom_vid_id = str(row[id_column].iloc[0])
+                        custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50]
+                
+                # Detect source type
+                url_type = ui_search.detect_url_type(url)
+                src_type = "youtube" if url_type == "youtube" else "direct"
+                
+                # Process
+                result = ui_search.process_single_video(
+                    url=url,
+                    custom_id=custom_vid_id,
+                    source_type=src_type,
+                    progress_bar=overall_progress,
+                    status_text=status_text,
+                    overall_progress=(idx, total)
+                )
+                
+                results.append(result)
+                st.session_state.batch_results = results
+                
+                # Update progress
+                progress_pct = int((idx / total) * 100)
+                overall_progress.progress(progress_pct)
+                
+                # Show result
+                with results_container:
+                    if result['status'] == 'success':
+                        url_stored = "✅ URL saved" if result.get('url_stored') else "⚠️ URL not stored"
+                        st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments ({url_stored})")
+                    else:
+                        error_msg = result.get('error', 'Unknown error')[:200]
+                        st.error(f"❌ [{idx}/{total}] Failed: {error_msg}...")
+                
+                time.sleep(1)  # Rate limiting
+            
+            # Final summary
+            overall_progress.progress(100)
+            status_text.text("Batch processing complete!")
+            
+            successful = [r for r in results if r['status'] == 'success']
+            failed = [r for r in results if r['status'] == 'failed']
+            
+            st.markdown("---")
+            st.subheader("📊 Batch Processing Summary")
+            
+            col1, col2, col3 = st.columns(3)
+            col1.metric("Total", total)
+            col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%")
+            col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%")
+            
+            # Detailed results
+            with st.expander("View Detailed Results"):
+                results_df = pd.DataFrame([
+                    {
+                        'Video ID': r['video_id'],
+                        'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'],
+                        'Source Type': r.get('source_type', 'unknown'),
+                        'Status': r['status'],
+                        'Segments': r.get('segments_count', 0),
+                        'URL Stored': r.get('url_stored', False),
+                        'Indexing': r.get('index_status', 'N/A'),
+                        'Error': (r.get('error', '')[:100] + '...') if r.get('error') else ''
+                    }
+                    for r in results
+                ])
+                st.dataframe(results_df)
+                
+                # Download results
+                csv_buffer = io.StringIO()
+                results_df.to_csv(csv_buffer, index=False)
+                st.download_button("Download Results CSV", csv_buffer.getvalue(), "batch_results.csv", "text/csv")
+            
+            # Search hint
+            if successful:
+                st.info("💡 **Search processed videos:**")
+                video_ids = [r['video_id'] for r in successful[:5]]
+                st.code(f"video_id:({' OR '.join(video_ids)})")
+            
+            st.session_state.batch_processing = False
+        
+        # --- SINGLE VIDEO PROCESSING ---
+        else:
+            progress_bar = st.progress(0)
+            status = st.empty()
+            
+            try:
+                # Upload file if needed
+                if source_type == "File Upload" and file_bytes:
+                    progress_bar.progress(10)
+                    status.text("Uploading to Azure Blob...")
+                    
+                    blob_name = f"upload_{video_id}_{int(time.time())}.m4a"
+                    
+                    sas_url, error = ui_search.upload_to_azure_blob_sdk(file_bytes, blob_name)
+                    if error and ("not installed" in error or "SDK" in error):
+                        sas_url, error = ui_search.upload_to_azure_blob_fixed(file_bytes, blob_name)
+                    
+                    if error:
+                        raise Exception(error)
+                    
+                    media_url = sas_url
+                    progress_bar.progress(50)
+                
+                # Download YouTube if needed
+                elif source_type == "YouTube":
+                    yt_url = st.session_state.get('yt_url_value', '')
+                    
+                    if not yt_url or not yt_url.strip():
+                        raise Exception("YouTube URL is empty")
+                    
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        progress_bar.progress(10)
+                        status.text("Downloading from YouTube...")
+                        
+                        output_path = f"{tmpdir}/youtube_{video_id}.m4a"
+                        downloaded_path, error = ui_search.download_youtube_audio(yt_url.strip(), output_path)
+                        
+                        if error:
+                            raise Exception(error)
+                        
+                        progress_bar.progress(50)
+                        status.text("Uploading to Azure Blob...")
+                        
+                        with open(downloaded_path, 'rb') as f:
+                            file_bytes = f.read()
+                        
+                        blob_name = f"youtube_{video_id}_{int(time.time())}.m4a"
+                        
+                        sas_url, error = ui_search.upload_to_azure_blob_sdk(file_bytes, blob_name)
+                        if error and ("not installed" in error):
+                            sas_url, error = ui_search.upload_to_azure_blob_fixed(file_bytes, blob_name)
+                        
+                        if error:
+                            raise Exception(error)
+                        
+                        media_url = sas_url
+                        progress_bar.progress(75)
+                
+                if not media_url:
+                    raise Exception("No media URL available")
+                
+                # Transcribe
+                status.text("Submitting to Azure Speech-to-Text...")
+                result = ui_search.submit_transcription_direct(video_id, media_url)
+                operation_url = result.get("operation_url")
+                
+                if not operation_url:
+                    raise Exception("No operation URL returned")
+                
+                # Poll
+                max_polls = 120
+                transcription_data = None
+                
+                for i in range(max_polls):
+                    time.sleep(ui_search.POLL_SECONDS)
+                    poll_result = ui_search.poll_transcription_operation(operation_url)
+                    status_text = poll_result.get("status", "unknown")
+                    
+                    progress = min(75 + int((i / max_polls) * 20), 95)
+                    progress_bar.progress(progress)
+                    status.text(f"Transcribing... ({i * ui_search.POLL_SECONDS // 60} min) - Status: {status_text}")
+                    
+                    if status_text.lower() == "succeeded":
+                        transcription_data = ui_search.get_transcription_from_result(poll_result)
+                        break
+                    elif status_text.lower() == "failed":
+                        raise Exception(f"Transcription failed: {poll_result.get('properties', {}).get('error', {}).get('message', 'Unknown error')}")
+                
+                if not transcription_data:
+                    raise Exception("Transcription timed out")
+                
+                # Process and index
+                progress_bar.progress(98)
+                status.text("Processing segments and indexing...")
+                
+                segments = ui_search.process_transcription_to_segments(transcription_data, video_id)
+                
+                # Save to blob
+                ui_search.save_segments_to_blob(video_id, segments)
+                
+                # Index with URL tracking
+                original_url = None
+                if source_type == "YouTube":
+                    original_url = st.session_state.get('yt_url_value', '')
+                elif source_type == "Direct URL":
+                    original_url = media_url
+                elif source_type == "File Upload":
+                    original_url = f"uploaded_file://{video_id}"
+                
+                index_result = ui_search.index_segments_direct(
+                    video_id,
+                    segments,
+                    source_url=original_url,
+                    source_type=detected_source_type
+                )
+                
+                url_stored_msg = "✅ Source URL stored" if index_result.get('source_url_stored') else "⚠️ URL storage not available"
+                
+                progress_bar.progress(100)
+                status.text("Complete!")
+                
+                st.success(f"""
+                ✅ **Transcription Complete!**
+                - Video ID: {video_id}
+                - Segments: {len(segments)}
+                - Source Type: {detected_source_type}
+                - Indexed: {index_result.get('indexed', 0)} documents
+                - {url_stored_msg}
+                """)
+                
+                if original_url:
+                    st.info(f"**Original Source:** [{original_url}]({original_url})")
+                
+                st.code(f'Search: video_id:{video_id}')
+                
+                with st.expander("View first 5 segments"):
+                    for seg in segments[:5]:
+                        st.write(f"**{ui_search.ms_to_ts(seg['start_ms'])} - {ui_search.ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...")
+                
+            except Exception as e:
+                st.error(f"❌ Error: {str(e)}")
+                st.exception(e)
\ No newline at end of file

From 892ef5f4b3ca57fd085e2d2ed67ff655ddfb5b51 Mon Sep 17 00:00:00 2001
From: Martin Nwadiugwu <YOUR_GITHUB_EMAIL@example.com>
Date: Mon, 2 Mar 2026 21:04:32 -0600
Subject: [PATCH 8/8] Revert .gitignore to upstream version

---
 .gitignore | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 172 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index e29811b..6e659d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,175 @@
-# macOS
-.DS_Store
+local.settings.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
 
-# JSON lines
-*.jsonl
+# C extensions
+*.so
 
-# Python bytecode
-*.pyc
-__pycache__/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
 
-# Local virtual env
-func_venv/
+# PyPI configuration file
+.pypirc