From 35e1604760c1db4c3073c8f198aad37f711f3911 Mon Sep 17 00:00:00 2001 From: Martin Date: Sun, 15 Feb 2026 08:17:13 -0600 Subject: [PATCH 1/8] Add multiple video upload options for processing multiple videos from CSV file --- TranscribeHttp/__init__.py | 2 +- scripts/box_shared_folder_manifest.py | 138 +- .../__pycache__/speech_batch.cpython-311.pyc | Bin 21537 -> 22598 bytes ui/Dockerfile | 3 + ui/ui_search.py | 1547 ++++++++++++++++- 5 files changed, 1542 insertions(+), 148 deletions(-) diff --git a/TranscribeHttp/__init__.py b/TranscribeHttp/__init__.py index 6fecd2b..3af666b 100644 --- a/TranscribeHttp/__init__.py +++ b/TranscribeHttp/__init__.py @@ -42,7 +42,7 @@ def _cfg() -> SpeechConfig: return SpeechConfig( key=os.environ["SPEECH_KEY"], endpoint=endpoint.rstrip("/"), - api_version=os.environ.get("SPEECH_API_VERSION", "2025-10-15"), + api_version=os.environ.get("SPEECH_API_VERSION", "2024-11-15"), ) def main(req: func.HttpRequest) -> func.HttpResponse: diff --git a/scripts/box_shared_folder_manifest.py b/scripts/box_shared_folder_manifest.py index 310df6e..d44eab0 100644 --- a/scripts/box_shared_folder_manifest.py +++ b/scripts/box_shared_folder_manifest.py @@ -1,39 +1,17 @@ """ scripts/box_shared_folder_manifest.py - Generate Video Manifest from Box - -This script enumerates .m4a video files from a Box shared folder and generates -a manifest file (videos.jsonl) that lists all videos with their IDs and media URLs. -It creates open shared links for each file so Azure Speech Service can access them. - -Architecture Role: -- Pre-processing step before video ingestion -- Generates videos.jsonl input file for import_videos.py -- Handles Box API authentication and folder traversal -- Creates publicly accessible download URLs for Speech Service - -Usage: - python scripts/box_shared_folder_manifest.py - -Output: - - videos.jsonl: One JSON object per line with video_id and media_url - -Configuration (via .env): - - BOX_SHARED_FOLDER_URL: Box shared folder link - - BOX_TOKEN or BOX_ACCESS_TOKEN/BOX_REFRESH_TOKEN: Box authentication - - OUT_PATH: Output file path (default: videos.jsonl) - - RECURSIVE: Whether to traverse subfolders (default: 1) """ import json import os import requests +import time from typing import Dict, Any, List, Optional - from box_auth import get_access_token BOX_API = "https://api.box.com/2.0" -SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"] # https://...box.com/s/ +SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"] print("BOX_SHARED_FOLDER_URL =", SHARED_FOLDER_URL) OUT_PATH = os.environ.get("OUT_PATH", "videos.jsonl") RECURSIVE = os.environ.get("RECURSIVE", "1") == "1" @@ -63,7 +41,6 @@ def resolve_shared_folder(token: str) -> Dict[str, Any]: return r.json() - def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int = 0) -> Dict[str, Any]: url = f"{BOX_API}/folders/{folder_id}/items" params = {"limit": limit, "offset": offset} @@ -72,31 +49,48 @@ def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int return r.json() -def ensure_open_shared_link_for_file(token: str, file_id: str) -> str: +def ensure_open_shared_link_for_file(token: str, file_id: str, max_retries: int = 3) -> Optional[str]: """ - Ensure file has an open shared link and return a direct-download URL. + Ensure file has an open shared link with retry logic for timeouts. """ url = f"{BOX_API}/files/{file_id}" payload = {"shared_link": {"access": "open"}} - params = {"fields": "shared_link"} # IMPORTANT: ask for shared_link back - - r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=30) - r.raise_for_status() - data = r.json() - - sl = data.get("shared_link") or {} - # print("shared_link =", json.dumps(sl, indent=2)) - # Prefer direct static download URL - dl = sl.get("download_url") - if dl: - return dl - - # Fallback: at least return the shared link (may require cookies) - if sl.get("url"): - return sl["url"] - - raise RuntimeError(f"No shared_link returned for file {file_id}: {data}") - + params = {"fields": "shared_link"} + + for attempt in range(max_retries): + try: + # Increased timeout to 60 seconds + r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=60) + + if r.status_code == 404: + print(f"⚠️ Skipping file {file_id} (not found)") + return None + + r.raise_for_status() + data = r.json() + sl = data.get("shared_link") or {} + + dl = sl.get("download_url") + if dl: + return dl + if sl.get("url"): + return sl["url"] + + raise RuntimeError(f"No shared_link returned for file {file_id}: {data}") + + except requests.exceptions.Timeout: + if attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff: 1, 2, 4 seconds + print(f"⏱️ Timeout on file {file_id}, retrying in {wait_time}s ({attempt + 1}/{max_retries})...") + time.sleep(wait_time) + else: + print(f"⚠️ Skipping file {file_id} (timeout after {max_retries} attempts)") + return None + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + print(f"⚠️ Skipping file {file_id} (not found)") + return None + raise def walk(token: str, folder_id: str) -> List[Dict[str, Any]]: @@ -114,6 +108,22 @@ def walk(token: str, folder_id: str) -> List[Dict[str, Any]]: return items +def load_existing_entries() -> set: + """Load already processed file IDs to avoid duplicates.""" + processed = set() + if os.path.exists(OUT_PATH): + with open(OUT_PATH, "r", encoding="utf-8") as f: + for line in f: + try: + entry = json.loads(line.strip()) + video_id = entry.get("video_id", "") + if video_id.startswith("vid_"): + processed.add(video_id[4:]) + except json.JSONDecodeError: + continue + return processed + + def main(): token = get_access_token() @@ -122,10 +132,16 @@ def main(): raise RuntimeError(f"Shared link did not resolve to a folder: {shared_folder.get('type')}") root_id = shared_folder["id"] + # Load already processed files to resume + processed_ids = load_existing_entries() + print(f"Resuming: {len(processed_ids)} files already processed") + queue = [root_id] - out_count = 0 + out_count = len(processed_ids) + skipped = 0 + new_files = 0 - with open(OUT_PATH, "w", encoding="utf-8") as f: + with open(OUT_PATH, "a", encoding="utf-8") as f: while queue: fid = queue.pop(0) entries = walk(token, fid) @@ -141,26 +157,36 @@ def main(): if et != "file": continue - if not lname.endswith(".m4a"): continue file_id = e["id"] + + # Skip if already processed + if file_id in processed_ids: + continue + video_id = f"vid_{file_id}" - # Make a per-file open shared link (Speech can fetch without auth). - # If your org disallows open links, this will fail — then you’ll need Blob staging. file_link = ensure_open_shared_link_for_file(token, file_id) + + if file_link is None: + skipped += 1 + continue - # Encourage direct download behavior - media_url = file_link # + ("?download=1" if "?" not in file_link else "&download=1") - + media_url = file_link f.write(json.dumps({"video_id": video_id, "media_url": media_url}) + "\n") + f.flush() # Ensure write is saved immediately out_count += 1 + new_files += 1 + if out_count % 10 == 0: - print(f"Wrote {out_count} entries...") + print(f"Wrote {out_count} entries total...") + + # Small delay to avoid rate limiting + time.sleep(0.2) - print(f"Done. Wrote {out_count} m4a entries to {OUT_PATH}") + print(f"Done. Total: {out_count} entries (new: {new_files}, skipped: {skipped})") if __name__ == "__main__": diff --git a/shared/__pycache__/speech_batch.cpython-311.pyc b/shared/__pycache__/speech_batch.cpython-311.pyc index 525fbbc638d0ded8eda6304af50b337f3dd74394..58ca2888792699e4a1a7d3ce5b2dab4fcc63daf0 100644 GIT binary patch delta 2294 zcmaJ@OK%%h6rMYF?8LE?rnU2EoA{b0t<%PN^pUo1DWqv0eWY!g7NontjD2q6A@i_i z#tA7BrYKSuAP{)LrVA=0il{`9<`1xe1p*-fDyX2sf<-p0iUb=FXRhPeE)|x?Gxt2d z^PSI@Z+}F-_>{EWi9|vYzSbVSrf!^lr>(p6nNO15lR#n;5Rf&(h`7s47Iz=>iMyZq z@$S=_^Z*MGFV?RG^&kuC&8%5(VJ*Veq=ocW)~bhDSdXxX-p1O5J)pJg+t@a}gLUZJ z*>-&g+ac^hZ7!8{#|pE}YFv8^e%;*!GZU?X_xcXcdCP2qgta z1e64}9>>Ba)bFkn;!9N#P&fHd01>WbD9t9b!9fiVZ+ zqyU=4I#1*I5S$YHG=LVP`r#S$9|nRfJ$p+o!9*icIE~~dYox*?5>+9)q1a+GoI%cx zz|<2P{THYorm^uj%z*6i470cVaF$OcUCrPnV>q4GaX zoh6#06W5Ctrwawnb9uVJtra!L>1oBm(^X3`?3|?*9Mv@FHAmG{N9A@XbTzNqR5zih zaayp<6&1KmbEaOfc%B=!y25F(8ci2Xi>{j1vT7{RRn^JYb!@G*43&TFik2oWpB<)W z9hz4R(72rrrRX^`OYLG-S8dxvfeuxzdQ<4$W{ z700xcB~Det;Y*fq(d5F!0=;(S0$t(;w@Sh|L>E;YtXS~M*@KuqiFgdId3J|e-XIrEMQ{(AS z30s+Xvh#|?fof{jQmi$>prUEiY1oHZ;FTe4M0m}{`0UyiAz41_I0XT7fiLOYaO$Y4 ztbDp?-u$FA|ryQ;D0*^W1XXAfAG-07^k|iuP7)wE`_TnWClHX*$E5y0nTQP$uv8M@u5B zs z72Ct?K}H2D4t|#_lD36zvYzsl9B#}Gxf{OwWNc;x#bVhbx?(x1VXP{k7ME6wBUR}3 zi1&RqQz&^8X)>+y-%SZ})Qtzm2|49H7raTxwEM96OA@$d q%VgaArMF1zAdw_|OLi~*huQFL$Q#nec)5K(UhbQ3c8BL5$^Qbmo12vY delta 1238 zcmaJ>OH30{6rDSrewcos&_Y{iseFV=Y5C)?;1>lo@e?-BRrNj;(`m63G;X>fQ5R}R zqb@X2<3g9lx6&9BHM%e{?p&yG<-!<03lmLTdFMfZqH)qU@7;6nJ@>pbNnhN9TX(^8 z$L-b?dak8rCZFw`_XLWMEQ)efK?NbBu!S8M$XI2n+%3#PyPDAqE3<-GW69VI8?za9 zW;YznAu%meVdzXZoXlysn9Fc8x5TX(k5S1gjVe}Uc$wFzX4Mk6Woir`+wPNrodQ4e z8?~%f#tw?>Se+4I0g5YF5Oo$pC#y#nYd|*(bE~TCQqVJ}VCDRmusgB{R1x5v{}L(U z7*_Kr)^InE2F)wW2Yoy`VMG6%m93$3WJ1N-IgK^ZxslFwbZ*7~^#y79xQ}W>gtcHj z_hEyHArKZOtaS;C5Z<_$(=kf9wG~CBLcu+f%4}<*6G>zXmgZaHcH1{k`l*0DJ4py`)PFNQm?OkEIBbi`NU%VO1`4Bn+OJ}ASnyhFOu4D z17Te&NLMy4^x1GD)pX+~DYT7vHdEY#Dd|s{Xz8JbLzM4Dgy<&JUWxf6$@(o*=(+m zF68pbLOz|HKAN9A!b^UA4TQy@J^;`nvd*&raq-jj5}@z)6ORfh@w6%qFebixX8;Zg zUgJ~YusGu%vcPfiAix1Gh{n(x6@Hg~HvEFi;#-($@K}sC-2`|mJ~zL!!Uypp_F4s1 zT#4^fK^GqrOogBr?goIE*xEB str: s = max(0, int(ms // 1000)) @@ -66,60 +92,1399 @@ def ms_to_ts(ms: int) -> str: return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}" -def call_search_api(payload: dict) -> dict: - r = requests.post( - SEARCH_FN_URL, - json=payload, - timeout=60, - headers={"Content-Type": "application/json"}, - ) +def call_api(url: str, payload: dict, timeout: int = 60) -> dict: + r = requests.post(url, json=payload, timeout=timeout, headers={"Content-Type": "application/json"}) if r.status_code >= 400: raise RuntimeError(f"HTTP {r.status_code}: {r.text}") return r.json() if r.text else {} -if go: - payload = {"q": q.strip(), "mode": mode, "top": top} - if mode in ("hybrid", "vector"): - payload["k"] = k - if video_id_filter.strip(): - payload["video_id"] = video_id_filter.strip() +# ============================================================================= +# DIRECT AZURE SPEECH API FUNCTIONS (BYPASS AZURE FUNCTION) +# ============================================================================= + +def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]: + """ + Submit transcription directly to Azure Speech API. + Bypasses the Azure Function with wrong API version. + """ + if not SPEECH_KEY: + raise RuntimeError("SPEECH_KEY not configured in environment") + + endpoint = f"https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext/transcriptions:submit?api-version={SPEECH_API_VERSION}" + + headers = { + "Ocp-Apim-Subscription-Key": SPEECH_KEY, + "Content-Type": "application/json" + } + + payload = { + "contentUrls": [media_url], + "locale": "en-US", + "displayName": f"transcription_{video_id}", + "properties": { + "diarizationEnabled": False, + "wordLevelTimestampsEnabled": False, + "punctuationMode": "DictatedAndAutomatic", + "profanityFilterMode": "Masked", + "timeToLiveHours": 24 + } + } + + try: + r = requests.post(endpoint, headers=headers, json=payload, timeout=60) + r.raise_for_status() + + # Get operation URL from Location header (this is the operation status URL) + operation_url = r.headers.get("Location") + if not operation_url: + result = r.json() + operation_url = result.get("self") or result.get("links", {}).get("self") + + if not operation_url: + raise RuntimeError("No operation URL returned from Speech API") + + return {"operation_url": operation_url, "video_id": video_id} + + except requests.exceptions.HTTPError as e: + if r.status_code == 401: + raise RuntimeError("Azure Speech API authentication failed. Check SPEECH_KEY.") + elif r.status_code == 400: + raise RuntimeError(f"Bad request: {r.text}") + else: + raise RuntimeError(f"Speech API error {r.status_code}: {r.text}") + + +def poll_transcription_operation(operation_url: str) -> Dict[str, Any]: + """Poll transcription operation status directly from Azure Speech API.""" + if not SPEECH_KEY: + raise RuntimeError("SPEECH_KEY not configured") + + headers = { + "Ocp-Apim-Subscription-Key": SPEECH_KEY + } + + try: + # CRITICAL FIX: Azure returns operation URL with :submit but we need to poll + # using the /transcriptions/{id} endpoint, not /transcriptions:submit/{id} + # The operation_url looks like: .../transcriptions:submit/{id}?api-version=... + # We need: .../transcriptions/{id}?api-version=... + + poll_url = operation_url.replace("/transcriptions:submit/", "/transcriptions/") + + # Debug info + st.session_state['debug_poll_url'] = poll_url + + r = requests.get(poll_url, headers=headers, timeout=30) + r.raise_for_status() + return r.json() + + except requests.exceptions.RequestException as e: + raise RuntimeError(f"Failed to poll transcription: {str(e)}") + + +def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]: + """Get the actual transcription JSON from the result files.""" + if not SPEECH_KEY: + raise RuntimeError("SPEECH_KEY not configured") + + headers = { + "Ocp-Apim-Subscription-Key": SPEECH_KEY + } + + try: + # Get the result files URL from the completed operation + links = result_data.get("links", {}) + files_url = links.get("files") + + if not files_url: + # Try to construct from result data or get content directly + if "combinedRecognizedPhrases" in result_data: + # Result might be embedded directly + return result_data + + raise RuntimeError("No files URL in result") + + # Get list of files + r = requests.get(files_url, headers=headers, timeout=30) + r.raise_for_status() + files_data = r.json() + + # Find the transcription JSON file + for file in files_data.get("values", []): + if file.get("kind") == "Transcription": + content_url = file.get("links", {}).get("contentUrl") + if content_url: + # Download the actual transcription content + content_r = requests.get(content_url, timeout=60) + content_r.raise_for_status() + return content_r.json() + + raise RuntimeError("No transcription file found in results") + + except requests.exceptions.RequestException as e: + raise RuntimeError(f"Failed to get transcription result: {str(e)}") + + +# ============================================================================= +# DIRECT EMBEDDING AND INDEXING (BYPASS AZURE FUNCTION) +# ============================================================================= + +def get_embeddings(texts: list) -> list: + """Get embeddings directly from Azure OpenAI.""" + if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_KEY: + raise RuntimeError("Azure OpenAI not configured") + + url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/embeddings?api-version=2024-02-01" + + headers = { + "api-key": AZURE_OPENAI_KEY, + "Content-Type": "application/json" + } + + payload = { + "input": texts, + "model": "text-embedding-3-small" + } + + try: + r = requests.post(url, headers=headers, json=payload, timeout=60) + r.raise_for_status() + result = r.json() + return [item["embedding"] for item in result["data"]] + except Exception as e: + raise RuntimeError(f"Embedding failed: {str(e)}") + + +def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: + """ + Index segments directly to Azure Cognitive Search. + Bypasses the EmbedAndIndex Azure Function. + """ + if not SEARCH_ENDPOINT or not SEARCH_KEY: + raise RuntimeError("Azure Search not configured") + + # Generate embeddings for all segments + texts = [seg.get("text", "") for seg in segments] + try: + embeddings = get_embeddings(texts) + except Exception as e: + st.warning(f"Embedding failed, indexing without vectors: {e}") + embeddings = [None] * len(segments) + + # Prepare search documents + documents = [] + for i, (seg, embedding) in enumerate(zip(segments, embeddings)): + doc = { + "id": f"{video_id}_{i}", + "video_id": video_id, + "segment_id": seg.get("segment_id", i), + "text": seg.get("text", ""), + "start_ms": seg.get("start_ms", 0), + "end_ms": seg.get("end_ms", 0), + "pred_labels": seg.get("pred_labels", []) + } + if embedding: + doc["embedding"] = embedding + + documents.append(doc) + + # Upload to Azure Search + url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01" + + headers = { + "api-key": SEARCH_KEY, + "Content-Type": "application/json" + } + + payload = { + "value": documents + } + + try: + r = requests.post(url, headers=headers, json=payload, timeout=60) + r.raise_for_status() + return {"indexed": len(documents), "video_id": video_id} + except Exception as e: + raise RuntimeError(f"Indexing failed: {str(e)}") + + +def process_transcription_to_segments(transcription_data: Dict, video_id: str) -> list: + """ + Convert Azure Speech transcription JSON to segments format. + """ + segments = [] + + # Parse phrases/segments from transcription + phrases = transcription_data.get("recognizedPhrases", []) + + for i, phrase in enumerate(phrases): + # Extract timing + offset = phrase.get("offsetInTicks", 0) // 10000 # Convert to ms + duration = phrase.get("durationInTicks", 0) // 10000 + + # Extract text + nbest = phrase.get("nBest", []) + if nbest: + text = nbest[0].get("display", "") + else: + text = "" + + # Create segment + segment = { + "segment_id": i, + "video_id": video_id, + "text": text, + "start_ms": offset, + "end_ms": offset + duration, + "pred_labels": [] # Could add label prediction here + } + + segments.append(segment) + + return segments + + +# ============================================================================= +# STORAGE FUNCTIONS - FIXED UPLOAD +# ============================================================================= + +def generate_video_id(filename: str) -> str: + clean_name = Path(filename).stem + clean_name = re.sub(r'[^\w\s-]', '', clean_name) + clean_name = re.sub(r'[-\s]+', '_', clean_name) + hash_suffix = hashlib.md5(clean_name.encode()).hexdigest()[:8] + return f"vid_{clean_name[:50]}_{hash_suffix}" + + +def test_sas_url(sas_url: str) -> Tuple[bool, str]: + """Test if SAS URL is accessible before sending to Speech API.""" + try: + r = requests.head(sas_url, timeout=10, allow_redirects=True) + if r.status_code == 200: + return True, "SAS URL is accessible" + else: + return False, f"SAS URL returned HTTP {r.status_code}" + except Exception as e: + return False, f"SAS URL test failed: {str(e)}" + + +def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]: + """ + FIXED upload to Azure Blob using REST API. + Corrected string-to-sign format. + """ + if not AZURE_STORAGE_KEY: + return None, "Azure Storage key not configured" + + try: + # Upload URL + url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}" + + # Create date header in the exact format Azure expects + date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') + content_length = len(file_bytes) + + # ==================================================================== + # CRITICAL FIX: Correct string-to-sign format for Azure Blob Storage + # Format: VERB\nContent-Encoding\nContent-Language\nContent-Length\n + # Content-MD5\nContent-Type\nDate\nIf-Modified-Since\nIf-Match\n + # If-None-Match\nIf-Unmodified-Since\nRange\n + # CanonicalizedHeaders\nCanonicalizedResource + # ==================================================================== + string_to_sign = ( + f"PUT\n" # HTTP method + f"\n" # Content-Encoding (empty) + f"\n" # Content-Language (empty) + f"{content_length}\n" # Content-Length (REQUIRED - must be exact) + f"\n" # Content-MD5 (empty) + f"application/octet-stream\n" # Content-Type (REQUIRED for PUT) + f"\n" # Date (empty, using x-ms-date instead) + f"\n" # If-Modified-Since (empty) + f"\n" # If-Match (empty) + f"\n" # If-None-Match (empty) + f"\n" # If-Unmodified-Since (empty) + f"\n" # Range (empty) + f"x-ms-blob-type:BlockBlob\n" # CanonicalizedHeaders (sorted alphabetically) + f"x-ms-date:{date_str}\n" + f"x-ms-version:2020-12-06\n" + f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" # CanonicalizedResource + ) + + # Sign with HMAC-SHA256 + account_key = base64.b64decode(AZURE_STORAGE_KEY) + signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() + signature = base64.b64encode(signed_hmac).decode('utf-8') + + # Build authorization header + auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" + + # Set headers - MUST match what was signed + headers = { + "x-ms-date": date_str, + "x-ms-version": "2020-12-06", + "x-ms-blob-type": "BlockBlob", + "Content-Type": "application/octet-stream", + "Content-Length": str(content_length), + "Authorization": auth_header + } + + # Upload + r = requests.put(url, data=file_bytes, headers=headers, timeout=300) + + if r.status_code not in [201, 200]: + return None, f"Upload failed: HTTP {r.status_code} - {r.text}" + + # Generate SAS token for reading + sas_token = generate_sas_token_fixed(blob_name) + if not sas_token: + return None, "Failed to generate SAS token" + + sas_url = f"{url}?{sas_token}" + + # Test the SAS URL + is_valid, test_msg = test_sas_url(sas_url) + if not is_valid: + return None, f"SAS URL validation failed: {test_msg}" + + return sas_url, None + + except Exception as e: + import traceback + return None, f"Upload error: {str(e)}\n{traceback.format_exc()}" + + +def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> str: + """ + FIXED SAS token generation for Azure Blob - Service SAS format. + """ + if not AZURE_STORAGE_KEY: + return None + + try: + # Set expiry in UTC + expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours) + expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ') + + # Decode account key + account_key = base64.b64decode(AZURE_STORAGE_KEY) + + # ==================================================================== + # CRITICAL FIX: Service SAS string-to-sign format + # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas + # Format for Blob service SAS: + # StringToSign = signedPermissions + "\n" + + # signedStart + "\n" + + # signedExpiry + "\n" + + # canonicalizedResource + "\n" + + # signedIdentifier + "\n" + + # signedIP + "\n" + + # signedProtocol + "\n" + + # signedVersion + "\n" + + # signedResource + "\n" + + # signedSnapshotTime + "\n" + + # signedEncryptionScope + "\n" + + # signedCacheControl + "\n" + + # signedContentDisposition + "\n" + + # signedContentEncoding + "\n" + + # signedContentLanguage + "\n" + + # signedContentType + # ==================================================================== + + # Canonicalized resource for service SAS: /blob/{account}/{container}/{blob} + canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" + + # Build string to sign for Service SAS + string_to_sign = ( + f"r\n" # signed permissions (read) + f"\n" # signed start (empty) + f"{expiry_str}\n" # signed expiry + f"{canonicalized_resource}\n" # canonicalized resource + f"\n" # signed identifier (empty) + f"\n" # signed IP (empty) + f"https\n" # signed protocol + f"2020-12-06\n" # signed version + f"b\n" # signed resource (b = blob) + f"\n" # signed snapshot time (empty) + f"\n" # signed encryption scope (empty) + f"\n" # signed cache control (empty) + f"\n" # signed content disposition (empty) + f"\n" # signed content encoding (empty) + f"\n" # signed content language (empty) + f"" # signed content type (empty, no newline at end) + ) + + # Sign with HMAC-SHA256 + signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() + signature = base64.b64encode(signed_hmac).decode('utf-8') + + # Build query parameters - Order matters for some clients + sas_params = { + 'sv': '2020-12-06', # signed version + 'sr': 'b', # signed resource (blob) + 'sp': 'r', # signed permissions (read) + 'se': expiry_str, # signed expiry + 'spr': 'https', # signed protocol + 'sig': signature # signature + } + + # URL encode the signature and other values + sas_token = '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()]) + return sas_token + + except Exception as e: + st.error(f"SAS generation error: {e}") + import traceback + st.error(traceback.format_exc()) + return None + +def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]: + """ + Upload using Azure SDK (more reliable, requires azure-storage-blob package). + """ try: - with st.spinner("Searching..."): - data = call_search_api(payload) + from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions + + connection_string = ( + f"DefaultEndpointsProtocol=https;" + f"AccountName={AZURE_STORAGE_ACCOUNT};" + f"AccountKey={AZURE_STORAGE_KEY};" + f"EndpointSuffix=core.windows.net" + ) + + blob_service = BlobServiceClient.from_connection_string(connection_string) + container_client = blob_service.get_container_client(INPUT_CONTAINER) + + # Ensure container exists + try: + container_client.create_container() + except Exception: + pass + + # Upload blob + blob_client = container_client.get_blob_client(blob_name) + blob_client.upload_blob(file_bytes, overwrite=True) + + # Generate SAS token + sas_token = generate_blob_sas( + account_name=AZURE_STORAGE_ACCOUNT, + container_name=INPUT_CONTAINER, + blob_name=blob_name, + account_key=AZURE_STORAGE_KEY, + permission=BlobSasPermissions(read=True), + expiry=datetime.now(timezone.utc) + timedelta(hours=24), + protocol="https" + ) + + sas_url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}?{sas_token}" + + # Test the SAS URL + is_valid, test_msg = test_sas_url(sas_url) + if not is_valid: + return None, f"SAS URL validation failed: {test_msg}" + + return sas_url, None + + except ImportError: + return None, "azure-storage-blob not installed" except Exception as e: - st.error(f"Search failed: {e}") - st.stop() - - hits = data.get("hits", []) - st.caption(f"Count: {data.get('count')} | Returned: {len(hits)}") - - for i, h in enumerate(hits, start=1): - start_ms = h.get("start_ms", 0) - end_ms = h.get("end_ms", 0) - vid = h.get("video_id", "") - seg = h.get("segment_id", "") - score = h.get("score", None) - - header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}" - if seg: - header += f" | seg={seg}" - if score is not None: - header += f" | score={score:.3f}" if isinstance(score, (int, float)) else f" | score={score}" - - with st.expander(header, expanded=(i <= 3)): - st.write(h.get("text", "")) - - labels = h.get("pred_labels") or [] - conf = h.get("pred_confidence") - rationale = h.get("pred_rationale") - - if labels or conf is not None or rationale: - st.subheader("Annotations") - if labels: - st.write("**Labels:**", ", ".join(labels)) - if conf is not None: - st.write("**Confidence:**", conf) - if rationale: - st.write("**Rationale:**", rationale) + import traceback + return None, f"SDK upload failed: {str(e)}\n{traceback.format_exc()}" + + +def save_segments_to_blob(video_id: str, segments: list) -> str: + """Save segments JSON to blob storage.""" + if not AZURE_STORAGE_KEY: + raise RuntimeError("Azure Storage key not configured") + + blob_name = f"{video_id}_segments.json" + url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{SEGMENTS_CONTAINER}/{blob_name}" + + json_bytes = json.dumps(segments, indent=2).encode('utf-8') + content_length = len(json_bytes) + + date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') + string_to_sign = ( + f"PUT\n" + f"\n" + f"\n" + f"{content_length}\n" + f"\n" + f"application/json\n" + f"\n" + f"\n" + f"\n" + f"\n" + f"\n" + f"\n" + f"x-ms-blob-type:BlockBlob\n" + f"x-ms-date:{date_str}\n" + f"x-ms-version:2020-12-06\n" + f"/{AZURE_STORAGE_ACCOUNT}/{SEGMENTS_CONTAINER}/{blob_name}" + ) + + account_key = base64.b64decode(AZURE_STORAGE_KEY) + signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() + signature = base64.b64encode(signed_hmac).decode('utf-8') + auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" + + headers = { + "x-ms-date": date_str, + "x-ms-version": "2020-12-06", + "x-ms-blob-type": "BlockBlob", + "Content-Type": "application/json", + "Content-Length": str(content_length), + "Authorization": auth_header + } + + r = requests.put(url, data=json_bytes, headers=headers, timeout=60) + r.raise_for_status() + + return blob_name + + +def check_yt_dlp() -> bool: + try: + result = subprocess.run(["which", "yt-dlp"], capture_output=True, text=True) + return result.returncode == 0 + except: + return False + + +def download_youtube_audio(youtube_url: str, output_path: str, progress_callback=None) -> Tuple[Optional[str], Optional[str]]: + """Download YouTube audio to specific path.""" + if not check_yt_dlp(): + return None, "yt-dlp not installed. Run: pip install yt-dlp" + + if not youtube_url or not youtube_url.strip(): + return None, "YouTube URL is empty" + + try: + cmd = [ + "yt-dlp", + "-f", "bestaudio[ext=m4a]/bestaudio", + "--extract-audio", + "--audio-format", "m4a", + "--audio-quality", "0", + "--no-check-certificate", # Added for compatibility + "--no-warnings", # Reduce noise + "-o", output_path, + youtube_url.strip() + ] + + # Try to use Node.js runtime if available, otherwise let yt-dlp handle it + # This fixes the "No supported JavaScript runtime" error + try: + node_check = subprocess.run(["which", "node"], capture_output=True, text=True) + if node_check.returncode != 0: + # No node.js, try to use legacy format that doesn't require JS + cmd.extend(["--extractor-args", "youtube:player_client=web"]) + except: + pass + + if progress_callback: + progress_callback(15, "Downloading from YouTube...") + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) + + if result.returncode != 0: + error_msg = result.stderr[:500] + # Provide helpful error message for JS runtime issues + if "JavaScript runtime" in error_msg: + error_msg += "\n\n💡 Tip: Install Node.js or run: pip install yt-dlp --upgrade" + return None, f"yt-dlp failed: {error_msg}" + + # Find the actual file + if os.path.exists(output_path): + return output_path, None + + # Try alternative extensions + base = output_path.rsplit('.', 1)[0] + for ext in ['.m4a', '.mp3', '.webm', '.opus']: + alt_path = base + ext + if os.path.exists(alt_path): + return alt_path, None + + return None, "Download completed but file not found" + + except subprocess.TimeoutExpired: + return None, "Download timed out after 10 minutes" + except Exception as e: + return None, f"Error: {str(e)}" + + +def detect_url_type(url: str) -> str: + """Detect if URL is YouTube, direct media, or unknown.""" + if not url: + return "unknown" + + url_lower = str(url).lower().strip() + + # YouTube patterns + youtube_patterns = [ + r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)', + r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=', + r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/', + r'youtube\.com\/shorts\/' + ] + + for pattern in youtube_patterns: + if re.search(pattern, url_lower): + return "youtube" + + # Direct media patterns + media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm'] + if any(url_lower.endswith(ext) for ext in media_extensions): + return "direct" + + # Box.com, Google Drive, Dropbox, etc. - treat as direct + cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive'] + if any(pattern in url_lower for pattern in cloud_patterns): + return "direct" + + return "unknown" + + +def process_single_video(url: str, custom_id: Optional[str] = None, + progress_bar=None, status_text=None, + overall_progress: Tuple[int, int] = (0, 1)) -> Dict[str, Any]: + """ + Process a single video URL (YouTube or Direct). + Returns result dict with status and metadata. + """ + result = { + "url": url, + "video_id": None, + "status": "pending", + "segments_count": 0, + "error": None, + "index_status": None + } + + try: + # Detect URL type + url_type = detect_url_type(url) + + if url_type == "unknown": + result["status"] = "failed" + result["error"] = "Unknown URL type. Must be YouTube or direct media URL." + return result + + # Generate video ID + if custom_id: + video_id = custom_id.strip() + else: + video_id = generate_video_id(f"batch_{url}") + + result["video_id"] = video_id + + # Update progress + current, total = overall_progress + base_progress = int((current / total) * 100) if progress_bar else 0 + + if status_text: + status_text.text(f"[{current}/{total}] Processing: {video_id}") + + media_url = None + + # Handle YouTube + if url_type == "youtube": + if not check_yt_dlp(): + result["status"] = "failed" + result["error"] = "yt-dlp not installed" + return result + + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + if status_text: + status_text.text(f"[{current}/{total}] Downloading from YouTube...") + + output_path = f"{tmpdir}/youtube_{video_id}.m4a" + downloaded_path, error = download_youtube_audio(url.strip(), output_path) + + if error: + result["status"] = "failed" + result["error"] = f"Download failed: {error}" + return result + + # Read and upload + with open(downloaded_path, 'rb') as f: + file_bytes = f.read() + + blob_name = f"batch_youtube_{video_id}_{int(time.time())}.m4a" + + if status_text: + status_text.text(f"[{current}/{total}] Uploading to Azure...") + + # Try SDK first + sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) + if error and ("not installed" in error or "SDK" in error): + sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) + + if error: + result["status"] = "failed" + result["error"] = f"Upload failed: {error}" + return result + + media_url = sas_url + + # Handle Direct URL + elif url_type == "direct": + media_url = url.strip() + if status_text: + status_text.text(f"[{current}/{total}] Using direct URL...") + + if not media_url: + result["status"] = "failed" + result["error"] = "No media URL available" + return result + + # Submit transcription + if status_text: + status_text.text(f"[{current}/{total}] Submitting to Speech API...") + + submit_result = submit_transcription_direct(video_id, media_url) + operation_url = submit_result.get("operation_url") + + if not operation_url: + result["status"] = "failed" + result["error"] = "No operation URL returned" + return result + + # Poll for completion + max_polls = 120 + transcription_data = None + + for i in range(max_polls): + time.sleep(POLL_SECONDS) + poll_result = poll_transcription_operation(operation_url) + status = poll_result.get("status", "unknown") + + # Update progress during polling + if progress_bar: + poll_progress = min(int((i / max_polls) * 20), 20) # 20% of progress for polling + overall = base_progress + int((1 / total) * 80) + int((1 / total) * poll_progress) + progress_bar.progress(min(overall, 99)) + + if status.lower() == "succeeded": + transcription_data = get_transcription_from_result(poll_result) + break + elif status.lower() == "failed": + error_msg = poll_result.get("properties", {}).get("error", {}).get("message", "Unknown error") + result["status"] = "failed" + result["error"] = f"Transcription failed: {error_msg}" + return result + + if not transcription_data: + result["status"] = "failed" + result["error"] = "Transcription timed out" + return result + + # Process segments + if status_text: + status_text.text(f"[{current}/{total}] Processing segments...") + + segments = process_transcription_to_segments(transcription_data, video_id) + result["segments_count"] = len(segments) + + # Save to blob + save_segments_to_blob(video_id, segments) + + # Index to search + try: + index_result = index_segments_direct(video_id, segments) + result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents" + except Exception as e: + result["index_status"] = f"Indexing skipped: {str(e)}" + + result["status"] = "success" + + except Exception as e: + result["status"] = "failed" + result["error"] = str(e) + import traceback + result["error"] += f"\n{traceback.format_exc()}" + + return result + + +# ============================================================================= +# PAGE 1: SEARCH +# ============================================================================= + +if page == "🔎 Search Segments": + st.header("Search Indexed Video Segments") + + if not SEARCH_FN_URL: + st.error("SEARCH_FN_URL not configured. Cannot search.") + else: + col1, col2 = st.columns([3, 1]) + with col1: + q = st.text_input("Query", placeholder="e.g., measles vaccine side effects") + with col2: + video_id_filter = st.text_input("Filter by video_id (optional)") + + if st.button("Search", type="primary", disabled=(not q.strip())): + try: + payload = {"q": q.strip(), "mode": mode, "top": top} + if mode in ("hybrid", "vector"): + payload["k"] = k + if video_id_filter.strip(): + payload["video_id"] = video_id_filter.strip() + + with st.spinner("Searching..."): + data = call_api(SEARCH_FN_URL, payload) + + hits = data.get("hits", []) + st.caption(f"Found {data.get('count', 0)} total | Showing {len(hits)}") + + for i, h in enumerate(hits, start=1): + start_ms, end_ms = h.get("start_ms", 0), h.get("end_ms", 0) + vid, seg, score = h.get("video_id", ""), h.get("segment_id", ""), h.get("score") + + header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}" + if seg: + header += f" | seg={seg}" + if score is not None: + header += f" | score={score:.3f}" if isinstance(score, (int, float)) else f" | score={score}" + + with st.expander(header, expanded=(i <= 3)): + st.write(h.get("text", "")) + if h.get("pred_labels"): + st.caption(f"Labels: {', '.join(h['pred_labels'])}") + except Exception as e: + st.error(f"Search failed: {e}") + + +# ============================================================================= +# PAGE 2: UPLOAD (DIRECT API VERSION) +# ============================================================================= + +elif page == "⬆️ Upload & Transcribe": + st.header("Upload Video for Transcription") + st.info(" Using direct Azure Speech API (bypassing Azure Function)") + + # Check Azure config + azure_configured = bool(AZURE_STORAGE_KEY) and bool(SPEECH_KEY) + if not azure_configured: + st.error("⚠️ Azure Storage and Speech keys required. Check .env file.") + + source_type = st.radio("Select Source", + ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"], + horizontal=True) + + media_url = None + video_id = None + file_bytes = None + yt_url = None # Initialize to None + csv_df = None + + # ------------------------------------------------------------------------- + # FILE UPLOAD + # ------------------------------------------------------------------------- + if source_type == "File Upload": + if not azure_configured: + st.info("Please configure Azure Storage to enable file upload") + else: + uploaded_file = st.file_uploader( + "Choose video/audio file", + type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"], + accept_multiple_files=False + ) + + if uploaded_file: + st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)") + file_bytes = uploaded_file.getvalue() + video_id = generate_video_id(uploaded_file.name) + st.info("File ready for upload to Azure") + + # ------------------------------------------------------------------------- + # DIRECT URL + # ------------------------------------------------------------------------- + elif source_type == "Direct URL": + url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...") + + if url_input.strip(): + media_url = url_input.strip() + video_id = generate_video_id(url_input) + st.success("✅ URL validated") + + # ------------------------------------------------------------------------- + # YOUTUBE - FIXED with session state + # ------------------------------------------------------------------------- + elif source_type == "YouTube": + # Use session state to persist the URL + yt_url = st.text_input( + "YouTube URL", + placeholder="https://youtube.com/watch?v= ...", + value=st.session_state.yt_url_value, + key="yt_url_input" + ) + + # Update session state when URL changes - FIXED: removed experimental_rerun + if yt_url != st.session_state.yt_url_value: + st.session_state.yt_url_value = yt_url + # Use st.rerun() instead of st.experimental_rerun() for newer Streamlit versions + try: + st.rerun() + except AttributeError: + # Fallback for older versions + try: + st.experimental_rerun() + except AttributeError: + pass # If neither exists, just continue without rerun + + if not check_yt_dlp(): + st.warning("yt-dlp not installed") + if st.button("Install yt-dlp"): + with st.spinner("Installing..."): + subprocess.run(["pip", "install", "-q", "yt-dlp"]) + # FIXED: Use st.rerun() instead of experimental_rerun + try: + st.rerun() + except AttributeError: + try: + st.experimental_rerun() + except AttributeError: + st.info("Please refresh the page manually") + elif yt_url and yt_url.strip(): + video_id = generate_video_id(f"yt_{yt_url.strip()}") + st.success("YouTube URL ready") + + # ------------------------------------------------------------------------- + # BATCH CSV UPLOAD - NEW FEATURE + # ------------------------------------------------------------------------- + elif source_type == "📁 Batch CSV Upload": + st.subheader("📁 Batch Process Videos from CSV") + + csv_file = st.file_uploader( + "Upload CSV file", + type=["csv"], + help="CSV must contain a column with video URLs (YouTube or direct links)" + ) + + if csv_file: + try: + # Read CSV - handle various formats + # Try to detect if URLs are in header or rows + content = csv_file.read().decode('utf-8') + csv_file.seek(0) # Reset pointer + + # First attempt: standard read + try: + csv_df = pd.read_csv(csv_file) + except Exception: + # Second attempt: maybe single column with no header + csv_file.seek(0) + csv_df = pd.read_csv(csv_file, header=None) + csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))] + + # Check if column names look like URLs (common issue) + url_like_columns = [] + for col in csv_df.columns: + col_str = str(col).strip() + if detect_url_type(col_str) != "unknown" or col_str.startswith('http'): + url_like_columns.append(col) + + # If column names look like URLs, treat them as data + if url_like_columns and len(csv_df.columns) == 1: + # The column name is actually a URL, convert to data + url_col_name = csv_df.columns[0] + new_row = {url_col_name: url_col_name} + csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True) + + st.success(f"✅ Loaded CSV with {len(csv_df)} rows and {len(csv_df.columns)} columns") + + # Show available columns + st.write("**Available columns:**", list(csv_df.columns)) + + # Let user select the URL column + url_column = st.selectbox( + "Select column containing video URLs", + options=csv_df.columns.tolist(), + help="Choose the column that contains YouTube or direct media URLs" + ) + + # Optional: Select custom ID column + id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column] + id_column = st.selectbox( + "Select column for custom Video ID (optional)", + options=id_column_options, + index=0, + help="Optional: Choose a column to use as custom video ID (e.g., title, ID field)" + ) + + # Extract and validate URLs + urls_raw = csv_df[url_column].dropna().astype(str).tolist() + + # Clean URLs (remove whitespace) + urls_to_process = [u.strip() for u in urls_raw if u.strip()] + + # Preview + with st.expander(f"Preview URLs to process ({len(urls_to_process)} found)"): + for i, url in enumerate(urls_to_process[:10], 1): + url_type = detect_url_type(url) + icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓" + st.text(f"{i}. {icon} {url[:80]}...") + if len(urls_to_process) > 10: + st.caption(f"... and {len(urls_to_process) - 10} more") + + # Validate URLs + valid_urls = [] + invalid_urls = [] + + for url in urls_to_process: + url_type = detect_url_type(str(url)) + if url_type in ["youtube", "direct"]: + valid_urls.append(url) + else: + invalid_urls.append(url) + + col1, col2, col3 = st.columns(3) + col1.metric("Total URLs", len(urls_to_process)) + col2.metric("✅ Valid", len(valid_urls), f"{len(valid_urls)/len(urls_to_process)*100:.1f}%" if urls_to_process else "0%") + col3.metric("❌ Invalid", len(invalid_urls)) + + if invalid_urls: + with st.expander(f"Show {len(invalid_urls)} invalid URLs"): + for url in invalid_urls[:10]: + st.text(f"❌ {url[:100]}...") + + # Store in session state for processing + st.session_state['batch_urls'] = valid_urls + st.session_state['batch_df'] = csv_df + st.session_state['batch_url_column'] = url_column + st.session_state['batch_id_column'] = id_column + + except Exception as e: + st.error(f"Error reading CSV: {e}") + import traceback + st.error(traceback.format_exc()) + + # Custom ID (for single uploads) + custom_id = st.text_input("Custom Video ID (optional)") + if custom_id.strip() and source_type != "📁 Batch CSV Upload": + video_id = custom_id.strip() + + # Determine if we can process + can_process = False + if source_type == "File Upload": + can_process = file_bytes is not None and azure_configured + elif source_type == "Direct URL": + can_process = media_url is not None and len(str(media_url).strip()) > 0 + elif source_type == "YouTube": + yt_url_to_check = st.session_state.get('yt_url_value', '') or (yt_url if yt_url else '') + can_process = len(str(yt_url_to_check).strip()) > 0 and check_yt_dlp() + elif source_type == "📁 Batch CSV Upload": + can_process = (st.session_state.get('batch_urls') and + len(st.session_state.get('batch_urls', [])) > 0 and + azure_configured and + not st.session_state.get('batch_processing', False)) + + # Process button + button_text = " Start Transcription" + if source_type == "📁 Batch CSV Upload": + count = len(st.session_state.get('batch_urls', [])) + button_text = f" Process {count} Videos from CSV" + + if st.button(button_text, type="primary", disabled=not can_process): + + # --------------------------------------------------------------------- + # BATCH PROCESSING + # --------------------------------------------------------------------- + if source_type == "📁 Batch CSV Upload": + st.session_state.batch_processing = True + st.session_state.batch_results = [] + + urls = st.session_state.get('batch_urls', []) + csv_df = st.session_state.get('batch_df') + url_column = st.session_state.get('batch_url_column') + id_column = st.session_state.get('batch_id_column') + + total = len(urls) + + st.info(f"Starting batch processing of {total} videos...") + + # Create progress containers + overall_progress = st.progress(0) + status_text = st.empty() + results_container = st.container() + + # Process each URL + results = [] + for idx, url in enumerate(urls, 1): + # Get custom ID if specified + custom_vid_id = None + if id_column != "Auto-generate": + # Find the row with this URL and get the ID + row = csv_df[csv_df[url_column] == url] + if not row.empty: + custom_vid_id = str(row[id_column].iloc[0]) + # Sanitize ID + custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50] + + # Process video + result = process_single_video( + url=url, + custom_id=custom_vid_id, + progress_bar=overall_progress, + status_text=status_text, + overall_progress=(idx, total) + ) + + results.append(result) + st.session_state.batch_results = results + + # Update progress + progress_pct = int((idx / total) * 100) + overall_progress.progress(progress_pct) + + # Show result in container + with results_container: + if result['status'] == 'success': + st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments") + else: + error_msg = result.get('error', 'Unknown error') + # Truncate long error messages + if len(error_msg) > 200: + error_msg = error_msg[:200] + "..." + st.error(f"❌ [{idx}/{total}] Failed: {error_msg}") + + # Small delay to prevent rate limiting + time.sleep(1) + + # Final summary + overall_progress.progress(100) + status_text.text("Batch processing complete!") + + successful = [r for r in results if r['status'] == 'success'] + failed = [r for r in results if r['status'] == 'failed'] + + st.markdown("---") + st.subheader("📊 Batch Processing Summary") + + col1, col2, col3 = st.columns(3) + col1.metric("Total", total) + col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%") + col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%") + + # Detailed results table + with st.expander("View Detailed Results"): + results_df = pd.DataFrame([ + { + 'Video ID': r['video_id'], + 'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'], + 'Status': r['status'], + 'Segments': r.get('segments_count', 0), + 'Indexing': r.get('index_status', 'N/A'), + 'Error': (r.get('error', '')[:100] + '...') if r.get('error') else '' + } + for r in results + ]) + st.dataframe(results_df) + + # Download results as CSV + csv_buffer = io.StringIO() + results_df.to_csv(csv_buffer, index=False) + st.download_button( + "Download Results CSV", + csv_buffer.getvalue(), + "batch_processing_results.csv", + "text/csv" + ) + + # Search hint + if successful: + st.info("💡 **Search processed videos using:**") + video_ids = [r['video_id'] for r in successful[:5]] + st.code(f"video_id:({' OR '.join(video_ids)})") + + st.session_state.batch_processing = False + + else: + # ----------------------------------------------------------------- + # SINGLE VIDEO PROCESSING (Original logic) + # ----------------------------------------------------------------- + progress_bar = st.progress(0) + status = st.empty() + + try: + # ------------------------------------------------------------- + # HANDLE FILE UPLOAD (Direct to Azure) + # ------------------------------------------------------------- + if source_type == "File Upload" and file_bytes: + progress_bar.progress(10) + status.text("Uploading to Azure Blob...") + + blob_name = f"upload_{video_id}_{int(time.time())}.m4a" + + # Try SDK method first, fallback to fixed REST method + sas_url = None + error = None + + try: + sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) + except Exception as e: + error = str(e) + + if error and ("not installed" in error or "SDK" in error): + st.info("Using REST API for upload...") + sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) + + if error: + raise Exception(error) + + if not sas_url: + raise Exception("Failed to generate SAS URL") + + media_url = sas_url + progress_bar.progress(50) + status.text("Upload complete, starting transcription...") + + # ------------------------------------------------------------- + # HANDLE YOUTUBE (Download then Upload) + # ------------------------------------------------------------- + elif source_type == "YouTube": + # Get URL from session state + yt_url = st.session_state.get('yt_url_value', '') + + if not yt_url or not yt_url.strip(): + raise Exception("YouTube URL is empty. Please enter a valid YouTube URL.") + + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + progress_bar.progress(10) + status.text("Downloading from YouTube...") + + output_path = f"{tmpdir}/youtube_{video_id}.m4a" + downloaded_path, error = download_youtube_audio( + yt_url.strip(), + output_path, + lambda p, m: (progress_bar.progress(p), status.text(m)) + ) + + if error: + raise Exception(error) + + progress_bar.progress(50) + status.text("Uploading to Azure Blob...") + + # Read file and upload + with open(downloaded_path, 'rb') as f: + file_bytes = f.read() + + blob_name = f"youtube_{video_id}_{int(time.time())}.m4a" + + # Try SDK first, fallback to fixed REST + sas_url = None + error = None + + try: + sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) + except Exception as e: + error = str(e) + + if error and ("not installed" in error or "SDK" in error): + st.info("Using REST API for upload...") + sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) + + if error: + raise Exception(error) + + if not sas_url: + raise Exception("Failed to generate SAS URL") + + media_url = sas_url + progress_bar.progress(75) + status.text("Processing with Azure Speech...") + + # ------------------------------------------------------------- + # TRANSCRIBE (All paths lead here) + # ------------------------------------------------------------- + if not media_url: + raise Exception("No media URL available") + + # Submit directly to Azure Speech API + status.text("Submitting to Azure Speech-to-Text...") + result = submit_transcription_direct(video_id, media_url) + operation_url = result.get("operation_url") + + if not operation_url: + raise Exception("No operation URL returned") + + # Debug info + st.info(f"Debug: Operation URL received") + + # Poll + max_polls = 120 + transcription_data = None + + for i in range(max_polls): + time.sleep(POLL_SECONDS) + poll_result = poll_transcription_operation(operation_url) + status_text = poll_result.get("status", "unknown") + + progress = min(75 + int((i / max_polls) * 20), 95) + progress_bar.progress(progress) + status.text(f"Transcribing... ({i * POLL_SECONDS // 60} min) - Status: {status_text}") + + if status_text.lower() == "succeeded": + status.text("Transcription complete, retrieving results...") + transcription_data = get_transcription_from_result(poll_result) + break + + elif status_text.lower() == "failed": + error_msg = poll_result.get("properties", {}).get("error", {}).get("message", "Unknown error") + raise Exception(f"Transcription failed: {error_msg}") + + if not transcription_data: + raise Exception("Transcription timed out") + + # ------------------------------------------------------------- + # PROCESS & INDEX (DIRECT) + # ------------------------------------------------------------- + progress_bar.progress(98) + status.text("Processing segments and indexing...") + + # Convert to segments + segments = process_transcription_to_segments(transcription_data, video_id) + + # Save to blob + blob_name = save_segments_to_blob(video_id, segments) + + # Index to search + try: + index_result = index_segments_direct(video_id, segments) + index_msg = f"Indexed: {index_result.get('indexed', 0)} documents" + except Exception as e: + index_msg = f"Indexing skipped: {e}" + + progress_bar.progress(100) + status.text("Complete!") + + st.success(f""" + ✅ **Transcription Complete!** + - Video ID: {video_id} + - Segments: {len(segments)} + - {index_msg} + """) + st.code(f'Search: video_id:{video_id}') + + # Show sample segments + with st.expander("View first 5 segments"): + for seg in segments[:5]: + st.write(f"**{ms_to_ts(seg['start_ms'])} - {ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...") + + except Exception as e: + st.error(f"❌ Error: {str(e)}") + st.exception(e) + + # Debug info + if 'debug_poll_url' in st.session_state: + st.error(f"Debug - Poll URL used: {st.session_state['debug_poll_url']}") + + +# Footer +st.sidebar.markdown("---") +st.sidebar.caption("Video Annotation Platform v1.0 - Direct API Mode") \ No newline at end of file From 5826913d7d9aeb25a5c743b5e34c47f8ce43e556 Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Sun, 15 Feb 2026 09:43:08 -0600 Subject: [PATCH 2/8] Add batch CSV upload feature + UI updates (multiple upload) --- ..._folder_manifest.py.backup.20260205_170952 | 167 ++++++++ ui/host.json | 8 + ui/ui_search1.py | 125 ++++++ ui/ui_search2.py | 372 ++++++++++++++++++ 4 files changed, 672 insertions(+) create mode 100644 scripts/box_shared_folder_manifest.py.backup.20260205_170952 create mode 100644 ui/host.json create mode 100644 ui/ui_search1.py create mode 100644 ui/ui_search2.py diff --git a/scripts/box_shared_folder_manifest.py.backup.20260205_170952 b/scripts/box_shared_folder_manifest.py.backup.20260205_170952 new file mode 100644 index 0000000..310df6e --- /dev/null +++ b/scripts/box_shared_folder_manifest.py.backup.20260205_170952 @@ -0,0 +1,167 @@ +""" +scripts/box_shared_folder_manifest.py - Generate Video Manifest from Box + +This script enumerates .m4a video files from a Box shared folder and generates +a manifest file (videos.jsonl) that lists all videos with their IDs and media URLs. +It creates open shared links for each file so Azure Speech Service can access them. + +Architecture Role: +- Pre-processing step before video ingestion +- Generates videos.jsonl input file for import_videos.py +- Handles Box API authentication and folder traversal +- Creates publicly accessible download URLs for Speech Service + +Usage: + python scripts/box_shared_folder_manifest.py + +Output: + - videos.jsonl: One JSON object per line with video_id and media_url + +Configuration (via .env): + - BOX_SHARED_FOLDER_URL: Box shared folder link + - BOX_TOKEN or BOX_ACCESS_TOKEN/BOX_REFRESH_TOKEN: Box authentication + - OUT_PATH: Output file path (default: videos.jsonl) + - RECURSIVE: Whether to traverse subfolders (default: 1) +""" + +import json +import os +import requests +from typing import Dict, Any, List, Optional + +from box_auth import get_access_token + +BOX_API = "https://api.box.com/2.0" + +SHARED_FOLDER_URL = os.environ["BOX_SHARED_FOLDER_URL"] # https://...box.com/s/ +print("BOX_SHARED_FOLDER_URL =", SHARED_FOLDER_URL) +OUT_PATH = os.environ.get("OUT_PATH", "videos.jsonl") +RECURSIVE = os.environ.get("RECURSIVE", "1") == "1" + + +def shared_headers(token: str) -> Dict[str, str]: + return { + "Authorization": f"Bearer {token}", + "BoxApi": f"shared_link={SHARED_FOLDER_URL}", + "Content-Type": "application/json", + } + + +def auth_headers(token: str) -> Dict[str, str]: + return { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + + +def resolve_shared_folder(token: str) -> Dict[str, Any]: + url = f"{BOX_API}/shared_items" + h = shared_headers(token) + r = requests.get(url, headers=h, timeout=30) + if r.status_code != 200: + raise RuntimeError(f"{r.status_code} {r.text} (headers sent: {h.get('BoxApi')})") + return r.json() + + + +def list_folder_items(token: str, folder_id: str, limit: int = 1000, offset: int = 0) -> Dict[str, Any]: + url = f"{BOX_API}/folders/{folder_id}/items" + params = {"limit": limit, "offset": offset} + r = requests.get(url, headers=shared_headers(token), params=params, timeout=30) + r.raise_for_status() + return r.json() + + +def ensure_open_shared_link_for_file(token: str, file_id: str) -> str: + """ + Ensure file has an open shared link and return a direct-download URL. + """ + url = f"{BOX_API}/files/{file_id}" + payload = {"shared_link": {"access": "open"}} + params = {"fields": "shared_link"} # IMPORTANT: ask for shared_link back + + r = requests.put(url, headers=auth_headers(token), params=params, json=payload, timeout=30) + r.raise_for_status() + data = r.json() + + sl = data.get("shared_link") or {} + # print("shared_link =", json.dumps(sl, indent=2)) + # Prefer direct static download URL + dl = sl.get("download_url") + if dl: + return dl + + # Fallback: at least return the shared link (may require cookies) + if sl.get("url"): + return sl["url"] + + raise RuntimeError(f"No shared_link returned for file {file_id}: {data}") + + + +def walk(token: str, folder_id: str) -> List[Dict[str, Any]]: + items: List[Dict[str, Any]] = [] + offset = 0 + limit = 1000 + while True: + page = list_folder_items(token, folder_id, limit=limit, offset=offset) + entries = page.get("entries", []) + items.extend(entries) + total = page.get("total_count", 0) + offset += len(entries) + if offset >= total or not entries: + break + return items + + +def main(): + token = get_access_token() + + shared_folder = resolve_shared_folder(token) + if shared_folder.get("type") != "folder": + raise RuntimeError(f"Shared link did not resolve to a folder: {shared_folder.get('type')}") + root_id = shared_folder["id"] + + queue = [root_id] + out_count = 0 + + with open(OUT_PATH, "w", encoding="utf-8") as f: + while queue: + fid = queue.pop(0) + entries = walk(token, fid) + + for e in entries: + et = e.get("type") + name = (e.get("name") or "") + lname = name.lower() + + if et == "folder" and RECURSIVE: + queue.append(e["id"]) + continue + + if et != "file": + continue + + if not lname.endswith(".m4a"): + continue + + file_id = e["id"] + video_id = f"vid_{file_id}" + + # Make a per-file open shared link (Speech can fetch without auth). + # If your org disallows open links, this will fail — then you’ll need Blob staging. + file_link = ensure_open_shared_link_for_file(token, file_id) + + # Encourage direct download behavior + media_url = file_link # + ("?download=1" if "?" not in file_link else "&download=1") + + f.write(json.dumps({"video_id": video_id, "media_url": media_url}) + "\n") + out_count += 1 + if out_count % 10 == 0: + print(f"Wrote {out_count} entries...") + + print(f"Done. Wrote {out_count} m4a entries to {OUT_PATH}") + + +if __name__ == "__main__": + main() diff --git a/ui/host.json b/ui/host.json new file mode 100644 index 0000000..3369578 --- /dev/null +++ b/ui/host.json @@ -0,0 +1,8 @@ +{ + "version": "2.0", + "isDefaultHostConfig": true, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + } +} \ No newline at end of file diff --git a/ui/ui_search1.py b/ui/ui_search1.py new file mode 100644 index 0000000..194fdec --- /dev/null +++ b/ui/ui_search1.py @@ -0,0 +1,125 @@ +""" +ui_search.py - Streamlit Web Interface for Video Segment Search + +This Streamlit application provides a user-friendly web interface for searching +indexed video segments. Users can: +- Enter text queries to search across all indexed segments +- Choose search mode (keyword, vector, or hybrid) +- Filter results by video_id and adjust result count +- View segment text with timestamps and relevance scores + +Architecture Role: +- Frontend user interface for the video annotation system +- Deployed as Azure Container App (video-annotator-ui) +- Calls SearchSegments Azure Function for all search operations +- Displays results with formatted timestamps and metadata + +Deployment: + - Local: python -m streamlit run ui_search.py + - Azure: Deployed as Container App (see ui/README.md) + +Configuration (via .env or Container App env vars): + - SEARCH_FN_URL: SearchSegments function endpoint + - DEFAULT_MODE: Default search mode (hybrid/keyword/vector) + - DEFAULT_TOP: Default number of results + - DEFAULT_K: Default vector recall depth +""" + +import os +import requests +import streamlit as st +from dotenv import load_dotenv + +# Load .env locally (Container Apps/App Service will use real env vars) +load_dotenv() + +SEARCH_FN_URL = os.environ["SEARCH_FN_URL"] +DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid") +DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10")) +DEFAULT_K = int(os.environ.get("DEFAULT_K", "40")) + +st.set_page_config(page_title="Video Segment Search", layout="wide") +st.title("🔎 Search indexed video segments") + +with st.sidebar: + st.header("Search settings") + mode = st.selectbox( + "Mode", + ["keyword", "hybrid", "vector"], + index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) + if DEFAULT_MODE in ("keyword", "hybrid", "vector") + else 1, + ) + top = st.slider("Top", 1, 50, DEFAULT_TOP) + k = st.slider("Vector k (hybrid/vector)", 5, 200, DEFAULT_K) + video_id_filter = st.text_input("Filter by video_id (optional)", value="") + st.caption("Tip: keep k ~ 4×top for hybrid.") + +q = st.text_input("Query", value="", placeholder="e.g., measles misinformation") +go = st.button("Search", type="primary", disabled=(not q.strip())) + + +def ms_to_ts(ms: int) -> str: + s = max(0, int(ms // 1000)) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}" + + +def call_search_api(payload: dict) -> dict: + r = requests.post( + SEARCH_FN_URL, + json=payload, + timeout=60, + headers={"Content-Type": "application/json"}, + ) + if r.status_code >= 400: + raise RuntimeError(f"HTTP {r.status_code}: {r.text}") + return r.json() if r.text else {} + + +if go: + payload = {"q": q.strip(), "mode": mode, "top": top} + if mode in ("hybrid", "vector"): + payload["k"] = k + if video_id_filter.strip(): + payload["video_id"] = video_id_filter.strip() + + try: + with st.spinner("Searching..."): + data = call_search_api(payload) + except Exception as e: + st.error(f"Search failed: {e}") + st.stop() + + hits = data.get("hits", []) + st.caption(f"Count: {data.get('count')} | Returned: {len(hits)}") + + for i, h in enumerate(hits, start=1): + start_ms = h.get("start_ms", 0) + end_ms = h.get("end_ms", 0) + vid = h.get("video_id", "") + seg = h.get("segment_id", "") + score = h.get("score", None) + + header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}" + if seg: + header += f" | seg={seg}" + if score is not None: + header += f" | score={score:.3f}" if isinstance(score, (int, float)) else f" | score={score}" + + with st.expander(header, expanded=(i <= 3)): + st.write(h.get("text", "")) + + labels = h.get("pred_labels") or [] + conf = h.get("pred_confidence") + rationale = h.get("pred_rationale") + + if labels or conf is not None or rationale: + st.subheader("Annotations") + if labels: + st.write("**Labels:**", ", ".join(labels)) + if conf is not None: + st.write("**Confidence:**", conf) + if rationale: + st.write("**Rationale:**", rationale) diff --git a/ui/ui_search2.py b/ui/ui_search2.py new file mode 100644 index 0000000..c81dda1 --- /dev/null +++ b/ui/ui_search2.py @@ -0,0 +1,372 @@ +""" +ui_search.py - Streamlit Web Interface for Video Segment Search & Upload + +This Streamlit application provides: +- Search indexed video segments (keyword, vector, hybrid) +- Upload new videos for transcription and indexing +- View processing status and results + +Architecture: +- Frontend for video annotation system +- Calls SearchSegments, TranscribeHttp, and EmbedAndIndex Azure Functions +- Supports both search and ingest workflows +""" + +import os +import requests +import streamlit as st +import tempfile +import json +import time +from typing import Optional, Dict, Any +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Azure Function URLs +SEARCH_FN_URL = os.environ["SEARCH_FN_URL"] +TRANSCRIBE_URL = os.environ.get("TRANSCRIBE_URL", "") +EMBED_INDEX_URL = os.environ.get("EMBED_INDEX_URL", "") + +# Default settings +DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid") +DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10")) +DEFAULT_K = int(os.environ.get("DEFAULT_K", "40")) +POLL_SECONDS = int(os.environ.get("POLL_SECONDS", "15")) + +st.set_page_config(page_title="Video Annotation Platform", layout="wide") +st.title("🎬 Video Annotation Platform") + +# Sidebar navigation +with st.sidebar: + st.header("Navigation") + page = st.radio("Select Page", ["🔎 Search Segments", "⬆️ Upload & Transcribe"]) + + st.header("Settings") + if page == "🔎 Search Segments": + mode = st.selectbox( + "Search Mode", + ["keyword", "hybrid", "vector"], + index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) + if DEFAULT_MODE in ("keyword", "hybrid", "vector") + else 1, + ) + top = st.slider("Results", 1, 50, DEFAULT_TOP) + k = st.slider("Vector k", 5, 200, DEFAULT_K) + st.caption("Tip: keep k ~ 4×top for hybrid") + + +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + +def ms_to_ts(ms: int) -> str: + """Convert milliseconds to timestamp.""" + s = max(0, int(ms // 1000)) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}" + + +def call_api(url: str, payload: dict, timeout: int = 60) -> dict: + """Generic API call with error handling.""" + if not url: + raise RuntimeError("API URL not configured") + + r = requests.post( + url, + json=payload, + timeout=timeout, + headers={"Content-Type": "application/json"}, + ) + if r.status_code >= 400: + raise RuntimeError(f"HTTP {r.status_code}: {r.text}") + return r.json() if r.text else {} + + +def submit_transcription(video_id: str, media_url: str) -> Dict[str, Any]: + """Submit video for transcription.""" + payload = { + "video_id": video_id, + "media_url": media_url, + "language": "en-US" + } + return call_api(TRANSCRIBE_URL, payload, timeout=60) + + +def poll_transcription(job_url: str) -> Dict[str, Any]: + """Poll transcription job status.""" + r = requests.get(job_url, timeout=30) + r.raise_for_status() + return r.json() + + +def embed_and_index(video_id: str, transcript_data: Dict[str, Any]) -> Dict[str, Any]: + """Send transcript for embedding and indexing.""" + payload = { + "video_id": video_id, + "transcript": transcript_data + } + return call_api(EMBED_INDEX_URL, payload, timeout=60) + + +def process_video_pipeline(video_id: str, media_url: str, progress_bar=None, status_text=None): + """ + Complete pipeline: transcribe -> poll -> embed/index + Returns final status + """ + # Step 1: Submit transcription + if status_text: + status_text.text("Submitting to Azure Speech-to-Text...") + if progress_bar: + progress_bar.progress(10) + + try: + result = submit_transcription(video_id, media_url) + job_url = result.get("job_url") + + if not job_url: + return {"status": "failed", "error": "No job URL returned"} + + # Step 2: Poll for completion + if status_text: + status_text.text("Transcribing audio (this may take several minutes)...") + + max_polls = 120 # 30 minutes max + for i in range(max_polls): + time.sleep(POLL_SECONDS) + + poll_result = poll_transcription(job_url) + status = poll_result.get("status", "unknown").lower() + + if progress_bar: + progress = min(10 + int((i / max_polls) * 70), 80) + progress_bar.progress(progress) + + if status == "succeeded": + if status_text: + status_text.text("Transcription complete! Indexing segments...") + if progress_bar: + progress_bar.progress(85) + + # Step 3: Embed and index + transcript_data = poll_result.get("transcript", {}) + index_result = embed_and_index(video_id, transcript_data) + + if progress_bar: + progress_bar.progress(100) + if status_text: + status_text.text("✅ Complete! Video is now searchable.") + + return { + "status": "completed", + "video_id": video_id, + "segments_indexed": index_result.get("indexed", 0), + "job_url": job_url + } + + elif status == "failed": + error = poll_result.get("error", "Unknown error") + return {"status": "failed", "error": error} + + # Still running, continue polling + if status_text and i % 4 == 0: # Update every minute + status_text.text(f"Transcribing... ({i * POLL_SECONDS // 60} minutes elapsed)") + + # Timeout + return {"status": "timeout", "error": "Transcription timed out after 30 minutes"} + + except Exception as e: + return {"status": "error", "error": str(e)} + + +def generate_video_id(filename: str) -> str: + """Generate unique video ID from filename.""" + import hashlib + clean_name = Path(filename).stem + hash_suffix = hashlib.md5(clean_name.encode()).hexdigest()[:8] + return f"vid_{clean_name[:50]}_{hash_suffix}" + + +# ============================================================================= +# PAGE 1: SEARCH SEGMENTS +# ============================================================================= + +if page == "🔎 Search Segments": + st.header("Search Indexed Video Segments") + + col1, col2 = st.columns([3, 1]) + with col1: + q = st.text_input("Query", value="", placeholder="e.g., measles vaccine side effects") + with col2: + video_id_filter = st.text_input("Filter by video_id (optional)", value="") + + go = st.button("Search", type="primary", disabled=(not q.strip())) + + if go: + payload = {"q": q.strip(), "mode": mode, "top": top} + if mode in ("hybrid", "vector"): + payload["k"] = k + if video_id_filter.strip(): + payload["video_id"] = video_id_filter.strip() + + try: + with st.spinner("Searching..."): + data = call_api(SEARCH_FN_URL, payload) + except Exception as e: + st.error(f"Search failed: {e}") + st.stop() + + hits = data.get("hits", []) + total_count = data.get("count", 0) + + st.caption(f"Found {total_count} total segments | Showing top {len(hits)}") + + if not hits: + st.info("No results found. Try a different query or upload videos first.") + + for i, h in enumerate(hits, start=1): + start_ms = h.get("start_ms", 0) + end_ms = h.get("end_ms", 0) + vid = h.get("video_id", "") + seg = h.get("segment_id", "") + score = h.get("score", None) + + header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}" + if seg: + header += f" | seg={seg}" + if score is not None: + header += f" | score={score:.3f}" if isinstance(score, (float, int)) else f" | score={score}" + + with st.expander(header, expanded=(i <= 3)): + st.write(h.get("text", "")) + + # Show annotations if present + labels = h.get("pred_labels") or [] + conf = h.get("pred_confidence") + rationale = h.get("pred_rationale") + + if labels or conf is not None or rationale: + st.subheader("Annotations") + cols = st.columns(3) + if labels: + cols[0].metric("Labels", ", ".join(labels)) + if conf is not None: + cols[1].metric("Confidence", f"{conf:.2f}" if isinstance(conf, float) else conf) + if rationale: + cols[2].metric("Rationale", rationale[:100] + "..." if len(str(rationale)) > 100 else rationale) + + +# ============================================================================= +# PAGE 2: UPLOAD & TRANSCRIBE +# ============================================================================= + +elif page == "⬆️ Upload & Transcribe": + st.header("Upload Video for Transcription") + + st.markdown(""" + Upload a video file to: + 1. Extract audio and transcribe using Azure Speech-to-Text + 2. Segment the transcript into searchable chunks + 3. Create vector embeddings and index for semantic search + + Supported formats: MP4, AVI, MOV, MKV, M4A, MP3, WAV + """) + + # File uploader + uploaded_file = st.file_uploader( + "Choose a video or audio file", + type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"], + accept_multiple_files=False + ) + + # Or provide URL + st.markdown("**OR** provide a media URL:") + media_url_input = st.text_input( + "Media URL (e.g., Box shared link, Azure Blob URL)", + placeholder="https://..." + ) + + # Video ID input (optional) + custom_video_id = st.text_input( + "Custom Video ID (optional)", + placeholder="my_video_001", + help="Leave blank to auto-generate from filename" + ) + + # Process button + process_clicked = st.button( + "🚀 Start Transcription", + type="primary", + disabled=(not uploaded_file and not media_url_input.strip()) + ) + + if process_clicked: + # Determine video ID and media URL + if uploaded_file: + # Save uploaded file temporarily + video_id = custom_video_id or generate_video_id(uploaded_file.name) + + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp: + tmp.write(uploaded_file.getvalue()) + tmp_path = tmp.name + + st.info(f"📁 File saved temporarily: {tmp_path}") + st.warning("⚠️ Direct file upload requires Azure Blob storage integration. Please use 'Media URL' option with a publicly accessible URL (Box, Azure Blob, etc.) for now.") + + # For now, instruct user to use URL option + st.error("Please use the 'Media URL' option instead. Upload your file to Box or Azure Blob first, then paste the direct download URL.") + + elif media_url_input.strip(): + video_id = custom_video_id or generate_video_id(media_url_input) + media_url = media_url_input.strip() + + st.success(f"🎬 Processing: {video_id}") + st.info(f"URL: {media_url[:80]}...") + + # Progress tracking + progress_bar = st.progress(0) + status_text = st.empty() + + # Run pipeline + result = process_video_pipeline(video_id, media_url, progress_bar, status_text) + + # Display results + if result["status"] == "completed": + st.success(f""" + ✅ **Transcription Complete!** + + - **Video ID**: {result['video_id']} + - **Segments Indexed**: {result['segments_indexed']} + - **Status**: Ready for search + + Go to the **Search** page to query this video's content. + """) + + # Show sample query + st.code(f'Query: "video_id:{video_id}" to see all segments from this video', language="text") + + elif result["status"] == "failed": + st.error(f"❌ **Processing Failed**\n\nError: {result.get('error', 'Unknown error')}") + + elif result["status"] == "timeout": + st.warning(f"⏱️ **Processing Timeout**\n\nThe transcription is taking longer than expected. Check pipeline_state.json for status.") + + else: + st.error(f"⚠️ **Unexpected Error**: {result.get('error', 'Unknown')}") + + +# ============================================================================= +# FOOTER +# ============================================================================= + +st.sidebar.markdown("---") +st.sidebar.caption(""" +**Video Annotation Platform v1.0** + +- Search: Query indexed segments +- Upload: Add new videos via URL +- Azure Speech-to-Text powered +""") \ No newline at end of file From b4e9a9a357289dc5fbd8a13c2b27db2a73c3ab80 Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Mon, 16 Feb 2026 19:14:39 -0600 Subject: [PATCH 3/8] Update upload feature + UI updates (multiple uploads) --- ui/ui_search.py | 211 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 192 insertions(+), 19 deletions(-) diff --git a/ui/ui_search.py b/ui/ui_search.py index 26a2ecd..0af5cc7 100644 --- a/ui/ui_search.py +++ b/ui/ui_search.py @@ -67,6 +67,8 @@ st.session_state.batch_results = [] if 'batch_processing' not in st.session_state: st.session_state.batch_processing = False +if 'index_schema_cache' not in st.session_state: + st.session_state.index_schema_cache = None # Sidebar with st.sidebar: @@ -79,6 +81,22 @@ index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1) top = st.slider("Results", 1, 50, DEFAULT_TOP) k = st.slider("Vector k", 5, 200, DEFAULT_K) + + # Debug section + st.markdown("---") + if st.button("🔍 Debug Index Schema"): + with st.spinner("Fetching index schema..."): + schema_info = debug_check_index_schema() + if isinstance(schema_info, dict): + st.success(f"Index: {schema_info['index_name']}") + st.write(f"**Key Field:** `{schema_info['key_field']}`") + st.write("**Fields:**") + for field in schema_info['fields']: + key_badge = "🔑 " if field['key'] else "" + st.caption(f"{key_badge}`{field['name']}` ({field['type']})") + st.session_state.index_schema_cache = schema_info + else: + st.error(schema_info) # ============================================================================= @@ -99,6 +117,94 @@ def call_api(url: str, payload: dict, timeout: int = 60) -> dict: return r.json() if r.text else {} +def sanitize_id(id_string: str) -> str: + """ + Sanitize ID to be valid for Azure Search (alphanumeric, hyphens, underscores only). + Document key rules: Cannot start with underscore, max 1024 chars. + """ + if not id_string: + id_string = "unknown" + + # Replace invalid characters with underscore + sanitized = re.sub(r'[^\w\-]', '_', str(id_string)) + + # Ensure it doesn't start with underscore (invalid for Azure Search keys) + if sanitized.startswith('_'): + sanitized = 'id' + sanitized + + # Ensure it doesn't start with dash (also problematic) + if sanitized.startswith('-'): + sanitized = 'id' + sanitized + + # Limit length to 1024 characters (Azure Search limit) + if len(sanitized) > 1024: + # Use hash to ensure uniqueness while truncating + hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:16] + sanitized = sanitized[:1000] + "_" + hash_suffix + + return sanitized + + +# ============================================================================= +# AZURE SEARCH SCHEMA FUNCTIONS +# ============================================================================= + +def debug_check_index_schema(): + """Check if your index exists and verify the key field""" + if not SEARCH_ENDPOINT or not SEARCH_KEY or not SEARCH_INDEX_NAME: + return "Search not configured - check SEARCH_ENDPOINT, SEARCH_KEY, and SEARCH_INDEX_NAME" + + url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version=2024-07-01" + headers = {"api-key": SEARCH_KEY} + + try: + r = requests.get(url, headers=headers, timeout=30) + if r.status_code == 200: + schema = r.json() + key_field = None + fields_info = [] + + for field in schema.get("fields", []): + field_info = { + "name": field.get("name"), + "type": field.get("type"), + "key": field.get("key", False), + "searchable": field.get("searchable", False), + "filterable": field.get("filterable", False), + "sortable": field.get("sortable", False), + "facetable": field.get("facetable", False), + "retrievable": field.get("retrievable", False) + } + fields_info.append(field_info) + + if field.get("key", False): + key_field = field.get("name") + + result = { + "index_name": schema.get("name"), + "key_field": key_field, + "fields": fields_info + } + return result + else: + return f"Index check failed: HTTP {r.status_code} - {r.text[:500]}" + except Exception as e: + return f"Error checking index: {str(e)}" + + +def get_index_schema(): + """Get cached schema or fetch new one""" + if st.session_state.index_schema_cache: + return st.session_state.index_schema_cache + + schema_info = debug_check_index_schema() + if isinstance(schema_info, dict): + st.session_state.index_schema_cache = schema_info + return schema_info + else: + raise RuntimeError(f"Cannot fetch index schema: {schema_info}") + + # ============================================================================= # DIRECT AZURE SPEECH API FUNCTIONS (BYPASS AZURE FUNCTION) # ============================================================================= @@ -259,11 +365,23 @@ def get_embeddings(texts: list) -> list: def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: """ Index segments directly to Azure Cognitive Search. - Bypasses the EmbedAndIndex Azure Function. + + CRITICAL: Automatically detects the key field from index schema instead of assuming 'id' """ if not SEARCH_ENDPOINT or not SEARCH_KEY: raise RuntimeError("Azure Search not configured") + # Get the key field name from the index schema + schema_info = get_index_schema() + key_field = schema_info.get("key_field") + + if not key_field: + available = [f.get("name") for f in schema_info.get("fields", [])] + raise RuntimeError(f"No key field found in index. Available fields: {available}") + + # Get list of available fields to ensure we only send existing fields + available_fields = {f.get("name") for f in schema_info.get("fields", [])} + # Generate embeddings for all segments texts = [seg.get("text", "") for seg in segments] try: @@ -275,17 +393,44 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: # Prepare search documents documents = [] for i, (seg, embedding) in enumerate(zip(segments, embeddings)): + safe_video_id = sanitize_id(video_id) + doc_id = f"{safe_video_id}_{i}" + + # Build document dynamically based on what fields actually exist in the index doc = { - "id": f"{video_id}_{i}", - "video_id": video_id, - "segment_id": seg.get("segment_id", i), - "text": seg.get("text", ""), - "start_ms": seg.get("start_ms", 0), - "end_ms": seg.get("end_ms", 0), - "pred_labels": seg.get("pred_labels", []) + "@search.action": "upload" + } + + # Add the key field (whatever it's actually named in your index) + doc[key_field] = doc_id + + # Map of our field names to potential index field names + field_mappings = { + "video_id": safe_video_id, + "segment_id": str(seg.get("segment_id", i)), + "text": str(seg.get("text", "")), + "start_ms": int(seg.get("start_ms", 0)), + "end_ms": int(seg.get("end_ms", 0)), + "pred_labels": seg.get("pred_labels", []) if seg.get("pred_labels") else [] } - if embedding: - doc["embedding"] = embedding + + # Only add fields that exist in the index schema + for field_name, value in field_mappings.items(): + if field_name in available_fields: + doc[field_name] = value + + # Handle embedding field - check for common naming variations + embedding_field = None + for possible_name in ["embedding", "embeddings", "vector", "vectors"]: + if possible_name in available_fields: + embedding_field = possible_name + break + + if embedding and isinstance(embedding, list) and len(embedding) > 0 and embedding_field: + try: + doc[embedding_field] = [float(x) for x in embedding] + except (ValueError, TypeError): + st.warning(f"Skipping embedding for segment {i} due to conversion error") documents.append(doc) @@ -303,8 +448,36 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: try: r = requests.post(url, headers=headers, json=payload, timeout=60) - r.raise_for_status() - return {"indexed": len(documents), "video_id": video_id} + + if r.status_code >= 400: + error_detail = "" + try: + error_json = r.json() + error_detail = json.dumps(error_json, indent=2) + except: + error_detail = r.text + + raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\nDetails: {error_detail}") + + result = r.json() + + # Check for partial failures (207 Multi-Status) + if r.status_code == 207: + failed_docs = [item for item in result.get("value", []) if not item.get("status", False)] + if failed_docs: + st.warning(f"Partial indexing failure: {len(failed_docs)} documents failed") + for fail in failed_docs[:3]: + st.error(f"Doc {fail.get('key', 'unknown')}: {fail.get('errorMessage', 'Unknown error')}") + + return { + "indexed": len(documents), + "video_id": video_id, + "key_field_used": key_field, + "api_response": result + } + + except requests.exceptions.HTTPError as e: + raise RuntimeError(f"HTTP Error: {str(e)}") except Exception as e: raise RuntimeError(f"Indexing failed: {str(e)}") @@ -471,7 +644,7 @@ def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> str: # ==================================================================== # CRITICAL FIX: Service SAS string-to-sign format - # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas + # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas # Format for Blob service SAS: # StringToSign = signedPermissions + "\n" + # signedStart + "\n" + @@ -894,9 +1067,9 @@ def process_single_video(url: str, custom_id: Optional[str] = None, # Index to search try: index_result = index_segments_direct(video_id, segments) - result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents" + result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents (key: {index_result.get('key_field_used', 'unknown')})" except Exception as e: - result["index_status"] = f"Indexing skipped: {str(e)}" + result["index_status"] = f"Indexing failed: {str(e)}" result["status"] = "success" @@ -1003,7 +1176,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, # DIRECT URL # ------------------------------------------------------------------------- elif source_type == "Direct URL": - url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...") + url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/...") if url_input.strip(): media_url = url_input.strip() @@ -1017,7 +1190,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, # Use session state to persist the URL yt_url = st.text_input( "YouTube URL", - placeholder="https://youtube.com/watch?v= ...", + placeholder="https://youtube.com/watch?v=...", value=st.session_state.yt_url_value, key="yt_url_input" ) @@ -1456,9 +1629,9 @@ def process_single_video(url: str, custom_id: Optional[str] = None, # Index to search try: index_result = index_segments_direct(video_id, segments) - index_msg = f"Indexed: {index_result.get('indexed', 0)} documents" + index_msg = f"Indexed: {index_result.get('indexed', 0)} documents (key field: {index_result.get('key_field_used', 'unknown')})" except Exception as e: - index_msg = f"Indexing skipped: {e}" + index_msg = f"Indexing failed: {str(e)}" progress_bar.progress(100) status.text("Complete!") From 26fea22370cf80506dfffd9a78639e62b426c4d5 Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Mon, 23 Feb 2026 19:42:13 -0600 Subject: [PATCH 4/8] Update UI to store videos and display source URL --- ui/ui_search.py | 1396 +++++++++++++++++++++++++++-------------------- 1 file changed, 807 insertions(+), 589 deletions(-) diff --git a/ui/ui_search.py b/ui/ui_search.py index 0af5cc7..15ef358 100644 --- a/ui/ui_search.py +++ b/ui/ui_search.py @@ -1,8 +1,12 @@ """ ui_search.py - Streamlit Web Interface for Video Segment Search & Upload -This version calls Azure Speech API DIRECTLY, bypassing the Azure Function -that has the wrong API version hardcoded. +Features: +- Direct Azure Speech API integration (bypasses Azure Function) +- URL tracking for all processed videos (source_url, source_type, processed_at) +- Handles existing videos without URL data gracefully +- Batch processing with CSV upload +- Video management interface with filtering and deletion """ import os @@ -27,130 +31,178 @@ # Load environment variables load_dotenv() -# Azure Function URLs (only Search uses these now) +# ============================================================================= +# CONFIGURATION +# ============================================================================= + +# Azure Function URLs SEARCH_FN_URL = os.environ.get("SEARCH_FN_URL", "") -# Azure Speech Service Configuration (DIRECT) +# Azure Speech Service (DIRECT) SPEECH_KEY = os.environ.get("SPEECH_KEY") SPEECH_REGION = os.environ.get("SPEECH_REGION", "eastus") SPEECH_API_VERSION = os.environ.get("SPEECH_API_VERSION", "2024-11-15") -# Azure OpenAI & Search for indexing +# Azure OpenAI AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT") AZURE_OPENAI_KEY = os.environ.get("AZURE_OPENAI_KEY") AZURE_OPENAI_DEPLOYMENT = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small") +# Azure Cognitive Search SEARCH_ENDPOINT = os.environ.get("SEARCH_ENDPOINT") SEARCH_KEY = os.environ.get("SEARCH_KEY") SEARCH_INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "segments") -# Azure Storage Configuration +# Azure Storage AZURE_STORAGE_ACCOUNT = os.environ.get("AZURE_STORAGE_ACCOUNT", "storagevideoannotator") AZURE_STORAGE_KEY = os.environ.get("AZURE_STORAGE_KEY", "") INPUT_CONTAINER = os.environ.get("INPUT_CONTAINER", "speech-input") SEGMENTS_CONTAINER = os.environ.get("SEGMENTS_CONTAINER", "segments") -# Default settings +# Settings DEFAULT_MODE = os.environ.get("DEFAULT_MODE", "hybrid") DEFAULT_TOP = int(os.environ.get("DEFAULT_TOP", "10")) DEFAULT_K = int(os.environ.get("DEFAULT_K", "40")) POLL_SECONDS = int(os.environ.get("POLL_SECONDS", "15")) -BATCH_MAX_WORKERS = int(os.environ.get("BATCH_MAX_WORKERS", "3")) # Concurrent processing limit + +# ============================================================================= +# STREAMLIT SETUP +# ============================================================================= st.set_page_config(page_title="Video Annotation Platform", layout="wide") st.title(" Video Annotation Platform") # Initialize session state -if 'yt_url_value' not in st.session_state: - st.session_state.yt_url_value = "" -if 'batch_results' not in st.session_state: - st.session_state.batch_results = [] -if 'batch_processing' not in st.session_state: - st.session_state.batch_processing = False -if 'index_schema_cache' not in st.session_state: - st.session_state.index_schema_cache = None - -# Sidebar +session_state_defaults = { + 'yt_url_value': "", + 'batch_results': [], + 'batch_processing': False, + 'index_schema_cache': None, + 'stored_videos_cache': None, + 'url_fields_status': None, + 'debug_info': {} +} + +for key, value in session_state_defaults.items(): + if key not in st.session_state: + st.session_state[key] = value + +# ============================================================================= +# SIDEBAR NAVIGATION +# ============================================================================= + with st.sidebar: st.header("Navigation") - page = st.radio("Select Page", ["🔎 Search Segments", "⬆️ Upload & Transcribe"]) - + page = st.radio("Select Page", [ + "🔎 Search Segments", + "⬆️ Upload & Transcribe", + "📚 Manage Videos", + "⚙️ System Diagnostics" + ]) + + # Settings for search page if page == "🔎 Search Segments": - st.header("Settings") + st.header("Search Settings") mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1) top = st.slider("Results", 1, 50, DEFAULT_TOP) k = st.slider("Vector k", 5, 200, DEFAULT_K) - # Debug section + # Quick actions st.markdown("---") - if st.button("🔍 Debug Index Schema"): - with st.spinner("Fetching index schema..."): - schema_info = debug_check_index_schema() - if isinstance(schema_info, dict): - st.success(f"Index: {schema_info['index_name']}") - st.write(f"**Key Field:** `{schema_info['key_field']}`") - st.write("**Fields:**") - for field in schema_info['fields']: - key_badge = "🔑 " if field['key'] else "" - st.caption(f"{key_badge}`{field['name']}` ({field['type']})") - st.session_state.index_schema_cache = schema_info - else: - st.error(schema_info) + if st.button("🔄 Refresh Schema Cache"): + st.session_state.index_schema_cache = None + st.session_state.url_fields_status = None + st.success("Cache cleared! Navigate to System Diagnostics to refresh.") + + st.markdown("---") + st.caption("Video Annotation Platform v2.1") + st.caption("With URL Tracking") # ============================================================================= -# HELPER FUNCTIONS +# UTILITY FUNCTIONS # ============================================================================= def ms_to_ts(ms: int) -> str: + """Convert milliseconds to timestamp string.""" s = max(0, int(ms // 1000)) m, s = divmod(s, 60) h, m = divmod(m, 60) return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}" -def call_api(url: str, payload: dict, timeout: int = 60) -> dict: - r = requests.post(url, json=payload, timeout=timeout, headers={"Content-Type": "application/json"}) - if r.status_code >= 400: - raise RuntimeError(f"HTTP {r.status_code}: {r.text}") - return r.json() if r.text else {} - - def sanitize_id(id_string: str) -> str: - """ - Sanitize ID to be valid for Azure Search (alphanumeric, hyphens, underscores only). - Document key rules: Cannot start with underscore, max 1024 chars. - """ + """Sanitize ID for Azure Search (alphanumeric, hyphens, underscores only).""" if not id_string: id_string = "unknown" - # Replace invalid characters with underscore sanitized = re.sub(r'[^\w\-]', '_', str(id_string)) - # Ensure it doesn't start with underscore (invalid for Azure Search keys) - if sanitized.startswith('_'): + if sanitized.startswith('_') or sanitized.startswith('-'): sanitized = 'id' + sanitized - # Ensure it doesn't start with dash (also problematic) - if sanitized.startswith('-'): - sanitized = 'id' + sanitized - - # Limit length to 1024 characters (Azure Search limit) if len(sanitized) > 1024: - # Use hash to ensure uniqueness while truncating hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:16] sanitized = sanitized[:1000] + "_" + hash_suffix return sanitized +def detect_url_type(url: str) -> str: + """Detect if URL is YouTube, direct media, or unknown.""" + if not url: + return "unknown" + + url_lower = str(url).lower().strip() + + youtube_patterns = [ + r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)', + r'youtube\.com\/watch\?v=', + r'youtu\.be\/', + r'youtube\.com\/shorts\/' + ] + + for pattern in youtube_patterns: + if re.search(pattern, url_lower): + return "youtube" + + media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm'] + if any(url_lower.endswith(ext) for ext in media_extensions): + return "direct" + + cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive'] + if any(pattern in url_lower for pattern in cloud_patterns): + return "direct" + + return "unknown" + + +def check_yt_dlp() -> bool: + """Check if yt-dlp is installed.""" + try: + result = subprocess.run(["which", "yt-dlp"], capture_output=True, text=True) + return result.returncode == 0 + except: + return False + + +def call_api(url: str, payload: dict) -> dict: + """Make API call to search function.""" + try: + r = requests.post(url, json=payload, timeout=30) + r.raise_for_status() + return r.json() + except requests.exceptions.RequestException as e: + raise RuntimeError(f"API call failed: {str(e)}") + + # ============================================================================= # AZURE SEARCH SCHEMA FUNCTIONS # ============================================================================= def debug_check_index_schema(): - """Check if your index exists and verify the key field""" + """Check index schema and verify URL tracking fields.""" if not SEARCH_ENDPOINT or not SEARCH_KEY or not SEARCH_INDEX_NAME: return "Search not configured - check SEARCH_ENDPOINT, SEARCH_KEY, and SEARCH_INDEX_NAME" @@ -164,28 +216,35 @@ def debug_check_index_schema(): key_field = None fields_info = [] + url_fields = ['source_url', 'source_type', 'processed_at'] + found_url_fields = [] + for field in schema.get("fields", []): field_info = { "name": field.get("name"), "type": field.get("type"), "key": field.get("key", False), - "searchable": field.get("searchable", False), + "retrievable": field.get("retrievable", False), "filterable": field.get("filterable", False), "sortable": field.get("sortable", False), - "facetable": field.get("facetable", False), - "retrievable": field.get("retrievable", False) + "facetable": field.get("facetable", False) } fields_info.append(field_info) if field.get("key", False): key_field = field.get("name") + + if field.get("name") in url_fields: + found_url_fields.append(field.get("name")) - result = { + return { "index_name": schema.get("name"), "key_field": key_field, - "fields": fields_info + "fields": fields_info, + "found_url_fields": found_url_fields, + "missing_url_fields": list(set(url_fields) - set(found_url_fields)), + "has_all_url_fields": len(found_url_fields) == len(url_fields) } - return result else: return f"Index check failed: HTTP {r.status_code} - {r.text[:500]}" except Exception as e: @@ -193,7 +252,7 @@ def debug_check_index_schema(): def get_index_schema(): - """Get cached schema or fetch new one""" + """Get cached schema or fetch new one.""" if st.session_state.index_schema_cache: return st.session_state.index_schema_cache @@ -205,17 +264,41 @@ def get_index_schema(): raise RuntimeError(f"Cannot fetch index schema: {schema_info}") +def check_url_fields_status(): + """Check URL fields status with caching.""" + if st.session_state.url_fields_status: + return st.session_state.url_fields_status + + try: + schema = get_index_schema() + if isinstance(schema, dict): + result = { + 'fields_exist': schema.get('has_all_url_fields', False), + 'found_fields': schema.get('found_url_fields', []), + 'missing_fields': schema.get('missing_url_fields', []), + 'key_field': schema.get('key_field') + } + st.session_state.url_fields_status = result + return result + except: + pass + + return { + 'fields_exist': False, + 'found_fields': [], + 'missing_fields': ['source_url', 'source_type', 'processed_at'], + 'key_field': None + } + + # ============================================================================= -# DIRECT AZURE SPEECH API FUNCTIONS (BYPASS AZURE FUNCTION) +# AZURE SPEECH API FUNCTIONS # ============================================================================= def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any]: - """ - Submit transcription directly to Azure Speech API. - Bypasses the Azure Function with wrong API version. - """ + """Submit transcription directly to Azure Speech API.""" if not SPEECH_KEY: - raise RuntimeError("SPEECH_KEY not configured in environment") + raise RuntimeError("SPEECH_KEY not configured") endpoint = f"https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext/transcriptions:submit?api-version={SPEECH_API_VERSION}" @@ -241,7 +324,6 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any] r = requests.post(endpoint, headers=headers, json=payload, timeout=60) r.raise_for_status() - # Get operation URL from Location header (this is the operation status URL) operation_url = r.headers.get("Location") if not operation_url: result = r.json() @@ -253,32 +335,21 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any] return {"operation_url": operation_url, "video_id": video_id} except requests.exceptions.HTTPError as e: + error_msg = f"Speech API error {r.status_code}: {r.text}" if r.status_code == 401: - raise RuntimeError("Azure Speech API authentication failed. Check SPEECH_KEY.") - elif r.status_code == 400: - raise RuntimeError(f"Bad request: {r.text}") - else: - raise RuntimeError(f"Speech API error {r.status_code}: {r.text}") + error_msg = "Azure Speech API authentication failed. Check SPEECH_KEY." + raise RuntimeError(error_msg) def poll_transcription_operation(operation_url: str) -> Dict[str, Any]: - """Poll transcription operation status directly from Azure Speech API.""" + """Poll transcription operation status.""" if not SPEECH_KEY: raise RuntimeError("SPEECH_KEY not configured") - headers = { - "Ocp-Apim-Subscription-Key": SPEECH_KEY - } + headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY} try: - # CRITICAL FIX: Azure returns operation URL with :submit but we need to poll - # using the /transcriptions/{id} endpoint, not /transcriptions:submit/{id} - # The operation_url looks like: .../transcriptions:submit/{id}?api-version=... - # We need: .../transcriptions/{id}?api-version=... - poll_url = operation_url.replace("/transcriptions:submit/", "/transcriptions/") - - # Debug info st.session_state['debug_poll_url'] = poll_url r = requests.get(poll_url, headers=headers, timeout=30) @@ -290,38 +361,29 @@ def poll_transcription_operation(operation_url: str) -> Dict[str, Any]: def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]: - """Get the actual transcription JSON from the result files.""" + """Get transcription JSON from result files.""" if not SPEECH_KEY: raise RuntimeError("SPEECH_KEY not configured") - headers = { - "Ocp-Apim-Subscription-Key": SPEECH_KEY - } + headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY} try: - # Get the result files URL from the completed operation links = result_data.get("links", {}) files_url = links.get("files") if not files_url: - # Try to construct from result data or get content directly if "combinedRecognizedPhrases" in result_data: - # Result might be embedded directly return result_data - raise RuntimeError("No files URL in result") - # Get list of files r = requests.get(files_url, headers=headers, timeout=30) r.raise_for_status() files_data = r.json() - # Find the transcription JSON file for file in files_data.get("values", []): if file.get("kind") == "Transcription": content_url = file.get("links", {}).get("contentUrl") if content_url: - # Download the actual transcription content content_r = requests.get(content_url, timeout=60) content_r.raise_for_status() return content_r.json() @@ -333,11 +395,11 @@ def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]: # ============================================================================= -# DIRECT EMBEDDING AND INDEXING (BYPASS AZURE FUNCTION) +# EMBEDDING AND INDEXING WITH URL TRACKING # ============================================================================= def get_embeddings(texts: list) -> list: - """Get embeddings directly from Azure OpenAI.""" + """Get embeddings from Azure OpenAI.""" if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_KEY: raise RuntimeError("Azure OpenAI not configured") @@ -362,27 +424,28 @@ def get_embeddings(texts: list) -> list: raise RuntimeError(f"Embedding failed: {str(e)}") -def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: +def index_segments_direct(video_id: str, segments: list, source_url: str = None, source_type: str = None) -> Dict[str, Any]: """ - Index segments directly to Azure Cognitive Search. - - CRITICAL: Automatically detects the key field from index schema instead of assuming 'id' + Index segments to Azure Cognitive Search with URL tracking. """ if not SEARCH_ENDPOINT or not SEARCH_KEY: raise RuntimeError("Azure Search not configured") - # Get the key field name from the index schema schema_info = get_index_schema() key_field = schema_info.get("key_field") + available_fields = {f.get("name") for f in schema_info.get("fields", [])} if not key_field: - available = [f.get("name") for f in schema_info.get("fields", [])] - raise RuntimeError(f"No key field found in index. Available fields: {available}") + raise RuntimeError("No key field found in index") - # Get list of available fields to ensure we only send existing fields - available_fields = {f.get("name") for f in schema_info.get("fields", [])} + # Check URL field availability + url_fields_available = { + 'source_url': 'source_url' in available_fields, + 'source_type': 'source_type' in available_fields, + 'processed_at': 'processed_at' in available_fields + } - # Generate embeddings for all segments + # Generate embeddings texts = [seg.get("text", "") for seg in segments] try: embeddings = get_embeddings(texts) @@ -390,21 +453,17 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: st.warning(f"Embedding failed, indexing without vectors: {e}") embeddings = [None] * len(segments) - # Prepare search documents + # Prepare documents documents = [] + processed_timestamp = datetime.utcnow().isoformat() + "Z" + for i, (seg, embedding) in enumerate(zip(segments, embeddings)): safe_video_id = sanitize_id(video_id) doc_id = f"{safe_video_id}_{i}" - # Build document dynamically based on what fields actually exist in the index - doc = { - "@search.action": "upload" - } + doc = {"@search.action": "upload", key_field: doc_id} - # Add the key field (whatever it's actually named in your index) - doc[key_field] = doc_id - - # Map of our field names to potential index field names + # Core fields field_mappings = { "video_id": safe_video_id, "segment_id": str(seg.get("segment_id", i)), @@ -414,115 +473,216 @@ def index_segments_direct(video_id: str, segments: list) -> Dict[str, Any]: "pred_labels": seg.get("pred_labels", []) if seg.get("pred_labels") else [] } - # Only add fields that exist in the index schema + # URL tracking fields + if url_fields_available['source_url']: + field_mappings["source_url"] = str(source_url) if source_url else "" + if url_fields_available['source_type']: + field_mappings["source_type"] = str(source_type) if source_type else "unknown" + if url_fields_available['processed_at']: + field_mappings["processed_at"] = processed_timestamp + + # Only add existing fields for field_name, value in field_mappings.items(): if field_name in available_fields: doc[field_name] = value - # Handle embedding field - check for common naming variations - embedding_field = None - for possible_name in ["embedding", "embeddings", "vector", "vectors"]: - if possible_name in available_fields: - embedding_field = possible_name - break - - if embedding and isinstance(embedding, list) and len(embedding) > 0 and embedding_field: + # Handle embedding + embedding_field = next((f for f in ["embedding", "embeddings", "vector", "vectors"] if f in available_fields), None) + if embedding and embedding_field: try: doc[embedding_field] = [float(x) for x in embedding] except (ValueError, TypeError): - st.warning(f"Skipping embedding for segment {i} due to conversion error") + pass documents.append(doc) - # Upload to Azure Search + # Upload to search index url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01" - - headers = { - "api-key": SEARCH_KEY, - "Content-Type": "application/json" - } - - payload = { - "value": documents - } + headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"} + payload = {"value": documents} try: r = requests.post(url, headers=headers, json=payload, timeout=60) if r.status_code >= 400: - error_detail = "" + error_detail = r.text try: - error_json = r.json() - error_detail = json.dumps(error_json, indent=2) + error_detail = json.dumps(r.json(), indent=2) except: - error_detail = r.text - - raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\nDetails: {error_detail}") + pass + raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\n{error_detail}") result = r.json() - # Check for partial failures (207 Multi-Status) + # Check for partial failures if r.status_code == 207: failed_docs = [item for item in result.get("value", []) if not item.get("status", False)] if failed_docs: st.warning(f"Partial indexing failure: {len(failed_docs)} documents failed") - for fail in failed_docs[:3]: - st.error(f"Doc {fail.get('key', 'unknown')}: {fail.get('errorMessage', 'Unknown error')}") return { - "indexed": len(documents), - "video_id": video_id, + "indexed": len(documents), + "video_id": video_id, "key_field_used": key_field, - "api_response": result + "source_url_stored": bool(source_url and url_fields_available['source_url']), + "source_type_stored": bool(source_type and url_fields_available['source_type']), + "url_fields_available": url_fields_available } - except requests.exceptions.HTTPError as e: - raise RuntimeError(f"HTTP Error: {str(e)}") except Exception as e: raise RuntimeError(f"Indexing failed: {str(e)}") def process_transcription_to_segments(transcription_data: Dict, video_id: str) -> list: - """ - Convert Azure Speech transcription JSON to segments format. - """ + """Convert Azure Speech transcription to segments.""" segments = [] - # Parse phrases/segments from transcription - phrases = transcription_data.get("recognizedPhrases", []) - - for i, phrase in enumerate(phrases): - # Extract timing - offset = phrase.get("offsetInTicks", 0) // 10000 # Convert to ms + for i, phrase in enumerate(transcription_data.get("recognizedPhrases", [])): + offset = phrase.get("offsetInTicks", 0) // 10000 duration = phrase.get("durationInTicks", 0) // 10000 - # Extract text nbest = phrase.get("nBest", []) - if nbest: - text = nbest[0].get("display", "") - else: - text = "" + text = nbest[0].get("display", "") if nbest else "" - # Create segment - segment = { + segments.append({ "segment_id": i, "video_id": video_id, "text": text, "start_ms": offset, "end_ms": offset + duration, - "pred_labels": [] # Could add label prediction here - } - - segments.append(segment) + "pred_labels": [] + }) return segments # ============================================================================= -# STORAGE FUNCTIONS - FIXED UPLOAD +# VIDEO RETRIEVAL AND MANAGEMENT +# ============================================================================= + +def get_stored_videos(video_id: str = None, source_type: str = None, + include_missing: bool = True, limit: int = 1000) -> List[Dict]: + """ + Retrieve videos from search index with URL data. + """ + if not SEARCH_ENDPOINT or not SEARCH_KEY: + return [] + + url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01" + headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"} + + # Build filter + filters = [] + if video_id: + filters.append(f"video_id eq '{video_id}'") + if source_type: + filters.append(f"source_type eq '{source_type}'") + + filter_query = " and ".join(filters) if filters else None + + # Get available fields + schema = get_index_schema() + available_fields = {f['name'] for f in schema.get('fields', [])} + + # Build select + select_fields = ["video_id"] + for field in ["source_url", "source_type", "processed_at"]: + if field in available_fields: + select_fields.append(field) + + payload = { + "search": "*", + "select": ",".join(select_fields), + "top": limit + } + + if "processed_at" in available_fields: + payload["orderby"] = "processed_at desc" + if filter_query: + payload["filter"] = filter_query + + try: + r = requests.post(url, headers=headers, json=payload, timeout=30) + r.raise_for_status() + docs = r.json().get("value", []) + + # Deduplicate and normalize + seen = set() + unique_docs = [] + for doc in docs: + vid = doc.get('video_id') + if vid and vid not in seen: + seen.add(vid) + # Normalize missing values + doc['source_type'] = doc.get('source_type') or 'unknown' + doc['source_url'] = doc.get('source_url') or '' + doc['processed_at'] = doc.get('processed_at') or 'unknown' + unique_docs.append(doc) + + return unique_docs + + except Exception as e: + st.error(f"Failed to retrieve videos: {e}") + return [] + + +def delete_video_by_id(video_id: str) -> bool: + """Delete all segments for a video_id from the index.""" + if not SEARCH_ENDPOINT or not SEARCH_KEY: + return False + + # Find all documents + search_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01" + headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"} + + payload = { + "search": "*", + "filter": f"video_id eq '{video_id}'", + "select": "video_id", + "top": 1000 + } + + try: + r = requests.post(search_url, headers=headers, json=payload, timeout=30) + r.raise_for_status() + docs = r.json().get("value", []) + + if not docs: + return False + + # Delete documents + schema = get_index_schema() + key_field = schema.get('key_field', 'id') + + delete_docs = [] + for doc in docs: + doc_key = doc.get(key_field) or doc.get('id') + if doc_key: + delete_docs.append({ + "@search.action": "delete", + key_field: doc_key + }) + + if not delete_docs: + return False + + delete_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01" + r = requests.post(delete_url, headers=headers, json={"value": delete_docs}, timeout=60) + r.raise_for_status() + + return True + + except Exception as e: + st.error(f"Delete failed: {e}") + return False + + +# ============================================================================= +# AZURE STORAGE FUNCTIONS # ============================================================================= def generate_video_id(filename: str) -> str: + """Generate safe video ID from filename.""" clean_name = Path(filename).stem clean_name = re.sub(r'[^\w\s-]', '', clean_name) clean_name = re.sub(r'[-\s]+', '_', clean_name) @@ -531,91 +691,90 @@ def generate_video_id(filename: str) -> str: def test_sas_url(sas_url: str) -> Tuple[bool, str]: - """Test if SAS URL is accessible before sending to Speech API.""" + """Test if SAS URL is accessible.""" try: r = requests.head(sas_url, timeout=10, allow_redirects=True) - if r.status_code == 200: - return True, "SAS URL is accessible" - else: - return False, f"SAS URL returned HTTP {r.status_code}" + return (r.status_code == 200, f"HTTP {r.status_code}") except Exception as e: - return False, f"SAS URL test failed: {str(e)}" + return (False, str(e)) + + +def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> Optional[str]: + """Generate SAS token for blob access.""" + if not AZURE_STORAGE_KEY: + return None + + try: + expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours) + expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ') + + account_key = base64.b64decode(AZURE_STORAGE_KEY) + canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" + + string_to_sign = ( + f"r\n\n{expiry_str}\n{canonicalized_resource}\n\n\nhttps\n2020-12-06\nb\n\n\n\n\n\n\n" + ) + + signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() + signature = base64.b64encode(signed_hmac).decode('utf-8') + + sas_params = { + 'sv': '2020-12-06', + 'sr': 'b', + 'sp': 'r', + 'se': expiry_str, + 'spr': 'https', + 'sig': signature + } + + return '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()]) + + except Exception as e: + st.error(f"SAS generation error: {e}") + return None def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]: - """ - FIXED upload to Azure Blob using REST API. - Corrected string-to-sign format. - """ + """Upload to Azure Blob using REST API.""" if not AZURE_STORAGE_KEY: return None, "Azure Storage key not configured" try: - # Upload URL url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}" - # Create date header in the exact format Azure expects date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') content_length = len(file_bytes) - # ==================================================================== - # CRITICAL FIX: Correct string-to-sign format for Azure Blob Storage - # Format: VERB\nContent-Encoding\nContent-Language\nContent-Length\n - # Content-MD5\nContent-Type\nDate\nIf-Modified-Since\nIf-Match\n - # If-None-Match\nIf-Unmodified-Since\nRange\n - # CanonicalizedHeaders\nCanonicalizedResource - # ==================================================================== string_to_sign = ( - f"PUT\n" # HTTP method - f"\n" # Content-Encoding (empty) - f"\n" # Content-Language (empty) - f"{content_length}\n" # Content-Length (REQUIRED - must be exact) - f"\n" # Content-MD5 (empty) - f"application/octet-stream\n" # Content-Type (REQUIRED for PUT) - f"\n" # Date (empty, using x-ms-date instead) - f"\n" # If-Modified-Since (empty) - f"\n" # If-Match (empty) - f"\n" # If-None-Match (empty) - f"\n" # If-Unmodified-Since (empty) - f"\n" # Range (empty) - f"x-ms-blob-type:BlockBlob\n" # CanonicalizedHeaders (sorted alphabetically) - f"x-ms-date:{date_str}\n" - f"x-ms-version:2020-12-06\n" - f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" # CanonicalizedResource + f"PUT\n\n\n{content_length}\n\napplication/octet-stream\n\n\n\n\n\n\n" + f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n" + f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" ) - # Sign with HMAC-SHA256 account_key = base64.b64decode(AZURE_STORAGE_KEY) signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() signature = base64.b64encode(signed_hmac).decode('utf-8') - # Build authorization header - auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" - - # Set headers - MUST match what was signed headers = { "x-ms-date": date_str, "x-ms-version": "2020-12-06", "x-ms-blob-type": "BlockBlob", "Content-Type": "application/octet-stream", "Content-Length": str(content_length), - "Authorization": auth_header + "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" } - # Upload r = requests.put(url, data=file_bytes, headers=headers, timeout=300) if r.status_code not in [201, 200]: - return None, f"Upload failed: HTTP {r.status_code} - {r.text}" + return None, f"Upload failed: HTTP {r.status_code}" - # Generate SAS token for reading sas_token = generate_sas_token_fixed(blob_name) if not sas_token: return None, "Failed to generate SAS token" sas_url = f"{url}?{sas_token}" - # Test the SAS URL is_valid, test_msg = test_sas_url(sas_url) if not is_valid: return None, f"SAS URL validation failed: {test_msg}" @@ -624,98 +783,11 @@ def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optio except Exception as e: import traceback - return None, f"Upload error: {str(e)}\n{traceback.format_exc()}" - - -def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> str: - """ - FIXED SAS token generation for Azure Blob - Service SAS format. - """ - if not AZURE_STORAGE_KEY: - return None - - try: - # Set expiry in UTC - expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours) - expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ') - - # Decode account key - account_key = base64.b64decode(AZURE_STORAGE_KEY) - - # ==================================================================== - # CRITICAL FIX: Service SAS string-to-sign format - # Reference: https://docs.microsoft.com/en-us/rest/api/storageservices/create-service-sas - # Format for Blob service SAS: - # StringToSign = signedPermissions + "\n" + - # signedStart + "\n" + - # signedExpiry + "\n" + - # canonicalizedResource + "\n" + - # signedIdentifier + "\n" + - # signedIP + "\n" + - # signedProtocol + "\n" + - # signedVersion + "\n" + - # signedResource + "\n" + - # signedSnapshotTime + "\n" + - # signedEncryptionScope + "\n" + - # signedCacheControl + "\n" + - # signedContentDisposition + "\n" + - # signedContentEncoding + "\n" + - # signedContentLanguage + "\n" + - # signedContentType - # ==================================================================== - - # Canonicalized resource for service SAS: /blob/{account}/{container}/{blob} - canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" - - # Build string to sign for Service SAS - string_to_sign = ( - f"r\n" # signed permissions (read) - f"\n" # signed start (empty) - f"{expiry_str}\n" # signed expiry - f"{canonicalized_resource}\n" # canonicalized resource - f"\n" # signed identifier (empty) - f"\n" # signed IP (empty) - f"https\n" # signed protocol - f"2020-12-06\n" # signed version - f"b\n" # signed resource (b = blob) - f"\n" # signed snapshot time (empty) - f"\n" # signed encryption scope (empty) - f"\n" # signed cache control (empty) - f"\n" # signed content disposition (empty) - f"\n" # signed content encoding (empty) - f"\n" # signed content language (empty) - f"" # signed content type (empty, no newline at end) - ) - - # Sign with HMAC-SHA256 - signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() - signature = base64.b64encode(signed_hmac).decode('utf-8') - - # Build query parameters - Order matters for some clients - sas_params = { - 'sv': '2020-12-06', # signed version - 'sr': 'b', # signed resource (blob) - 'sp': 'r', # signed permissions (read) - 'se': expiry_str, # signed expiry - 'spr': 'https', # signed protocol - 'sig': signature # signature - } - - # URL encode the signature and other values - sas_token = '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()]) - return sas_token - - except Exception as e: - st.error(f"SAS generation error: {e}") - import traceback - st.error(traceback.format_exc()) - return None + return None, f"Upload error: {str(e)}" def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]: - """ - Upload using Azure SDK (more reliable, requires azure-storage-blob package). - """ + """Upload using Azure SDK (preferred method).""" try: from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions @@ -729,17 +801,14 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona blob_service = BlobServiceClient.from_connection_string(connection_string) container_client = blob_service.get_container_client(INPUT_CONTAINER) - # Ensure container exists try: container_client.create_container() except Exception: pass - # Upload blob blob_client = container_client.get_blob_client(blob_name) blob_client.upload_blob(file_bytes, overwrite=True) - # Generate SAS token sas_token = generate_blob_sas( account_name=AZURE_STORAGE_ACCOUNT, container_name=INPUT_CONTAINER, @@ -752,7 +821,6 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona sas_url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}?{sas_token}" - # Test the SAS URL is_valid, test_msg = test_sas_url(sas_url) if not is_valid: return None, f"SAS URL validation failed: {test_msg}" @@ -763,7 +831,7 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona return None, "azure-storage-blob not installed" except Exception as e: import traceback - return None, f"SDK upload failed: {str(e)}\n{traceback.format_exc()}" + return None, f"SDK upload failed: {str(e)}" def save_segments_to_blob(video_id: str, segments: list) -> str: @@ -779,28 +847,14 @@ def save_segments_to_blob(video_id: str, segments: list) -> str: date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') string_to_sign = ( - f"PUT\n" - f"\n" - f"\n" - f"{content_length}\n" - f"\n" - f"application/json\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"x-ms-blob-type:BlockBlob\n" - f"x-ms-date:{date_str}\n" - f"x-ms-version:2020-12-06\n" + f"PUT\n\n\n{content_length}\n\napplication/json\n\n\n\n\n\n\n" + f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n" f"/{AZURE_STORAGE_ACCOUNT}/{SEGMENTS_CONTAINER}/{blob_name}" ) account_key = base64.b64decode(AZURE_STORAGE_KEY) signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() signature = base64.b64encode(signed_hmac).decode('utf-8') - auth_header = f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" headers = { "x-ms-date": date_str, @@ -808,7 +862,7 @@ def save_segments_to_blob(video_id: str, segments: list) -> str: "x-ms-blob-type": "BlockBlob", "Content-Type": "application/json", "Content-Length": str(content_length), - "Authorization": auth_header + "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" } r = requests.put(url, data=json_bytes, headers=headers, timeout=60) @@ -817,16 +871,9 @@ def save_segments_to_blob(video_id: str, segments: list) -> str: return blob_name -def check_yt_dlp() -> bool: - try: - result = subprocess.run(["which", "yt-dlp"], capture_output=True, text=True) - return result.returncode == 0 - except: - return False - - -def download_youtube_audio(youtube_url: str, output_path: str, progress_callback=None) -> Tuple[Optional[str], Optional[str]]: - """Download YouTube audio to specific path.""" +def download_youtube_audio(youtube_url: str, output_path: str, + progress_callback=None) -> Tuple[Optional[str], Optional[str]]: + """Download audio from YouTube.""" if not check_yt_dlp(): return None, "yt-dlp not installed. Run: pip install yt-dlp" @@ -840,18 +887,16 @@ def download_youtube_audio(youtube_url: str, output_path: str, progress_callback "--extract-audio", "--audio-format", "m4a", "--audio-quality", "0", - "--no-check-certificate", # Added for compatibility - "--no-warnings", # Reduce noise + "--no-check-certificate", + "--no-warnings", "-o", output_path, youtube_url.strip() ] - # Try to use Node.js runtime if available, otherwise let yt-dlp handle it - # This fixes the "No supported JavaScript runtime" error + # Handle missing Node.js try: node_check = subprocess.run(["which", "node"], capture_output=True, text=True) if node_check.returncode != 0: - # No node.js, try to use legacy format that doesn't require JS cmd.extend(["--extractor-args", "youtube:player_client=web"]) except: pass @@ -863,16 +908,14 @@ def download_youtube_audio(youtube_url: str, output_path: str, progress_callback if result.returncode != 0: error_msg = result.stderr[:500] - # Provide helpful error message for JS runtime issues if "JavaScript runtime" in error_msg: error_msg += "\n\n💡 Tip: Install Node.js or run: pip install yt-dlp --upgrade" return None, f"yt-dlp failed: {error_msg}" - # Find the actual file + # Find downloaded file if os.path.exists(output_path): return output_path, None - # Try alternative extensions base = output_path.rsplit('.', 1)[0] for ext in ['.m4a', '.mp3', '.webm', '.opus']: alt_path = base + ext @@ -887,44 +930,16 @@ def download_youtube_audio(youtube_url: str, output_path: str, progress_callback return None, f"Error: {str(e)}" -def detect_url_type(url: str) -> str: - """Detect if URL is YouTube, direct media, or unknown.""" - if not url: - return "unknown" - - url_lower = str(url).lower().strip() - - # YouTube patterns - youtube_patterns = [ - r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)', - r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=', - r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/', - r'youtube\.com\/shorts\/' - ] - - for pattern in youtube_patterns: - if re.search(pattern, url_lower): - return "youtube" - - # Direct media patterns - media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm'] - if any(url_lower.endswith(ext) for ext in media_extensions): - return "direct" - - # Box.com, Google Drive, Dropbox, etc. - treat as direct - cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive'] - if any(pattern in url_lower for pattern in cloud_patterns): - return "direct" - - return "unknown" - +# ============================================================================= +# MAIN VIDEO PROCESSING +# ============================================================================= -def process_single_video(url: str, custom_id: Optional[str] = None, - progress_bar=None, status_text=None, +def process_single_video(url: str, custom_id: Optional[str] = None, + source_type: str = "unknown", + progress_bar=None, status_text=None, overall_progress: Tuple[int, int] = (0, 1)) -> Dict[str, Any]: """ - Process a single video URL (YouTube or Direct). - Returns result dict with status and metadata. + Process a single video: download (if needed), transcribe, segment, index. """ result = { "url": url, @@ -932,27 +947,24 @@ def process_single_video(url: str, custom_id: Optional[str] = None, "status": "pending", "segments_count": 0, "error": None, - "index_status": None + "index_status": None, + "source_url": url, + "source_type": source_type, + "url_stored": False } try: - # Detect URL type + # Validate URL url_type = detect_url_type(url) - if url_type == "unknown": result["status"] = "failed" result["error"] = "Unknown URL type. Must be YouTube or direct media URL." return result # Generate video ID - if custom_id: - video_id = custom_id.strip() - else: - video_id = generate_video_id(f"batch_{url}") - + video_id = custom_id.strip() if custom_id else generate_video_id(f"batch_{url}") result["video_id"] = video_id - # Update progress current, total = overall_progress base_progress = int((current / total) * 100) if progress_bar else 0 @@ -981,7 +993,6 @@ def process_single_video(url: str, custom_id: Optional[str] = None, result["error"] = f"Download failed: {error}" return result - # Read and upload with open(downloaded_path, 'rb') as f: file_bytes = f.read() @@ -990,7 +1001,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, if status_text: status_text.text(f"[{current}/{total}] Uploading to Azure...") - # Try SDK first + # Try SDK first, fallback to REST sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) if error and ("not installed" in error or "SDK" in error): sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) @@ -1013,7 +1024,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, result["error"] = "No media URL available" return result - # Submit transcription + # Submit to Speech API if status_text: status_text.text(f"[{current}/{total}] Submitting to Speech API...") @@ -1034,9 +1045,8 @@ def process_single_video(url: str, custom_id: Optional[str] = None, poll_result = poll_transcription_operation(operation_url) status = poll_result.get("status", "unknown") - # Update progress during polling if progress_bar: - poll_progress = min(int((i / max_polls) * 20), 20) # 20% of progress for polling + poll_progress = min(int((i / max_polls) * 20), 20) overall = base_progress + int((1 / total) * 80) + int((1 / total) * poll_progress) progress_bar.progress(min(overall, 99)) @@ -1054,7 +1064,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, result["error"] = "Transcription timed out" return result - # Process segments + # Process and index if status_text: status_text.text(f"[{current}/{total}] Processing segments...") @@ -1064,10 +1074,25 @@ def process_single_video(url: str, custom_id: Optional[str] = None, # Save to blob save_segments_to_blob(video_id, segments) - # Index to search + # Index with URL tracking try: - index_result = index_segments_direct(video_id, segments) - result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents (key: {index_result.get('key_field_used', 'unknown')})" + index_result = index_segments_direct( + video_id, + segments, + source_url=url, + source_type=source_type + ) + + result["url_stored"] = index_result.get('source_url_stored', False) + result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents" + + # Debug info + st.session_state['debug_info'][video_id] = { + 'url_fields_available': index_result.get('url_fields_available', {}), + 'source_url_stored': index_result.get('source_url_stored', False), + 'source_type_stored': index_result.get('source_type_stored', False) + } + except Exception as e: result["index_status"] = f"Indexing failed: {str(e)}" @@ -1083,7 +1108,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, # ============================================================================= -# PAGE 1: SEARCH +# PAGE 1: SEARCH SEGMENTS # ============================================================================= if page == "🔎 Search Segments": @@ -1091,6 +1116,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, if not SEARCH_FN_URL: st.error("SEARCH_FN_URL not configured. Cannot search.") + st.info("Please set SEARCH_FN_URL environment variable.") else: col1, col2 = st.columns([3, 1]) with col1: @@ -1116,7 +1142,17 @@ def process_single_video(url: str, custom_id: Optional[str] = None, start_ms, end_ms = h.get("start_ms", 0), h.get("end_ms", 0) vid, seg, score = h.get("video_id", ""), h.get("segment_id", ""), h.get("score") - header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}" + # Show URL info if available + source_url = h.get('source_url', '') + source_type = h.get('source_type', '') + url_indicator = "" + + if source_url: + url_indicator = f" | 🔗 {source_type}: {source_url[:40]}..." + elif source_type and source_type != 'unknown': + url_indicator = f" | 📁 {source_type}" + + header = f"{i}. {vid} | {ms_to_ts(start_ms)}–{ms_to_ts(end_ms)}{url_indicator}" if seg: header += f" | seg={seg}" if score is not None: @@ -1126,23 +1162,40 @@ def process_single_video(url: str, custom_id: Optional[str] = None, st.write(h.get("text", "")) if h.get("pred_labels"): st.caption(f"Labels: {', '.join(h['pred_labels'])}") + if source_url: + st.caption(f"**Source:** [{source_url}]({source_url})") + st.caption(f"**Type:** {source_type}") + except Exception as e: st.error(f"Search failed: {e}") # ============================================================================= -# PAGE 2: UPLOAD (DIRECT API VERSION) +# PAGE 2: UPLOAD & TRANSCRIBE # ============================================================================= elif page == "⬆️ Upload & Transcribe": st.header("Upload Video for Transcription") - st.info(" Using direct Azure Speech API (bypassing Azure Function)") - # Check Azure config + # Check URL fields status + url_status = check_url_fields_status() + + if url_status['fields_exist']: + st.success("✅ URL Tracking Enabled - Original source URLs will be stored") + else: + st.warning(f""" + ⚠️ **Partial URL Tracking** - Missing fields: {', '.join(url_status['missing_fields'])} + + Videos will still be processed, but URL information will be limited. + Add missing fields to your Azure Search index for full functionality. + """) + + # Check Azure configuration azure_configured = bool(AZURE_STORAGE_KEY) and bool(SPEECH_KEY) if not azure_configured: st.error("⚠️ Azure Storage and Speech keys required. Check .env file.") + # Source selection source_type = st.radio("Select Source", ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"], horizontal=True) @@ -1150,12 +1203,11 @@ def process_single_video(url: str, custom_id: Optional[str] = None, media_url = None video_id = None file_bytes = None - yt_url = None # Initialize to None + yt_url = None csv_df = None + detected_source_type = "unknown" - # ------------------------------------------------------------------------- - # FILE UPLOAD - # ------------------------------------------------------------------------- + # --- File Upload --- if source_type == "File Upload": if not azure_configured: st.info("Please configure Azure Storage to enable file upload") @@ -1170,143 +1222,105 @@ def process_single_video(url: str, custom_id: Optional[str] = None, st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)") file_bytes = uploaded_file.getvalue() video_id = generate_video_id(uploaded_file.name) - st.info("File ready for upload to Azure") + detected_source_type = "upload" + st.info("File ready for upload") - # ------------------------------------------------------------------------- - # DIRECT URL - # ------------------------------------------------------------------------- + # --- Direct URL --- elif source_type == "Direct URL": - url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/...") + url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...") if url_input.strip(): media_url = url_input.strip() video_id = generate_video_id(url_input) + detected_source_type = "direct" st.success("✅ URL validated") - # ------------------------------------------------------------------------- - # YOUTUBE - FIXED with session state - # ------------------------------------------------------------------------- + # --- YouTube --- elif source_type == "YouTube": - # Use session state to persist the URL yt_url = st.text_input( - "YouTube URL", - placeholder="https://youtube.com/watch?v=...", + "YouTube URL", + placeholder="https://youtube.com/watch?v= ...", value=st.session_state.yt_url_value, key="yt_url_input" ) - # Update session state when URL changes - FIXED: removed experimental_rerun + # Update session state if yt_url != st.session_state.yt_url_value: st.session_state.yt_url_value = yt_url - # Use st.rerun() instead of st.experimental_rerun() for newer Streamlit versions try: st.rerun() - except AttributeError: - # Fallback for older versions - try: - st.experimental_rerun() - except AttributeError: - pass # If neither exists, just continue without rerun + except: + pass + # Check yt-dlp if not check_yt_dlp(): st.warning("yt-dlp not installed") if st.button("Install yt-dlp"): with st.spinner("Installing..."): subprocess.run(["pip", "install", "-q", "yt-dlp"]) - # FIXED: Use st.rerun() instead of experimental_rerun try: st.rerun() - except AttributeError: - try: - st.experimental_rerun() - except AttributeError: - st.info("Please refresh the page manually") + except: + st.info("Please refresh the page") elif yt_url and yt_url.strip(): video_id = generate_video_id(f"yt_{yt_url.strip()}") + detected_source_type = "youtube" st.success("YouTube URL ready") - # ------------------------------------------------------------------------- - # BATCH CSV UPLOAD - NEW FEATURE - # ------------------------------------------------------------------------- + # --- Batch CSV Upload --- elif source_type == "📁 Batch CSV Upload": st.subheader("📁 Batch Process Videos from CSV") csv_file = st.file_uploader( "Upload CSV file", type=["csv"], - help="CSV must contain a column with video URLs (YouTube or direct links)" + help="CSV must contain a column with video URLs" ) if csv_file: try: - # Read CSV - handle various formats - # Try to detect if URLs are in header or rows - content = csv_file.read().decode('utf-8') - csv_file.seek(0) # Reset pointer - - # First attempt: standard read + # Read CSV with flexible parsing try: csv_df = pd.read_csv(csv_file) except Exception: - # Second attempt: maybe single column with no header csv_file.seek(0) csv_df = pd.read_csv(csv_file, header=None) csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))] - # Check if column names look like URLs (common issue) + # Handle case where column names are URLs url_like_columns = [] for col in csv_df.columns: col_str = str(col).strip() - if detect_url_type(col_str) != "unknown" or col_str.startswith('http'): + if detect_url_type(col_str) != "unknown": url_like_columns.append(col) - # If column names look like URLs, treat them as data if url_like_columns and len(csv_df.columns) == 1: - # The column name is actually a URL, convert to data url_col_name = csv_df.columns[0] new_row = {url_col_name: url_col_name} csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True) - st.success(f"✅ Loaded CSV with {len(csv_df)} rows and {len(csv_df.columns)} columns") - - # Show available columns - st.write("**Available columns:**", list(csv_df.columns)) + st.success(f"✅ Loaded CSV with {len(csv_df)} rows") - # Let user select the URL column - url_column = st.selectbox( - "Select column containing video URLs", - options=csv_df.columns.tolist(), - help="Choose the column that contains YouTube or direct media URLs" - ) + # Column selection + url_column = st.selectbox("Select column containing video URLs", options=csv_df.columns.tolist()) - # Optional: Select custom ID column id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column] - id_column = st.selectbox( - "Select column for custom Video ID (optional)", - options=id_column_options, - index=0, - help="Optional: Choose a column to use as custom video ID (e.g., title, ID field)" - ) + id_column = st.selectbox("Select column for custom Video ID (optional)", options=id_column_options, index=0) # Extract and validate URLs urls_raw = csv_df[url_column].dropna().astype(str).tolist() - - # Clean URLs (remove whitespace) urls_to_process = [u.strip() for u in urls_raw if u.strip()] # Preview - with st.expander(f"Preview URLs to process ({len(urls_to_process)} found)"): + with st.expander(f"Preview URLs ({len(urls_to_process)} found)"): for i, url in enumerate(urls_to_process[:10], 1): url_type = detect_url_type(url) icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓" st.text(f"{i}. {icon} {url[:80]}...") - if len(urls_to_process) > 10: - st.caption(f"... and {len(urls_to_process) - 10} more") - # Validate URLs + # Validate valid_urls = [] invalid_urls = [] - for url in urls_to_process: url_type = detect_url_type(str(url)) if url_type in ["youtube", "direct"]: @@ -1315,16 +1329,11 @@ def process_single_video(url: str, custom_id: Optional[str] = None, invalid_urls.append(url) col1, col2, col3 = st.columns(3) - col1.metric("Total URLs", len(urls_to_process)) - col2.metric("✅ Valid", len(valid_urls), f"{len(valid_urls)/len(urls_to_process)*100:.1f}%" if urls_to_process else "0%") + col1.metric("Total", len(urls_to_process)) + col2.metric("✅ Valid", len(valid_urls)) col3.metric("❌ Invalid", len(invalid_urls)) - if invalid_urls: - with st.expander(f"Show {len(invalid_urls)} invalid URLs"): - for url in invalid_urls[:10]: - st.text(f"❌ {url[:100]}...") - - # Store in session state for processing + # Store in session state st.session_state['batch_urls'] = valid_urls st.session_state['batch_df'] = csv_df st.session_state['batch_url_column'] = url_column @@ -1335,7 +1344,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, import traceback st.error(traceback.format_exc()) - # Custom ID (for single uploads) + # Custom ID input custom_id = st.text_input("Custom Video ID (optional)") if custom_id.strip() and source_type != "📁 Batch CSV Upload": video_id = custom_id.strip() @@ -1347,7 +1356,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, elif source_type == "Direct URL": can_process = media_url is not None and len(str(media_url).strip()) > 0 elif source_type == "YouTube": - yt_url_to_check = st.session_state.get('yt_url_value', '') or (yt_url if yt_url else '') + yt_url_to_check = st.session_state.get('yt_url_value', '') can_process = len(str(yt_url_to_check).strip()) > 0 and check_yt_dlp() elif source_type == "📁 Batch CSV Upload": can_process = (st.session_state.get('batch_urls') and @@ -1356,16 +1365,14 @@ def process_single_video(url: str, custom_id: Optional[str] = None, not st.session_state.get('batch_processing', False)) # Process button - button_text = " Start Transcription" + button_text = "🚀 Start Transcription" if source_type == "📁 Batch CSV Upload": count = len(st.session_state.get('batch_urls', [])) - button_text = f" Process {count} Videos from CSV" + button_text = f"🚀 Process {count} Videos from CSV" if st.button(button_text, type="primary", disabled=not can_process): - # --------------------------------------------------------------------- - # BATCH PROCESSING - # --------------------------------------------------------------------- + # --- BATCH PROCESSING --- if source_type == "📁 Batch CSV Upload": st.session_state.batch_processing = True st.session_state.batch_results = [] @@ -1376,31 +1383,32 @@ def process_single_video(url: str, custom_id: Optional[str] = None, id_column = st.session_state.get('batch_id_column') total = len(urls) - st.info(f"Starting batch processing of {total} videos...") - # Create progress containers + # Progress UI overall_progress = st.progress(0) status_text = st.empty() results_container = st.container() - # Process each URL results = [] for idx, url in enumerate(urls, 1): # Get custom ID if specified custom_vid_id = None if id_column != "Auto-generate": - # Find the row with this URL and get the ID row = csv_df[csv_df[url_column] == url] if not row.empty: custom_vid_id = str(row[id_column].iloc[0]) - # Sanitize ID custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50] - # Process video + # Detect source type + url_type = detect_url_type(url) + src_type = "youtube" if url_type == "youtube" else "direct" + + # Process result = process_single_video( url=url, custom_id=custom_vid_id, + source_type=src_type, progress_bar=overall_progress, status_text=status_text, overall_progress=(idx, total) @@ -1413,19 +1421,16 @@ def process_single_video(url: str, custom_id: Optional[str] = None, progress_pct = int((idx / total) * 100) overall_progress.progress(progress_pct) - # Show result in container + # Show result with results_container: if result['status'] == 'success': - st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments") + url_stored = "✅ URL saved" if result.get('url_stored') else "⚠️ URL not stored" + st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments ({url_stored})") else: - error_msg = result.get('error', 'Unknown error') - # Truncate long error messages - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - st.error(f"❌ [{idx}/{total}] Failed: {error_msg}") + error_msg = result.get('error', 'Unknown error')[:200] + st.error(f"❌ [{idx}/{total}] Failed: {error_msg}...") - # Small delay to prevent rate limiting - time.sleep(1) + time.sleep(1) # Rate limiting # Final summary overall_progress.progress(100) @@ -1442,14 +1447,16 @@ def process_single_video(url: str, custom_id: Optional[str] = None, col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%") col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%") - # Detailed results table + # Detailed results with st.expander("View Detailed Results"): results_df = pd.DataFrame([ { 'Video ID': r['video_id'], 'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'], + 'Source Type': r.get('source_type', 'unknown'), 'Status': r['status'], 'Segments': r.get('segments_count', 0), + 'URL Stored': r.get('url_stored', False), 'Indexing': r.get('index_status', 'N/A'), 'Error': (r.get('error', '')[:100] + '...') if r.get('error') else '' } @@ -1457,73 +1464,48 @@ def process_single_video(url: str, custom_id: Optional[str] = None, ]) st.dataframe(results_df) - # Download results as CSV + # Download results csv_buffer = io.StringIO() results_df.to_csv(csv_buffer, index=False) - st.download_button( - "Download Results CSV", - csv_buffer.getvalue(), - "batch_processing_results.csv", - "text/csv" - ) + st.download_button("Download Results CSV", csv_buffer.getvalue(), "batch_results.csv", "text/csv") # Search hint if successful: - st.info("💡 **Search processed videos using:**") + st.info("💡 **Search processed videos:**") video_ids = [r['video_id'] for r in successful[:5]] st.code(f"video_id:({' OR '.join(video_ids)})") st.session_state.batch_processing = False - + + # --- SINGLE VIDEO PROCESSING --- else: - # ----------------------------------------------------------------- - # SINGLE VIDEO PROCESSING (Original logic) - # ----------------------------------------------------------------- progress_bar = st.progress(0) status = st.empty() try: - # ------------------------------------------------------------- - # HANDLE FILE UPLOAD (Direct to Azure) - # ------------------------------------------------------------- + # Upload file if needed if source_type == "File Upload" and file_bytes: progress_bar.progress(10) status.text("Uploading to Azure Blob...") blob_name = f"upload_{video_id}_{int(time.time())}.m4a" - # Try SDK method first, fallback to fixed REST method - sas_url = None - error = None - - try: - sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) - except Exception as e: - error = str(e) - + sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) if error and ("not installed" in error or "SDK" in error): - st.info("Using REST API for upload...") sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) if error: raise Exception(error) - if not sas_url: - raise Exception("Failed to generate SAS URL") - media_url = sas_url progress_bar.progress(50) - status.text("Upload complete, starting transcription...") - # ------------------------------------------------------------- - # HANDLE YOUTUBE (Download then Upload) - # ------------------------------------------------------------- + # Download YouTube if needed elif source_type == "YouTube": - # Get URL from session state yt_url = st.session_state.get('yt_url_value', '') if not yt_url or not yt_url.strip(): - raise Exception("YouTube URL is empty. Please enter a valid YouTube URL.") + raise Exception("YouTube URL is empty") import tempfile with tempfile.TemporaryDirectory() as tmpdir: @@ -1531,11 +1513,7 @@ def process_single_video(url: str, custom_id: Optional[str] = None, status.text("Downloading from YouTube...") output_path = f"{tmpdir}/youtube_{video_id}.m4a" - downloaded_path, error = download_youtube_audio( - yt_url.strip(), - output_path, - lambda p, m: (progress_bar.progress(p), status.text(m)) - ) + downloaded_path, error = download_youtube_audio(yt_url.strip(), output_path) if error: raise Exception(error) @@ -1543,42 +1521,25 @@ def process_single_video(url: str, custom_id: Optional[str] = None, progress_bar.progress(50) status.text("Uploading to Azure Blob...") - # Read file and upload with open(downloaded_path, 'rb') as f: file_bytes = f.read() blob_name = f"youtube_{video_id}_{int(time.time())}.m4a" - # Try SDK first, fallback to fixed REST - sas_url = None - error = None - - try: - sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) - except Exception as e: - error = str(e) - - if error and ("not installed" in error or "SDK" in error): - st.info("Using REST API for upload...") + sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) + if error and ("not installed" in error): sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) if error: raise Exception(error) - if not sas_url: - raise Exception("Failed to generate SAS URL") - media_url = sas_url progress_bar.progress(75) - status.text("Processing with Azure Speech...") - # ------------------------------------------------------------- - # TRANSCRIBE (All paths lead here) - # ------------------------------------------------------------- if not media_url: raise Exception("No media URL available") - # Submit directly to Azure Speech API + # Transcribe status.text("Submitting to Azure Speech-to-Text...") result = submit_transcription_direct(video_id, media_url) operation_url = result.get("operation_url") @@ -1586,9 +1547,6 @@ def process_single_video(url: str, custom_id: Optional[str] = None, if not operation_url: raise Exception("No operation URL returned") - # Debug info - st.info(f"Debug: Operation URL received") - # Poll max_polls = 120 transcription_data = None @@ -1603,35 +1561,40 @@ def process_single_video(url: str, custom_id: Optional[str] = None, status.text(f"Transcribing... ({i * POLL_SECONDS // 60} min) - Status: {status_text}") if status_text.lower() == "succeeded": - status.text("Transcription complete, retrieving results...") transcription_data = get_transcription_from_result(poll_result) break - elif status_text.lower() == "failed": - error_msg = poll_result.get("properties", {}).get("error", {}).get("message", "Unknown error") - raise Exception(f"Transcription failed: {error_msg}") + raise Exception(f"Transcription failed: {poll_result.get('properties', {}).get('error', {}).get('message', 'Unknown error')}") if not transcription_data: raise Exception("Transcription timed out") - # ------------------------------------------------------------- - # PROCESS & INDEX (DIRECT) - # ------------------------------------------------------------- + # Process and index progress_bar.progress(98) status.text("Processing segments and indexing...") - # Convert to segments segments = process_transcription_to_segments(transcription_data, video_id) # Save to blob - blob_name = save_segments_to_blob(video_id, segments) + save_segments_to_blob(video_id, segments) - # Index to search - try: - index_result = index_segments_direct(video_id, segments) - index_msg = f"Indexed: {index_result.get('indexed', 0)} documents (key field: {index_result.get('key_field_used', 'unknown')})" - except Exception as e: - index_msg = f"Indexing failed: {str(e)}" + # Index with URL tracking + original_url = None + if source_type == "YouTube": + original_url = st.session_state.get('yt_url_value', '') + elif source_type == "Direct URL": + original_url = media_url + elif source_type == "File Upload": + original_url = f"uploaded_file://{video_id}" + + index_result = index_segments_direct( + video_id, + segments, + source_url=original_url, + source_type=detected_source_type + ) + + url_stored_msg = "✅ Source URL stored" if index_result.get('source_url_stored') else "⚠️ URL storage not available" progress_bar.progress(100) status.text("Complete!") @@ -1640,24 +1603,279 @@ def process_single_video(url: str, custom_id: Optional[str] = None, ✅ **Transcription Complete!** - Video ID: {video_id} - Segments: {len(segments)} - - {index_msg} + - Source Type: {detected_source_type} + - Indexed: {index_result.get('indexed', 0)} documents + - {url_stored_msg} """) + + if original_url: + st.info(f"**Original Source:** [{original_url}]({original_url})") + st.code(f'Search: video_id:{video_id}') - # Show sample segments with st.expander("View first 5 segments"): for seg in segments[:5]: st.write(f"**{ms_to_ts(seg['start_ms'])} - {ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...") - + except Exception as e: st.error(f"❌ Error: {str(e)}") st.exception(e) + + +# ============================================================================= +# PAGE 3: MANAGE VIDEOS +# ============================================================================= + +elif page == "📚 Manage Videos": + st.header("📚 Manage Stored Videos") + st.info("View, search, and manage all processed videos and their source URLs") + + if not SEARCH_ENDPOINT or not SEARCH_KEY: + st.error("Azure Search not configured. Cannot retrieve video list.") + else: + # Check URL fields status + url_status = check_url_fields_status() + + if url_status['fields_exist']: + st.success("✅ URL tracking fields are configured") + else: + st.warning(f"⚠️ Missing URL fields: {', '.join(url_status['missing_fields'])}") + + # URL coverage analysis + if st.button("📊 Analyze URL Data Coverage"): + with st.spinner("Analyzing..."): + all_videos = get_stored_videos(include_missing=True) + + with_urls = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']] + without_urls = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']] + + col1, col2, col3 = st.columns(3) + col1.metric("Total Videos", len(all_videos)) + col2.metric("✅ With URL Data", len(with_urls), f"{len(with_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%") + col3.metric("⚠️ Missing URL Data", len(without_urls), f"{len(without_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%") + + # By type breakdown + st.subheader("Breakdown by Source Type") + type_counts = {} + for v in all_videos: + t = v.get('source_type') or 'unknown' + type_counts[t] = type_counts.get(t, 0) + 1 + + cols = st.columns(len(type_counts) if type_counts else 1) + for i, (stype, count) in enumerate(sorted(type_counts.items())): + icon = "🎬" if stype == "youtube" else "📄" if stype == "direct" else "📁" if stype == "upload" else "❓" + cols[i % len(cols)].metric(f"{icon} {stype}", count) - # Debug info - if 'debug_poll_url' in st.session_state: - st.error(f"Debug - Poll URL used: {st.session_state['debug_poll_url']}") + if without_urls: + with st.expander(f"Videos without URL data ({len(without_urls)})"): + st.info("These were likely processed before URL tracking was enabled") + for v in without_urls[:20]: + st.text(f"• {v.get('video_id')}") + + st.markdown("---") + + # Filters + st.subheader("Filter Videos") + col1, col2 = st.columns(2) + + with col1: + filter_video_id = st.text_input("Filter by Video ID (optional)") + with col2: + filter_options = ["All", "With URL Data Only", "Missing URL Data Only", "youtube", "direct", "upload", "unknown"] + filter_source_type = st.selectbox("Filter by Source Type", options=filter_options, index=0) + + # Load videos + if st.button("🔍 Load Videos", type="primary"): + with st.spinner("Retrieving videos..."): + + # Handle special filters + if filter_source_type == "Missing URL Data Only": + all_videos = get_stored_videos(include_missing=True) + videos = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']] + if filter_video_id.strip(): + videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()] + elif filter_source_type == "With URL Data Only": + all_videos = get_stored_videos(include_missing=True) + videos = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']] + if filter_video_id.strip(): + videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()] + else: + source_type = None if filter_source_type == "All" else filter_source_type + videos = get_stored_videos( + video_id=filter_video_id if filter_video_id.strip() else None, + source_type=source_type, + include_missing=True, + limit=1000 + ) + + st.session_state.stored_videos_cache = videos + st.success(f"Found {len(videos)} videos") + + # Display videos + if st.session_state.stored_videos_cache: + videos = st.session_state.stored_videos_cache + + # Metrics + st.markdown("---") + cols = st.columns(4) + + type_counts = {} + for v in videos: + t = v.get('source_type') or 'unknown' + type_counts[t] = type_counts.get(t, 0) + 1 + + cols[0].metric("Total", len(videos)) + cols[1].metric("YouTube", type_counts.get('youtube', 0)) + cols[2].metric("Direct", type_counts.get('direct', 0)) + cols[3].metric("Upload", type_counts.get('upload', 0)) + + # Group by type + st.markdown("---") + st.subheader("Video List") + + videos_by_type = {} + for v in videos: + stype = v.get('source_type') or 'unknown' + if stype not in videos_by_type: + videos_by_type[stype] = [] + videos_by_type[stype].append(v) + + # Display by category + for source_type in ['youtube', 'direct', 'upload', 'unknown']: + if source_type not in videos_by_type: + continue + + type_videos = videos_by_type[source_type] + icon = "🎬" if source_type == "youtube" else "📄" if source_type == "direct" else "📁" if source_type == "upload" else "❓" + + with st.expander(f"{icon} {source_type.upper()} ({len(type_videos)} videos)", expanded=(source_type == 'youtube')): + for i, video in enumerate(type_videos, 1): + vid = video.get('video_id', 'unknown') + src_url = video.get('source_url', '') + processed = video.get('processed_at', 'unknown') + + has_url = bool(src_url) + status_icon = "✅" if has_url else "⚠️" + + with st.container(): + cols = st.columns([4, 1]) + + with cols[0]: + st.write(f"**{status_icon} {i}. {vid}**") + st.caption(f"Processed: {processed}") + + if src_url: + display_url = src_url[:80] + "..." if len(str(src_url)) > 80 else src_url + st.code(display_url) + if str(src_url).startswith('http'): + st.markdown(f"[Open Source ↗]({src_url})") + else: + st.warning("No source URL stored") + + with cols[1]: + if st.button(f"🗑️ Delete", key=f"del_{vid}_{i}_{source_type}"): + if delete_video_by_id(vid): + st.success(f"Deleted {vid}") + st.session_state.stored_videos_cache = [ + v for v in videos if v.get('video_id') != vid + ] + try: + st.rerun() + except: + pass + + st.markdown("---") + + # Export + st.markdown("---") + if st.button("📥 Export to CSV"): + export_df = pd.DataFrame([ + { + 'video_id': v.get('video_id'), + 'source_type': v.get('source_type') or 'unknown', + 'source_url': v.get('source_url', ''), + 'has_url_data': bool(v.get('source_url')), + 'processed_at': v.get('processed_at', 'unknown') + } + for v in videos + ]) + + csv_buffer = io.StringIO() + export_df.to_csv(csv_buffer, index=False) + st.download_button("Download CSV", csv_buffer.getvalue(), "video_list.csv", "text/csv") + + +# ============================================================================= +# PAGE 4: SYSTEM DIAGNOSTICS +# ============================================================================= + +elif page == "⚙️ System Diagnostics": + st.header("⚙️ System Diagnostics") + st.info("Check system configuration and troubleshoot issues") + + # Configuration status + st.subheader("Configuration Status") + + config_checks = { + "Azure Speech (SPEECH_KEY)": bool(SPEECH_KEY), + "Azure OpenAI (AZURE_OPENAI_KEY)": bool(AZURE_OPENAI_KEY), + "Azure Search (SEARCH_KEY)": bool(SEARCH_KEY), + "Azure Storage (AZURE_STORAGE_KEY)": bool(AZURE_STORAGE_KEY), + "Search Function (SEARCH_FN_URL)": bool(SEARCH_FN_URL), + "yt-dlp installed": check_yt_dlp() + } + + cols = st.columns(2) + for i, (name, status) in enumerate(config_checks.items()): + icon = "✅" if status else "❌" + cols[i % 2].write(f"{icon} {name}: {'OK' if status else 'Not configured'}") + + # Index schema check + st.markdown("---") + st.subheader("Index Schema Check") + + if st.button("🔍 Check Index Schema"): + with st.spinner("Fetching schema..."): + schema = debug_check_index_schema() + + if isinstance(schema, dict): + st.success(f"Index: {schema['index_name']}") + st.write(f"Key Field: `{schema['key_field']}`") + + # URL fields status + if schema.get('has_all_url_fields'): + st.success("✅ All URL tracking fields present") + else: + st.warning(f"⚠️ Missing fields: {', '.join(schema.get('missing_url_fields', []))}") + + # Show all fields + with st.expander("View all fields"): + for field in schema['fields']: + key = "🔑" if field['key'] else "" + url = "🔗" if 'url' in field['name'].lower() else "" + st.caption(f"{key}{url} `{field['name']}` ({field['type']})") + + st.session_state.index_schema_cache = schema + else: + st.error(f"Schema check failed: {schema}") + + # Debug info + st.markdown("---") + st.subheader("Debug Information") + + with st.expander("Session State"): + st.json({ + k: str(v)[:100] + "..." if len(str(v)) > 100 else v + for k, v in st.session_state.items() + }) + + with st.expander("Recent Processing Debug"): + if st.session_state.get('debug_info'): + st.json(st.session_state['debug_info']) + else: + st.info("No debug info yet. Process a video first.") # Footer st.sidebar.markdown("---") -st.sidebar.caption("Video Annotation Platform v1.0 - Direct API Mode") \ No newline at end of file +st.sidebar.caption("Video Annotation Platform v2.1") \ No newline at end of file From 7ee4987fd789f7b47d20d4dc66e54bdaedf5a91d Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Mon, 23 Feb 2026 20:22:47 -0600 Subject: [PATCH 5/8] =?UTF-8?q?scripts=20to=20verify/enable=20URL=E2=80=91?= =?UTF-8?q?tracking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- add_url_fields.py | 132 ++++++++++++++++++++++++++++++++++++++++++++++ verify_fields.py | 67 +++++++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 add_url_fields.py create mode 100644 verify_fields.py diff --git a/add_url_fields.py b/add_url_fields.py new file mode 100644 index 0000000..a9a7ed8 --- /dev/null +++ b/add_url_fields.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +Add URL tracking fields to Azure Search index +""" + +import os +import requests +import sys + +# Load from .env file +env_path = os.path.join(os.path.dirname(__file__), "ui", ".env") +env_vars = {} + +if os.path.exists(env_path): + with open(env_path) as f: + for line in f: + if '=' in line and not line.startswith('#'): + key, value = line.strip().split('=', 1) + env_vars[key] = value + +SEARCH_ENDPOINT = env_vars.get("SEARCH_ENDPOINT") +SEARCH_KEY = env_vars.get("SEARCH_KEY") +SEARCH_INDEX_NAME = env_vars.get("SEARCH_INDEX_NAME", "segments") + +print(f"Endpoint: {SEARCH_ENDPOINT}") +print(f"Index: {SEARCH_INDEX_NAME}") +print(f"Key: {'*' * 10}{SEARCH_KEY[-4:] if SEARCH_KEY else 'NOT FOUND'}") +print() + +if not SEARCH_ENDPOINT or not SEARCH_KEY: + print("ERROR: Missing SEARCH_ENDPOINT or SEARCH_KEY in .env") + sys.exit(1) + +API_VERSION = "2024-07-01" + +def get_index(): + url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}" + headers = {"api-key": SEARCH_KEY} + + print(f"Fetching index: {url}") + response = requests.get(url, headers=headers) + + if response.status_code == 200: + return response.json() + else: + print(f"Failed to get index: {response.status_code}") + print(response.text) + return None + +def update_index(index_def): + url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}" + headers = { + "Content-Type": "application/json", + "api-key": SEARCH_KEY + } + + response = requests.put(url, headers=headers, json=index_def) + + if response.status_code in [200, 201]: + print("✅ Index updated successfully!") + return True + else: + print(f"❌ Failed to update: {response.status_code}") + print(response.text) + return False + +def main(): + print("Fetching current index...") + index = get_index() + if not index: + sys.exit(1) + + existing_fields = {f["name"] for f in index.get("fields", [])} + print(f"Existing fields: {existing_fields}") + print() + + new_fields = [ + { + "name": "source_url", + "type": "Edm.String", + "searchable": False, + "filterable": True, + "retrievable": True, + "sortable": False, + "facetable": False, + "key": False + }, + { + "name": "source_type", + "type": "Edm.String", + "searchable": False, + "filterable": True, + "retrievable": True, + "sortable": False, + "facetable": True, + "key": False + }, + { + "name": "processed_at", + "type": "Edm.DateTimeOffset", + "searchable": False, + "filterable": True, + "retrievable": True, + "sortable": True, + "facetable": False, + "key": False + } + ] + + added = 0 + for field in new_fields: + if field["name"] in existing_fields: + print(f"⚠️ Already exists: {field['name']}") + else: + print(f"➕ Adding: {field['name']}") + index["fields"].append(field) + added += 1 + + if added == 0: + print("\n✅ All fields already present!") + return + + print(f"\n💾 Saving with {added} new fields...") + if update_index(index): + print("\n🎉 SUCCESS! URL tracking fields added.") + print("\nNext steps:") + print("1. Restart your Streamlit app") + print("2. Go to 'System Diagnostics' page") + print("3. Click 'Check Index Schema' to verify") + +if __name__ == "__main__": + main() diff --git a/verify_fields.py b/verify_fields.py new file mode 100644 index 0000000..be7742b --- /dev/null +++ b/verify_fields.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Verify URL fields were added to the index""" + +import os +import requests + +# Load .env +env_path = os.path.join(os.path.dirname(__file__), "ui", ".env") +env_vars = {} + +if os.path.exists(env_path): + with open(env_path) as f: + for line in f: + if '=' in line and not line.startswith('#'): + key, value = line.strip().split('=', 1) + env_vars[key] = value + +SEARCH_ENDPOINT = env_vars.get("SEARCH_ENDPOINT") +SEARCH_KEY = env_vars.get("SEARCH_KEY") +SEARCH_INDEX_NAME = env_vars.get("SEARCH_INDEX_NAME", "segments") + +API_VERSION = "2024-07-01" + +def check_index(): + url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version={API_VERSION}" + headers = {"api-key": SEARCH_KEY} + + response = requests.get(url, headers=headers) + + if response.status_code == 200: + index = response.json() + fields = {f["name"]: f["type"] for f in index.get("fields", [])} + + print("✅ Successfully connected to index!") + print(f"\nTotal fields: {len(fields)}") + print(f"\nChecking URL tracking fields:") + + url_fields = { + "source_url": "Edm.String", + "source_type": "Edm.String", + "processed_at": "Edm.DateTimeOffset" + } + + all_present = True + for field, expected_type in url_fields.items(): + if field in fields: + print(f" ✅ {field}: {fields[field]}") + else: + print(f" ❌ {field}: MISSING") + all_present = False + + if all_present: + print("\n🎉 SUCCESS! All URL tracking fields are present!") + print("\nYou can now:") + print("1. Restart your Streamlit app") + print("2. Process new videos - URLs will be stored automatically") + else: + print("\n⚠️ Some fields are missing. Run the add script again.") + + return all_present + else: + print(f"❌ Failed to get index: {response.status_code}") + print(response.text) + return False + +if __name__ == "__main__": + check_index() From 2f1555a2f47295773e9905b19604d86eae625ec9 Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Mon, 2 Mar 2026 18:46:08 -0600 Subject: [PATCH 6/8] Add .gitignore for macOS, JSONL, Python caches, and local venv --- .gitignore | 181 +----------------- .../__pycache__/speech_batch.cpython-311.pyc | Bin 22598 -> 0 bytes 2 files changed, 9 insertions(+), 172 deletions(-) delete mode 100644 shared/__pycache__/speech_batch.cpython-311.pyc diff --git a/.gitignore b/.gitignore index 6e659d7..e29811b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,175 +1,12 @@ -local.settings.json -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock +# macOS +.DS_Store -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -#uv.lock +# JSON lines +*.jsonl -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -# Ruff stuff: -.ruff_cache/ +# Python bytecode +*.pyc +__pycache__/ -# PyPI configuration file -.pypirc +# Local virtual env +func_venv/ diff --git a/shared/__pycache__/speech_batch.cpython-311.pyc b/shared/__pycache__/speech_batch.cpython-311.pyc deleted file mode 100644 index 58ca2888792699e4a1a7d3ce5b2dab4fcc63daf0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22598 zcmdUXYm6LMc3xHWyQk;Lhn^IRA|+01In9SwT7|3C?vNab)KWv5Gn7}oR@yyXHDr_h z(ybZ}*{%VkwXpHVguUyR42ug)){)jS=@$RT4>%bnOC%1H zKxPmm-#NG5-OW&j10*%mw{P9|oQLnd(e?TXh?U=UUs_JE}X_pND^Eb!Ypb z>O=fq!2841hue=-AHjRE`lvBheat9TA2-IUyNrqI6J}OZUQvw6cNJsmLrqa`;v=6A z@paRGQL4MWl+#E#bKjJ@@nvJ1xyRUUPMCWh2}&?_n0r>U#?E&$)hFTKy{Z`xy_>G? zg_N@XqvlGGcSpQdjKk*;f6z-mSYFzA0r7`8{%?3?W{sl=I~*xK&^gnmxPO?da$2 zW~*s8O{-X3zSy+%cGp;InR-ujuQUzQ(i`1&PnZ|Yj@7(k>T7;#eYGp}^{%+w>|D^- zoA$+_H5R!=x*e-={_LrWe#+J_);mVaw2l_%^jEtpy0x~_Zdw+#2@!g)+iIa~M279v zjqZA<)vX(xqSF=adaK#5Q$LXISc^)ryP|%<)SDgKydXGApIw??($AketzR%ZrVKlv zuQt)kwH}pbm67{6n$s~`6mFY>dPS`;Oap0}4Zn`%X4}M&bgr0!TRMCA(4mf{+s(#h zxLv*7M5`?mDX7%qyl7l(+GfM1$vD?-nbg_iUA)MN(_76IQ5V;EH0!NaVx+n~uelbo zowrOwU%3_x$BKEL5~{a=R8b18uEc>aHBsmxvf_eF*t*#tWt`(;dHPHZDi{LDYi+Kn8Vt%?i?n zHPqHb%Q~vRV%m{hk{CcCmcA;wZLfPaMhInbAwJ>NFaWh+1dr+`h7qJ0DO$E6$Pwx5 zQ4j*rksTeySC`H#5*?xa9iuJ`Z#20L?fPZ2X4Ng~0`-7t8Eae;H=2$Wxk788-oW%( z#oHv!7R#zzFzR-_(W+ZkIpyZ@V$!U-1#%6uW!K$2dHdatSx&pD`OY;reWKa0-Sp|E zWxM$^%qi+EMCNJuupn%=2;W*q`t!ZIu*}+9mRmrm*=ckQ^CQ)rrWNV&p5v`1D737( z)7~O9Yt&Zik2}NZ>eg9PcpSh&!ks}vtR}nOM0Yv$Ll`#sx)jTDo1o9_IdV$mJPfBf zwO>&h@B|OA4?lO0;OjlbRDgbzs&-y6Qzr1viwI?(Mm{4|&G1{A-!kC4S+^vaDq6AH zyx^uTo7ddD*)e)uu$+GJ$e|<8%^f~8clbGXtln$ZJVvpIjQLuvQ*WEKnp>>ZB%6i5 zRI9zUR&RMRDa#h_WDR82>4Jo5?bd2y7uxcXau=?0@0EjK1j0Jlt_!=_>8#g{=Gukz zwSzt-S_g@5cjvqky5it)86h4;Y3zIs9_s*{t>|Bx-fk;fTJ+oVBS!4=vkxEMFCqHV z4sSKfgPrBU&S9|g?)Z}A=iX@NK-%54jx8QT@%Ktn?9DYhy*1mNg$aq#X$0^VX&~(tBHp_lIPWPoT>rVG`o7I*4nI(F6i1DN;0?{u ztQk8OBmum4l=qcyYagPln?ZSBQyl^uTK`F6n8JK(tqHiHo1H67(e04T(XZ4+v%b{l

T_OuIsv> zDZoM^{*Ub9dAz=ik4L_D(mzoClky*UMs6!Vpl^)%oyfTRVLYKR%FUw~9^c=A2jd%z zfuh(H?mdPYAE8)WV)Ta3hLK8?koC(}luIqxRaiLVe!P-Gp81p@2Lmf87~a7 zz4rRTxusKQ7W?Td&sL6LdT8~D$H^hiRL)9XLX!c~1qW;P#9@jh9xI+DXD>OyA|fGR z*JS%>J)&LW3rMkuk442`($X|-G>QJVoLCu>nC6_E?7>Mbpe!1iyWfZN*E6YhFh!WI zVAwY!p{UAxjFgd%PuHi)oiVcU+_!QcqMn;UUBn{$kIBjQaBpIkcg$DMpIfLcEuT3z z|H?vb{`m1T=NFgV>3A^Hz^TQv=a*~8&nzy_pITfvC(7uAJ9Xy#ayKzMQK$rK@e+? z6NKb|hW720quxrz2Oa~R?mu>1n7~dFV+E^};kn@R&Hzp~!! z06AKfj!DpG%#&UO;M`&ZoD>YSS#Jqa&6KsU$riD3pWAIiLR9O)CApZzNhC1vu?XeR zU`aX6qDb4yLRz6ipRx)=RHV_O?(eNkRvhqu%Q(cN860>$}mq{ zB|{3HC1tM?T>$$Okye#AIA&eBntDT7S0QVj^2Rx6soq@Gu^BMBrq$VR>*iIU(#j&R z>~T0~SQMg@vmPlR+$)7xZF^cohy3`bg5t8W%HD*z;oF7FHn6B3BC^ zk4J5#L!QKNepwuUyd}5W5|Gn*V%C8kapZaa}^C`E$^TPn3K%3H9 zuWny#1Zy10NkH)A405qS(XP_u4V@4N!u(be2&C*x8i!M+oB4m=WgZ1alXX|lUql=u-OTEoY>00&EPf~MXh0wxGvlwOFWh=4=};$_!*j`0{*2pGEv|Y0?X*K4W(Y38Q3Iky~jy|g~nJ#A2AzQ2iVRY^HV7dO`VjZ z+2diDk-n9Q=XJ7xIq2L?>L5QGIVbbJV&t3@+`N-R+d)URuj57;}Y4CM} zHk7_tU>-0Ll*GG5Nu7TaS==JDLEiq@&6-zxO>s>eMZ`Vzix9naN|ND~;HII2vfWhO zFx;$UhDfl21foF>l_p4f2op}ZAc)zxsZMuYFa@R7?1rGh5j1*YhMaBWY==|MOF2bL zQI^E(hdSlvSD@-4>Ba@SDQxa7!qvQlf>l(CDl$YD@n6Ho`Z+kD?Wri8V>v!3Plm$e(TX-f6?y@E~CMNBgG>r5<5QXMPRWp80yV_TK}RJ|8y z7!O{ux*dqXN6s{QbMw7sdk%`QKvz2l6v=wUCa2TAp40I69Xqnh-= zwO6|ki2E)jqGqCJGzuB&A@^<>;A97sxkjyJUNKv>Q0WIHk=?2>69Fknl}Cvv6fIy? znww>gZMr3+31v|Inxt9_)d(9-Ajk;P(0-E#>q}@Wp^QrikPntI7*WV@u%J`KE*vl> zL%X0lX%aU#0H|-IH*!&-6|y5I8)HGrrC@<4*h$XFfT4iuLnytGck-74_G0ahvJ$YF z{3W7eewg9pF9pnw*k-(Z?3(kNML&J?`$N2h$5a~yqu>;-sA48oThS??%*0+2Wg>NW zIKeFG&}cO&X)Y`JIo2astP+KKv(i809TnJJ+q`NY4E3UK? zk=n2!_2ZWNC=tqOqLbx7H@wgx6@cMR66Gv+PvfZIBncjD^~XHAScHZ{tfMe@%%h+4 zP$hz?3DR=Y0mN-m#-OU4N3?F5R)d?PGNHTa@AQ_BY^AaME0Y0GS__MblA z0b7K&9V&`I9X-n|6IzM=lxBY!`iHS_@9O4BSwRcYP4~Lky3#`7eGPOJm)+7iD4S3{ z>m}SVsZ|CbG)y;511N}dm5YMpzqmpU^-x?TCs+dD9jwh}*q`gwuhE$URV)?eZVFRn z5xMwm(uW%ld--CsM0#)WO{D&He0-+4o%I@6&i*$qaE(|bv%4I=3$)zt>_iG?mXf7% zthaC4nxfp2k^1GxZ;vf9-1YpPP9;FN0pgKukW4d&wsNmabedE1&P61ZU`A{0W@pVd zt$;_MM1zN2luJaz{KR1?z)nO17V2J{OdO(w zd?CKh8UBKwh1!>mY;+|jcS~veHryoZ))%Rje4V95Gbn;mQ@K#1O%P~|r17na4_a6i(-JNObm4#&;P)eTWt zSX#pg3U=p!PcJi<>F-*O9br;{-5WDJf{0z1H~J6)qgvsy&)&l)oEb*hCGVIhCX z^l-`uwX~??z_5nHo0@C@^AxwVEV-WKJQ=HHnmr&jW;EZT(td`VpQVQ94L}T>m6~pz zml@%&pQ6~GCg(jk7LiN9UF2S7C#M}`E#E=Xe?fVE8xGjPgvTi0zwO~JpG8PkKnM-# z@QJ4MS&5$}z9sURNVs37GNYm48?i0?A)eH`G7YJCldOkggIv2M14>~mVsI@$AgkzI zoVz$me1&YNQQfYihFk=GLNWZn&$Lc2fPU&~V)Z~yYm2S|{7s}S2!4Um8W&AaOBQM& zpeINk?D(;}{bh>G;4ICyVj7?lC<8iGoD-RV{J)A+{}mtWX*hBi!a=4X@IN$cGKL2u zrtirjFZJQ>_uwYRA)({hjKW=Hkxx<&PA3XLMp~5HGKNcy;IXN{GLKC|Arax*ev5|i z$@GKpQz$qx55@_5J4dEW@baE|0h?WXa!&Sy^5#zv4;@iQf-m>frm_K4I3jgEPNTi; zBU2XQDL0Bv(NeF!JhC0dc>11djBP%Mlo@lzj1r{74ktGp^E!!=!O6ej&q68%pO}c3D0Uf>4#v<*8&mZCsmAgUX` z=pzNK_%_87P)GB->!xtC*m2{;o{#M<64>SOP_RN@PD%_Pxw1-@O)glbx@oed5WhyH zrO{X+NP%1d2%DNe-S+ zGb1KM`cmu-6Te$gb@YJWmknFqiK!=ischOcWT_7Pc^b+{Y$9Z!ZBVYn!nk}dzqf0M zr}%^`8yYpN39-T{*L^FDrIuX^-B?3Ep?*tA8I^b>-C0upDLipW8EPv??Q<)kI^R$o zb@&PJhbSr30;n(nMM|n;;EAW?^OML90p}-=V^0y0g`vg+xP)XpKXGo;(B4p%l;FH2 zwJ3fbsXkH}1;Zrb>v)q`HCQ;}vk0Tr16w)qlkj_Bg_lGOs*DGQQoL@zhqP#-Vx5ME zunJ)l`t#|5rpNb3=`}byky(7)$C>#Ug*215w#_Tb$I9OMr&Aw4rNU(%5W_hE-W!~~ z0lcTh@SYapy&-d(GEzAsDl zS*~ztq@A9=j2#C~IO?6tH7Nx8WYFX*hL9p-bS?8OY)Tl}5}R`~5~5DP&rnwo0gLaF z^D7jeUxjfdnT-Gs|1yP={ebw0qBOD|>zL~jtgO&dX)y6phYZn4;kp<#}46r>AiC`bfPJRK{kzr!>B%EB^}NGOo}oi#`Zn!JWtDRRPj z?2LQ1rJE3&rXVv;GUW86rH->~>_xBnry*nv6hju3iX^Zw2Z>+op<|RgyKd?@yTF-m znak(BI^z)>J}@OP*?b)xN^Gh_S&zfOaIZ|HM#n8y&mwkCZI(hfM?x5l7k`N0XNk0q z!2{CblLMdYjNai!Syt4~MC4}JEI79R45Y~+)RUkk#nuq$J$?A&d>gZ+_V*q4Y)0>6 zcZ`ElH$|$2((qEs|+5ReKL;Ji`N zj5M}auW#U(VAIbK`*sqikYK^oTJLev(R0oC@)ggFI`@t*BC+$q(ls;{V)yzF8yT;~ zBl`dUV2iULy19&YyY&7q$@u^d_yg(V++q_Zi)3xxFxmRk&9JTo z@`(@w8o7ejxoLLQ%dUDQA(`Cq8Wmn|0L+u;&ynjSEu()34BT17^DBo%^_1z6xPNc^_y(<(}o= zqoZOAPQVw4jb-s+zVNrf^8S|ko&se^3ZJy2oKU{0zIkkvr2&rl2BCBos__VOMA_aG zUyBT-vkwBLH*%dajFBD`OlL!Q7UxbIxi^$f+6U9$P~UkAYLI|;`K_dHSOUiQ+D1O6 zp3mFFcfBXUiC%H?Vj-T|$b_8Gp7HZRWMGuSTXEtUBMaqyBrTK+y1gw-?yili}NHYGd3PcZ!%t_4*S|ag(6-z~dxTRv5vt&B$R;n87I}dwKnoGX{B@7H{6C z^%H3Qq(idzI%eto2jI)IjSeyw*=VaO)>Ww0>$x>nAtTun0;u)s1QN0@iEJ zv{9h#nKMm_vifr%Ll$NJjftqXjPKugcYxT);k;XX!QLE>?EoTEzYOz8QLKI>Wj<8+#~UAo6~5d+gl|AnQew0@*pMfA++ zi@3DoWup6I{wckzP_pc>qGhp$#KLUDXbuHIXoXupTtn z+B5L*$vp$W_BQBweba^)f1ru?@zt)}!ItLT~ z4UU_papNwD|Bm-amjo$PAG|l$AE-BB&2lXN2kIzKjntiqj}b3Z{u%i$!htH8?$~V~ z#!1M&4<=Ry3|qPamVA~8g$-r=O*rKlahv=B@Ny@x^lK3!D$qLhaG;D-LA#D1cW+)(cvPBda1 zqxz#XvVQ@G7Rr+ZQt<)eBxAk@*9=C?sz>Msx1blg9{?sSjTAuBA*Pn^kG*dzcZy-WMDuY>@8=yIhyG?|ER`Nl9|?gD>yGBfax47Sc>93p5|iwNQ^lZv zd>p(|Fy{aFN~RugYk2hJ1wuAg{*CSPW7@}Kd*=^lK0c(vb!Wn@1hh>x(iq3s1EF)? zLmUMQYfQ|vFd9U&Nz%!}M!3+pVY8$XW8I>}rNdU5VM{_c7tJjZ2 zbqZ0XpOe7+akK*S#_Y(P2{VFtPBwIc?emB}CGxTc1$bWEBSW50bnr3r@KH(Z3ZShi zjHqh>y&HuG+JC{waYTuU<6>$(aOV3!%VQ&QBs50A)1y)*=Wu*v$^`n~m~cw(e8m}m z=XGa7ERUSE1e%V|8W8j3$eanp9?!}1Jp~;XL4kqCnRKR{38Qo)(K{Tp-{_P_Qb}@- z)d!xrq98mrye(NL=M;_c!M0$(^#{mBSa9GmCa{B=jQ1IP7Hvf8sbQ&&X-8$uuL1LG z!2HDk^Uw6(V*tAtV0h1xG{%2)#{SToc74DcO4OtJSAWb0Q3r7Sl|a-1R(@@~=|OQ8 zx5Lv>)W)0oEUA#lBO51D87$WjIX6; zi#1svP}qMVhp|U0f=C7e2=2u=D`8}YlqAbZYa4}>OR$Sul*R3(@`vK7rvnrb*#He< z!+%OLpKM!MEU@5WG${TOVV|Z+VjD`j559ZA_I!U-d;S9Lf##5KJgdURACvE2kwchE z{4Y2FUifh%MNnEf^nVJUo0ZcCJ3MdRB2)3NDe1o*yCYP-o(K4O zG4wNLXYTwGRMP7-l~>968FGFWjyuI$&KitOn{|0N5TX1KFAXr#f5w`QA1Ea>PMN6e zNyKGis3jXQZ_I|~jd9%~%!|jocHj$KKdK`|TS6@)OcatSbAHZvnMfJYf`(TtMi`bQ zv;@$oKIE61Y>|`4aX%zhc;N$eg-{5l*Reil{c;c<$T=r(KkA3W_&`nE7%SLxD(O8= z#woxs05{NjX&6Q3$?%zOgX3l!%8uM)E84sLJV2va7|SKcZciAGQj53}lTO|PXKX|k z5d)8rz-LC`GM|6JH2bMAu~Br!hMH#Y57SWH@wSoP5 z|Jj0#abtqyw=@2M3S+pUQ;O0P3`?hY3{%GR4QTd0P=UuLpxl_LZwI?$3B2qbORSjQ z9a3H}@|S%`At!%QBh=$XfBi_FNO3Vswux)p>^)r3y-TTNkCwcS%1Mq?{9h`HLkS!y zbNsUvoekEL=II>y*~+M&5ArtJoQfUq9?wzz$K-9R0}*ZA0s0v(!}{U@T^`hLa1L1L z{1676FB|qJVy&B77No6#wBr-?l3pOKey!evuOH^v7QQ zqL_a%r@wcx8`Z6{2_f#mtzkCkw~A0al#w8JMf^K*s149au(bgjC*rd9wI7!YdIizdi>%fs;ayd+4zXC^IRHW;ce0!JgAi6tGEWA8_ z{`7JUKd4kYdFK4NrP{I6XO8h*8*ziOeuo@J2;U$d8SAlPqdX~gfea&q7>ttZ{RoZs zBM0t;_xohtyS#-=6Xh0UmoVP4(=a^5#RWD_U~Nx^YG5otiu?o=kjj@?f1Fm#xX!Ld z8ts~tBWznqSAYKuAL|7;&;~p$<(97%z$HnR-@fE0od|{asbX(g|5q8}v6RLO!DU0B zf#8GEyo;81W*`rxMC2MM5B@03yTU0AF~V{%nA3kaQFz*hPUHg3oRVgn=K6tMD&vg?9-A znOzAw1afEU^pk%TkHGl)u^?$W>=LiL-6!_5XHK84EiK?@KTa&U5BZ_Xr(RpYFMrft zJGFRzd0|QH1(~>y`LW>w34bl*pHNLC9_4R<>>=OtE=0d|*s0uGMY zyWN&tY`o6snuAp`?MUY5>CR_f-)=T0DV)3t7 zyD;jQU81$;c@`xelQbvkz^6n1I~pfb#M4E?Mx~{2ro%*?e%@$Z6b20=w~@mAPllMnV7p From 7a013521fc492a24cd88f89c8124380b66e708f1 Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Mon, 2 Mar 2026 19:06:16 -0600 Subject: [PATCH 7/8] Update ui_search, remove obsolete ui_search2, add .gitignore --- ui/manage_videos.py | 213 ++++++++ ui/system_diagnostics.py | 75 +++ ui/ui_search.py | 1124 +++++--------------------------------- ui/upload_transcribe.py | 461 ++++++++++++++++ 4 files changed, 872 insertions(+), 1001 deletions(-) create mode 100644 ui/manage_videos.py create mode 100644 ui/system_diagnostics.py create mode 100644 ui/upload_transcribe.py diff --git a/ui/manage_videos.py b/ui/manage_videos.py new file mode 100644 index 0000000..b5cce69 --- /dev/null +++ b/ui/manage_videos.py @@ -0,0 +1,213 @@ +""" +manage_videos.py - Manage Videos page for the Video Annotation Platform +""" + +import streamlit as st +import pandas as pd +import io +import time + +import ui_search + +def show_manage_videos_page(): + """Display the Manage Videos page.""" + st.header("📚 Manage Stored Videos") + st.info("View, search, and manage all processed videos and their source URLs") + + if not ui_search.SEARCH_ENDPOINT or not ui_search.SEARCH_KEY: + st.error("Azure Search not configured. Cannot retrieve video list.") + else: + # Check URL fields status + url_status = ui_search.check_url_fields_status() + + if url_status['fields_exist']: + st.success("✅ URL tracking fields are configured") + else: + st.warning(f"⚠️ Missing URL fields: {', '.join(url_status['missing_fields'])}") + + # URL coverage analysis + if st.button("📊 Analyze URL Data Coverage"): + with st.spinner("Analyzing..."): + all_videos = ui_search.get_stored_videos(include_missing=True) + + with_urls = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']] + without_urls = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']] + + col1, col2, col3 = st.columns(3) + col1.metric("Total Videos", len(all_videos)) + col2.metric("✅ With URL Data", len(with_urls), f"{len(with_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%") + col3.metric("⚠️ Missing URL Data", len(without_urls), f"{len(without_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%") + + # By type breakdown + st.subheader("Breakdown by Source Type") + type_counts = {} + for v in all_videos: + t = v.get('source_type') or 'unknown' + type_counts[t] = type_counts.get(t, 0) + 1 + + cols = st.columns(len(type_counts) if type_counts else 1) + for i, (stype, count) in enumerate(sorted(type_counts.items())): + icon = "🎬" if stype == "youtube" else "📄" if stype == "direct" else "📁" if stype == "upload" else "❓" + cols[i % len(cols)].metric(f"{icon} {stype}", count) + + if without_urls: + with st.expander(f"Videos without URL data ({len(without_urls)})"): + st.info("These were likely processed before URL tracking was enabled") + for v in without_urls[:20]: + st.text(f"• {v.get('video_id')}") + + st.markdown("---") + + # Filters + st.subheader("Filter Videos") + col1, col2 = st.columns(2) + + with col1: + filter_video_id = st.text_input("Filter by Video ID (optional)") + with col2: + filter_options = ["All", "With URL Data Only", "Missing URL Data Only", "youtube", "direct", "upload", "unknown"] + filter_source_type = st.selectbox("Filter by Source Type", options=filter_options, index=0) + + # Load videos button + load_clicked = st.button("🔍 Load Videos", type="primary") + + # Handle deletion using session state + if st.session_state.video_to_delete: + vid_to_delete = st.session_state.video_to_delete + + with st.spinner(f"Deleting {vid_to_delete}..."): + success = ui_search.delete_video_by_id(vid_to_delete) + + if success: + # Remove from cache immediately + if st.session_state.stored_videos_cache: + st.session_state.stored_videos_cache = [ + v for v in st.session_state.stored_videos_cache + if v.get('video_id') != vid_to_delete + ] + st.success(f"✅ Deleted {vid_to_delete}") + st.session_state.delete_success = True + else: + st.error(f"❌ Failed to delete {vid_to_delete}") + + # Clear the trigger + st.session_state.video_to_delete = None + time.sleep(0.5) + st.rerun() + + # Load videos if button clicked + if load_clicked: + with st.spinner("Retrieving videos..."): + + # Handle special filters + if filter_source_type == "Missing URL Data Only": + all_videos = ui_search.get_stored_videos(include_missing=True) + videos = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']] + if filter_video_id.strip(): + videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()] + elif filter_source_type == "With URL Data Only": + all_videos = ui_search.get_stored_videos(include_missing=True) + videos = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']] + if filter_video_id.strip(): + videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()] + else: + source_type = None if filter_source_type == "All" else filter_source_type + videos = ui_search.get_stored_videos( + video_id=filter_video_id if filter_video_id.strip() else None, + source_type=source_type, + include_missing=True, + limit=1000 + ) + + st.session_state.stored_videos_cache = videos + st.session_state.videos_loaded = True + st.success(f"Found {len(videos)} videos") + + # Display videos + if st.session_state.stored_videos_cache: + videos = st.session_state.stored_videos_cache + + # Metrics + st.markdown("---") + cols = st.columns(4) + + type_counts = {} + for v in videos: + t = v.get('source_type') or 'unknown' + type_counts[t] = type_counts.get(t, 0) + 1 + + cols[0].metric("Total", len(videos)) + cols[1].metric("YouTube", type_counts.get('youtube', 0)) + cols[2].metric("Direct", type_counts.get('direct', 0)) + cols[3].metric("Upload", type_counts.get('upload', 0)) + + # Group by type + st.markdown("---") + st.subheader("Video List") + + videos_by_type = {} + for v in videos: + stype = v.get('source_type') or 'unknown' + if stype not in videos_by_type: + videos_by_type[stype] = [] + videos_by_type[stype].append(v) + + # Display by category + for source_type in ['youtube', 'direct', 'upload', 'unknown']: + if source_type not in videos_by_type: + continue + + type_videos = videos_by_type[source_type] + icon = "🎬" if source_type == "youtube" else "📄" if source_type == "direct" else "📁" if source_type == "upload" else "❓" + + with st.expander(f"{icon} {source_type.upper()} ({len(type_videos)} videos)", expanded=(source_type == 'youtube')): + for i, video in enumerate(type_videos, 1): + vid = video.get('video_id', 'unknown') + src_url = video.get('source_url', '') + processed = video.get('processed_at', 'unknown') + + has_url = bool(src_url) + status_icon = "✅" if has_url else "⚠️" + + with st.container(): + cols = st.columns([4, 1]) + + with cols[0]: + st.write(f"**{status_icon} {i}. {vid}**") + st.caption(f"Processed: {processed}") + + if src_url: + display_url = src_url[:80] + "..." if len(str(src_url)) > 80 else src_url + st.code(display_url) + if str(src_url).startswith('http'): + st.markdown(f"[Open Source ↗]({src_url})") + else: + st.warning("No source URL stored") + + with cols[1]: + # Capture current vid value for callback + st.button( + f"🗑️ Delete", + key=f"del_{vid}_{i}_{source_type}", + on_click=lambda v=vid: setattr(st.session_state, 'video_to_delete', v) + ) + + st.markdown("---") + + # Export + st.markdown("---") + if st.button("📥 Export to CSV"): + export_df = pd.DataFrame([ + { + 'video_id': v.get('video_id'), + 'source_type': v.get('source_type') or 'unknown', + 'source_url': v.get('source_url', ''), + 'has_url_data': bool(v.get('source_url')), + 'processed_at': v.get('processed_at', 'unknown') + } + for v in videos + ]) + + csv_buffer = io.StringIO() + export_df.to_csv(csv_buffer, index=False) + st.download_button("Download CSV", csv_buffer.getvalue(), "video_list.csv", "text/csv") \ No newline at end of file diff --git a/ui/system_diagnostics.py b/ui/system_diagnostics.py new file mode 100644 index 0000000..790b7ec --- /dev/null +++ b/ui/system_diagnostics.py @@ -0,0 +1,75 @@ +""" +system_diagnostics.py - System Diagnostics page for the Video Annotation Platform +""" + +import streamlit as st + +import ui_search + +def show_system_diagnostics_page(): + """Display the System Diagnostics page.""" + st.header("⚙️ System Diagnostics") + st.info("Check system configuration and troubleshoot issues") + + # Configuration status + st.subheader("Configuration Status") + + config_checks = { + "Azure Speech (SPEECH_KEY)": bool(ui_search.SPEECH_KEY), + "Azure OpenAI (AZURE_OPENAI_KEY)": bool(ui_search.AZURE_OPENAI_KEY), + "Azure Search (SEARCH_KEY)": bool(ui_search.SEARCH_KEY), + "Azure Storage (AZURE_STORAGE_KEY)": bool(ui_search.AZURE_STORAGE_KEY), + "Search Function (SEARCH_FN_URL)": bool(ui_search.SEARCH_FN_URL), + "yt-dlp installed": ui_search.check_yt_dlp() + } + + cols = st.columns(2) + for i, (name, status) in enumerate(config_checks.items()): + icon = "✅" if status else "❌" + cols[i % 2].write(f"{icon} {name}: {'OK' if status else 'Not configured'}") + + # Index schema check + st.markdown("---") + st.subheader("Index Schema Check") + + if st.button("🔍 Check Index Schema"): + with st.spinner("Fetching schema..."): + schema = ui_search.debug_check_index_schema() + + if isinstance(schema, dict): + st.success(f"Index: {schema['index_name']}") + st.write(f"Key Field: `{schema['key_field']}`") + + # URL fields status + if schema.get('has_all_url_fields'): + st.success("✅ All URL tracking fields present") + else: + st.warning(f"⚠️ Missing fields: {', '.join(schema.get('missing_url_fields', []))}") + + # Show all fields + with st.expander("View all fields"): + for field in schema['fields']: + key = "🔑" if field['key'] else "" + url = "🔗" if 'url' in field['name'].lower() else "" + facet = "📊" if field.get('facetable') else "" + st.caption(f"{key}{url}{facet} `{field['name']}` ({field['type']}) - facetable: {field.get('facetable', False)}") + + st.session_state.index_schema_cache = schema + else: + st.error(f"Schema check failed: {schema}") + + # Debug info + st.markdown("---") + st.subheader("Debug Information") + + with st.expander("Session State"): + st.json({ + k: str(v)[:100] + "..." if len(str(v)) > 100 else v + for k, v in st.session_state.items() + }) + + with st.expander("Recent Processing Debug"): + if st.session_state.get('debug_info'): + st.json(st.session_state['debug_info']) + else: + st.info("No debug info yet. Process a video first.") \ No newline at end of file diff --git a/ui/ui_search.py b/ui/ui_search.py index 15ef358..eec7833 100644 --- a/ui/ui_search.py +++ b/ui/ui_search.py @@ -1,12 +1,11 @@ """ -ui_search.py - Streamlit Web Interface for Video Segment Search & Upload - -Features: -- Direct Azure Speech API integration (bypasses Azure Function) -- URL tracking for all processed videos (source_url, source_type, processed_at) -- Handles existing videos without URL data gracefully -- Batch processing with CSV upload -- Video management interface with filtering and deletion +ui_search.py - Main Streamlit entry point +Contains: +- Environment configuration +- Shared utility functions +- Sidebar navigation +- Search Segments page (default) +- Imports and calls the other three page modules """ import os @@ -80,46 +79,16 @@ 'index_schema_cache': None, 'stored_videos_cache': None, 'url_fields_status': None, - 'debug_info': {} + 'debug_info': {}, + 'video_to_delete': None, + 'delete_success': False, + 'videos_loaded': False } for key, value in session_state_defaults.items(): if key not in st.session_state: st.session_state[key] = value -# ============================================================================= -# SIDEBAR NAVIGATION -# ============================================================================= - -with st.sidebar: - st.header("Navigation") - page = st.radio("Select Page", [ - "🔎 Search Segments", - "⬆️ Upload & Transcribe", - "📚 Manage Videos", - "⚙️ System Diagnostics" - ]) - - # Settings for search page - if page == "🔎 Search Segments": - st.header("Search Settings") - mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], - index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1) - top = st.slider("Results", 1, 50, DEFAULT_TOP) - k = st.slider("Vector k", 5, 200, DEFAULT_K) - - # Quick actions - st.markdown("---") - if st.button("🔄 Refresh Schema Cache"): - st.session_state.index_schema_cache = None - st.session_state.url_fields_status = None - st.success("Cache cleared! Navigate to System Diagnostics to refresh.") - - st.markdown("---") - st.caption("Video Annotation Platform v2.1") - st.caption("With URL Tracking") - - # ============================================================================= # UTILITY FUNCTIONS # ============================================================================= @@ -131,53 +100,44 @@ def ms_to_ts(ms: int) -> str: h, m = divmod(m, 60) return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}" - def sanitize_id(id_string: str) -> str: """Sanitize ID for Azure Search (alphanumeric, hyphens, underscores only).""" if not id_string: id_string = "unknown" - sanitized = re.sub(r'[^\w\-]', '_', str(id_string)) - + sanitized = re.sub(r'_+', '_', sanitized) + sanitized = sanitized.strip('_') + if not sanitized: + sanitized = "unknown" if sanitized.startswith('_') or sanitized.startswith('-'): sanitized = 'id' + sanitized - if len(sanitized) > 1024: hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:16] sanitized = sanitized[:1000] + "_" + hash_suffix - return sanitized - def detect_url_type(url: str) -> str: """Detect if URL is YouTube, direct media, or unknown.""" if not url: return "unknown" - url_lower = str(url).lower().strip() - youtube_patterns = [ r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)', r'youtube\.com\/watch\?v=', r'youtu\.be\/', r'youtube\.com\/shorts\/' ] - for pattern in youtube_patterns: if re.search(pattern, url_lower): return "youtube" - media_extensions = ['.mp4', '.m4a', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.webm'] if any(url_lower.endswith(ext) for ext in media_extensions): return "direct" - cloud_patterns = ['box.com', 'drive.google.com', 'dropbox.com', 'onedrive'] if any(pattern in url_lower for pattern in cloud_patterns): return "direct" - return "unknown" - def check_yt_dlp() -> bool: """Check if yt-dlp is installed.""" try: @@ -186,7 +146,6 @@ def check_yt_dlp() -> bool: except: return False - def call_api(url: str, payload: dict) -> dict: """Make API call to search function.""" try: @@ -196,7 +155,6 @@ def call_api(url: str, payload: dict) -> dict: except requests.exceptions.RequestException as e: raise RuntimeError(f"API call failed: {str(e)}") - # ============================================================================= # AZURE SEARCH SCHEMA FUNCTIONS # ============================================================================= @@ -205,20 +163,16 @@ def debug_check_index_schema(): """Check index schema and verify URL tracking fields.""" if not SEARCH_ENDPOINT or not SEARCH_KEY or not SEARCH_INDEX_NAME: return "Search not configured - check SEARCH_ENDPOINT, SEARCH_KEY, and SEARCH_INDEX_NAME" - url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}?api-version=2024-07-01" headers = {"api-key": SEARCH_KEY} - try: r = requests.get(url, headers=headers, timeout=30) if r.status_code == 200: schema = r.json() key_field = None fields_info = [] - url_fields = ['source_url', 'source_type', 'processed_at'] found_url_fields = [] - for field in schema.get("fields", []): field_info = { "name": field.get("name"), @@ -230,13 +184,10 @@ def debug_check_index_schema(): "facetable": field.get("facetable", False) } fields_info.append(field_info) - if field.get("key", False): key_field = field.get("name") - if field.get("name") in url_fields: found_url_fields.append(field.get("name")) - return { "index_name": schema.get("name"), "key_field": key_field, @@ -250,12 +201,10 @@ def debug_check_index_schema(): except Exception as e: return f"Error checking index: {str(e)}" - def get_index_schema(): """Get cached schema or fetch new one.""" if st.session_state.index_schema_cache: return st.session_state.index_schema_cache - schema_info = debug_check_index_schema() if isinstance(schema_info, dict): st.session_state.index_schema_cache = schema_info @@ -263,12 +212,10 @@ def get_index_schema(): else: raise RuntimeError(f"Cannot fetch index schema: {schema_info}") - def check_url_fields_status(): """Check URL fields status with caching.""" if st.session_state.url_fields_status: return st.session_state.url_fields_status - try: schema = get_index_schema() if isinstance(schema, dict): @@ -282,7 +229,6 @@ def check_url_fields_status(): return result except: pass - return { 'fields_exist': False, 'found_fields': [], @@ -290,7 +236,6 @@ def check_url_fields_status(): 'key_field': None } - # ============================================================================= # AZURE SPEECH API FUNCTIONS # ============================================================================= @@ -299,14 +244,11 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any] """Submit transcription directly to Azure Speech API.""" if not SPEECH_KEY: raise RuntimeError("SPEECH_KEY not configured") - endpoint = f"https://{SPEECH_REGION}.api.cognitive.microsoft.com/speechtotext/transcriptions:submit?api-version={SPEECH_API_VERSION}" - headers = { "Ocp-Apim-Subscription-Key": SPEECH_KEY, "Content-Type": "application/json" } - payload = { "contentUrls": [media_url], "locale": "en-US", @@ -319,67 +261,51 @@ def submit_transcription_direct(video_id: str, media_url: str) -> Dict[str, Any] "timeToLiveHours": 24 } } - try: r = requests.post(endpoint, headers=headers, json=payload, timeout=60) r.raise_for_status() - operation_url = r.headers.get("Location") if not operation_url: result = r.json() operation_url = result.get("self") or result.get("links", {}).get("self") - if not operation_url: raise RuntimeError("No operation URL returned from Speech API") - return {"operation_url": operation_url, "video_id": video_id} - except requests.exceptions.HTTPError as e: error_msg = f"Speech API error {r.status_code}: {r.text}" if r.status_code == 401: error_msg = "Azure Speech API authentication failed. Check SPEECH_KEY." raise RuntimeError(error_msg) - def poll_transcription_operation(operation_url: str) -> Dict[str, Any]: """Poll transcription operation status.""" if not SPEECH_KEY: raise RuntimeError("SPEECH_KEY not configured") - headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY} - try: poll_url = operation_url.replace("/transcriptions:submit/", "/transcriptions/") st.session_state['debug_poll_url'] = poll_url - r = requests.get(poll_url, headers=headers, timeout=30) r.raise_for_status() return r.json() - except requests.exceptions.RequestException as e: raise RuntimeError(f"Failed to poll transcription: {str(e)}") - def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]: """Get transcription JSON from result files.""" if not SPEECH_KEY: raise RuntimeError("SPEECH_KEY not configured") - headers = {"Ocp-Apim-Subscription-Key": SPEECH_KEY} - try: links = result_data.get("links", {}) files_url = links.get("files") - if not files_url: if "combinedRecognizedPhrases" in result_data: return result_data raise RuntimeError("No files URL in result") - r = requests.get(files_url, headers=headers, timeout=30) r.raise_for_status() files_data = r.json() - for file in files_data.get("values", []): if file.get("kind") == "Transcription": content_url = file.get("links", {}).get("contentUrl") @@ -387,13 +313,10 @@ def get_transcription_from_result(result_data: Dict) -> Dict[str, Any]: content_r = requests.get(content_url, timeout=60) content_r.raise_for_status() return content_r.json() - raise RuntimeError("No transcription file found in results") - except requests.exceptions.RequestException as e: raise RuntimeError(f"Failed to get transcription result: {str(e)}") - # ============================================================================= # EMBEDDING AND INDEXING WITH URL TRACKING # ============================================================================= @@ -402,19 +325,15 @@ def get_embeddings(texts: list) -> list: """Get embeddings from Azure OpenAI.""" if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_KEY: raise RuntimeError("Azure OpenAI not configured") - url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/embeddings?api-version=2024-02-01" - headers = { "api-key": AZURE_OPENAI_KEY, "Content-Type": "application/json" } - payload = { "input": texts, "model": "text-embedding-3-small" } - try: r = requests.post(url, headers=headers, json=payload, timeout=60) r.raise_for_status() @@ -423,47 +342,34 @@ def get_embeddings(texts: list) -> list: except Exception as e: raise RuntimeError(f"Embedding failed: {str(e)}") - def index_segments_direct(video_id: str, segments: list, source_url: str = None, source_type: str = None) -> Dict[str, Any]: """ Index segments to Azure Cognitive Search with URL tracking. """ if not SEARCH_ENDPOINT or not SEARCH_KEY: raise RuntimeError("Azure Search not configured") - schema_info = get_index_schema() key_field = schema_info.get("key_field") available_fields = {f.get("name") for f in schema_info.get("fields", [])} - if not key_field: raise RuntimeError("No key field found in index") - - # Check URL field availability url_fields_available = { 'source_url': 'source_url' in available_fields, 'source_type': 'source_type' in available_fields, 'processed_at': 'processed_at' in available_fields } - - # Generate embeddings texts = [seg.get("text", "") for seg in segments] try: embeddings = get_embeddings(texts) except Exception as e: st.warning(f"Embedding failed, indexing without vectors: {e}") embeddings = [None] * len(segments) - - # Prepare documents documents = [] processed_timestamp = datetime.utcnow().isoformat() + "Z" - for i, (seg, embedding) in enumerate(zip(segments, embeddings)): safe_video_id = sanitize_id(video_id) doc_id = f"{safe_video_id}_{i}" - doc = {"@search.action": "upload", key_field: doc_id} - - # Core fields field_mappings = { "video_id": safe_video_id, "segment_id": str(seg.get("segment_id", i)), @@ -472,38 +378,27 @@ def index_segments_direct(video_id: str, segments: list, source_url: str = None, "end_ms": int(seg.get("end_ms", 0)), "pred_labels": seg.get("pred_labels", []) if seg.get("pred_labels") else [] } - - # URL tracking fields if url_fields_available['source_url']: field_mappings["source_url"] = str(source_url) if source_url else "" if url_fields_available['source_type']: field_mappings["source_type"] = str(source_type) if source_type else "unknown" if url_fields_available['processed_at']: field_mappings["processed_at"] = processed_timestamp - - # Only add existing fields for field_name, value in field_mappings.items(): if field_name in available_fields: doc[field_name] = value - - # Handle embedding embedding_field = next((f for f in ["embedding", "embeddings", "vector", "vectors"] if f in available_fields), None) if embedding and embedding_field: try: doc[embedding_field] = [float(x) for x in embedding] except (ValueError, TypeError): pass - documents.append(doc) - - # Upload to search index url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01" headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"} payload = {"value": documents} - try: r = requests.post(url, headers=headers, json=payload, timeout=60) - if r.status_code >= 400: error_detail = r.text try: @@ -511,15 +406,11 @@ def index_segments_direct(video_id: str, segments: list, source_url: str = None, except: pass raise RuntimeError(f"Indexing failed: HTTP {r.status_code}\n{error_detail}") - result = r.json() - - # Check for partial failures if r.status_code == 207: failed_docs = [item for item in result.get("value", []) if not item.get("status", False)] if failed_docs: st.warning(f"Partial indexing failure: {len(failed_docs)} documents failed") - return { "indexed": len(documents), "video_id": video_id, @@ -528,22 +419,17 @@ def index_segments_direct(video_id: str, segments: list, source_url: str = None, "source_type_stored": bool(source_type and url_fields_available['source_type']), "url_fields_available": url_fields_available } - except Exception as e: raise RuntimeError(f"Indexing failed: {str(e)}") - def process_transcription_to_segments(transcription_data: Dict, video_id: str) -> list: """Convert Azure Speech transcription to segments.""" segments = [] - for i, phrase in enumerate(transcription_data.get("recognizedPhrases", [])): offset = phrase.get("offsetInTicks", 0) // 10000 duration = phrase.get("durationInTicks", 0) // 10000 - nbest = phrase.get("nBest", []) text = nbest[0].get("display", "") if nbest else "" - segments.append({ "segment_id": i, "video_id": video_id, @@ -552,108 +438,107 @@ def process_transcription_to_segments(transcription_data: Dict, video_id: str) - "end_ms": offset + duration, "pred_labels": [] }) - return segments - # ============================================================================= -# VIDEO RETRIEVAL AND MANAGEMENT +# VIDEO RETRIEVAL AND MANAGEMENT (get_stored_videos, delete_video_by_id) # ============================================================================= def get_stored_videos(video_id: str = None, source_type: str = None, include_missing: bool = True, limit: int = 1000) -> List[Dict]: """ Retrieve videos from search index with URL data. + FALLBACK METHOD: Does not use faceting (requires facetable field). + Instead uses pagination to get all documents and deduplicates. """ if not SEARCH_ENDPOINT or not SEARCH_KEY: return [] - url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01" headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"} - - # Build filter + try: + schema = get_index_schema() + available_fields = {f['name'] for f in schema.get('fields', [])} + except: + available_fields = set() filters = [] if video_id: - filters.append(f"video_id eq '{video_id}'") - if source_type: - filters.append(f"source_type eq '{source_type}'") - + escaped_id = video_id.replace("'", "''") + filters.append(f"video_id eq '{escaped_id}'") + if source_type and source_type != "All": + escaped_type = source_type.replace("'", "''") + filters.append(f"source_type eq '{escaped_type}'") filter_query = " and ".join(filters) if filters else None - - # Get available fields - schema = get_index_schema() - available_fields = {f['name'] for f in schema.get('fields', [])} - - # Build select select_fields = ["video_id"] - for field in ["source_url", "source_type", "processed_at"]: - if field in available_fields: + optional_fields = ["source_url", "source_type", "processed_at"] + for field in optional_fields: + if not available_fields or field in available_fields: select_fields.append(field) - - payload = { - "search": "*", - "select": ",".join(select_fields), - "top": limit - } - - if "processed_at" in available_fields: - payload["orderby"] = "processed_at desc" - if filter_query: - payload["filter"] = filter_query - + all_videos = {} + skip = 0 + batch_size = 1000 + max_iterations = 100 try: - r = requests.post(url, headers=headers, json=payload, timeout=30) - r.raise_for_status() - docs = r.json().get("value", []) - - # Deduplicate and normalize - seen = set() - unique_docs = [] - for doc in docs: - vid = doc.get('video_id') - if vid and vid not in seen: - seen.add(vid) - # Normalize missing values - doc['source_type'] = doc.get('source_type') or 'unknown' - doc['source_url'] = doc.get('source_url') or '' - doc['processed_at'] = doc.get('processed_at') or 'unknown' - unique_docs.append(doc) - - return unique_docs - + for iteration in range(max_iterations): + payload = { + "search": "*", + "select": ",".join(select_fields), + "top": batch_size, + "skip": skip, + "count": True + } + if filter_query: + payload["filter"] = filter_query + if "processed_at" in available_fields: + payload["orderby"] = "processed_at desc" + r = requests.post(url, headers=headers, json=payload, timeout=30) + r.raise_for_status() + data = r.json() + docs = data.get("value", []) + total_count = data.get("@odata.count", 0) + if not docs: + break + for doc in docs: + vid = doc.get('video_id') + if vid and vid not in all_videos: + all_videos[vid] = { + 'video_id': vid, + 'source_type': doc.get('source_type') or 'unknown', + 'source_url': doc.get('source_url', ''), + 'processed_at': doc.get('processed_at', 'unknown') + } + skip += len(docs) + if skip >= total_count or len(docs) < batch_size: + break + videos = list(all_videos.values())[:limit] + return videos except Exception as e: st.error(f"Failed to retrieve videos: {e}") + import traceback + st.error(traceback.format_exc()) return [] - def delete_video_by_id(video_id: str) -> bool: """Delete all segments for a video_id from the index.""" if not SEARCH_ENDPOINT or not SEARCH_KEY: return False - - # Find all documents search_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/search?api-version=2024-07-01" headers = {"api-key": SEARCH_KEY, "Content-Type": "application/json"} - + escaped_id = video_id.replace("'", "''") payload = { "search": "*", - "filter": f"video_id eq '{video_id}'", + "filter": f"video_id eq '{escaped_id}'", "select": "video_id", "top": 1000 } - try: r = requests.post(search_url, headers=headers, json=payload, timeout=30) r.raise_for_status() docs = r.json().get("value", []) - if not docs: + st.warning(f"No documents found for video_id: {video_id}") return False - - # Delete documents schema = get_index_schema() key_field = schema.get('key_field', 'id') - delete_docs = [] for doc in docs: doc_key = doc.get(key_field) or doc.get('id') @@ -662,21 +547,19 @@ def delete_video_by_id(video_id: str) -> bool: "@search.action": "delete", key_field: doc_key }) - if not delete_docs: + st.warning("No valid documents to delete") return False - delete_url = f"{SEARCH_ENDPOINT}/indexes/{SEARCH_INDEX_NAME}/docs/index?api-version=2024-07-01" r = requests.post(delete_url, headers=headers, json={"value": delete_docs}, timeout=60) r.raise_for_status() - return True - except Exception as e: st.error(f"Delete failed: {e}") + import traceback + st.error(traceback.format_exc()) return False - # ============================================================================= # AZURE STORAGE FUNCTIONS # ============================================================================= @@ -689,7 +572,6 @@ def generate_video_id(filename: str) -> str: hash_suffix = hashlib.md5(clean_name.encode()).hexdigest()[:8] return f"vid_{clean_name[:50]}_{hash_suffix}" - def test_sas_url(sas_url: str) -> Tuple[bool, str]: """Test if SAS URL is accessible.""" try: @@ -698,26 +580,20 @@ def test_sas_url(sas_url: str) -> Tuple[bool, str]: except Exception as e: return (False, str(e)) - def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> Optional[str]: """Generate SAS token for blob access.""" if not AZURE_STORAGE_KEY: return None - try: expiry = datetime.now(timezone.utc) + timedelta(hours=expiry_hours) expiry_str = expiry.strftime('%Y-%m-%dT%H:%M:%SZ') - account_key = base64.b64decode(AZURE_STORAGE_KEY) canonicalized_resource = f"/blob/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" - string_to_sign = ( f"r\n\n{expiry_str}\n{canonicalized_resource}\n\n\nhttps\n2020-12-06\nb\n\n\n\n\n\n\n" ) - signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() signature = base64.b64encode(signed_hmac).decode('utf-8') - sas_params = { 'sv': '2020-12-06', 'sr': 'b', @@ -726,35 +602,27 @@ def generate_sas_token_fixed(blob_name: str, expiry_hours: int = 24) -> Optional 'spr': 'https', 'sig': signature } - return '&'.join([f"{k}={urllib.parse.quote(v, safe='')}" for k, v in sas_params.items()]) - except Exception as e: st.error(f"SAS generation error: {e}") return None - def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]: """Upload to Azure Blob using REST API.""" if not AZURE_STORAGE_KEY: return None, "Azure Storage key not configured" - try: url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}" - date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') content_length = len(file_bytes) - string_to_sign = ( f"PUT\n\n\n{content_length}\n\napplication/octet-stream\n\n\n\n\n\n\n" f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n" f"/{AZURE_STORAGE_ACCOUNT}/{INPUT_CONTAINER}/{blob_name}" ) - account_key = base64.b64decode(AZURE_STORAGE_KEY) signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() signature = base64.b64encode(signed_hmac).decode('utf-8') - headers = { "x-ms-date": date_str, "x-ms-version": "2020-12-06", @@ -763,52 +631,39 @@ def upload_to_azure_blob_fixed(file_bytes: bytes, blob_name: str) -> Tuple[Optio "Content-Length": str(content_length), "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" } - r = requests.put(url, data=file_bytes, headers=headers, timeout=300) - if r.status_code not in [201, 200]: return None, f"Upload failed: HTTP {r.status_code}" - sas_token = generate_sas_token_fixed(blob_name) if not sas_token: return None, "Failed to generate SAS token" - sas_url = f"{url}?{sas_token}" - is_valid, test_msg = test_sas_url(sas_url) if not is_valid: return None, f"SAS URL validation failed: {test_msg}" - return sas_url, None - except Exception as e: import traceback return None, f"Upload error: {str(e)}" - def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optional[str], Optional[str]]: """Upload using Azure SDK (preferred method).""" try: from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions - connection_string = ( f"DefaultEndpointsProtocol=https;" f"AccountName={AZURE_STORAGE_ACCOUNT};" f"AccountKey={AZURE_STORAGE_KEY};" f"EndpointSuffix=core.windows.net" ) - blob_service = BlobServiceClient.from_connection_string(connection_string) container_client = blob_service.get_container_client(INPUT_CONTAINER) - try: container_client.create_container() except Exception: pass - blob_client = container_client.get_blob_client(blob_name) blob_client.upload_blob(file_bytes, overwrite=True) - sas_token = generate_blob_sas( account_name=AZURE_STORAGE_ACCOUNT, container_name=INPUT_CONTAINER, @@ -818,44 +673,34 @@ def upload_to_azure_blob_sdk(file_bytes: bytes, blob_name: str) -> Tuple[Optiona expiry=datetime.now(timezone.utc) + timedelta(hours=24), protocol="https" ) - sas_url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{INPUT_CONTAINER}/{blob_name}?{sas_token}" - is_valid, test_msg = test_sas_url(sas_url) if not is_valid: return None, f"SAS URL validation failed: {test_msg}" - return sas_url, None - except ImportError: return None, "azure-storage-blob not installed" except Exception as e: import traceback return None, f"SDK upload failed: {str(e)}" - def save_segments_to_blob(video_id: str, segments: list) -> str: """Save segments JSON to blob storage.""" if not AZURE_STORAGE_KEY: raise RuntimeError("Azure Storage key not configured") - blob_name = f"{video_id}_segments.json" url = f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/{SEGMENTS_CONTAINER}/{blob_name}" - json_bytes = json.dumps(segments, indent=2).encode('utf-8') content_length = len(json_bytes) - date_str = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') string_to_sign = ( f"PUT\n\n\n{content_length}\n\napplication/json\n\n\n\n\n\n\n" f"x-ms-blob-type:BlockBlob\nx-ms-date:{date_str}\nx-ms-version:2020-12-06\n" f"/{AZURE_STORAGE_ACCOUNT}/{SEGMENTS_CONTAINER}/{blob_name}" ) - account_key = base64.b64decode(AZURE_STORAGE_KEY) signed_hmac = hmac.new(account_key, string_to_sign.encode('utf-8'), hashlib.sha256).digest() signature = base64.b64encode(signed_hmac).decode('utf-8') - headers = { "x-ms-date": date_str, "x-ms-version": "2020-12-06", @@ -864,22 +709,17 @@ def save_segments_to_blob(video_id: str, segments: list) -> str: "Content-Length": str(content_length), "Authorization": f"SharedKey {AZURE_STORAGE_ACCOUNT}:{signature}" } - r = requests.put(url, data=json_bytes, headers=headers, timeout=60) r.raise_for_status() - return blob_name - def download_youtube_audio(youtube_url: str, output_path: str, progress_callback=None) -> Tuple[Optional[str], Optional[str]]: """Download audio from YouTube.""" if not check_yt_dlp(): return None, "yt-dlp not installed. Run: pip install yt-dlp" - if not youtube_url or not youtube_url.strip(): return None, "YouTube URL is empty" - try: cmd = [ "yt-dlp", @@ -892,46 +732,35 @@ def download_youtube_audio(youtube_url: str, output_path: str, "-o", output_path, youtube_url.strip() ] - - # Handle missing Node.js try: node_check = subprocess.run(["which", "node"], capture_output=True, text=True) if node_check.returncode != 0: cmd.extend(["--extractor-args", "youtube:player_client=web"]) except: pass - if progress_callback: progress_callback(15, "Downloading from YouTube...") - result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) - if result.returncode != 0: error_msg = result.stderr[:500] if "JavaScript runtime" in error_msg: error_msg += "\n\n💡 Tip: Install Node.js or run: pip install yt-dlp --upgrade" return None, f"yt-dlp failed: {error_msg}" - - # Find downloaded file if os.path.exists(output_path): return output_path, None - base = output_path.rsplit('.', 1)[0] for ext in ['.m4a', '.mp3', '.webm', '.opus']: alt_path = base + ext if os.path.exists(alt_path): return alt_path, None - return None, "Download completed but file not found" - except subprocess.TimeoutExpired: return None, "Download timed out after 10 minutes" except Exception as e: return None, f"Error: {str(e)}" - # ============================================================================= -# MAIN VIDEO PROCESSING +# MAIN VIDEO PROCESSING (process_single_video) # ============================================================================= def process_single_video(url: str, custom_id: Optional[str] = None, @@ -952,104 +781,73 @@ def process_single_video(url: str, custom_id: Optional[str] = None, "source_type": source_type, "url_stored": False } - try: - # Validate URL url_type = detect_url_type(url) if url_type == "unknown": result["status"] = "failed" result["error"] = "Unknown URL type. Must be YouTube or direct media URL." return result - - # Generate video ID video_id = custom_id.strip() if custom_id else generate_video_id(f"batch_{url}") result["video_id"] = video_id - current, total = overall_progress base_progress = int((current / total) * 100) if progress_bar else 0 - if status_text: status_text.text(f"[{current}/{total}] Processing: {video_id}") - media_url = None - - # Handle YouTube if url_type == "youtube": if not check_yt_dlp(): result["status"] = "failed" result["error"] = "yt-dlp not installed" return result - import tempfile with tempfile.TemporaryDirectory() as tmpdir: if status_text: status_text.text(f"[{current}/{total}] Downloading from YouTube...") - output_path = f"{tmpdir}/youtube_{video_id}.m4a" downloaded_path, error = download_youtube_audio(url.strip(), output_path) - if error: result["status"] = "failed" result["error"] = f"Download failed: {error}" return result - with open(downloaded_path, 'rb') as f: file_bytes = f.read() - blob_name = f"batch_youtube_{video_id}_{int(time.time())}.m4a" - if status_text: status_text.text(f"[{current}/{total}] Uploading to Azure...") - - # Try SDK first, fallback to REST sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) if error and ("not installed" in error or "SDK" in error): sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) - if error: result["status"] = "failed" result["error"] = f"Upload failed: {error}" return result - media_url = sas_url - - # Handle Direct URL elif url_type == "direct": media_url = url.strip() if status_text: status_text.text(f"[{current}/{total}] Using direct URL...") - if not media_url: result["status"] = "failed" result["error"] = "No media URL available" return result - - # Submit to Speech API if status_text: status_text.text(f"[{current}/{total}] Submitting to Speech API...") - submit_result = submit_transcription_direct(video_id, media_url) operation_url = submit_result.get("operation_url") - if not operation_url: result["status"] = "failed" result["error"] = "No operation URL returned" return result - - # Poll for completion max_polls = 120 transcription_data = None - for i in range(max_polls): time.sleep(POLL_SECONDS) poll_result = poll_transcription_operation(operation_url) status = poll_result.get("status", "unknown") - if progress_bar: poll_progress = min(int((i / max_polls) * 20), 20) overall = base_progress + int((1 / total) * 80) + int((1 / total) * poll_progress) progress_bar.progress(min(overall, 99)) - if status.lower() == "succeeded": transcription_data = get_transcription_from_result(poll_result) break @@ -1058,23 +856,15 @@ def process_single_video(url: str, custom_id: Optional[str] = None, result["status"] = "failed" result["error"] = f"Transcription failed: {error_msg}" return result - if not transcription_data: result["status"] = "failed" result["error"] = "Transcription timed out" return result - - # Process and index if status_text: status_text.text(f"[{current}/{total}] Processing segments...") - segments = process_transcription_to_segments(transcription_data, video_id) result["segments_count"] = len(segments) - - # Save to blob save_segments_to_blob(video_id, segments) - - # Index with URL tracking try: index_result = index_segments_direct( video_id, @@ -1082,36 +872,71 @@ def process_single_video(url: str, custom_id: Optional[str] = None, source_url=url, source_type=source_type ) - result["url_stored"] = index_result.get('source_url_stored', False) result["index_status"] = f"Indexed {index_result.get('indexed', 0)} documents" - - # Debug info st.session_state['debug_info'][video_id] = { 'url_fields_available': index_result.get('url_fields_available', {}), 'source_url_stored': index_result.get('source_url_stored', False), 'source_type_stored': index_result.get('source_type_stored', False) } - except Exception as e: result["index_status"] = f"Indexing failed: {str(e)}" - result["status"] = "success" - except Exception as e: result["status"] = "failed" result["error"] = str(e) import traceback result["error"] += f"\n{traceback.format_exc()}" - return result +# ============================================================================= +# IMPORT PAGE MODULES (after utilities are defined) +# ============================================================================= + +import upload_transcribe +import manage_videos +import system_diagnostics # ============================================================================= -# PAGE 1: SEARCH SEGMENTS +# SIDEBAR NAVIGATION +# ============================================================================= + +with st.sidebar: + st.header("Navigation") + page = st.radio("Select Page", [ + "🔎 Search Segments", + "⬆️ Upload & Transcribe", + "📚 Manage Videos", + "⚙️ System Diagnostics" + ]) + + # Settings for search page + if page == "🔎 Search Segments": + st.header("Search Settings") + mode = st.selectbox("Search Mode", ["keyword", "hybrid", "vector"], + index=["keyword", "hybrid", "vector"].index(DEFAULT_MODE) if DEFAULT_MODE in ("keyword", "hybrid", "vector") else 1) + top = st.slider("Results", 1, 50, DEFAULT_TOP) + k = st.slider("Vector k", 5, 200, DEFAULT_K) + + # Quick actions + st.markdown("---") + if st.button("🔄 Refresh Schema Cache"): + st.session_state.index_schema_cache = None + st.session_state.url_fields_status = None + st.success("Cache cleared! Navigate to System Diagnostics to refresh.") + + st.markdown("---") + st.caption("Video Annotation Platform v2.1") + st.caption("With URL Tracking") + +# ============================================================================= +# PAGE ROUTING # ============================================================================= if page == "🔎 Search Segments": + # ------------------------------------------------------------------------- + # SEARCH SEGMENTS PAGE (embedded) + # ------------------------------------------------------------------------- st.header("Search Indexed Video Segments") if not SEARCH_FN_URL: @@ -1142,7 +967,6 @@ def process_single_video(url: str, custom_id: Optional[str] = None, start_ms, end_ms = h.get("start_ms", 0), h.get("end_ms", 0) vid, seg, score = h.get("video_id", ""), h.get("segment_id", ""), h.get("score") - # Show URL info if available source_url = h.get('source_url', '') source_type = h.get('source_type', '') url_indicator = "" @@ -1169,713 +993,11 @@ def process_single_video(url: str, custom_id: Optional[str] = None, except Exception as e: st.error(f"Search failed: {e}") - -# ============================================================================= -# PAGE 2: UPLOAD & TRANSCRIBE -# ============================================================================= - elif page == "⬆️ Upload & Transcribe": - st.header("Upload Video for Transcription") - - # Check URL fields status - url_status = check_url_fields_status() - - if url_status['fields_exist']: - st.success("✅ URL Tracking Enabled - Original source URLs will be stored") - else: - st.warning(f""" - ⚠️ **Partial URL Tracking** - Missing fields: {', '.join(url_status['missing_fields'])} - - Videos will still be processed, but URL information will be limited. - Add missing fields to your Azure Search index for full functionality. - """) - - # Check Azure configuration - azure_configured = bool(AZURE_STORAGE_KEY) and bool(SPEECH_KEY) - if not azure_configured: - st.error("⚠️ Azure Storage and Speech keys required. Check .env file.") - - # Source selection - source_type = st.radio("Select Source", - ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"], - horizontal=True) - - media_url = None - video_id = None - file_bytes = None - yt_url = None - csv_df = None - detected_source_type = "unknown" - - # --- File Upload --- - if source_type == "File Upload": - if not azure_configured: - st.info("Please configure Azure Storage to enable file upload") - else: - uploaded_file = st.file_uploader( - "Choose video/audio file", - type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"], - accept_multiple_files=False - ) - - if uploaded_file: - st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)") - file_bytes = uploaded_file.getvalue() - video_id = generate_video_id(uploaded_file.name) - detected_source_type = "upload" - st.info("File ready for upload") - - # --- Direct URL --- - elif source_type == "Direct URL": - url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...") - - if url_input.strip(): - media_url = url_input.strip() - video_id = generate_video_id(url_input) - detected_source_type = "direct" - st.success("✅ URL validated") - - # --- YouTube --- - elif source_type == "YouTube": - yt_url = st.text_input( - "YouTube URL", - placeholder="https://youtube.com/watch?v= ...", - value=st.session_state.yt_url_value, - key="yt_url_input" - ) - - # Update session state - if yt_url != st.session_state.yt_url_value: - st.session_state.yt_url_value = yt_url - try: - st.rerun() - except: - pass - - # Check yt-dlp - if not check_yt_dlp(): - st.warning("yt-dlp not installed") - if st.button("Install yt-dlp"): - with st.spinner("Installing..."): - subprocess.run(["pip", "install", "-q", "yt-dlp"]) - try: - st.rerun() - except: - st.info("Please refresh the page") - elif yt_url and yt_url.strip(): - video_id = generate_video_id(f"yt_{yt_url.strip()}") - detected_source_type = "youtube" - st.success("YouTube URL ready") - - # --- Batch CSV Upload --- - elif source_type == "📁 Batch CSV Upload": - st.subheader("📁 Batch Process Videos from CSV") - - csv_file = st.file_uploader( - "Upload CSV file", - type=["csv"], - help="CSV must contain a column with video URLs" - ) - - if csv_file: - try: - # Read CSV with flexible parsing - try: - csv_df = pd.read_csv(csv_file) - except Exception: - csv_file.seek(0) - csv_df = pd.read_csv(csv_file, header=None) - csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))] - - # Handle case where column names are URLs - url_like_columns = [] - for col in csv_df.columns: - col_str = str(col).strip() - if detect_url_type(col_str) != "unknown": - url_like_columns.append(col) - - if url_like_columns and len(csv_df.columns) == 1: - url_col_name = csv_df.columns[0] - new_row = {url_col_name: url_col_name} - csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True) - - st.success(f"✅ Loaded CSV with {len(csv_df)} rows") - - # Column selection - url_column = st.selectbox("Select column containing video URLs", options=csv_df.columns.tolist()) - - id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column] - id_column = st.selectbox("Select column for custom Video ID (optional)", options=id_column_options, index=0) - - # Extract and validate URLs - urls_raw = csv_df[url_column].dropna().astype(str).tolist() - urls_to_process = [u.strip() for u in urls_raw if u.strip()] - - # Preview - with st.expander(f"Preview URLs ({len(urls_to_process)} found)"): - for i, url in enumerate(urls_to_process[:10], 1): - url_type = detect_url_type(url) - icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓" - st.text(f"{i}. {icon} {url[:80]}...") - - # Validate - valid_urls = [] - invalid_urls = [] - for url in urls_to_process: - url_type = detect_url_type(str(url)) - if url_type in ["youtube", "direct"]: - valid_urls.append(url) - else: - invalid_urls.append(url) - - col1, col2, col3 = st.columns(3) - col1.metric("Total", len(urls_to_process)) - col2.metric("✅ Valid", len(valid_urls)) - col3.metric("❌ Invalid", len(invalid_urls)) - - # Store in session state - st.session_state['batch_urls'] = valid_urls - st.session_state['batch_df'] = csv_df - st.session_state['batch_url_column'] = url_column - st.session_state['batch_id_column'] = id_column - - except Exception as e: - st.error(f"Error reading CSV: {e}") - import traceback - st.error(traceback.format_exc()) - - # Custom ID input - custom_id = st.text_input("Custom Video ID (optional)") - if custom_id.strip() and source_type != "📁 Batch CSV Upload": - video_id = custom_id.strip() - - # Determine if we can process - can_process = False - if source_type == "File Upload": - can_process = file_bytes is not None and azure_configured - elif source_type == "Direct URL": - can_process = media_url is not None and len(str(media_url).strip()) > 0 - elif source_type == "YouTube": - yt_url_to_check = st.session_state.get('yt_url_value', '') - can_process = len(str(yt_url_to_check).strip()) > 0 and check_yt_dlp() - elif source_type == "📁 Batch CSV Upload": - can_process = (st.session_state.get('batch_urls') and - len(st.session_state.get('batch_urls', [])) > 0 and - azure_configured and - not st.session_state.get('batch_processing', False)) - - # Process button - button_text = "🚀 Start Transcription" - if source_type == "📁 Batch CSV Upload": - count = len(st.session_state.get('batch_urls', [])) - button_text = f"🚀 Process {count} Videos from CSV" - - if st.button(button_text, type="primary", disabled=not can_process): - - # --- BATCH PROCESSING --- - if source_type == "📁 Batch CSV Upload": - st.session_state.batch_processing = True - st.session_state.batch_results = [] - - urls = st.session_state.get('batch_urls', []) - csv_df = st.session_state.get('batch_df') - url_column = st.session_state.get('batch_url_column') - id_column = st.session_state.get('batch_id_column') - - total = len(urls) - st.info(f"Starting batch processing of {total} videos...") - - # Progress UI - overall_progress = st.progress(0) - status_text = st.empty() - results_container = st.container() - - results = [] - for idx, url in enumerate(urls, 1): - # Get custom ID if specified - custom_vid_id = None - if id_column != "Auto-generate": - row = csv_df[csv_df[url_column] == url] - if not row.empty: - custom_vid_id = str(row[id_column].iloc[0]) - custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50] - - # Detect source type - url_type = detect_url_type(url) - src_type = "youtube" if url_type == "youtube" else "direct" - - # Process - result = process_single_video( - url=url, - custom_id=custom_vid_id, - source_type=src_type, - progress_bar=overall_progress, - status_text=status_text, - overall_progress=(idx, total) - ) - - results.append(result) - st.session_state.batch_results = results - - # Update progress - progress_pct = int((idx / total) * 100) - overall_progress.progress(progress_pct) - - # Show result - with results_container: - if result['status'] == 'success': - url_stored = "✅ URL saved" if result.get('url_stored') else "⚠️ URL not stored" - st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments ({url_stored})") - else: - error_msg = result.get('error', 'Unknown error')[:200] - st.error(f"❌ [{idx}/{total}] Failed: {error_msg}...") - - time.sleep(1) # Rate limiting - - # Final summary - overall_progress.progress(100) - status_text.text("Batch processing complete!") - - successful = [r for r in results if r['status'] == 'success'] - failed = [r for r in results if r['status'] == 'failed'] - - st.markdown("---") - st.subheader("📊 Batch Processing Summary") - - col1, col2, col3 = st.columns(3) - col1.metric("Total", total) - col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%") - col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%") - - # Detailed results - with st.expander("View Detailed Results"): - results_df = pd.DataFrame([ - { - 'Video ID': r['video_id'], - 'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'], - 'Source Type': r.get('source_type', 'unknown'), - 'Status': r['status'], - 'Segments': r.get('segments_count', 0), - 'URL Stored': r.get('url_stored', False), - 'Indexing': r.get('index_status', 'N/A'), - 'Error': (r.get('error', '')[:100] + '...') if r.get('error') else '' - } - for r in results - ]) - st.dataframe(results_df) - - # Download results - csv_buffer = io.StringIO() - results_df.to_csv(csv_buffer, index=False) - st.download_button("Download Results CSV", csv_buffer.getvalue(), "batch_results.csv", "text/csv") - - # Search hint - if successful: - st.info("💡 **Search processed videos:**") - video_ids = [r['video_id'] for r in successful[:5]] - st.code(f"video_id:({' OR '.join(video_ids)})") - - st.session_state.batch_processing = False - - # --- SINGLE VIDEO PROCESSING --- - else: - progress_bar = st.progress(0) - status = st.empty() - - try: - # Upload file if needed - if source_type == "File Upload" and file_bytes: - progress_bar.progress(10) - status.text("Uploading to Azure Blob...") - - blob_name = f"upload_{video_id}_{int(time.time())}.m4a" - - sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) - if error and ("not installed" in error or "SDK" in error): - sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) - - if error: - raise Exception(error) - - media_url = sas_url - progress_bar.progress(50) - - # Download YouTube if needed - elif source_type == "YouTube": - yt_url = st.session_state.get('yt_url_value', '') - - if not yt_url or not yt_url.strip(): - raise Exception("YouTube URL is empty") - - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - progress_bar.progress(10) - status.text("Downloading from YouTube...") - - output_path = f"{tmpdir}/youtube_{video_id}.m4a" - downloaded_path, error = download_youtube_audio(yt_url.strip(), output_path) - - if error: - raise Exception(error) - - progress_bar.progress(50) - status.text("Uploading to Azure Blob...") - - with open(downloaded_path, 'rb') as f: - file_bytes = f.read() - - blob_name = f"youtube_{video_id}_{int(time.time())}.m4a" - - sas_url, error = upload_to_azure_blob_sdk(file_bytes, blob_name) - if error and ("not installed" in error): - sas_url, error = upload_to_azure_blob_fixed(file_bytes, blob_name) - - if error: - raise Exception(error) - - media_url = sas_url - progress_bar.progress(75) - - if not media_url: - raise Exception("No media URL available") - - # Transcribe - status.text("Submitting to Azure Speech-to-Text...") - result = submit_transcription_direct(video_id, media_url) - operation_url = result.get("operation_url") - - if not operation_url: - raise Exception("No operation URL returned") - - # Poll - max_polls = 120 - transcription_data = None - - for i in range(max_polls): - time.sleep(POLL_SECONDS) - poll_result = poll_transcription_operation(operation_url) - status_text = poll_result.get("status", "unknown") - - progress = min(75 + int((i / max_polls) * 20), 95) - progress_bar.progress(progress) - status.text(f"Transcribing... ({i * POLL_SECONDS // 60} min) - Status: {status_text}") - - if status_text.lower() == "succeeded": - transcription_data = get_transcription_from_result(poll_result) - break - elif status_text.lower() == "failed": - raise Exception(f"Transcription failed: {poll_result.get('properties', {}).get('error', {}).get('message', 'Unknown error')}") - - if not transcription_data: - raise Exception("Transcription timed out") - - # Process and index - progress_bar.progress(98) - status.text("Processing segments and indexing...") - - segments = process_transcription_to_segments(transcription_data, video_id) - - # Save to blob - save_segments_to_blob(video_id, segments) - - # Index with URL tracking - original_url = None - if source_type == "YouTube": - original_url = st.session_state.get('yt_url_value', '') - elif source_type == "Direct URL": - original_url = media_url - elif source_type == "File Upload": - original_url = f"uploaded_file://{video_id}" - - index_result = index_segments_direct( - video_id, - segments, - source_url=original_url, - source_type=detected_source_type - ) - - url_stored_msg = "✅ Source URL stored" if index_result.get('source_url_stored') else "⚠️ URL storage not available" - - progress_bar.progress(100) - status.text("Complete!") - - st.success(f""" - ✅ **Transcription Complete!** - - Video ID: {video_id} - - Segments: {len(segments)} - - Source Type: {detected_source_type} - - Indexed: {index_result.get('indexed', 0)} documents - - {url_stored_msg} - """) - - if original_url: - st.info(f"**Original Source:** [{original_url}]({original_url})") - - st.code(f'Search: video_id:{video_id}') - - with st.expander("View first 5 segments"): - for seg in segments[:5]: - st.write(f"**{ms_to_ts(seg['start_ms'])} - {ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...") - - except Exception as e: - st.error(f"❌ Error: {str(e)}") - st.exception(e) - - -# ============================================================================= -# PAGE 3: MANAGE VIDEOS -# ============================================================================= + upload_transcribe.show_upload_transcribe_page() elif page == "📚 Manage Videos": - st.header("📚 Manage Stored Videos") - st.info("View, search, and manage all processed videos and their source URLs") - - if not SEARCH_ENDPOINT or not SEARCH_KEY: - st.error("Azure Search not configured. Cannot retrieve video list.") - else: - # Check URL fields status - url_status = check_url_fields_status() - - if url_status['fields_exist']: - st.success("✅ URL tracking fields are configured") - else: - st.warning(f"⚠️ Missing URL fields: {', '.join(url_status['missing_fields'])}") - - # URL coverage analysis - if st.button("📊 Analyze URL Data Coverage"): - with st.spinner("Analyzing..."): - all_videos = get_stored_videos(include_missing=True) - - with_urls = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']] - without_urls = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']] - - col1, col2, col3 = st.columns(3) - col1.metric("Total Videos", len(all_videos)) - col2.metric("✅ With URL Data", len(with_urls), f"{len(with_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%") - col3.metric("⚠️ Missing URL Data", len(without_urls), f"{len(without_urls)/len(all_videos)*100:.1f}%" if all_videos else "0%") - - # By type breakdown - st.subheader("Breakdown by Source Type") - type_counts = {} - for v in all_videos: - t = v.get('source_type') or 'unknown' - type_counts[t] = type_counts.get(t, 0) + 1 - - cols = st.columns(len(type_counts) if type_counts else 1) - for i, (stype, count) in enumerate(sorted(type_counts.items())): - icon = "🎬" if stype == "youtube" else "📄" if stype == "direct" else "📁" if stype == "upload" else "❓" - cols[i % len(cols)].metric(f"{icon} {stype}", count) - - if without_urls: - with st.expander(f"Videos without URL data ({len(without_urls)})"): - st.info("These were likely processed before URL tracking was enabled") - for v in without_urls[:20]: - st.text(f"• {v.get('video_id')}") - - st.markdown("---") - - # Filters - st.subheader("Filter Videos") - col1, col2 = st.columns(2) - - with col1: - filter_video_id = st.text_input("Filter by Video ID (optional)") - with col2: - filter_options = ["All", "With URL Data Only", "Missing URL Data Only", "youtube", "direct", "upload", "unknown"] - filter_source_type = st.selectbox("Filter by Source Type", options=filter_options, index=0) - - # Load videos - if st.button("🔍 Load Videos", type="primary"): - with st.spinner("Retrieving videos..."): - - # Handle special filters - if filter_source_type == "Missing URL Data Only": - all_videos = get_stored_videos(include_missing=True) - videos = [v for v in all_videos if not v.get('source_url') or v.get('source_type') in ['', 'unknown']] - if filter_video_id.strip(): - videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()] - elif filter_source_type == "With URL Data Only": - all_videos = get_stored_videos(include_missing=True) - videos = [v for v in all_videos if v.get('source_url') and v.get('source_type') not in ['', 'unknown']] - if filter_video_id.strip(): - videos = [v for v in videos if filter_video_id.strip().lower() in v.get('video_id', '').lower()] - else: - source_type = None if filter_source_type == "All" else filter_source_type - videos = get_stored_videos( - video_id=filter_video_id if filter_video_id.strip() else None, - source_type=source_type, - include_missing=True, - limit=1000 - ) - - st.session_state.stored_videos_cache = videos - st.success(f"Found {len(videos)} videos") - - # Display videos - if st.session_state.stored_videos_cache: - videos = st.session_state.stored_videos_cache - - # Metrics - st.markdown("---") - cols = st.columns(4) - - type_counts = {} - for v in videos: - t = v.get('source_type') or 'unknown' - type_counts[t] = type_counts.get(t, 0) + 1 - - cols[0].metric("Total", len(videos)) - cols[1].metric("YouTube", type_counts.get('youtube', 0)) - cols[2].metric("Direct", type_counts.get('direct', 0)) - cols[3].metric("Upload", type_counts.get('upload', 0)) - - # Group by type - st.markdown("---") - st.subheader("Video List") - - videos_by_type = {} - for v in videos: - stype = v.get('source_type') or 'unknown' - if stype not in videos_by_type: - videos_by_type[stype] = [] - videos_by_type[stype].append(v) - - # Display by category - for source_type in ['youtube', 'direct', 'upload', 'unknown']: - if source_type not in videos_by_type: - continue - - type_videos = videos_by_type[source_type] - icon = "🎬" if source_type == "youtube" else "📄" if source_type == "direct" else "📁" if source_type == "upload" else "❓" - - with st.expander(f"{icon} {source_type.upper()} ({len(type_videos)} videos)", expanded=(source_type == 'youtube')): - for i, video in enumerate(type_videos, 1): - vid = video.get('video_id', 'unknown') - src_url = video.get('source_url', '') - processed = video.get('processed_at', 'unknown') - - has_url = bool(src_url) - status_icon = "✅" if has_url else "⚠️" - - with st.container(): - cols = st.columns([4, 1]) - - with cols[0]: - st.write(f"**{status_icon} {i}. {vid}**") - st.caption(f"Processed: {processed}") - - if src_url: - display_url = src_url[:80] + "..." if len(str(src_url)) > 80 else src_url - st.code(display_url) - if str(src_url).startswith('http'): - st.markdown(f"[Open Source ↗]({src_url})") - else: - st.warning("No source URL stored") - - with cols[1]: - if st.button(f"🗑️ Delete", key=f"del_{vid}_{i}_{source_type}"): - if delete_video_by_id(vid): - st.success(f"Deleted {vid}") - st.session_state.stored_videos_cache = [ - v for v in videos if v.get('video_id') != vid - ] - try: - st.rerun() - except: - pass - - st.markdown("---") - - # Export - st.markdown("---") - if st.button("📥 Export to CSV"): - export_df = pd.DataFrame([ - { - 'video_id': v.get('video_id'), - 'source_type': v.get('source_type') or 'unknown', - 'source_url': v.get('source_url', ''), - 'has_url_data': bool(v.get('source_url')), - 'processed_at': v.get('processed_at', 'unknown') - } - for v in videos - ]) - - csv_buffer = io.StringIO() - export_df.to_csv(csv_buffer, index=False) - st.download_button("Download CSV", csv_buffer.getvalue(), "video_list.csv", "text/csv") - - -# ============================================================================= -# PAGE 4: SYSTEM DIAGNOSTICS -# ============================================================================= + manage_videos.show_manage_videos_page() elif page == "⚙️ System Diagnostics": - st.header("⚙️ System Diagnostics") - st.info("Check system configuration and troubleshoot issues") - - # Configuration status - st.subheader("Configuration Status") - - config_checks = { - "Azure Speech (SPEECH_KEY)": bool(SPEECH_KEY), - "Azure OpenAI (AZURE_OPENAI_KEY)": bool(AZURE_OPENAI_KEY), - "Azure Search (SEARCH_KEY)": bool(SEARCH_KEY), - "Azure Storage (AZURE_STORAGE_KEY)": bool(AZURE_STORAGE_KEY), - "Search Function (SEARCH_FN_URL)": bool(SEARCH_FN_URL), - "yt-dlp installed": check_yt_dlp() - } - - cols = st.columns(2) - for i, (name, status) in enumerate(config_checks.items()): - icon = "✅" if status else "❌" - cols[i % 2].write(f"{icon} {name}: {'OK' if status else 'Not configured'}") - - # Index schema check - st.markdown("---") - st.subheader("Index Schema Check") - - if st.button("🔍 Check Index Schema"): - with st.spinner("Fetching schema..."): - schema = debug_check_index_schema() - - if isinstance(schema, dict): - st.success(f"Index: {schema['index_name']}") - st.write(f"Key Field: `{schema['key_field']}`") - - # URL fields status - if schema.get('has_all_url_fields'): - st.success("✅ All URL tracking fields present") - else: - st.warning(f"⚠️ Missing fields: {', '.join(schema.get('missing_url_fields', []))}") - - # Show all fields - with st.expander("View all fields"): - for field in schema['fields']: - key = "🔑" if field['key'] else "" - url = "🔗" if 'url' in field['name'].lower() else "" - st.caption(f"{key}{url} `{field['name']}` ({field['type']})") - - st.session_state.index_schema_cache = schema - else: - st.error(f"Schema check failed: {schema}") - - # Debug info - st.markdown("---") - st.subheader("Debug Information") - - with st.expander("Session State"): - st.json({ - k: str(v)[:100] + "..." if len(str(v)) > 100 else v - for k, v in st.session_state.items() - }) - - with st.expander("Recent Processing Debug"): - if st.session_state.get('debug_info'): - st.json(st.session_state['debug_info']) - else: - st.info("No debug info yet. Process a video first.") - - -# Footer -st.sidebar.markdown("---") -st.sidebar.caption("Video Annotation Platform v2.1") \ No newline at end of file + system_diagnostics.show_system_diagnostics_page() \ No newline at end of file diff --git a/ui/upload_transcribe.py b/ui/upload_transcribe.py new file mode 100644 index 0000000..96414dc --- /dev/null +++ b/ui/upload_transcribe.py @@ -0,0 +1,461 @@ +""" +upload_transcribe.py - Upload & Transcribe page for the Video Annotation Platform +""" + +import streamlit as st +import time +import pandas as pd +import io +import tempfile +from typing import Tuple, Optional + +# Import shared utilities from ui_search (must be in same directory) +import ui_search + +def show_upload_transcribe_page(): + """Display the Upload & Transcribe page.""" + st.header("Upload Video for Transcription") + + # Check URL fields status + url_status = ui_search.check_url_fields_status() + + if url_status['fields_exist']: + st.success("✅ URL Tracking Enabled - Original source URLs will be stored") + else: + st.warning(f""" + ⚠️ **Partial URL Tracking** - Missing fields: {', '.join(url_status['missing_fields'])} + + Videos will still be processed, but URL information will be limited. + Add missing fields to your Azure Search index for full functionality. + """) + + # Check Azure configuration + azure_configured = bool(ui_search.AZURE_STORAGE_KEY) and bool(ui_search.SPEECH_KEY) + if not azure_configured: + st.error("⚠️ Azure Storage and Speech keys required. Check .env file.") + + # Source selection + source_type = st.radio("Select Source", + ["File Upload", "Direct URL", "YouTube", "📁 Batch CSV Upload"], + horizontal=True) + + media_url = None + video_id = None + file_bytes = None + yt_url = None + csv_df = None + detected_source_type = "unknown" + + # --- File Upload --- + if source_type == "File Upload": + if not azure_configured: + st.info("Please configure Azure Storage to enable file upload") + else: + uploaded_file = st.file_uploader( + "Choose video/audio file", + type=["mp4", "avi", "mov", "mkv", "m4a", "mp3", "wav"], + accept_multiple_files=False + ) + + if uploaded_file: + st.success(f"📁 {uploaded_file.name} ({uploaded_file.size / 1024 / 1024:.1f} MB)") + file_bytes = uploaded_file.getvalue() + video_id = ui_search.generate_video_id(uploaded_file.name) + detected_source_type = "upload" + st.info("File ready for upload") + + # --- Direct URL --- + elif source_type == "Direct URL": + url_input = st.text_input("Media URL", placeholder="https://tulane.box.com/shared/static/ ...") + + if url_input.strip(): + media_url = url_input.strip() + video_id = ui_search.generate_video_id(url_input) + detected_source_type = "direct" + st.success("✅ URL validated") + + # --- YouTube --- + elif source_type == "YouTube": + yt_url = st.text_input( + "YouTube URL", + placeholder="https://youtube.com/watch?v= ...", + value=st.session_state.yt_url_value, + key="yt_url_input" + ) + + # Update session state + if yt_url != st.session_state.yt_url_value: + st.session_state.yt_url_value = yt_url + try: + st.rerun() + except: + pass + + # Check yt-dlp + if not ui_search.check_yt_dlp(): + st.warning("yt-dlp not installed") + if st.button("Install yt-dlp"): + with st.spinner("Installing..."): + import subprocess + subprocess.run(["pip", "install", "-q", "yt-dlp"]) + try: + st.rerun() + except: + st.info("Please refresh the page") + elif yt_url and yt_url.strip(): + video_id = ui_search.generate_video_id(f"yt_{yt_url.strip()}") + detected_source_type = "youtube" + st.success("YouTube URL ready") + + # --- Batch CSV Upload --- + elif source_type == "📁 Batch CSV Upload": + st.subheader("📁 Batch Process Videos from CSV") + + csv_file = st.file_uploader( + "Upload CSV file", + type=["csv"], + help="CSV must contain a column with video URLs" + ) + + if csv_file: + try: + # Read CSV with flexible parsing + try: + csv_df = pd.read_csv(csv_file) + except Exception: + csv_file.seek(0) + csv_df = pd.read_csv(csv_file, header=None) + csv_df.columns = [f"column_{i}" for i in range(len(csv_df.columns))] + + # Handle case where column names are URLs + url_like_columns = [] + for col in csv_df.columns: + col_str = str(col).strip() + if ui_search.detect_url_type(col_str) != "unknown": + url_like_columns.append(col) + + if url_like_columns and len(csv_df.columns) == 1: + url_col_name = csv_df.columns[0] + new_row = {url_col_name: url_col_name} + csv_df = pd.concat([pd.DataFrame([new_row]), csv_df], ignore_index=True) + + st.success(f"✅ Loaded CSV with {len(csv_df)} rows") + + # Column selection + url_column = st.selectbox("Select column containing video URLs", options=csv_df.columns.tolist()) + + id_column_options = ["Auto-generate"] + [c for c in csv_df.columns if c != url_column] + id_column = st.selectbox("Select column for custom Video ID (optional)", options=id_column_options, index=0) + + # Extract and validate URLs + urls_raw = csv_df[url_column].dropna().astype(str).tolist() + urls_to_process = [u.strip() for u in urls_raw if u.strip()] + + # Preview + with st.expander(f"Preview URLs ({len(urls_to_process)} found)"): + for i, url in enumerate(urls_to_process[:10], 1): + url_type = ui_search.detect_url_type(url) + icon = "🎬" if url_type == "youtube" else "📄" if url_type == "direct" else "❓" + st.text(f"{i}. {icon} {url[:80]}...") + + # Validate + valid_urls = [] + invalid_urls = [] + for url in urls_to_process: + url_type = ui_search.detect_url_type(str(url)) + if url_type in ["youtube", "direct"]: + valid_urls.append(url) + else: + invalid_urls.append(url) + + col1, col2, col3 = st.columns(3) + col1.metric("Total", len(urls_to_process)) + col2.metric("✅ Valid", len(valid_urls)) + col3.metric("❌ Invalid", len(invalid_urls)) + + # Store in session state + st.session_state['batch_urls'] = valid_urls + st.session_state['batch_df'] = csv_df + st.session_state['batch_url_column'] = url_column + st.session_state['batch_id_column'] = id_column + + except Exception as e: + st.error(f"Error reading CSV: {e}") + import traceback + st.error(traceback.format_exc()) + + # Custom ID input + custom_id = st.text_input("Custom Video ID (optional)") + if custom_id.strip() and source_type != "📁 Batch CSV Upload": + video_id = custom_id.strip() + + # Determine if we can process + can_process = False + if source_type == "File Upload": + can_process = file_bytes is not None and azure_configured + elif source_type == "Direct URL": + can_process = media_url is not None and len(str(media_url).strip()) > 0 + elif source_type == "YouTube": + yt_url_to_check = st.session_state.get('yt_url_value', '') + can_process = len(str(yt_url_to_check).strip()) > 0 and ui_search.check_yt_dlp() + elif source_type == "📁 Batch CSV Upload": + can_process = (st.session_state.get('batch_urls') and + len(st.session_state.get('batch_urls', [])) > 0 and + azure_configured and + not st.session_state.get('batch_processing', False)) + + # Process button + button_text = "🚀 Start Transcription" + if source_type == "📁 Batch CSV Upload": + count = len(st.session_state.get('batch_urls', [])) + button_text = f"🚀 Process {count} Videos from CSV" + + if st.button(button_text, type="primary", disabled=not can_process): + + # --- BATCH PROCESSING --- + if source_type == "📁 Batch CSV Upload": + st.session_state.batch_processing = True + st.session_state.batch_results = [] + + urls = st.session_state.get('batch_urls', []) + csv_df = st.session_state.get('batch_df') + url_column = st.session_state.get('batch_url_column') + id_column = st.session_state.get('batch_id_column') + + total = len(urls) + st.info(f"Starting batch processing of {total} videos...") + + # Progress UI + overall_progress = st.progress(0) + status_text = st.empty() + results_container = st.container() + + results = [] + for idx, url in enumerate(urls, 1): + # Get custom ID if specified + custom_vid_id = None + if id_column != "Auto-generate": + row = csv_df[csv_df[url_column] == url] + if not row.empty: + custom_vid_id = str(row[id_column].iloc[0]) + custom_vid_id = re.sub(r'[^\w\s-]', '', custom_vid_id).strip().replace(' ', '_')[:50] + + # Detect source type + url_type = ui_search.detect_url_type(url) + src_type = "youtube" if url_type == "youtube" else "direct" + + # Process + result = ui_search.process_single_video( + url=url, + custom_id=custom_vid_id, + source_type=src_type, + progress_bar=overall_progress, + status_text=status_text, + overall_progress=(idx, total) + ) + + results.append(result) + st.session_state.batch_results = results + + # Update progress + progress_pct = int((idx / total) * 100) + overall_progress.progress(progress_pct) + + # Show result + with results_container: + if result['status'] == 'success': + url_stored = "✅ URL saved" if result.get('url_stored') else "⚠️ URL not stored" + st.success(f"✅ [{idx}/{total}] {result['video_id']}: {result['segments_count']} segments ({url_stored})") + else: + error_msg = result.get('error', 'Unknown error')[:200] + st.error(f"❌ [{idx}/{total}] Failed: {error_msg}...") + + time.sleep(1) # Rate limiting + + # Final summary + overall_progress.progress(100) + status_text.text("Batch processing complete!") + + successful = [r for r in results if r['status'] == 'success'] + failed = [r for r in results if r['status'] == 'failed'] + + st.markdown("---") + st.subheader("📊 Batch Processing Summary") + + col1, col2, col3 = st.columns(3) + col1.metric("Total", total) + col2.metric("Successful", len(successful), f"{len(successful)/total*100:.1f}%" if total > 0 else "0%") + col3.metric("Failed", len(failed), f"{len(failed)/total*100:.1f}%" if total > 0 else "0%") + + # Detailed results + with st.expander("View Detailed Results"): + results_df = pd.DataFrame([ + { + 'Video ID': r['video_id'], + 'URL': r['url'][:50] + "..." if len(r['url']) > 50 else r['url'], + 'Source Type': r.get('source_type', 'unknown'), + 'Status': r['status'], + 'Segments': r.get('segments_count', 0), + 'URL Stored': r.get('url_stored', False), + 'Indexing': r.get('index_status', 'N/A'), + 'Error': (r.get('error', '')[:100] + '...') if r.get('error') else '' + } + for r in results + ]) + st.dataframe(results_df) + + # Download results + csv_buffer = io.StringIO() + results_df.to_csv(csv_buffer, index=False) + st.download_button("Download Results CSV", csv_buffer.getvalue(), "batch_results.csv", "text/csv") + + # Search hint + if successful: + st.info("💡 **Search processed videos:**") + video_ids = [r['video_id'] for r in successful[:5]] + st.code(f"video_id:({' OR '.join(video_ids)})") + + st.session_state.batch_processing = False + + # --- SINGLE VIDEO PROCESSING --- + else: + progress_bar = st.progress(0) + status = st.empty() + + try: + # Upload file if needed + if source_type == "File Upload" and file_bytes: + progress_bar.progress(10) + status.text("Uploading to Azure Blob...") + + blob_name = f"upload_{video_id}_{int(time.time())}.m4a" + + sas_url, error = ui_search.upload_to_azure_blob_sdk(file_bytes, blob_name) + if error and ("not installed" in error or "SDK" in error): + sas_url, error = ui_search.upload_to_azure_blob_fixed(file_bytes, blob_name) + + if error: + raise Exception(error) + + media_url = sas_url + progress_bar.progress(50) + + # Download YouTube if needed + elif source_type == "YouTube": + yt_url = st.session_state.get('yt_url_value', '') + + if not yt_url or not yt_url.strip(): + raise Exception("YouTube URL is empty") + + with tempfile.TemporaryDirectory() as tmpdir: + progress_bar.progress(10) + status.text("Downloading from YouTube...") + + output_path = f"{tmpdir}/youtube_{video_id}.m4a" + downloaded_path, error = ui_search.download_youtube_audio(yt_url.strip(), output_path) + + if error: + raise Exception(error) + + progress_bar.progress(50) + status.text("Uploading to Azure Blob...") + + with open(downloaded_path, 'rb') as f: + file_bytes = f.read() + + blob_name = f"youtube_{video_id}_{int(time.time())}.m4a" + + sas_url, error = ui_search.upload_to_azure_blob_sdk(file_bytes, blob_name) + if error and ("not installed" in error): + sas_url, error = ui_search.upload_to_azure_blob_fixed(file_bytes, blob_name) + + if error: + raise Exception(error) + + media_url = sas_url + progress_bar.progress(75) + + if not media_url: + raise Exception("No media URL available") + + # Transcribe + status.text("Submitting to Azure Speech-to-Text...") + result = ui_search.submit_transcription_direct(video_id, media_url) + operation_url = result.get("operation_url") + + if not operation_url: + raise Exception("No operation URL returned") + + # Poll + max_polls = 120 + transcription_data = None + + for i in range(max_polls): + time.sleep(ui_search.POLL_SECONDS) + poll_result = ui_search.poll_transcription_operation(operation_url) + status_text = poll_result.get("status", "unknown") + + progress = min(75 + int((i / max_polls) * 20), 95) + progress_bar.progress(progress) + status.text(f"Transcribing... ({i * ui_search.POLL_SECONDS // 60} min) - Status: {status_text}") + + if status_text.lower() == "succeeded": + transcription_data = ui_search.get_transcription_from_result(poll_result) + break + elif status_text.lower() == "failed": + raise Exception(f"Transcription failed: {poll_result.get('properties', {}).get('error', {}).get('message', 'Unknown error')}") + + if not transcription_data: + raise Exception("Transcription timed out") + + # Process and index + progress_bar.progress(98) + status.text("Processing segments and indexing...") + + segments = ui_search.process_transcription_to_segments(transcription_data, video_id) + + # Save to blob + ui_search.save_segments_to_blob(video_id, segments) + + # Index with URL tracking + original_url = None + if source_type == "YouTube": + original_url = st.session_state.get('yt_url_value', '') + elif source_type == "Direct URL": + original_url = media_url + elif source_type == "File Upload": + original_url = f"uploaded_file://{video_id}" + + index_result = ui_search.index_segments_direct( + video_id, + segments, + source_url=original_url, + source_type=detected_source_type + ) + + url_stored_msg = "✅ Source URL stored" if index_result.get('source_url_stored') else "⚠️ URL storage not available" + + progress_bar.progress(100) + status.text("Complete!") + + st.success(f""" + ✅ **Transcription Complete!** + - Video ID: {video_id} + - Segments: {len(segments)} + - Source Type: {detected_source_type} + - Indexed: {index_result.get('indexed', 0)} documents + - {url_stored_msg} + """) + + if original_url: + st.info(f"**Original Source:** [{original_url}]({original_url})") + + st.code(f'Search: video_id:{video_id}') + + with st.expander("View first 5 segments"): + for seg in segments[:5]: + st.write(f"**{ui_search.ms_to_ts(seg['start_ms'])} - {ui_search.ms_to_ts(seg['end_ms'])}:** {seg['text'][:100]}...") + + except Exception as e: + st.error(f"❌ Error: {str(e)}") + st.exception(e) \ No newline at end of file From 892ef5f4b3ca57fd085e2d2ed67ff655ddfb5b51 Mon Sep 17 00:00:00 2001 From: Martin Nwadiugwu Date: Mon, 2 Mar 2026 21:04:32 -0600 Subject: [PATCH 8/8] Revert .gitignore to upstream version --- .gitignore | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 172 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index e29811b..6e659d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,175 @@ -# macOS -.DS_Store +local.settings.json +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class -# JSON lines -*.jsonl +# C extensions +*.so -# Python bytecode -*.pyc -__pycache__/ +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ -# Local virtual env -func_venv/ +# PyPI configuration file +.pypirc