diff --git a/Bot/app/api/meetings.py b/Bot/app/api/meetings.py index 40747b0..4894f4a 100644 --- a/Bot/app/api/meetings.py +++ b/Bot/app/api/meetings.py @@ -14,6 +14,8 @@ import json from app.core import config import os +import re +import html # Now you can access the values like this: CLIENT_ID = os.getenv("CLIENT_ID") @@ -70,10 +72,90 @@ def get_meetings(body: ScheduleMeeting, token: str = Depends(OAUTH2_SCHEME)): events = events_result.get('items', []) scheduled_meetings = [] meetings_map = {} - for event in events: - meeting_url = event.get('hangoutLink') + # Try to import the canonical extraction logic from the attendee package. + # In some deployment setups the `attendee` package may not be on sys.path + # (separate services). Fall back to a local implementation that mirrors + # the attendee behaviour and will attempt to use meeting_type_from_url if + # that helper is importable. + try: + from attendee.bots.tasks.sync_calendar_task import extract_meeting_url_from_text # type: ignore + except Exception: + try: + # Try alternative import path used in the attendee codebase + from bots.tasks.sync_calendar_task import extract_meeting_url_from_text # type: ignore + except Exception: + # Fallback implementation copied from attendee/bots/tasks/sync_calendar_task.py + from typing import Optional + + URL_CANDIDATE = re.compile(r"https?://[^\s<>\"']+", re.IGNORECASE) - # check is bot request is aleady sent for the meeting + try: + # Prefer to reuse meeting_type_from_url and the canonical patterns when available + from bots.meeting_url_utils import meeting_type_from_url, SCHEME_LESS_PATTERNS # type: ignore + except Exception: + try: + from attendee.bots.meeting_url_utils import meeting_type_from_url, SCHEME_LESS_PATTERNS # type: ignore + except Exception: + # Last-resort: accept any url (best-effort) and provide default patterns + def meeting_type_from_url(url: str): + return True + SCHEME_LESS_PATTERNS = [ + r"(?:[\w.-]+\.)?zoom\.us/[^\s<>\"']+", + r"meet\.google\.com/[^\s<>\"']+", + r"teams\.microsoft\.com/[^\s<>\"']+", + ] + + def extract_meeting_url_from_text(text: str) -> Optional[str]: + if not text: + return None + # First pass: look for normal https:// links (case-insensitive) + for m in URL_CANDIDATE.finditer(text): + url = m.group(0).rstrip(").,;]}") + # strip trailing '>' that sometimes remains from markdown/angle-bracket wrapping + url = url.rstrip('>') + try: + if meeting_type_from_url(url): + return url + except Exception: + return url + + # Fallback: links without scheme (e.g., "zoom.us/j/12345") or mixed-case scheme + for pat in SCHEME_LESS_PATTERNS: + for m in re.finditer(pat, text, flags=re.IGNORECASE): + candidate = m.group(0) + if not candidate.lower().startswith("http"): + candidate = "https://" + candidate + try: + if meeting_type_from_url(candidate): + return candidate + except Exception: + return candidate + return None + + for event in events: + # Prefer structured conferenceData entryPoints (video) when available + meeting_url = None + conf = event.get('conferenceData') + if conf: + entry_points = conf.get('entryPoints', []) + for ep in entry_points: + if ep.get('entryPointType') == 'video' and ep.get('uri'): + meeting_url = ep.get('uri') + break + # fallbacks + if not meeting_url: + meeting_url = event.get('hangoutLink') + if not meeting_url: + # check location, description, summary for embedded links + for field in ('location', 'description', 'summary'): + val = event.get(field) + if val: + val = html.unescape(val) + meeting_url = extract_meeting_url_from_text(val) + if meeting_url: + break + + # check is bot request is already sent for the meeting json_str = redis_client.get(BOT_ADDED_IN_MEETING_KEY) if json_str: meetings_map = json.loads(json_str) diff --git a/Bot/app/core/config.py b/Bot/app/core/config.py index 75ba517..5f3c554 100644 --- a/Bot/app/core/config.py +++ b/Bot/app/core/config.py @@ -74,38 +74,27 @@ def _set_user_id_in_redis(value, key='bot:user_id', ttl=REDIS_USER_ID_TTL): logger.exception('Error writing USER_ID to Redis') return False - async def _fetch_user_id_async(db_url: str): - """Connect to Postgres and return a probable user id from the session table. - - This is intentionally defensive: it looks for common column names and falls - back to the first column value if necessary. Returns empty string on errors. - """ try: conn = await asyncpg.connect(dsn=db_url) - try: - # Prefer a non-expired session (expires_at in the future), most - # recently created first. If none found, fall back to the most - # recent session regardless of expiry. - val = await conn.fetchval( - "SELECT user_id FROM session LIMIT 1" - ) - if val: - return str(val) + print("Connected successfully!") + try: + query = "SELECT user_id FROM session LIMIT 1" + val = await conn.fetchval(query) if val: - print("Fetched user_id from session table:", val) return str(val) - - logger.warning('session table returned no rows') return '' + finally: await conn.close() - except Exception: - logger.exception('Error fetching USER_ID from DB') + + except Exception as e: + logger.exception("Error fetching USER_ID from DB ") return '' + def get_user_id(): """Synchronous helper that returns the USER_ID, fetching it from DB if needed. @@ -129,6 +118,7 @@ def get_user_id(): pass db_url = DATABASE_URL or _build_db_url_from_env() + print( "Database URL:", db_url) if not db_url: logger.error('No DATABASE_URL or PG_* env vars set; cannot fetch USER_ID') _USER_ID_CACHE = '' diff --git a/attendee/bots/meeting_url_utils.py b/attendee/bots/meeting_url_utils.py index 3ef56ac..dc4a76c 100644 --- a/attendee/bots/meeting_url_utils.py +++ b/attendee/bots/meeting_url_utils.py @@ -11,6 +11,14 @@ HTTP_URL_RE = re.compile(r"https?://[^\s<>\"']+") +# Patterns used to detect scheme-less meeting URLs (e.g. "zoom.us/j/123456") +# Keep these here as the canonical source so other modules can reuse them. +SCHEME_LESS_PATTERNS = [ + r"(?:[\w.-]+\.)?zoom\.us/[^\s<>\"']+", + r"meet\.google\.com/[^\s<>\"']+", + r"teams\.microsoft\.com/[^\s<>\"']+", +] + def contains_multiple_urls(url: str): if not url: diff --git a/attendee/bots/tasks/sync_calendar_task.py b/attendee/bots/tasks/sync_calendar_task.py index 85645c6..25ad288 100644 --- a/attendee/bots/tasks/sync_calendar_task.py +++ b/attendee/bots/tasks/sync_calendar_task.py @@ -7,6 +7,7 @@ from zoneinfo import ZoneInfo import dateutil.parser +import html import requests from celery import shared_task from django.db import transaction @@ -14,23 +15,36 @@ from bots.bots_api_utils import delete_bot, patch_bot from bots.calendars_api_utils import remove_bots_from_calendar -from bots.meeting_url_utils import meeting_type_from_url +from bots.meeting_url_utils import meeting_type_from_url, SCHEME_LESS_PATTERNS from bots.models import Bot, BotStates, Calendar, CalendarEvent, CalendarPlatform, CalendarStates, WebhookTriggerTypes from bots.webhook_payloads import calendar_webhook_payload from bots.webhook_utils import trigger_webhook logger = logging.getLogger(__name__) -URL_CANDIDATE = re.compile(r"https?://[^\s<>\"']+") +URL_CANDIDATE = re.compile(r"https?://[^\s<>\"']+", re.IGNORECASE) def extract_meeting_url_from_text(text: str) -> Optional[str]: if not text: return None + # First pass: look for normal https:// links (case-insensitive) for m in URL_CANDIDATE.finditer(text): - url = m.group(0).rstrip(").,;]}>") + url = m.group(0).rstrip(").,;]}") + # strip trailing '>' that sometimes remains from markdown/angle-bracket wrapping + url = url.rstrip('>') if meeting_type_from_url(url): return url + + # Fallback: links without scheme (e.g., "zoom.us/j/12345") or mixed-case scheme + # Try to find common meeting host patterns and prepend https:// when detected + for pat in SCHEME_LESS_PATTERNS: + for m in re.finditer(pat, text, flags=re.IGNORECASE): + candidate = m.group(0) + if not candidate.lower().startswith("http"): + candidate = "https://" + candidate + if meeting_type_from_url(candidate): + return candidate return None @@ -457,13 +471,86 @@ def _remote_event_to_calendar_event_data(self, google_event: dict) -> dict: # Extract meeting URL if present meeting_url_from_conference_data = None + entry_points = [] if "conferenceData" in google_event: entry_points = google_event["conferenceData"].get("entryPoints", []) for entry_point in entry_points: if entry_point.get("entryPointType") == "video": meeting_url_from_conference_data = entry_point.get("uri") break - meeting_url = extract_meeting_url_from_text(meeting_url_from_conference_data) or extract_meeting_url_from_text(google_event.get("hangoutLink")) or extract_meeting_url_from_text(google_event.get("location")) or extract_meeting_url_from_text(google_event.get("description")) or extract_meeting_url_from_text(google_event.get("summary")) + + # Normalize/unescape free-text fields before extraction + hangout_link = google_event.get("hangoutLink") + location_text = google_event.get("location") + description_text = google_event.get("description") + summary_text = google_event.get("summary") + + if description_text: + description_text = html.unescape(description_text) + if location_text: + location_text = html.unescape(location_text) + if summary_text: + summary_text = html.unescape(summary_text) + + meeting_url = None + meeting_url_source = None + + # Check in order and record the source field for logging + if meeting_url_from_conference_data: + meeting_url = extract_meeting_url_from_text(meeting_url_from_conference_data) + if meeting_url: + meeting_url_source = "conferenceData.entryPoints" + + if not meeting_url and hangout_link: + meeting_url = extract_meeting_url_from_text(hangout_link) + if meeting_url: + meeting_url_source = "hangoutLink" + + if not meeting_url and location_text: + meeting_url = extract_meeting_url_from_text(location_text) + if meeting_url: + meeting_url_source = "location" + + if not meeting_url and description_text: + meeting_url = extract_meeting_url_from_text(description_text) + if meeting_url: + meeting_url_source = "description" + + if not meeting_url and summary_text: + meeting_url = extract_meeting_url_from_text(summary_text) + if meeting_url: + meeting_url_source = "summary" + + # Loose fallback: try to detect scheme-less Zoom/Meet/Teams links inside text + if not meeting_url: + # Try entry points liberally (some providers set non-standard URIs) + if entry_points: + for ep in entry_points: + uri = ep.get("uri") + if uri: + candidate = uri + if not candidate.lower().startswith("http"): + candidate = "https://" + candidate + if meeting_type_from_url(candidate): + meeting_url = extract_meeting_url_from_text(candidate) or candidate + meeting_url_source = "conferenceData.entryPoints.loose" + break + + # Try scheme-less patterns in text fields + if not meeting_url: + loose_text = "\n".join(filter(None, [location_text, description_text, summary_text])) + if loose_text: + # common patterns + m = re.search(r"(?:[\w.-]+\.)?zoom\.us/\S+", loose_text, flags=re.IGNORECASE) + if m: + candidate = m.group(0) + if not candidate.lower().startswith("http"): + candidate = "https://" + candidate + if meeting_type_from_url(candidate): + meeting_url = candidate + meeting_url_source = "loose_text_zoom" + + logger.debug("Event %s: extracted meeting_url=%s source=%s", google_event.get("id"), meeting_url, meeting_url_source) # Extract attendees attendees = [] diff --git a/docker-compose.yml b/docker-compose.yml index f8704b9..447fbe7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -155,7 +155,8 @@ services: # - "8001:8000" command: "python bots/webpage_streamer/run_webpage_streamer.py" security_opt: - - seccomp=attendee/bots/web_bot_adapter/chrome_seccomp.json + # - seccomp=attendee/bots/web_bot_adapter/chrome_seccomp.json + - seccomp:unconfined depends_on: - postgres-attendee - redis