From 0e25241b6855fa1d9e964105d777380b0ca6d623 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 10:47:11 -0500 Subject: [PATCH 01/25] feat: Integrate IBM COS connector with HMAC as default connection method --- .../mutations/useConnectConnectorMutation.ts | 5 + .../mutations/useIBMCOSConfigureMutation.ts | 40 ++ .../app/api/mutations/useSyncConnector.ts | 4 + .../api/queries/useIBMCOSBucketStatusQuery.ts | 34 ++ .../app/api/queries/useIBMCOSBucketsQuery.ts | 23 ++ .../app/api/queries/useIBMCOSDefaultsQuery.ts | 28 ++ frontend/app/knowledge/page.tsx | 4 + .../settings/_components/connector-card.tsx | 59 ++- .../settings/_components/connector-cards.tsx | 76 ++-- .../_components/ibm-cos-settings-dialog.tsx | 198 ++++++++++ .../_components/ibm-cos-settings-form.tsx | 305 +++++++++++++++ frontend/app/upload/[provider]/page.tsx | 204 +++++++++- .../components/cloud-picker/file-item.tsx | 3 + frontend/components/icons/ibm-cos-icon.tsx | 63 +++ pyproject.toml | 1 + src/api/connectors.py | 322 ++++++++++++++- src/connectors/__init__.py | 2 + src/connectors/connection_manager.py | 9 + src/connectors/ibm_cos/__init__.py | 3 + src/connectors/ibm_cos/auth.py | 183 +++++++++ src/connectors/ibm_cos/connector.py | 368 ++++++++++++++++++ src/connectors/langflow_connector_service.py | 59 +-- src/main.py | 5 + src/services/auth_service.py | 66 +++- src/tui/config_fields.py | 30 ++ src/tui/managers/env_manager.py | 17 + src/utils/file_utils.py | 22 +- uv.lock | 40 +- 28 files changed, 2085 insertions(+), 88 deletions(-) create mode 100644 frontend/app/api/mutations/useIBMCOSConfigureMutation.ts create mode 100644 frontend/app/api/queries/useIBMCOSBucketStatusQuery.ts create mode 100644 frontend/app/api/queries/useIBMCOSBucketsQuery.ts create mode 100644 frontend/app/api/queries/useIBMCOSDefaultsQuery.ts create mode 100644 frontend/app/settings/_components/ibm-cos-settings-dialog.tsx create mode 100644 frontend/app/settings/_components/ibm-cos-settings-form.tsx create mode 100644 frontend/components/icons/ibm-cos-icon.tsx create mode 100644 src/connectors/ibm_cos/__init__.py create mode 100644 src/connectors/ibm_cos/auth.py create mode 100644 src/connectors/ibm_cos/connector.py diff --git a/frontend/app/api/mutations/useConnectConnectorMutation.ts b/frontend/app/api/mutations/useConnectConnectorMutation.ts index 893a90086..6e0faf4b8 100644 --- a/frontend/app/api/mutations/useConnectConnectorMutation.ts +++ b/frontend/app/api/mutations/useConnectConnectorMutation.ts @@ -81,6 +81,11 @@ export const useConnectConnectorMutation = () => { `state=${result.connection_id}`; window.location.href = authUrl; + } else { + // Direct-auth connector (e.g. IBM COS) — credentials already verified, + // no OAuth redirect needed. Refresh connector status. + queryClient.invalidateQueries({ queryKey: ["connectors"] }); + toast.success(`${connector.name} connected successfully`); } }, }); diff --git a/frontend/app/api/mutations/useIBMCOSConfigureMutation.ts b/frontend/app/api/mutations/useIBMCOSConfigureMutation.ts new file mode 100644 index 000000000..89e0e23f4 --- /dev/null +++ b/frontend/app/api/mutations/useIBMCOSConfigureMutation.ts @@ -0,0 +1,40 @@ +import { useMutation, useQueryClient } from "@tanstack/react-query"; + +export interface IBMCOSConfigurePayload { + auth_mode: "iam" | "hmac"; + endpoint: string; + // IAM + api_key?: string; + service_instance_id?: string; + auth_endpoint?: string; + // HMAC + hmac_access_key?: string; + hmac_secret_key?: string; + // Bucket selection + bucket_names?: string[]; + // Updating an existing connection + connection_id?: string; +} + +async function configureIBMCOS(payload: IBMCOSConfigurePayload) { + const res = await fetch("/api/connectors/ibm_cos/configure", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + const data = await res.json(); + if (!res.ok) throw new Error(data.error || "Failed to configure IBM COS"); + return data as { connection_id: string; status: string }; +} + +export function useIBMCOSConfigureMutation() { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: configureIBMCOS, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ["connectors"] }); + queryClient.invalidateQueries({ queryKey: ["ibm-cos-defaults"] }); + }, + }); +} diff --git a/frontend/app/api/mutations/useSyncConnector.ts b/frontend/app/api/mutations/useSyncConnector.ts index 9ff22e47e..fe2c7ef2e 100644 --- a/frontend/app/api/mutations/useSyncConnector.ts +++ b/frontend/app/api/mutations/useSyncConnector.ts @@ -47,6 +47,10 @@ const syncConnector = async ({ size?: number; }>; settings?: any; + /** When true, ingest all files from the connector (bypasses the re-sync gate). */ + sync_all?: boolean; + /** Restrict ingest to these bucket names (IBM COS). */ + bucket_filter?: string[]; }; }): Promise => { const response = await fetch(`/api/connectors/${connectorType}/sync`, { diff --git a/frontend/app/api/queries/useIBMCOSBucketStatusQuery.ts b/frontend/app/api/queries/useIBMCOSBucketStatusQuery.ts new file mode 100644 index 000000000..2a4b81b89 --- /dev/null +++ b/frontend/app/api/queries/useIBMCOSBucketStatusQuery.ts @@ -0,0 +1,34 @@ +import { useQuery } from "@tanstack/react-query"; + +export interface IBMCOSBucketStatus { + name: string; + ingested_count: number; + is_synced: boolean; +} + +async function fetchIBMCOSBucketStatus( + connectionId: string, +): Promise { + const res = await fetch( + `/api/connectors/ibm_cos/${connectionId}/bucket-status`, + ); + if (!res.ok) { + const err = await res.json().catch(() => ({})); + throw new Error(err.error || "Failed to fetch bucket status"); + } + const data = await res.json(); + return data.buckets as IBMCOSBucketStatus[]; +} + +export function useIBMCOSBucketStatusQuery( + connectionId: string | null | undefined, + options?: { enabled?: boolean }, +) { + return useQuery({ + queryKey: ["ibm-cos-bucket-status", connectionId], + queryFn: () => fetchIBMCOSBucketStatus(connectionId!), + enabled: (options?.enabled ?? true) && !!connectionId, + staleTime: 0, + refetchOnMount: "always", + }); +} diff --git a/frontend/app/api/queries/useIBMCOSBucketsQuery.ts b/frontend/app/api/queries/useIBMCOSBucketsQuery.ts new file mode 100644 index 000000000..a7daa619f --- /dev/null +++ b/frontend/app/api/queries/useIBMCOSBucketsQuery.ts @@ -0,0 +1,23 @@ +import { useQuery } from "@tanstack/react-query"; + +async function fetchIBMCOSBuckets(connectionId: string): Promise { + const res = await fetch(`/api/connectors/ibm_cos/${connectionId}/buckets`); + if (!res.ok) { + const err = await res.json().catch(() => ({})); + throw new Error(err.error || "Failed to list buckets"); + } + const data = await res.json(); + return data.buckets as string[]; +} + +export function useIBMCOSBucketsQuery( + connectionId: string | null | undefined, + options?: { enabled?: boolean }, +) { + return useQuery({ + queryKey: ["ibm-cos-buckets", connectionId], + queryFn: () => fetchIBMCOSBuckets(connectionId!), + enabled: (options?.enabled ?? true) && !!connectionId, + staleTime: 30_000, + }); +} diff --git a/frontend/app/api/queries/useIBMCOSDefaultsQuery.ts b/frontend/app/api/queries/useIBMCOSDefaultsQuery.ts new file mode 100644 index 000000000..b44ee4c5b --- /dev/null +++ b/frontend/app/api/queries/useIBMCOSDefaultsQuery.ts @@ -0,0 +1,28 @@ +import { useQuery } from "@tanstack/react-query"; + +export interface IBMCOSDefaults { + api_key_set: boolean; + service_instance_id: string; + endpoint: string; + hmac_access_key_set: boolean; + hmac_secret_key_set: boolean; + auth_mode: "iam" | "hmac"; + bucket_names: string[]; + connection_id: string | null; + disable_iam: boolean; +} + +async function fetchIBMCOSDefaults(): Promise { + const res = await fetch("/api/connectors/ibm_cos/defaults"); + if (!res.ok) throw new Error("Failed to fetch IBM COS defaults"); + return res.json(); +} + +export function useIBMCOSDefaultsQuery(options?: { enabled?: boolean }) { + return useQuery({ + queryKey: ["ibm-cos-defaults"], + queryFn: fetchIBMCOSDefaults, + enabled: options?.enabled ?? true, + staleTime: 0, + }); +} diff --git a/frontend/app/knowledge/page.tsx b/frontend/app/knowledge/page.tsx index a634423e8..6a99deab8 100644 --- a/frontend/app/knowledge/page.tsx +++ b/frontend/app/knowledge/page.tsx @@ -41,6 +41,8 @@ import { formatFilesToDelete, } from "../../components/delete-confirmation-dialog"; import GoogleDriveIcon from "../../components/icons/google-drive-logo"; +import IBMLogo from "../../components/icons/ibm-logo"; +import IBMCOSIcon from "../../components/icons/ibm-cos-icon"; import OneDriveIcon from "../../components/icons/one-drive-logo"; import SharePointIcon from "../../components/icons/share-point-logo"; import { useDeleteDocument } from "../api/mutations/useDeleteDocument"; @@ -63,6 +65,8 @@ function getSourceIcon(connectorType?: string) { return ; case "s3": return ; + case "ibm_cos": + return ; default: return ( diff --git a/frontend/app/settings/_components/connector-card.tsx b/frontend/app/settings/_components/connector-card.tsx index aa132d99d..37c08dc77 100644 --- a/frontend/app/settings/_components/connector-card.tsx +++ b/frontend/app/settings/_components/connector-card.tsx @@ -1,6 +1,6 @@ "use client"; -import { Loader2, Plus, RefreshCcw, Trash2 } from "lucide-react"; +import { Loader2, Plus, RefreshCcw, Settings2, Trash2 } from "lucide-react"; import Link from "next/link"; import { Button } from "@/components/ui/button"; import { @@ -29,6 +29,8 @@ interface ConnectorCardProps { onConnect: (connector: Connector) => void; onDisconnect: (connector: Connector) => void; onNavigateToKnowledge: (connector: Connector) => void; + /** Optional: open a connector-specific settings/edit dialog */ + onConfigure?: (connector: Connector) => void; } export default function ConnectorCard({ @@ -38,6 +40,7 @@ export default function ConnectorCard({ onConnect, onDisconnect, onNavigateToKnowledge, + onConfigure, }: ConnectorCardProps) { console.log(connector); const isConnected = @@ -91,19 +94,32 @@ export default function ConnectorCard({ Add Knowledge - + {onConfigure ? ( + + ) : ( + + )} )} + ) : onConfigure ? ( + // Connector supports UI-based configuration (e.g. IBM COS) + ) : (

diff --git a/frontend/app/settings/_components/connector-cards.tsx b/frontend/app/settings/_components/connector-cards.tsx index 2e9f31a73..00f207584 100644 --- a/frontend/app/settings/_components/connector-cards.tsx +++ b/frontend/app/settings/_components/connector-cards.tsx @@ -1,7 +1,7 @@ "use client"; +import { useCallback, useState } from "react"; import { useRouter } from "next/navigation"; -import { useCallback } from "react"; import { useConnectConnectorMutation } from "@/app/api/mutations/useConnectConnectorMutation"; import { useDisconnectConnectorMutation } from "@/app/api/mutations/useDisconnectConnectorMutation"; import { @@ -9,15 +9,18 @@ import { useGetConnectorsQuery, } from "@/app/api/queries/useGetConnectorsQuery"; import GoogleDriveIcon from "@/components/icons/google-drive-logo"; +import IBMCOSIcon from "@/components/icons/ibm-cos-icon"; import OneDriveIcon from "@/components/icons/one-drive-logo"; import SharePointIcon from "@/components/icons/share-point-logo"; import { useAuth } from "@/contexts/auth-context"; import ConnectorCard, { type Connector } from "./connector-card"; import ConnectorsSkeleton from "./connectors-skeleton"; +import IBMCOSSettingsDialog from "./ibm-cos-settings-dialog"; export default function ConnectorCards() { const { isAuthenticated, isNoAuthMode } = useAuth(); const router = useRouter(); + const [ibmCOSDialogOpen, setIBMCOSDialogOpen] = useState(false); const { data: queryConnectors = [], isLoading: connectorsLoading } = useGetConnectorsQuery({ @@ -32,6 +35,7 @@ export default function ConnectorCards() { "google-drive": , sharepoint: , onedrive: , + "ibm-cos": , }; return ( iconMap[iconName] || ( @@ -63,37 +67,53 @@ export default function ConnectorCards() { router.push(`/upload/${provider}`); }; + // Connectors that use a settings dialog instead of OAuth for configuration + const getConfigureHandler = (connector: Connector) => { + if (connector.type === "ibm_cos") { + return () => setIBMCOSDialogOpen(true); + } + return undefined; + }; + if (!connectorsLoading && connectors.length === 0) { return null; } return ( -

- {connectorsLoading ? ( - <> - - - - - ) : ( - connectors.map((connector) => ( - - )) - )} -
+ <> +
+ {connectorsLoading ? ( + <> + + + + + ) : ( + connectors.map((connector) => ( + + )) + )} +
+ + + ); } diff --git a/frontend/app/settings/_components/ibm-cos-settings-dialog.tsx b/frontend/app/settings/_components/ibm-cos-settings-dialog.tsx new file mode 100644 index 000000000..0d6a9b367 --- /dev/null +++ b/frontend/app/settings/_components/ibm-cos-settings-dialog.tsx @@ -0,0 +1,198 @@ +"use client"; + +import { useState } from "react"; +import { FormProvider, useForm } from "react-hook-form"; +import { toast } from "sonner"; +import { useQueryClient } from "@tanstack/react-query"; +import IBMCOSIcon from "@/components/icons/ibm-cos-icon"; +import { Button } from "@/components/ui/button"; +import { + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { IBMCOSSettingsForm, type IBMCOSFormData } from "./ibm-cos-settings-form"; +import { useIBMCOSDefaultsQuery } from "@/app/api/queries/useIBMCOSDefaultsQuery"; +import { useIBMCOSConfigureMutation } from "@/app/api/mutations/useIBMCOSConfigureMutation"; + +interface IBMCOSSettingsDialogProps { + open: boolean; + setOpen: (open: boolean) => void; +} + +export default function IBMCOSSettingsDialog({ + open, + setOpen, +}: IBMCOSSettingsDialogProps) { + const queryClient = useQueryClient(); + + // Fetch env-based defaults to pre-fill the form + const { data: defaults } = useIBMCOSDefaultsQuery({ enabled: open }); + + const disableIam = defaults?.disable_iam ?? false; + + const methods = useForm({ + mode: "onSubmit", + values: { + auth_mode: disableIam ? "hmac" : (defaults?.auth_mode ?? "hmac"), + endpoint: defaults?.endpoint ?? "", + api_key: "", + service_instance_id: defaults?.service_instance_id ?? "", + hmac_access_key: "", + hmac_secret_key: "", + }, + }); + + const { handleSubmit } = methods; + + // Bucket state + const [buckets, setBuckets] = useState( + defaults?.bucket_names?.length ? defaults.bucket_names : null, + ); + const [selectedBuckets, setSelectedBuckets] = useState( + defaults?.bucket_names ?? [], + ); + const [isFetchingBuckets, setIsFetchingBuckets] = useState(false); + const [bucketsError, setBucketsError] = useState(null); + const [formError, setFormError] = useState(null); + + const configureMutation = useIBMCOSConfigureMutation(); + + // Inline bucket test: calls the configure endpoint without saving, + // then uses the returned connection_id to list buckets. + const handleTestConnection = handleSubmit(async (data) => { + setIsFetchingBuckets(true); + setBucketsError(null); + setFormError(null); + + try { + // First configure (creates/updates connection) to get a connection_id + const result = await configureMutation.mutateAsync({ + auth_mode: data.auth_mode, + endpoint: data.endpoint, + api_key: data.api_key || undefined, + service_instance_id: data.service_instance_id || undefined, + hmac_access_key: data.hmac_access_key || undefined, + hmac_secret_key: data.hmac_secret_key || undefined, + connection_id: defaults?.connection_id ?? undefined, + }); + + // Then list buckets using the connection + const res = await fetch( + `/api/connectors/ibm_cos/${result.connection_id}/buckets`, + ); + const json = await res.json(); + if (!res.ok) throw new Error(json.error || "Failed to list buckets"); + + const fetchedBuckets: string[] = json.buckets; + setBuckets(fetchedBuckets); + + // Keep any previously selected buckets that still exist + setSelectedBuckets((prev) => + prev.filter((b) => fetchedBuckets.includes(b)), + ); + + // Refresh defaults so we have the new connection_id + queryClient.invalidateQueries({ queryKey: ["ibm-cos-defaults"] }); + } catch (err: any) { + setBucketsError(err.message ?? "Connection failed"); + } finally { + setIsFetchingBuckets(false); + } + }); + + const onSubmit = handleSubmit(async (data) => { + setFormError(null); + if (buckets === null) { + setFormError("Test the connection first to validate credentials."); + return; + } + + try { + // Refresh defaults to get latest connection_id + const latestDefaults = await queryClient.fetchQuery({ + queryKey: ["ibm-cos-defaults"], + queryFn: async () => { + const res = await fetch("/api/connectors/ibm_cos/defaults"); + return res.json(); + }, + staleTime: 0, + }); + + await configureMutation.mutateAsync({ + auth_mode: data.auth_mode, + endpoint: data.endpoint, + api_key: data.api_key || undefined, + service_instance_id: data.service_instance_id || undefined, + hmac_access_key: data.hmac_access_key || undefined, + hmac_secret_key: data.hmac_secret_key || undefined, + bucket_names: selectedBuckets, + connection_id: latestDefaults?.connection_id ?? defaults?.connection_id ?? undefined, + }); + + toast.success("IBM Cloud Object Storage configured", { + description: + selectedBuckets.length > 0 + ? `Will ingest from: ${selectedBuckets.join(", ")}` + : "Will auto-discover and ingest all accessible buckets.", + icon: , + }); + + queryClient.invalidateQueries({ queryKey: ["connectors"] }); + setOpen(false); + } catch (err: any) { + setFormError(err.message ?? "Failed to save configuration"); + } + }); + + return ( + + + +
+ + +
+ +
+ IBM Cloud Object Storage Setup +
+
+ + void} + apiKeySet={defaults?.api_key_set} + hmacAccessKeySet={defaults?.hmac_access_key_set} + hmacSecretKeySet={defaults?.hmac_secret_key_set} + formError={formError} + disableIam={disableIam} + /> + + + + + + +
+
+
+ ); +} diff --git a/frontend/app/settings/_components/ibm-cos-settings-form.tsx b/frontend/app/settings/_components/ibm-cos-settings-form.tsx new file mode 100644 index 000000000..481aad35b --- /dev/null +++ b/frontend/app/settings/_components/ibm-cos-settings-form.tsx @@ -0,0 +1,305 @@ +"use client"; + +import { useFormContext, Controller } from "react-hook-form"; +import { LabelWrapper } from "@/components/label-wrapper"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import { Tabs, TabsList, TabsTrigger, TabsContent } from "@/components/ui/tabs"; +import { Loader2, RefreshCcw } from "lucide-react"; +import { Button } from "@/components/ui/button"; + +export interface IBMCOSFormData { + auth_mode: "iam" | "hmac"; + endpoint: string; + // IAM + api_key: string; + service_instance_id: string; + // HMAC + hmac_access_key: string; + hmac_secret_key: string; +} + +interface IBMCOSSettingsFormProps { + /** Available buckets after a successful test — null means not yet tested */ + buckets: string[] | null; + selectedBuckets: string[]; + onSelectedBucketsChange: (buckets: string[]) => void; + isFetchingBuckets: boolean; + bucketsError: string | null; + onTestConnection: () => void; + apiKeySet?: boolean; + hmacAccessKeySet?: boolean; + hmacSecretKeySet?: boolean; + formError?: string | null; + /** When true, IAM tab is greyed out and HMAC is the only selectable option */ + disableIam?: boolean; +} + +export function IBMCOSSettingsForm({ + buckets, + selectedBuckets, + onSelectedBucketsChange, + isFetchingBuckets, + bucketsError, + onTestConnection, + apiKeySet, + hmacAccessKeySet, + hmacSecretKeySet, + formError, + disableIam = false, +}: IBMCOSSettingsFormProps) { + const { + register, + control, + formState: { errors }, + } = useFormContext(); + + const toggleBucket = (name: string, checked: boolean) => { + if (checked) { + onSelectedBucketsChange([...selectedBuckets, name]); + } else { + onSelectedBucketsChange(selectedBuckets.filter((b) => b !== name)); + } + }; + + const toggleAll = (checked: boolean) => { + onSelectedBucketsChange(checked ? (buckets ?? []) : []); + }; + + return ( +
+ {/* Auth mode selector using Tabs */} +
+ + ( + { + if (disableIam && v === "iam") return; + field.onChange(v); + }} + > + + + HMAC + + Access Key + Secret Key + + + + IAM + + API Key + Resource Instance ID + + + + + {/* HMAC fields — first tab */} + +
+
+ + v?.trim() })} + id="ibm-cos-hmac-key" + type="password" + placeholder={ + hmacAccessKeySet + ? "•••••••• (loaded from env)" + : "cos_hmac_keys.access_key_id" + } + autoComplete="off" + /> + +
+
+ + v?.trim() })} + id="ibm-cos-hmac-secret" + type="password" + placeholder={ + hmacSecretKeySet + ? "•••••••• (loaded from env)" + : "cos_hmac_keys.secret_access_key" + } + autoComplete="off" + /> + +
+
+
+ + {/* IAM fields — second tab */} + +
+
+ + v?.trim() })} + id="ibm-cos-api-key" + type="password" + placeholder={ + apiKeySet + ? "•••••••• (loaded from env)" + : 'apikey value from Service Credentials' + } + autoComplete="off" + /> + +
+
+ + v?.trim() })} + id="ibm-cos-svc-id" + placeholder="crn:v1:bluemix:public:cloud-object-storage:..." + /> + +
+
+
+
+ )} + /> +
+ + {/* Endpoint — shared by both auth modes */} +
+ + v?.trim() })} + id="ibm-cos-endpoint" + placeholder="https://s3.us-south.cloud-object-storage.appdomain.cloud" + className={errors.endpoint ? "!border-destructive" : ""} + /> + + {errors.endpoint && ( +

{errors.endpoint.message}

+ )} +
+ + {/* Test connection */} + + + {bucketsError && ( +

+ {bucketsError} +

+ )} + + {formError && ( +

+ {formError} +

+ )} + + {/* Bucket selector — native checkboxes styled with Tailwind */} + {buckets !== null && ( +
+
+ + {buckets.length > 1 && ( + + )} +
+ + {buckets.length === 0 ? ( +

+ No buckets found for this account. +

+ ) : ( +
+ {buckets.map((bucket) => ( + + ))} +
+ )} +
+ )} +
+ ); +} diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index bf96214f7..277fc965e 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -1,11 +1,14 @@ "use client"; -import { AlertCircle, ArrowLeft } from "lucide-react"; +import { AlertCircle, ArrowLeft, RefreshCw } from "lucide-react"; import { useParams, useRouter } from "next/navigation"; -import { useEffect, useState } from "react"; +import { useState } from "react"; +import { toast } from "sonner"; +import { useQueryClient } from "@tanstack/react-query"; import { useSyncConnector } from "@/app/api/mutations/useSyncConnector"; import { useGetConnectorsQuery } from "@/app/api/queries/useGetConnectorsQuery"; import { useGetConnectorTokenQuery } from "@/app/api/queries/useGetConnectorTokenQuery"; +import { useIBMCOSBucketStatusQuery } from "@/app/api/queries/useIBMCOSBucketStatusQuery"; import { type CloudFile, UnifiedCloudPicker } from "@/components/cloud-picker"; import type { IngestSettings } from "@/components/cloud-picker/types"; import { Button } from "@/components/ui/button"; @@ -16,6 +19,179 @@ import { } from "@/components/ui/tooltip"; import { useTask } from "@/contexts/task-context"; +// Connectors that sync entire buckets/repositories without a file picker +const DIRECT_SYNC_PROVIDERS = ["ibm_cos"]; + +// --------------------------------------------------------------------------- +// IBM COS bucket list with sync status +// --------------------------------------------------------------------------- + +function IBMCOSBucketView({ + connector, + syncMutation, + addTask, + onBack, + onDone, +}: { + connector: any; + syncMutation: ReturnType; + addTask: (id: string) => void; + onBack: () => void; + onDone: () => void; +}) { + const queryClient = useQueryClient(); + const { data: buckets, isLoading, refetch } = useIBMCOSBucketStatusQuery( + connector.connectionId, + { enabled: true }, + ); + + const [syncingBucket, setSyncingBucket] = useState(null); + + const invalidateBucketStatus = () => { + queryClient.invalidateQueries({ queryKey: ["ibm-cos-bucket-status", connector.connectionId] }); + }; + + const syncAll = () => { + syncMutation.mutate( + { + connectorType: connector.type, + body: { + connection_id: connector.connectionId!, + selected_files: [], + sync_all: true, + }, + }, + { + onSuccess: (result) => { + invalidateBucketStatus(); + if (result.task_ids?.length) { + addTask(result.task_ids[0]); + onDone(); + } else { + toast.info("No files found in any bucket."); + } + }, + onError: (err) => { + toast.error(err instanceof Error ? err.message : "Sync failed"); + }, + }, + ); + }; + + const syncBucket = (bucketName: string) => { + setSyncingBucket(bucketName); + syncMutation.mutate( + { + connectorType: connector.type, + body: { + connection_id: connector.connectionId!, + selected_files: [], + bucket_filter: [bucketName], + }, + }, + { + onSuccess: (result) => { + setSyncingBucket(null); + invalidateBucketStatus(); + if (result.task_ids?.length) { + addTask(result.task_ids[0]); + onDone(); + } else { + toast.info(`No files found in bucket "${bucketName}".`); + } + }, + onError: (err) => { + setSyncingBucket(null); + toast.error(err instanceof Error ? err.message : "Sync failed"); + }, + }, + ); + }; + + return ( + <> +
+ +

+ Add from {connector.name} +

+
+ +
+
+

+ Select a bucket to ingest, or sync everything at once. +

+
+ + +
+
+ + {isLoading ? ( +
+
+
+ ) : !buckets?.length ? ( +
+ No buckets found. Check your IBM COS credentials and endpoint. +
+ ) : ( +
+ {buckets.map((bucket) => ( +
+
+
+

{bucket.name}

+ {bucket.ingested_count > 0 && ( +

+ {bucket.ingested_count} document{bucket.ingested_count !== 1 ? "s" : ""} ingested +

+ )} +
+
+ +
+ ))} +
+ )} +
+ + ); +} + // CloudFile interface is now imported from the unified cloud picker export default function UploadProviderPage() { @@ -31,6 +207,8 @@ export default function UploadProviderPage() { } = useGetConnectorsQuery(); const connector = connectors.find((c) => c.type === provider); + const isDirectSyncProvider = DIRECT_SYNC_PROVIDERS.includes(provider); + const { data: tokenData, isLoading: tokenLoading } = useGetConnectorTokenQuery( { @@ -42,7 +220,11 @@ export default function UploadProviderPage() { : undefined, }, { - enabled: !!connector && connector.status === "connected", + // Direct-sync providers (e.g. IBM COS) don't use OAuth tokens + enabled: + !!connector && + connector.status === "connected" && + !isDirectSyncProvider, }, ); @@ -61,7 +243,8 @@ export default function UploadProviderPage() { }); const accessToken = tokenData?.access_token || null; - const isLoading = connectorsLoading || tokenLoading; + const isLoading = + connectorsLoading || (!isDirectSyncProvider && tokenLoading); const isIngesting = syncMutation.isPending; // Error handling @@ -193,6 +376,19 @@ export default function UploadProviderPage() { ); } + // Direct-sync providers (e.g. IBM COS) show a bucket list with sync status. + if (isDirectSyncProvider && connector.status === "connected") { + return ( + router.back()} + onDone={() => router.push("/knowledge")} + /> + ); + } + if (!accessToken) { return ( <> diff --git a/frontend/components/cloud-picker/file-item.tsx b/frontend/components/cloud-picker/file-item.tsx index 617e54433..fa4d079fc 100644 --- a/frontend/components/cloud-picker/file-item.tsx +++ b/frontend/components/cloud-picker/file-item.tsx @@ -2,6 +2,7 @@ import { FileText, Folder, Trash2 } from "lucide-react"; import GoogleDriveIcon from "@/components/icons/google-drive-logo"; +import IBMCOSIcon from "@/components/icons/ibm-cos-icon"; import OneDriveIcon from "@/components/icons/one-drive-logo"; import SharePointIcon from "@/components/icons/share-point-logo"; import { Button } from "@/components/ui/button"; @@ -54,6 +55,8 @@ const getProviderIcon = (provider: string) => { return ; case "sharepoint": return ; + case "ibm_cos": + return ; default: return ; } diff --git a/frontend/components/icons/ibm-cos-icon.tsx b/frontend/components/icons/ibm-cos-icon.tsx new file mode 100644 index 000000000..a8fc4b324 --- /dev/null +++ b/frontend/components/icons/ibm-cos-icon.tsx @@ -0,0 +1,63 @@ +export default function IBMCOSIcon(props: React.SVGProps) { + return ( + + + + + + + + + + + + + + + + + + + + + + + + ); +} diff --git a/pyproject.toml b/pyproject.toml index 422c052bd..dfb583aac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "fastapi>=0.115.0", "uvicorn>=0.35.0", "boto3>=1.35.0", + "ibm-cos-sdk>=2.13.0", "psutil>=7.0.0", "rich>=13.0.0", "textual>=0.45.0", diff --git a/src/api/connectors.py b/src/api/connectors.py index 73c096d44..921df2bd8 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -88,6 +88,11 @@ async def get_synced_file_ids_for_connector( class ConnectorSyncBody(BaseModel): max_files: Optional[int] = None selected_files: Optional[List[Any]] = None + # When True, ingest ALL files from the connector (bypasses the existing-files gate). + # Used by direct-sync providers like IBM COS on initial ingest. + sync_all: bool = False + # When set, only ingest files from these buckets (IBM COS specific). + bucket_filter: Optional[List[str]] = None async def list_connectors( @@ -200,6 +205,51 @@ async def connector_sync( jwt_token=jwt_token, file_infos=file_infos, ) + elif body.sync_all or body.bucket_filter: + # Full ingest: discover and ingest all files (or files from specific buckets). + # Used by direct-sync providers (IBM COS) on initial ingest or per-bucket sync. + logger.info( + "Full connector ingest requested", + connector_type=connector_type, + bucket_filter=body.bucket_filter, + ) + connector = await connector_service.get_connector(working_connection.connection_id) + if body.bucket_filter: + # List only files from the requested buckets, then sync_specific_files + original_buckets = connector.bucket_names + connector.bucket_names = body.bucket_filter + try: + all_file_ids = [] + page_token = None + while True: + result = await connector.list_files(page_token=page_token) + for f in result.get("files", []): + all_file_ids.append(f["id"]) + page_token = result.get("next_page_token") + if not page_token: + break + finally: + connector.bucket_names = original_buckets + + if not all_file_ids: + return JSONResponse( + {"status": "no_files", "message": "No files found in the selected buckets."}, + status_code=200, + ) + task_id = await connector_service.sync_specific_files( + working_connection.connection_id, + user.user_id, + all_file_ids, + jwt_token=jwt_token, + ) + else: + # sync_all: ingest everything the connector can see + task_id = await connector_service.sync_connector_files( + working_connection.connection_id, + user.user_id, + max_files=max_files, + jwt_token=jwt_token, + ) else: # No files specified - sync only files already in OpenSearch for this connector # This ensures deleted files stay deleted @@ -209,7 +259,7 @@ async def connector_sync( session_manager=session_manager, jwt_token=jwt_token, ) - + if not existing_file_ids and not existing_filenames: return JSONResponse( { @@ -218,7 +268,7 @@ async def connector_sync( }, status_code=200, ) - + # If we have document_ids (connector file IDs), use sync_specific_files # Otherwise, use filename filtering with sync_connector_files if existing_file_ids: @@ -602,6 +652,272 @@ async def connector_disconnect( ) +# --------------------------------------------------------------------------- +# IBM COS-specific endpoints +# --------------------------------------------------------------------------- + +class IBMCOSConfigureBody(BaseModel): + auth_mode: str # "iam" or "hmac" + endpoint: str + # IAM fields + api_key: Optional[str] = None + service_instance_id: Optional[str] = None + auth_endpoint: Optional[str] = None + # HMAC fields + hmac_access_key: Optional[str] = None + hmac_secret_key: Optional[str] = None + # Optional bucket selection + bucket_names: Optional[List[str]] = None + # Optional: update an existing connection + connection_id: Optional[str] = None + + +async def ibm_cos_defaults( + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Return current IBM COS env-var defaults for pre-filling the config dialog. + + Sensitive values (API key, HMAC secret) are masked — only whether they are + set is returned, not the actual values. + """ + import os + + api_key = os.getenv("IBM_COS_API_KEY", "") + service_instance_id = os.getenv("IBM_COS_SERVICE_INSTANCE_ID", "") + endpoint = os.getenv("IBM_COS_ENDPOINT", "") + hmac_access_key = os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID", "") + hmac_secret_key = os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY", "") + disable_iam = os.getenv("OPENRAG_IBM_COS_IAM_UI", "").lower() not in ("1", "true", "yes") + + # Try to read existing connection config for this user too + connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="ibm_cos" + ) + conn_config = {} + if connections: + conn_config = connections[0].config or {} + + def _pick(conn_key, env_val): + """Prefer connection config value over env var.""" + return conn_config.get(conn_key) or env_val + + return JSONResponse({ + "api_key_set": bool(api_key or conn_config.get("api_key")), + "service_instance_id": _pick("service_instance_id", service_instance_id), + "endpoint": _pick("endpoint_url", endpoint), + "hmac_access_key_set": bool(hmac_access_key or conn_config.get("hmac_access_key")), + "hmac_secret_key_set": bool(hmac_secret_key or conn_config.get("hmac_secret_key")), + # Return which auth mode was previously used; default to hmac when IAM is disabled + "auth_mode": conn_config.get("auth_mode", "hmac" if (disable_iam or not (api_key or conn_config.get("api_key"))) else "iam"), + "disable_iam": disable_iam, + # Return bucket_names from existing connection (if any) + "bucket_names": conn_config.get("bucket_names", []), + # Return connection_id if an existing connection exists + "connection_id": connections[0].connection_id if connections else None, + }) + + +async def ibm_cos_configure( + body: IBMCOSConfigureBody, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Create or update an IBM COS connection with explicit credentials. + + Tests the credentials by listing buckets, then persists the connection. + Credentials are stored in the connection config dict (not env vars) so + the connector works even without system-level env vars. + """ + import os + from connectors.ibm_cos.auth import create_ibm_cos_client, create_ibm_cos_resource + + # Build the config dict that will be stored in the connection + conn_config: dict = { + "auth_mode": body.auth_mode, + "endpoint_url": body.endpoint, + } + + if body.auth_mode == "iam": + # Resolve: use supplied value, fall back to env var, fall back to existing connection + api_key = body.api_key or os.getenv("IBM_COS_API_KEY") + svc_id = body.service_instance_id or os.getenv("IBM_COS_SERVICE_INSTANCE_ID") + + # If still empty, pull from existing connection config + existing_connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="ibm_cos" + ) + if not api_key and existing_connections: + api_key = existing_connections[0].config.get("api_key") + if not svc_id and existing_connections: + svc_id = existing_connections[0].config.get("service_instance_id") + + if not api_key or not svc_id: + return JSONResponse( + {"error": "IAM mode requires api_key and service_instance_id"}, + status_code=400, + ) + conn_config["api_key"] = api_key + conn_config["service_instance_id"] = svc_id + if body.auth_endpoint: + conn_config["auth_endpoint"] = body.auth_endpoint + else: + # HMAC mode + hmac_access = body.hmac_access_key or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") + hmac_secret = body.hmac_secret_key or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") + + existing_connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="ibm_cos" + ) + if not hmac_access and existing_connections: + hmac_access = existing_connections[0].config.get("hmac_access_key") + if not hmac_secret and existing_connections: + hmac_secret = existing_connections[0].config.get("hmac_secret_key") + + if not hmac_access or not hmac_secret: + return JSONResponse( + {"error": "HMAC mode requires hmac_access_key and hmac_secret_key"}, + status_code=400, + ) + conn_config["hmac_access_key"] = hmac_access + conn_config["hmac_secret_key"] = hmac_secret + + if body.bucket_names is not None: + conn_config["bucket_names"] = body.bucket_names + + # Test credentials — IAM uses client (avoids ibm_botocore discovery-call bug), + # HMAC uses resource (S3-compatible, works with MinIO). + try: + if conn_config.get("auth_mode", "iam") == "hmac": + cos = create_ibm_cos_resource(conn_config) + list(cos.buckets.all()) + else: + cos = create_ibm_cos_client(conn_config) + cos.list_buckets() + except Exception as exc: + return JSONResponse( + {"error": f"Could not connect to IBM COS: {exc}"}, + status_code=400, + ) + + # Persist: update existing connection or create a new one + if body.connection_id: + existing = await connector_service.connection_manager.get_connection(body.connection_id) + if existing and existing.user_id == user.user_id: + await connector_service.connection_manager.update_connection( + connection_id=body.connection_id, + config=conn_config, + ) + # Evict cached connector so next call gets a fresh instance + connector_service.connection_manager.active_connectors.pop(body.connection_id, None) + return JSONResponse({"connection_id": body.connection_id, "status": "connected"}) + + # Create a fresh connection + connection_id = await connector_service.connection_manager.create_connection( + connector_type="ibm_cos", + name="IBM Cloud Object Storage", + config=conn_config, + user_id=user.user_id, + ) + return JSONResponse({"connection_id": connection_id, "status": "connected"}) + + +async def ibm_cos_list_buckets( + connection_id: str, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """List all buckets accessible with the stored IBM COS credentials.""" + from connectors.ibm_cos.auth import create_ibm_cos_client, create_ibm_cos_resource + + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "ibm_cos": + return JSONResponse({"error": "Not an IBM COS connection"}, status_code=400) + + try: + cfg = connection.config + if cfg.get("auth_mode", "iam") == "hmac": + cos = create_ibm_cos_resource(cfg) + buckets = [b.name for b in cos.buckets.all()] + else: + cos = create_ibm_cos_client(cfg) + buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] + return JSONResponse({"buckets": buckets}) + except Exception as exc: + return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + + +async def ibm_cos_bucket_status( + connection_id: str, + connector_service=Depends(get_connector_service), + session_manager=Depends(get_session_manager), + user: User = Depends(get_current_user), +): + """Return all buckets for an IBM COS connection with their ingestion status. + + Each entry includes the bucket name, whether it has been ingested (is_synced), + and the count of indexed documents from that bucket. + """ + from connectors.ibm_cos.auth import create_ibm_cos_client, create_ibm_cos_resource + + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "ibm_cos": + return JSONResponse({"error": "Not an IBM COS connection"}, status_code=400) + + # 1. List all buckets from COS + try: + cfg = connection.config + if cfg.get("auth_mode", "iam") == "hmac": + cos = create_ibm_cos_resource(cfg) + all_buckets = [b.name for b in cos.buckets.all()] + else: + cos = create_ibm_cos_client(cfg) + all_buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] + except Exception as exc: + return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + + # 2. Count indexed documents per bucket from OpenSearch + ingested_counts: dict = {} + try: + opensearch_client = session_manager.get_user_opensearch_client( + user.user_id, user.jwt_token + ) + query_body = { + "size": 0, + "query": {"term": {"connector_type": "ibm_cos"}}, + "aggs": { + "doc_ids": { + "terms": {"field": "document_id", "size": 50000} + } + }, + } + index_name = get_index_name(user.user_id) + os_resp = opensearch_client.search(index=index_name, body=query_body) + for bucket_entry in os_resp.get("aggregations", {}).get("doc_ids", {}).get("buckets", []): + doc_id = bucket_entry["key"] + if "::" in doc_id: + bucket_name = doc_id.split("::")[0] + ingested_counts[bucket_name] = ingested_counts.get(bucket_name, 0) + 1 + except Exception: + pass # OpenSearch unavailable — show zero counts + + result = [ + { + "name": bucket, + "ingested_count": ingested_counts.get(bucket, 0), + "is_synced": ingested_counts.get(bucket, 0) > 0, + } + for bucket in all_buckets + ] + return JSONResponse({"buckets": result}) + + +# --------------------------------------------------------------------------- + async def sync_all_connectors( connector_service=Depends(get_connector_service), session_manager=Depends(get_session_manager), @@ -615,7 +931,7 @@ async def sync_all_connectors( jwt_token = user.jwt_token # Cloud connector types to sync - cloud_connector_types = ["google_drive", "onedrive", "sharepoint"] + cloud_connector_types = ["google_drive", "onedrive", "sharepoint", "ibm_cos"] all_task_ids = [] synced_connectors = [] diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index 83e77b8cd..a49dff219 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -2,10 +2,12 @@ from .google_drive import GoogleDriveConnector from .sharepoint import SharePointConnector from .onedrive import OneDriveConnector +from .ibm_cos import IBMCOSConnector __all__ = [ "BaseConnector", "GoogleDriveConnector", "SharePointConnector", "OneDriveConnector", + "IBMCOSConnector", ] diff --git a/src/connectors/connection_manager.py b/src/connectors/connection_manager.py index fd207be6e..35016e389 100644 --- a/src/connectors/connection_manager.py +++ b/src/connectors/connection_manager.py @@ -13,6 +13,7 @@ from .google_drive import GoogleDriveConnector from .sharepoint import SharePointConnector from .onedrive import OneDriveConnector +from .ibm_cos import IBMCOSConnector @dataclass @@ -351,6 +352,12 @@ def get_available_connector_types(self) -> Dict[str, Dict[str, Any]]: "icon": OneDriveConnector.CONNECTOR_ICON, "available": self._is_connector_available("onedrive"), }, + "ibm_cos": { + "name": IBMCOSConnector.CONNECTOR_NAME, + "description": IBMCOSConnector.CONNECTOR_DESCRIPTION, + "icon": IBMCOSConnector.CONNECTOR_ICON, + "available": self._is_connector_available("ibm_cos"), + }, } def _is_connector_available(self, connector_type: str) -> bool: @@ -379,6 +386,8 @@ def _create_connector(self, config: ConnectionConfig) -> BaseConnector: return SharePointConnector(config.config) elif config.connector_type == "onedrive": return OneDriveConnector(config.config) + elif config.connector_type == "ibm_cos": + return IBMCOSConnector(config.config) elif config.connector_type == "box": raise NotImplementedError("Box connector not implemented yet") elif config.connector_type == "dropbox": diff --git a/src/connectors/ibm_cos/__init__.py b/src/connectors/ibm_cos/__init__.py new file mode 100644 index 000000000..32f19defc --- /dev/null +++ b/src/connectors/ibm_cos/__init__.py @@ -0,0 +1,3 @@ +from .connector import IBMCOSConnector + +__all__ = ["IBMCOSConnector"] diff --git a/src/connectors/ibm_cos/auth.py b/src/connectors/ibm_cos/auth.py new file mode 100644 index 000000000..54012d724 --- /dev/null +++ b/src/connectors/ibm_cos/auth.py @@ -0,0 +1,183 @@ +"""IBM Cloud Object Storage authentication and client factory.""" + +import os +from typing import Dict, Any + +from utils.logging_config import get_logger + +logger = get_logger(__name__) + +# IAM auth endpoint default +_DEFAULT_AUTH_ENDPOINT = "https://iam.cloud.ibm.com/identity/token" + + +def _resolve_credentials(config: Dict[str, Any]): + """Resolve IBM COS credentials from config dict → environment variable fallback. + + Returns a dict with the resolved values needed to build a boto3 client/resource. + Raises ValueError if neither IAM nor HMAC credentials are available. + """ + endpoint_url = config.get("endpoint_url") or os.getenv("IBM_COS_ENDPOINT") + if not endpoint_url: + raise ValueError( + "IBM COS endpoint URL is required. Set IBM_COS_ENDPOINT or provide " + "'endpoint_url' in the connector config." + ) + + api_key = config.get("api_key") or os.getenv("IBM_COS_API_KEY") + service_instance_id = ( + config.get("service_instance_id") or os.getenv("IBM_COS_SERVICE_INSTANCE_ID") + ) + hmac_access_key = ( + config.get("hmac_access_key") or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") + ) + hmac_secret_key = ( + config.get("hmac_secret_key") or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") + ) + auth_endpoint = ( + config.get("auth_endpoint") + or os.getenv("IBM_COS_AUTH_ENDPOINT") + or _DEFAULT_AUTH_ENDPOINT + ) + + return { + "endpoint_url": endpoint_url, + "api_key": api_key, + "service_instance_id": service_instance_id, + "hmac_access_key": hmac_access_key, + "hmac_secret_key": hmac_secret_key, + "auth_endpoint": auth_endpoint, + } + + +def _build_resource(config: Dict[str, Any], creds: Dict[str, Any]): + """Build an S3-compatible resource using resolved credentials. + + HMAC mode uses standard boto3 (no IBM-specific calls, pure S3 protocol). + IAM mode uses ibm_boto3 with OAuth signature. + """ + auth_mode = config.get("auth_mode", "iam") + + if auth_mode == "hmac": + if not (creds["hmac_access_key"] and creds["hmac_secret_key"]): + raise ValueError( + "HMAC mode requires hmac_access_key and hmac_secret_key." + ) + try: + import boto3 + except ImportError as exc: + raise ImportError( + "boto3 is required for IBM COS HMAC mode. " + "Install it with: pip install boto3" + ) from exc + logger.debug("Creating IBM COS resource with HMAC authentication (boto3)") + return boto3.resource( + "s3", + aws_access_key_id=creds["hmac_access_key"], + aws_secret_access_key=creds["hmac_secret_key"], + endpoint_url=creds["endpoint_url"], + ) + + # IAM mode (default) — requires ibm_boto3 for OAuth token handling + try: + import ibm_boto3 + from ibm_botocore.client import Config + except ImportError as exc: + raise ImportError( + "ibm-cos-sdk is required for IBM COS IAM mode. " + "Install it with: pip install ibm-cos-sdk" + ) from exc + if not (creds["api_key"] and creds["service_instance_id"]): + raise ValueError( + "IAM mode requires api_key and service_instance_id." + ) + logger.debug("Creating IBM COS resource with IAM authentication (ibm_boto3)") + return ibm_boto3.resource( + "s3", + ibm_api_key_id=creds["api_key"], + ibm_service_instance_id=creds["service_instance_id"], + ibm_auth_endpoint=creds["auth_endpoint"], + config=Config(signature_version="oauth"), + endpoint_url=creds["endpoint_url"], + ) + + +def _build_client(config: Dict[str, Any], creds: Dict[str, Any]): + """Build an S3-compatible client using resolved credentials. + + HMAC mode uses standard boto3 (no IBM-specific calls, pure S3 protocol). + IAM mode uses ibm_boto3 with OAuth signature. + """ + auth_mode = config.get("auth_mode", "iam") + + if auth_mode == "hmac": + if not (creds["hmac_access_key"] and creds["hmac_secret_key"]): + raise ValueError( + "HMAC mode requires hmac_access_key and hmac_secret_key." + ) + try: + import boto3 + except ImportError as exc: + raise ImportError( + "boto3 is required for IBM COS HMAC mode. " + "Install it with: pip install boto3" + ) from exc + logger.debug("Creating IBM COS client with HMAC authentication (boto3)") + return boto3.client( + "s3", + aws_access_key_id=creds["hmac_access_key"], + aws_secret_access_key=creds["hmac_secret_key"], + endpoint_url=creds["endpoint_url"], + ) + + # IAM mode (default) — requires ibm_boto3 for OAuth token handling + try: + import ibm_boto3 + from ibm_botocore.client import Config + except ImportError as exc: + raise ImportError( + "ibm-cos-sdk is required for IBM COS IAM mode. " + "Install it with: pip install ibm-cos-sdk" + ) from exc + if not (creds["api_key"] and creds["service_instance_id"]): + raise ValueError( + "IAM mode requires api_key and service_instance_id." + ) + logger.debug("Creating IBM COS client with IAM authentication (ibm_boto3)") + return ibm_boto3.client( + "s3", + ibm_api_key_id=creds["api_key"], + ibm_service_instance_id=creds["service_instance_id"], + ibm_auth_endpoint=creds["auth_endpoint"], + config=Config(signature_version="oauth"), + endpoint_url=creds["endpoint_url"], + ) + + +def create_ibm_cos_resource(config: Dict[str, Any]): + """Return an S3 resource handle (high-level API). + + HMAC mode returns a standard boto3.resource (pure S3, no IBM discovery calls). + IAM mode returns an ibm_boto3.resource (OAuth token handling). + + Auth mode is determined by config["auth_mode"]: + - "iam" (default): IBM_COS_API_KEY + IBM_COS_SERVICE_INSTANCE_ID + - "hmac": IBM_COS_HMAC_ACCESS_KEY_ID + IBM_COS_HMAC_SECRET_ACCESS_KEY + + Resolution order for each credential: config dict → environment variable. + """ + creds = _resolve_credentials(config) + return _build_resource(config, creds) + + +def create_ibm_cos_client(config: Dict[str, Any]): + """Return an S3 low-level client. + + HMAC mode returns a standard boto3.client (pure S3, no IBM discovery calls). + IAM mode returns an ibm_boto3.client (OAuth token handling). + + Used by API endpoints that need raw client operations (e.g. get_object_acl). + For bucket/object listing and download, prefer create_ibm_cos_resource(). + """ + creds = _resolve_credentials(config) + return _build_client(config, creds) diff --git a/src/connectors/ibm_cos/connector.py b/src/connectors/ibm_cos/connector.py new file mode 100644 index 000000000..4a41a5b7d --- /dev/null +++ b/src/connectors/ibm_cos/connector.py @@ -0,0 +1,368 @@ +"""IBM Cloud Object Storage connector for OpenRAG.""" + +import mimetypes +import os +from datetime import datetime, timezone +from posixpath import basename +from typing import Any, Dict, List, Optional + +from connectors.base import BaseConnector, ConnectorDocument, DocumentACL +from utils.logging_config import get_logger + +from .auth import create_ibm_cos_client, create_ibm_cos_resource + +logger = get_logger(__name__) + +# Separator used in composite file IDs: "::" +_ID_SEPARATOR = "::" + + +def _make_file_id(bucket: str, key: str) -> str: + return f"{bucket}{_ID_SEPARATOR}{key}" + + +def _split_file_id(file_id: str): + """Split a composite file ID into (bucket, key). Raises ValueError if invalid.""" + if _ID_SEPARATOR not in file_id: + raise ValueError(f"Invalid IBM COS file ID (missing separator): {file_id!r}") + bucket, key = file_id.split(_ID_SEPARATOR, 1) + return bucket, key + + +class IBMCOSConnector(BaseConnector): + """Connector for IBM Cloud Object Storage. + + Supports IAM (API key) and HMAC credential modes. Credentials are read + from the connector config dict first, then from environment variables. + + Config dict keys: + bucket_names (list[str]): Buckets to ingest from. Required. + prefix (str): Optional object key prefix filter. + endpoint_url (str): Overrides IBM_COS_ENDPOINT. + api_key (str): Overrides IBM_COS_API_KEY. + service_instance_id (str): Overrides IBM_COS_SERVICE_INSTANCE_ID. + hmac_access_key (str): HMAC mode – overrides IBM_COS_HMAC_ACCESS_KEY_ID. + hmac_secret_key (str): HMAC mode – overrides IBM_COS_HMAC_SECRET_ACCESS_KEY. + connection_id (str): Connection identifier used for logging. + """ + + CONNECTOR_NAME = "IBM Cloud Object Storage" + CONNECTOR_DESCRIPTION = "Add knowledge from IBM Cloud Object Storage" + CONNECTOR_ICON = "ibm-cos" + + # BaseConnector uses these to check env-var availability for IAM mode. + # HMAC-only setups will show as "unavailable" in the UI but can still be + # used when credentials are supplied in the config dict directly. + CLIENT_ID_ENV_VAR = "IBM_COS_API_KEY" + CLIENT_SECRET_ENV_VAR = "IBM_COS_SERVICE_INSTANCE_ID" + + def get_client_id(self) -> str: + """Return IAM API key, or HMAC access key ID as fallback.""" + val = os.getenv("IBM_COS_API_KEY") or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") + if val: + return val + raise ValueError( + "IBM COS credentials not set. Provide IBM_COS_API_KEY (IAM) " + "or IBM_COS_HMAC_ACCESS_KEY_ID (HMAC)." + ) + + def get_client_secret(self) -> str: + """Return IAM service instance ID, or HMAC secret key as fallback.""" + val = os.getenv("IBM_COS_SERVICE_INSTANCE_ID") or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") + if val: + return val + raise ValueError( + "IBM COS credentials not set. Provide IBM_COS_SERVICE_INSTANCE_ID (IAM) " + "or IBM_COS_HMAC_SECRET_ACCESS_KEY (HMAC)." + ) + + def __init__(self, config: Dict[str, Any]): + if config is None: + config = {} + super().__init__(config) + + self.bucket_names: List[str] = config.get("bucket_names") or [] + self.prefix: str = config.get("prefix", "") + self.connection_id: str = config.get("connection_id", "default") + + # Resolved service instance ID used as ACL owner fallback + self._service_instance_id: str = ( + config.get("service_instance_id") + or os.getenv("IBM_COS_SERVICE_INSTANCE_ID", "") + ) + + self._handle = None # Lazy-initialised on first use + # IAM mode uses ibm_boto3.client to avoid internal service-instance + # discovery calls that cause XML-parse errors against the real IBM COS API. + # HMAC mode uses ibm_boto3.resource (confirmed working with MinIO and S3). + self._is_hmac: bool = (config.get("auth_mode", "iam") == "hmac") + + def _get_handle(self): + """Return (and cache) the appropriate boto3 handle for the configured auth mode. + + - HMAC → ibm_boto3.resource (S3-compatible, works with MinIO) + - IAM → ibm_boto3.client (avoids ibm_botocore service-discovery calls + that break against the real IBM COS API) + """ + if self._handle is None: + if self._is_hmac: + self._handle = create_ibm_cos_resource(self.config) + else: + self._handle = create_ibm_cos_client(self.config) + return self._handle + + # ------------------------------------------------------------------ + # BaseConnector abstract method implementations + # ------------------------------------------------------------------ + + async def authenticate(self) -> bool: + """Validate credentials by listing buckets on the COS service.""" + try: + handle = self._get_handle() + if self._is_hmac: + list(handle.buckets.all()) # resource API + else: + handle.list_buckets() # client API + self._authenticated = True + logger.debug(f"IBM COS authenticated for connection {self.connection_id}") + return True + except Exception as exc: + logger.warning(f"IBM COS authentication failed: {exc}") + self._authenticated = False + return False + + def _resolve_bucket_names(self) -> List[str]: + """Return configured bucket names, or auto-discover all accessible buckets.""" + if self.bucket_names: + return self.bucket_names + try: + handle = self._get_handle() + if self._is_hmac: + buckets = [b.name for b in handle.buckets.all()] + else: + resp = handle.list_buckets() + buckets = [b["Name"] for b in resp.get("Buckets", [])] + logger.debug(f"IBM COS auto-discovered {len(buckets)} bucket(s): {buckets}") + return buckets + except Exception as exc: + logger.warning(f"IBM COS could not auto-discover buckets: {exc}") + return [] + + async def list_files( + self, + page_token: Optional[str] = None, + max_files: Optional[int] = None, + **kwargs, + ) -> Dict[str, Any]: + """List objects across all configured (or auto-discovered) buckets. + + Uses the ibm_boto3 resource API: Bucket.objects.all() handles pagination + internally so all objects are returned without manual continuation tokens. + + If no bucket_names are configured, all accessible buckets are used. + + Returns: + dict with keys: + "files": list of file dicts (id, name, bucket, size, modified_time) + "next_page_token": always None (SDK handles pagination internally) + """ + handle = self._get_handle() + files: List[Dict[str, Any]] = [] + bucket_names = self._resolve_bucket_names() + + for bucket_name in bucket_names: + try: + if self._is_hmac: + # resource API: Bucket.objects.all() handles pagination internally + bucket = handle.Bucket(bucket_name) + objects = ( + bucket.objects.filter(Prefix=self.prefix) + if self.prefix + else bucket.objects.all() + ) + for obj in objects: + if obj.key.endswith("/"): + continue + files.append( + { + "id": _make_file_id(bucket_name, obj.key), + "name": basename(obj.key) or obj.key, + "bucket": bucket_name, + "key": obj.key, + "size": obj.size, + "modified_time": obj.last_modified.isoformat() + if obj.last_modified + else None, + } + ) + if max_files and len(files) >= max_files: + return {"files": files, "next_page_token": None} + else: + # client API: list_objects_v2 with manual pagination + kwargs: Dict[str, Any] = {"Bucket": bucket_name} + if self.prefix: + kwargs["Prefix"] = self.prefix + while True: + resp = handle.list_objects_v2(**kwargs) + for obj in resp.get("Contents", []): + key = obj["Key"] + if key.endswith("/"): + continue + files.append( + { + "id": _make_file_id(bucket_name, key), + "name": basename(key) or key, + "bucket": bucket_name, + "key": key, + "size": obj.get("Size", 0), + "modified_time": obj["LastModified"].isoformat() + if obj.get("LastModified") + else None, + } + ) + if max_files and len(files) >= max_files: + return {"files": files, "next_page_token": None} + if resp.get("IsTruncated"): + kwargs["ContinuationToken"] = resp["NextContinuationToken"] + else: + break + + except Exception as exc: + logger.error(f"Failed to list objects in bucket {bucket_name!r}: {exc}") + continue + + return {"files": files, "next_page_token": None} + + async def get_file_content(self, file_id: str) -> ConnectorDocument: + """Download an object from IBM COS and return a ConnectorDocument. + + Uses the ibm_boto3 resource API: Object.get() downloads content and + returns all metadata (ContentType, ContentLength, LastModified) in one call. + + Args: + file_id: Composite ID in the form "::". + + Returns: + ConnectorDocument with content bytes, ACL, and metadata. + """ + bucket_name, key = _split_file_id(file_id) + handle = self._get_handle() + + # Both client.get_object() and resource.Object().get() return the same + # response dict: Body stream + ContentType, ContentLength, LastModified. + if self._is_hmac: + response = handle.Object(bucket_name, key).get() # resource + else: + response = handle.get_object(Bucket=bucket_name, Key=key) # client + content: bytes = response["Body"].read() + + last_modified: datetime = response.get("LastModified") or datetime.now(timezone.utc) + size: int = response.get("ContentLength", len(content)) + + # MIME type detection: prefer filename extension over generic S3 content-type. + # IBM COS often stores "application/octet-stream" for all objects regardless + # of their real type, so we treat that as "unknown" and fall back to the + # extension-based guess which is more reliable for named files. + raw_content_type = response.get("ContentType", "") + if raw_content_type and raw_content_type != "application/octet-stream": + mime_type: str = raw_content_type + else: + mime_type = mimetypes.guess_type(key)[0] or "application/octet-stream" + + filename = basename(key) or key + + acl = await self._extract_acl(bucket_name, key) + + return ConnectorDocument( + id=file_id, + filename=filename, + mimetype=mime_type, + content=content, + source_url=f"cos://{bucket_name}/{key}", + acl=acl, + modified_time=last_modified, + created_time=last_modified, # IBM COS does not expose creation time + metadata={ + "ibm_cos_bucket": bucket_name, + "ibm_cos_key": key, + "size": size, + }, + ) + + async def _extract_acl(self, bucket: str, key: str) -> DocumentACL: + """Fetch object ACL from IBM COS and map it to DocumentACL. + + Falls back to a minimal ACL (owner = service instance ID) on failure. + """ + try: + handle = self._get_handle() + # For resource (HMAC), access the underlying client via meta.client. + # For client (IAM), call directly. + client = handle.meta.client if self._is_hmac else handle + acl_response = client.get_object_acl(Bucket=bucket, Key=key) + + owner_id: str = ( + acl_response.get("Owner", {}).get("DisplayName") + or acl_response.get("Owner", {}).get("ID") + or self._service_instance_id + ) + + allowed_users: List[str] = [] + for grant in acl_response.get("Grants", []): + grantee = grant.get("Grantee", {}) + permission = grant.get("Permission", "") + if permission in ("FULL_CONTROL", "READ"): + user_id = ( + grantee.get("DisplayName") + or grantee.get("ID") + or grantee.get("EmailAddress") + ) + if user_id and user_id not in allowed_users: + allowed_users.append(user_id) + + return DocumentACL( + owner=owner_id, + allowed_users=allowed_users, + allowed_groups=[], + ) + except Exception as exc: + logger.warning( + f"Could not fetch ACL for cos://{bucket}/{key}: {exc}. " + "Using fallback ACL." + ) + return DocumentACL( + owner=self._service_instance_id or None, + allowed_users=[], + allowed_groups=[], + ) + + # ------------------------------------------------------------------ + # Webhook / subscription (stub — IBM COS events require IBM Event + # Notifications service; not in scope for this connector version) + # ------------------------------------------------------------------ + + async def setup_subscription(self) -> str: + """No-op: IBM COS event notifications are out of scope for this connector.""" + return "" + + async def handle_webhook(self, payload: Dict[str, Any]) -> List[str]: + """No-op: webhooks are not supported in this connector version.""" + return [] + + def extract_webhook_channel_id( + self, payload: Dict[str, Any], headers: Dict[str, str] + ) -> Optional[str]: + return None + + async def cleanup_subscription(self, subscription_id: str) -> bool: + """No-op: no subscription to clean up.""" + return True + + + + +if __name__ == "__main__": + connector = IBMCOSConnector({}) + print(connector.authenticate()) + print(connector.list_files()) + # print(connector.get_file_content("test_cos.py")) \ No newline at end of file diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py index f80df0b02..1c981ba5a 100644 --- a/src/connectors/langflow_connector_service.py +++ b/src/connectors/langflow_connector_service.py @@ -331,39 +331,40 @@ async def sync_specific_files( original_folder_ids = getattr(cfg, "folder_ids", None) expanded_file_ids = file_ids # Default to original IDs - - try: - # Set the file_ids we want to sync in the connector's config - if cfg is not None: + + # Only attempt folder expansion for connectors that use cfg-based filtering + # (Google Drive, OneDrive, SharePoint). Connectors without a cfg attribute + # (e.g. IBM COS) receive pre-filtered file IDs and must NOT call list_files() + # here — doing so would re-list all files from all buckets, overwriting the + # carefully selected IDs passed in. + if cfg is not None: + try: cfg.file_ids = file_ids # type: ignore cfg.folder_ids = None # type: ignore - # Get the expanded list of file IDs (folders will be expanded to their contents) - # This uses the connector's list_files() which calls _iter_selected_items() - result = await connector.list_files() - expanded_file_ids = [f["id"] for f in result.get("files", [])] + # Expand file IDs — folders become their individual file contents + result = await connector.list_files() + expanded_file_ids = [f["id"] for f in result.get("files", [])] + + if not expanded_file_ids: + logger.warning( + f"No files found after expanding file_ids. " + f"Original IDs: {file_ids}. This may indicate all IDs were folders " + f"with no contents, or files that were filtered out." + ) + # If we have file_infos with download URLs, use original file_ids + # (OneDrive sharing IDs can't be expanded but can be downloaded directly) + if file_infos: + logger.info("Using original file IDs with cached download URLs") + expanded_file_ids = file_ids + else: + raise ValueError("No files to sync after expanding folders") - if not expanded_file_ids: - logger.warning( - f"No files found after expanding file_ids. " - f"Original IDs: {file_ids}. This may indicate all IDs were folders " - f"with no contents, or files that were filtered out." - ) - # If we have file_infos with download URLs, use original file_ids - # (OneDrive sharing IDs can't be expanded but can be downloaded directly) - if file_infos: - logger.info("Using original file IDs with cached download URLs") - expanded_file_ids = file_ids - else: - raise ValueError("No files to sync after expanding folders") - - except Exception as e: - logger.error(f"Failed to expand file_ids via list_files(): {e}") - # Fallback to original file_ids if expansion fails - expanded_file_ids = file_ids - finally: - # Restore original config values - if cfg is not None: + except Exception as e: + logger.error(f"Failed to expand file_ids via list_files(): {e}") + # Fallback to original file_ids if expansion fails + expanded_file_ids = file_ids + finally: cfg.file_ids = original_file_ids # type: ignore cfg.folder_ids = original_folder_ids # type: ignore diff --git a/src/main.py b/src/main.py index 598263f78..016a86ff7 100644 --- a/src/main.py +++ b/src/main.py @@ -762,6 +762,11 @@ async def create_app(): # Connector endpoints app.add_api_route("/connectors", connectors.list_connectors, methods=["GET"], tags=["internal"]) + # IBM COS-specific routes (registered before generic /{connector_type}/... to avoid shadowing) + app.add_api_route("/connectors/ibm_cos/defaults", connectors.ibm_cos_defaults, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/configure", connectors.ibm_cos_configure, methods=["POST"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/{connection_id}/buckets", connectors.ibm_cos_list_buckets, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/{connection_id}/bucket-status", connectors.ibm_cos_bucket_status, methods=["GET"], tags=["internal"]) app.add_api_route("/connectors/{connector_type}/sync", connectors.connector_sync, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/sync-all", connectors.sync_all_connectors, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/{connector_type}/status", connectors.connector_status, methods=["GET"], tags=["internal"]) diff --git a/src/services/auth_service.py b/src/services/auth_service.py index d2fe43971..277638c6a 100644 --- a/src/services/auth_service.py +++ b/src/services/auth_service.py @@ -1,4 +1,3 @@ -import os import uuid import json import httpx @@ -20,6 +19,9 @@ from connectors.onedrive import OneDriveConnector from connectors.sharepoint import SharePointConnector +# Connectors that authenticate directly (no OAuth redirect required) +_DIRECT_AUTH_CONNECTORS = {"ibm_cos"} + class AuthService: def __init__(self, session_manager: SessionManager, connector_service=None, langflow_mcp_service: LangflowMCPService | None = None): @@ -56,6 +58,7 @@ async def init_oauth( "google_drive", "onedrive", "sharepoint", + "ibm_cos", ]: raise ValueError(f"Unsupported connector type: {connector_type}") elif purpose not in ["app_auth", "data_source"]: @@ -91,6 +94,10 @@ async def init_oauth( ) ) + # Direct-auth connectors (HMAC/API-key based, no OAuth redirect) + if connector_type in _DIRECT_AUTH_CONNECTORS: + return await self._init_direct_connection(connector_type, connection_id) + # Get OAuth configuration from connector and OAuth classes import os @@ -147,6 +154,61 @@ def _assert_env_key(name, val): return {"connection_id": connection_id, "oauth_config": oauth_config} + async def _init_direct_connection( + self, connector_type: str, connection_id: str + ) -> dict: + """Authenticate a non-OAuth connector immediately using env var credentials. + + Creates the connection record (already done by the caller) and verifies + that the credentials work by calling authenticate() on the connector. + Returns a response without oauth_config so the frontend knows no redirect + is needed. + """ + try: + connection_config = ( + await self.connector_service.connection_manager.get_connection( + connection_id + ) + ) + if not connection_config: + raise ValueError("Connection not found") + + connector = self.connector_service.connection_manager._create_connector( + connection_config + ) + authenticated = await connector.authenticate() + if not authenticated: + # Remove the connection so the user can retry after fixing credentials + await self.connector_service.connection_manager.delete_connection( + connection_id + ) + raise ValueError( + f"Could not authenticate with {connector_type}. " + "Check that your credentials and endpoint are correct." + ) + + # Cache the authenticated connector + self.connector_service.connection_manager.active_connectors[ + connection_id + ] = connector + + except ValueError: + raise + except Exception as exc: + await self.connector_service.connection_manager.delete_connection( + connection_id + ) + raise ValueError( + f"Failed to connect {connector_type}: {exc}" + ) from exc + + return { + "connection_id": connection_id, + "status": "connected", + "connector_type": connector_type, + # No oauth_config — frontend must not attempt an OAuth redirect + } + async def handle_oauth_callback( self, connection_id: str, @@ -414,7 +476,7 @@ async def _handle_data_source_auth( else: logger.warning("_handle_data_source_auth: _detect_base_url returned None") else: - logger.warning(f"_handle_data_source_auth: Connector not available or doesn't have _detect_base_url") + logger.warning("_handle_data_source_auth: Connector not available or doesn't have _detect_base_url") # Clear the cached connector so next get_connector() creates a fresh instance # with the updated config (including base_url) diff --git a/src/tui/config_fields.py b/src/tui/config_fields.py index c13e2e126..6a78a883e 100644 --- a/src/tui/config_fields.py +++ b/src/tui/config_fields.py @@ -183,6 +183,36 @@ class ConfigSection: ), ], advanced=True, gate_prompt="Configure AWS credentials?"), + # ── IBM Cloud Object Storage ───────────────────────────────── + ConfigSection("IBM Cloud Object Storage", [ + ConfigField( + "ibm_cos_api_key", "IBM_COS_API_KEY", "API Key", + placeholder="", + helper_text="Create API key at https://cloud.ibm.com/iam/apikeys", + secret=True, + ), + ConfigField( + "ibm_cos_service_instance_id", "IBM_COS_SERVICE_INSTANCE_ID", + "Service Instance ID (CRN)", + placeholder="crn:v1:bluemix:...", + ), + ConfigField( + "ibm_cos_endpoint", "IBM_COS_ENDPOINT", "Service Endpoint", + placeholder="https://s3.us-south.cloud-object-storage.appdomain.cloud", + helper_text="Endpoints: https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-endpoints", + ), + ConfigField( + "ibm_cos_hmac_access_key_id", "IBM_COS_HMAC_ACCESS_KEY_ID", + "HMAC Access Key ID (optional)", + placeholder="", + ), + ConfigField( + "ibm_cos_hmac_secret_access_key", "IBM_COS_HMAC_SECRET_ACCESS_KEY", + "HMAC Secret Access Key (optional)", + placeholder="", secret=True, + ), + ], advanced=True, gate_prompt="Configure IBM Cloud Object Storage?"), + # ── Langfuse ──────────────────────────────────────────────── ConfigSection("Langfuse", [ ConfigField( diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py index b940f2a0e..b16c38af2 100644 --- a/src/tui/managers/env_manager.py +++ b/src/tui/managers/env_manager.py @@ -59,6 +59,13 @@ class EnvConfig: aws_secret_access_key: str = "" langflow_public_url: str = "" + # IBM Cloud Object Storage settings + ibm_cos_api_key: str = "" + ibm_cos_service_instance_id: str = "" + ibm_cos_endpoint: str = "" + ibm_cos_hmac_access_key_id: str = "" + ibm_cos_hmac_secret_access_key: str = "" + # Langfuse settings (optional) langfuse_secret_key: str = "" langfuse_public_key: str = "" @@ -194,6 +201,11 @@ def load_existing_env(self) -> bool: "AWS_ACCESS_KEY_ID": "aws_access_key_id", "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret "LANGFLOW_PUBLIC_URL": "langflow_public_url", + "IBM_COS_API_KEY": "ibm_cos_api_key", # pragma: allowlist secret + "IBM_COS_SERVICE_INSTANCE_ID": "ibm_cos_service_instance_id", + "IBM_COS_ENDPOINT": "ibm_cos_endpoint", + "IBM_COS_HMAC_ACCESS_KEY_ID": "ibm_cos_hmac_access_key_id", + "IBM_COS_HMAC_SECRET_ACCESS_KEY": "ibm_cos_hmac_secret_access_key", # pragma: allowlist secret "OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths", "OPENRAG_DOCUMENTS_PATH": "openrag_documents_path", "OPENRAG_KEYS_PATH": "openrag_keys_path", @@ -521,6 +533,11 @@ def save_env_file(self) -> bool: ("AWS_ACCESS_KEY_ID", self.config.aws_access_key_id), ("AWS_SECRET_ACCESS_KEY", self.config.aws_secret_access_key), ("LANGFLOW_PUBLIC_URL", self.config.langflow_public_url), + ("IBM_COS_API_KEY", self.config.ibm_cos_api_key), + ("IBM_COS_SERVICE_INSTANCE_ID", self.config.ibm_cos_service_instance_id), + ("IBM_COS_ENDPOINT", self.config.ibm_cos_endpoint), + ("IBM_COS_HMAC_ACCESS_KEY_ID", self.config.ibm_cos_hmac_access_key_id), + ("IBM_COS_HMAC_SECRET_ACCESS_KEY", self.config.ibm_cos_hmac_secret_access_key), ] optional_written = False diff --git a/src/utils/file_utils.py b/src/utils/file_utils.py index 764f90694..44d0bb98e 100644 --- a/src/utils/file_utils.py +++ b/src/utils/file_utils.py @@ -61,7 +61,7 @@ def safe_unlink(path: str) -> None: def get_file_extension(mimetype: str) -> str: - """Get file extension based on MIME type""" + """Get file extension based on MIME type. Returns None if the type is unknown.""" mime_to_ext = { "application/pdf": ".pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", @@ -69,19 +69,33 @@ def get_file_extension(mimetype: str) -> str: "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", "application/vnd.ms-powerpoint": ".ppt", "text/plain": ".txt", + "text/markdown": ".md", + "text/x-markdown": ".md", "text/html": ".html", + "text/csv": ".csv", + "application/json": ".json", + "application/xml": ".xml", + "text/xml": ".xml", "application/rtf": ".rtf", "application/vnd.google-apps.document": ".pdf", # Exported as PDF "application/vnd.google-apps.presentation": ".pdf", "application/vnd.google-apps.spreadsheet": ".pdf", } - return mime_to_ext.get(mimetype, ".bin") + return mime_to_ext.get(mimetype) def clean_connector_filename(filename: str, mimetype: str) -> str: - """Clean filename and ensure correct extension""" - suffix = get_file_extension(mimetype) + """Clean filename and ensure correct extension. + + If the MIME type maps to a known extension, it is enforced. + If the MIME type is unknown, the original filename (and its extension) is kept as-is + rather than appending a meaningless .bin suffix. + """ clean_name = filename.replace(" ", "_").replace("/", "_") + suffix = get_file_extension(mimetype) + if suffix is None: + # Unknown type — keep whatever extension the file already has + return clean_name if not clean_name.lower().endswith(suffix.lower()): return clean_name + suffix return clean_name \ No newline at end of file diff --git a/uv.lock b/uv.lock index 8b98c160b..56f6a9646 100644 --- a/uv.lock +++ b/uv.lock @@ -819,6 +819,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/ae/2f6d96b4e6c5478d87d606a1934b5d436c4a2bce6bb7c6fdece891c128e3/huggingface_hub-1.4.1-py3-none-any.whl", hash = "sha256:9931d075fb7a79af5abc487106414ec5fba2c0ae86104c0c62fd6cae38873d18", size = 553326, upload-time = "2026-02-06T09:20:00.728Z" }, ] +[[package]] +name = "ibm-cos-sdk" +version = "2.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ibm-cos-sdk-core" }, + { name = "ibm-cos-sdk-s3transfer" }, + { name = "jmespath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d8/6c/ffbe556fd148e58d5b8e4b1f71fce604dcf531d791a8599cc8a7a1ee2a09/ibm_cos_sdk-2.16.0.tar.gz", hash = "sha256:ef11ceb121dc5c90050e87a82b6d27e67394c58f1c7abcc991a3d9a0964a290a", size = 58939, upload-time = "2026-01-06T10:30:11.235Z" } + +[[package]] +name = "ibm-cos-sdk-core" +version = "2.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/bb/4d6908fd4fe9d1994ad6aae483e352859b4ae7e501b8743f5db5c4840124/ibm_cos_sdk_core-2.16.0.tar.gz", hash = "sha256:2707d0ca62dd6f85455c4ac02ed37aa35ea1f2b143666271237f979d4fe904f7", size = 1119533, upload-time = "2026-01-06T10:30:01.931Z" } + +[[package]] +name = "ibm-cos-sdk-s3transfer" +version = "2.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ibm-cos-sdk-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/6f/660a6e0b7015a512304487c17c4ac87c280180b0a6142ee196570efc221e/ibm_cos_sdk_s3transfer-2.16.0.tar.gz", hash = "sha256:5cca69c48dcb7a1442b39cadb3a635162fbc2cf002820c8cd6cb170056d27c5c", size = 141138, upload-time = "2026-01-06T10:30:06.375Z" } + [[package]] name = "idna" version = "3.11" @@ -914,11 +946,11 @@ wheels = [ [[package]] name = "jmespath" -version = "1.1.0" +version = "1.0.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } +sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, + { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, ] [[package]] @@ -1265,6 +1297,7 @@ dependencies = [ { name = "google-auth-httplib2" }, { name = "google-auth-oauthlib" }, { name = "httpx" }, + { name = "ibm-cos-sdk" }, { name = "msal" }, { name = "opensearch-py", extra = ["async"] }, { name = "psutil" }, @@ -1298,6 +1331,7 @@ requires-dist = [ { name = "google-auth-httplib2", specifier = ">=0.2.0" }, { name = "google-auth-oauthlib", specifier = ">=1.2.0" }, { name = "httpx", specifier = ">=0.27.0" }, + { name = "ibm-cos-sdk", specifier = ">=2.13.0" }, { name = "msal", specifier = ">=1.29.0" }, { name = "opensearch-py", extras = ["async"], specifier = ">=3.0.0" }, { name = "psutil", specifier = ">=7.0.0" }, From 671f294ba18ccdd5f3e365024c5fb2b049cc5210 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 14:11:57 -0500 Subject: [PATCH 02/25] Add Amazon S3 connector and UI Introduce a full Amazon S3 / S3-compatible connector and integrate it into the app. Backend: add new S3 connector implementation and auth helpers (src/connectors/aws_s3), register API routes for defaults/configure/list/bucket-status, wire S3 into connection manager and connector registry, and include AWS S3 in sync_all. Frontend: add S3 settings form/dialog, React Query hooks (defaults, configure, bucket status), connect S3 UI into connector cards, cloud picker, and upload flow with an S3 bucket view and direct-sync support. CLI/TUI: add S3-related env fields and config prompts. Misc: small UI icon usages and query invalidation added to keep state in sync. --- .../api/mutations/useS3ConfigureMutation.ts | 33 +++ .../app/api/queries/useS3BucketStatusQuery.ts | 30 ++ .../app/api/queries/useS3DefaultsQuery.ts | 25 ++ frontend/app/knowledge/page.tsx | 3 + .../settings/_components/connector-cards.tsx | 11 + .../_components/s3-settings-dialog.tsx | 179 ++++++++++++ .../settings/_components/s3-settings-form.tsx | 223 +++++++++++++++ frontend/app/upload/[provider]/page.tsx | 186 ++++++++++++- .../components/cloud-picker/file-item.tsx | 3 + src/api/connectors.py | 201 +++++++++++++- src/connectors/__init__.py | 2 + src/connectors/aws_s3/__init__.py | 5 + src/connectors/aws_s3/auth.py | 96 +++++++ src/connectors/aws_s3/connector.py | 260 ++++++++++++++++++ src/connectors/connection_manager.py | 9 + src/main.py | 4 + src/tui/config_fields.py | 11 + src/tui/managers/env_manager.py | 4 + 18 files changed, 1282 insertions(+), 3 deletions(-) create mode 100644 frontend/app/api/mutations/useS3ConfigureMutation.ts create mode 100644 frontend/app/api/queries/useS3BucketStatusQuery.ts create mode 100644 frontend/app/api/queries/useS3DefaultsQuery.ts create mode 100644 frontend/app/settings/_components/s3-settings-dialog.tsx create mode 100644 frontend/app/settings/_components/s3-settings-form.tsx create mode 100644 src/connectors/aws_s3/__init__.py create mode 100644 src/connectors/aws_s3/auth.py create mode 100644 src/connectors/aws_s3/connector.py diff --git a/frontend/app/api/mutations/useS3ConfigureMutation.ts b/frontend/app/api/mutations/useS3ConfigureMutation.ts new file mode 100644 index 000000000..1e4e16a5e --- /dev/null +++ b/frontend/app/api/mutations/useS3ConfigureMutation.ts @@ -0,0 +1,33 @@ +import { useMutation, useQueryClient } from "@tanstack/react-query"; + +export interface S3ConfigurePayload { + access_key?: string; + secret_key?: string; + endpoint_url?: string; + region?: string; + bucket_names?: string[]; + connection_id?: string; +} + +async function configureS3(payload: S3ConfigurePayload) { + const res = await fetch("/api/connectors/aws_s3/configure", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + const data = await res.json(); + if (!res.ok) throw new Error(data.error || "Failed to configure S3"); + return data as { connection_id: string; status: string }; +} + +export function useS3ConfigureMutation() { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: configureS3, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ["connectors"] }); + queryClient.invalidateQueries({ queryKey: ["s3-defaults"] }); + }, + }); +} diff --git a/frontend/app/api/queries/useS3BucketStatusQuery.ts b/frontend/app/api/queries/useS3BucketStatusQuery.ts new file mode 100644 index 000000000..595920933 --- /dev/null +++ b/frontend/app/api/queries/useS3BucketStatusQuery.ts @@ -0,0 +1,30 @@ +import { useQuery } from "@tanstack/react-query"; + +export interface S3BucketStatus { + name: string; + ingested_count: number; + is_synced: boolean; +} + +async function fetchS3BucketStatus(connectionId: string): Promise { + const res = await fetch(`/api/connectors/aws_s3/${connectionId}/bucket-status`); + if (!res.ok) { + const err = await res.json().catch(() => ({})); + throw new Error(err.error || "Failed to fetch bucket status"); + } + const data = await res.json(); + return data.buckets as S3BucketStatus[]; +} + +export function useS3BucketStatusQuery( + connectionId: string | null | undefined, + options?: { enabled?: boolean }, +) { + return useQuery({ + queryKey: ["s3-bucket-status", connectionId], + queryFn: () => fetchS3BucketStatus(connectionId!), + enabled: (options?.enabled ?? true) && !!connectionId, + staleTime: 0, + refetchOnMount: "always", + }); +} diff --git a/frontend/app/api/queries/useS3DefaultsQuery.ts b/frontend/app/api/queries/useS3DefaultsQuery.ts new file mode 100644 index 000000000..9c7d24560 --- /dev/null +++ b/frontend/app/api/queries/useS3DefaultsQuery.ts @@ -0,0 +1,25 @@ +import { useQuery } from "@tanstack/react-query"; + +export interface S3Defaults { + access_key_set: boolean; + secret_key_set: boolean; + endpoint: string; + region: string; + bucket_names: string[]; + connection_id: string | null; +} + +async function fetchS3Defaults(): Promise { + const res = await fetch("/api/connectors/aws_s3/defaults"); + if (!res.ok) throw new Error("Failed to fetch S3 defaults"); + return res.json(); +} + +export function useS3DefaultsQuery(options?: { enabled?: boolean }) { + return useQuery({ + queryKey: ["s3-defaults"], + queryFn: fetchS3Defaults, + enabled: options?.enabled ?? true, + staleTime: 0, + }); +} diff --git a/frontend/app/knowledge/page.tsx b/frontend/app/knowledge/page.tsx index 6a99deab8..a17f95f8a 100644 --- a/frontend/app/knowledge/page.tsx +++ b/frontend/app/knowledge/page.tsx @@ -45,6 +45,7 @@ import IBMLogo from "../../components/icons/ibm-logo"; import IBMCOSIcon from "../../components/icons/ibm-cos-icon"; import OneDriveIcon from "../../components/icons/one-drive-logo"; import SharePointIcon from "../../components/icons/share-point-logo"; +import AwsLogo from "../../components/icons/aws-logo"; import { useDeleteDocument } from "../api/mutations/useDeleteDocument"; import { useSyncAllConnectors } from "../api/mutations/useSyncConnector"; @@ -67,6 +68,8 @@ function getSourceIcon(connectorType?: string) { return ; case "ibm_cos": return ; + case "aws_s3": + return ; default: return ( diff --git a/frontend/app/settings/_components/connector-cards.tsx b/frontend/app/settings/_components/connector-cards.tsx index 00f207584..1121fad94 100644 --- a/frontend/app/settings/_components/connector-cards.tsx +++ b/frontend/app/settings/_components/connector-cards.tsx @@ -12,15 +12,18 @@ import GoogleDriveIcon from "@/components/icons/google-drive-logo"; import IBMCOSIcon from "@/components/icons/ibm-cos-icon"; import OneDriveIcon from "@/components/icons/one-drive-logo"; import SharePointIcon from "@/components/icons/share-point-logo"; +import AwsLogo from "@/components/icons/aws-logo"; import { useAuth } from "@/contexts/auth-context"; import ConnectorCard, { type Connector } from "./connector-card"; import ConnectorsSkeleton from "./connectors-skeleton"; import IBMCOSSettingsDialog from "./ibm-cos-settings-dialog"; +import S3SettingsDialog from "./s3-settings-dialog"; export default function ConnectorCards() { const { isAuthenticated, isNoAuthMode } = useAuth(); const router = useRouter(); const [ibmCOSDialogOpen, setIBMCOSDialogOpen] = useState(false); + const [s3DialogOpen, setS3DialogOpen] = useState(false); const { data: queryConnectors = [], isLoading: connectorsLoading } = useGetConnectorsQuery({ @@ -36,6 +39,7 @@ export default function ConnectorCards() { sharepoint: , onedrive: , "ibm-cos": , + "aws-s3": , }; return ( iconMap[iconName] || ( @@ -72,6 +76,9 @@ export default function ConnectorCards() { if (connector.type === "ibm_cos") { return () => setIBMCOSDialogOpen(true); } + if (connector.type === "aws_s3") { + return () => setS3DialogOpen(true); + } return undefined; }; @@ -114,6 +121,10 @@ export default function ConnectorCards() { open={ibmCOSDialogOpen} setOpen={setIBMCOSDialogOpen} /> + ); } diff --git a/frontend/app/settings/_components/s3-settings-dialog.tsx b/frontend/app/settings/_components/s3-settings-dialog.tsx new file mode 100644 index 000000000..d2ee705ee --- /dev/null +++ b/frontend/app/settings/_components/s3-settings-dialog.tsx @@ -0,0 +1,179 @@ +"use client"; + +import { useState } from "react"; +import { FormProvider, useForm } from "react-hook-form"; +import { toast } from "sonner"; +import { useQueryClient } from "@tanstack/react-query"; +import AwsLogo from "@/components/icons/aws-logo"; +import { Button } from "@/components/ui/button"; +import { + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { S3SettingsForm, type S3FormData } from "./s3-settings-form"; +import { useS3DefaultsQuery } from "@/app/api/queries/useS3DefaultsQuery"; +import { useS3ConfigureMutation } from "@/app/api/mutations/useS3ConfigureMutation"; + +interface S3SettingsDialogProps { + open: boolean; + setOpen: (open: boolean) => void; +} + +export default function S3SettingsDialog({ + open, + setOpen, +}: S3SettingsDialogProps) { + const queryClient = useQueryClient(); + + const { data: defaults } = useS3DefaultsQuery({ enabled: open }); + + const methods = useForm({ + mode: "onSubmit", + values: { + access_key: "", + secret_key: "", + endpoint_url: defaults?.endpoint ?? "", + region: defaults?.region ?? "", + }, + }); + + const { handleSubmit } = methods; + + const [buckets, setBuckets] = useState( + defaults?.bucket_names?.length ? defaults.bucket_names : null, + ); + const [selectedBuckets, setSelectedBuckets] = useState( + defaults?.bucket_names ?? [], + ); + const [isFetchingBuckets, setIsFetchingBuckets] = useState(false); + const [bucketsError, setBucketsError] = useState(null); + const [formError, setFormError] = useState(null); + + const configureMutation = useS3ConfigureMutation(); + + const handleTestConnection = handleSubmit(async (data) => { + setIsFetchingBuckets(true); + setBucketsError(null); + setFormError(null); + + try { + const result = await configureMutation.mutateAsync({ + access_key: data.access_key || undefined, + secret_key: data.secret_key || undefined, + endpoint_url: data.endpoint_url || undefined, + region: data.region || undefined, + connection_id: defaults?.connection_id ?? undefined, + }); + + const res = await fetch( + `/api/connectors/aws_s3/${result.connection_id}/buckets`, + ); + const json = await res.json(); + if (!res.ok) throw new Error(json.error || "Failed to list buckets"); + + const fetchedBuckets: string[] = json.buckets; + setBuckets(fetchedBuckets); + + setSelectedBuckets((prev) => + prev.filter((b) => fetchedBuckets.includes(b)), + ); + + queryClient.invalidateQueries({ queryKey: ["s3-defaults"] }); + } catch (err: any) { + setBucketsError(err.message ?? "Connection failed"); + } finally { + setIsFetchingBuckets(false); + } + }); + + const onSubmit = handleSubmit(async (data) => { + setFormError(null); + if (buckets === null) { + setFormError("Test the connection first to validate credentials."); + return; + } + + try { + const latestDefaults = await queryClient.fetchQuery({ + queryKey: ["s3-defaults"], + queryFn: async () => { + const res = await fetch("/api/connectors/aws_s3/defaults"); + return res.json(); + }, + staleTime: 0, + }); + + await configureMutation.mutateAsync({ + access_key: data.access_key || undefined, + secret_key: data.secret_key || undefined, + endpoint_url: data.endpoint_url || undefined, + region: data.region || undefined, + bucket_names: selectedBuckets, + connection_id: latestDefaults?.connection_id ?? defaults?.connection_id ?? undefined, + }); + + toast.success("Amazon S3 configured", { + description: + selectedBuckets.length > 0 + ? `Will ingest from: ${selectedBuckets.join(", ")}` + : "Will auto-discover and ingest all accessible buckets.", + icon: , + }); + + queryClient.invalidateQueries({ queryKey: ["connectors"] }); + setOpen(false); + } catch (err: any) { + setFormError(err.message ?? "Failed to save configuration"); + } + }); + + return ( + + + +
+ + +
+ +
+ Amazon S3 Setup +
+
+ + void} + accessKeySet={defaults?.access_key_set} + secretKeySet={defaults?.secret_key_set} + formError={formError} + /> + + + + + + +
+
+
+ ); +} diff --git a/frontend/app/settings/_components/s3-settings-form.tsx b/frontend/app/settings/_components/s3-settings-form.tsx new file mode 100644 index 000000000..22d36ac50 --- /dev/null +++ b/frontend/app/settings/_components/s3-settings-form.tsx @@ -0,0 +1,223 @@ +"use client"; + +import { useFormContext } from "react-hook-form"; +import { LabelWrapper } from "@/components/label-wrapper"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import { Loader2, RefreshCcw } from "lucide-react"; +import { Button } from "@/components/ui/button"; + +export interface S3FormData { + access_key: string; + secret_key: string; + endpoint_url: string; + region: string; +} + +interface S3SettingsFormProps { + /** Available buckets after a successful test — null means not yet tested */ + buckets: string[] | null; + selectedBuckets: string[]; + onSelectedBucketsChange: (buckets: string[]) => void; + isFetchingBuckets: boolean; + bucketsError: string | null; + onTestConnection: () => void; + accessKeySet?: boolean; + secretKeySet?: boolean; + formError?: string | null; +} + +export function S3SettingsForm({ + buckets, + selectedBuckets, + onSelectedBucketsChange, + isFetchingBuckets, + bucketsError, + onTestConnection, + accessKeySet, + secretKeySet, + formError, +}: S3SettingsFormProps) { + const { register } = useFormContext(); + + const toggleBucket = (name: string, checked: boolean) => { + if (checked) { + onSelectedBucketsChange([...selectedBuckets, name]); + } else { + onSelectedBucketsChange(selectedBuckets.filter((b) => b !== name)); + } + }; + + const toggleAll = (checked: boolean) => { + onSelectedBucketsChange(checked ? (buckets ?? []) : []); + }; + + return ( +
+ {/* Access Key ID */} +
+ + v?.trim() })} + id="s3-access-key" + type="password" + placeholder={ + accessKeySet + ? "•••••••• (loaded from env)" + : "AKIAIOSFODNN7EXAMPLE" + } + autoComplete="off" + /> + +
+ + {/* Secret Access Key */} +
+ + v?.trim() })} + id="s3-secret-key" + type="password" + placeholder={ + secretKeySet + ? "•••••••• (loaded from env)" + : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + } + autoComplete="off" + /> + +
+ + {/* Endpoint URL (optional) */} +
+ + v?.trim() })} + id="s3-endpoint" + placeholder="https://your-minio.example.com" + autoComplete="off" + /> + +
+ + {/* Region (optional) */} +
+ + v?.trim() })} + id="s3-region" + placeholder="us-east-1" + autoComplete="off" + /> + +
+ + {/* Test connection */} + + + {bucketsError && ( +

+ {bucketsError} +

+ )} + + {formError && ( +

+ {formError} +

+ )} + + {/* Bucket selector */} + {buckets !== null && ( +
+
+ + {buckets.length > 1 && ( + + )} +
+ + {buckets.length === 0 ? ( +

+ No buckets found for this account. +

+ ) : ( +
+ {buckets.map((bucket) => ( + + ))} +
+ )} +
+ )} +
+ ); +} diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index 277fc965e..464e7c283 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -9,6 +9,7 @@ import { useSyncConnector } from "@/app/api/mutations/useSyncConnector"; import { useGetConnectorsQuery } from "@/app/api/queries/useGetConnectorsQuery"; import { useGetConnectorTokenQuery } from "@/app/api/queries/useGetConnectorTokenQuery"; import { useIBMCOSBucketStatusQuery } from "@/app/api/queries/useIBMCOSBucketStatusQuery"; +import { useS3BucketStatusQuery } from "@/app/api/queries/useS3BucketStatusQuery"; import { type CloudFile, UnifiedCloudPicker } from "@/components/cloud-picker"; import type { IngestSettings } from "@/components/cloud-picker/types"; import { Button } from "@/components/ui/button"; @@ -20,7 +21,7 @@ import { import { useTask } from "@/contexts/task-context"; // Connectors that sync entire buckets/repositories without a file picker -const DIRECT_SYNC_PROVIDERS = ["ibm_cos"]; +const DIRECT_SYNC_PROVIDERS = ["ibm_cos", "aws_s3"]; // --------------------------------------------------------------------------- // IBM COS bucket list with sync status @@ -192,6 +193,176 @@ function IBMCOSBucketView({ ); } +// --------------------------------------------------------------------------- +// Amazon S3 bucket list with sync status +// --------------------------------------------------------------------------- + +function S3BucketView({ + connector, + syncMutation, + addTask, + onBack, + onDone, +}: { + connector: any; + syncMutation: ReturnType; + addTask: (id: string) => void; + onBack: () => void; + onDone: () => void; +}) { + const queryClient = useQueryClient(); + const { data: buckets, isLoading, refetch } = useS3BucketStatusQuery( + connector.connectionId, + { enabled: true }, + ); + + const [syncingBucket, setSyncingBucket] = useState(null); + + const invalidateBucketStatus = () => { + queryClient.invalidateQueries({ queryKey: ["s3-bucket-status", connector.connectionId] }); + }; + + const syncAll = () => { + syncMutation.mutate( + { + connectorType: connector.type, + body: { + connection_id: connector.connectionId!, + selected_files: [], + sync_all: true, + }, + }, + { + onSuccess: (result) => { + invalidateBucketStatus(); + if (result.task_ids?.length) { + addTask(result.task_ids[0]); + onDone(); + } else { + toast.info("No files found in any bucket."); + } + }, + onError: (err) => { + toast.error(err instanceof Error ? err.message : "Sync failed"); + }, + }, + ); + }; + + const syncBucket = (bucketName: string) => { + setSyncingBucket(bucketName); + syncMutation.mutate( + { + connectorType: connector.type, + body: { + connection_id: connector.connectionId!, + selected_files: [], + bucket_filter: [bucketName], + }, + }, + { + onSuccess: (result) => { + setSyncingBucket(null); + invalidateBucketStatus(); + if (result.task_ids?.length) { + addTask(result.task_ids[0]); + onDone(); + } else { + toast.info(`No files found in bucket "${bucketName}".`); + } + }, + onError: (err) => { + setSyncingBucket(null); + toast.error(err instanceof Error ? err.message : "Sync failed"); + }, + }, + ); + }; + + return ( + <> +
+ +

+ Add from {connector.name} +

+
+ +
+
+

+ Select a bucket to ingest, or sync everything at once. +

+
+ + +
+
+ + {isLoading ? ( +
+
+
+ ) : !buckets?.length ? ( +
+ No buckets found. Check your S3 credentials and endpoint. +
+ ) : ( +
+ {buckets.map((bucket) => ( +
+
+
+

{bucket.name}

+ {bucket.ingested_count > 0 && ( +

+ {bucket.ingested_count} document{bucket.ingested_count !== 1 ? "s" : ""} ingested +

+ )} +
+
+ +
+ ))} +
+ )} +
+ + ); +} + // CloudFile interface is now imported from the unified cloud picker export default function UploadProviderPage() { @@ -376,8 +547,19 @@ export default function UploadProviderPage() { ); } - // Direct-sync providers (e.g. IBM COS) show a bucket list with sync status. + // Direct-sync providers show a bucket list with sync status. if (isDirectSyncProvider && connector.status === "connected") { + if (provider === "aws_s3") { + return ( + router.back()} + onDone={() => router.push("/knowledge")} + /> + ); + } return ( { return ; case "ibm_cos": return ; + case "aws_s3": + return ; default: return ; } diff --git a/src/api/connectors.py b/src/api/connectors.py index 921df2bd8..8f4f4130b 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -916,6 +916,205 @@ async def ibm_cos_bucket_status( return JSONResponse({"buckets": result}) +# --------------------------------------------------------------------------- +# Amazon S3 / S3-compatible endpoints +# --------------------------------------------------------------------------- + +class S3ConfigureBody(BaseModel): + access_key: Optional[str] = None + secret_key: Optional[str] = None + endpoint_url: Optional[str] = None + region: Optional[str] = None + bucket_names: Optional[List[str]] = None + connection_id: Optional[str] = None + + +async def s3_defaults( + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Return current S3 env-var defaults for pre-filling the config dialog. + + Sensitive values (secret key) are masked — only whether they are set is returned. + """ + import os + + access_key = os.getenv("AWS_ACCESS_KEY_ID", "") + secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "") + endpoint_url = os.getenv("AWS_S3_ENDPOINT", "") + region = os.getenv("AWS_REGION", "") + + connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="aws_s3" + ) + conn_config = {} + if connections: + conn_config = connections[0].config or {} + + def _pick(conn_key, env_val): + return conn_config.get(conn_key) or env_val + + return JSONResponse({ + "access_key_set": bool(access_key or conn_config.get("access_key")), + "secret_key_set": bool(secret_key or conn_config.get("secret_key")), + "endpoint": _pick("endpoint_url", endpoint_url), + "region": _pick("region", region), + "bucket_names": conn_config.get("bucket_names", []), + "connection_id": connections[0].connection_id if connections else None, + }) + + +async def s3_configure( + body: S3ConfigureBody, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Create or update an S3 connection with explicit credentials. + + Tests the credentials by listing buckets, then persists the connection. + """ + import os + from connectors.aws_s3.auth import create_s3_resource + + access_key = body.access_key or os.getenv("AWS_ACCESS_KEY_ID") + secret_key = body.secret_key or os.getenv("AWS_SECRET_ACCESS_KEY") + + # Fall back to existing connection config + existing_connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="aws_s3" + ) + if not access_key and existing_connections: + access_key = existing_connections[0].config.get("access_key") + if not secret_key and existing_connections: + secret_key = existing_connections[0].config.get("secret_key") + + if not access_key or not secret_key: + return JSONResponse( + {"error": "access_key and secret_key are required"}, + status_code=400, + ) + + conn_config: dict = { + "access_key": access_key.strip(), + "secret_key": secret_key.strip(), + } + if body.endpoint_url: + conn_config["endpoint_url"] = body.endpoint_url.strip() + if body.region: + conn_config["region"] = body.region.strip() + if body.bucket_names is not None: + conn_config["bucket_names"] = body.bucket_names + + # Test credentials + try: + s3 = create_s3_resource(conn_config) + list(s3.buckets.all()) + except Exception as exc: + return JSONResponse( + {"error": f"Could not connect to S3: {exc}"}, + status_code=400, + ) + + # Persist: update existing connection or create a new one + if body.connection_id: + existing = await connector_service.connection_manager.get_connection(body.connection_id) + if existing and existing.user_id == user.user_id: + await connector_service.connection_manager.update_connection( + connection_id=body.connection_id, + config=conn_config, + ) + connector_service.connection_manager.active_connectors.pop(body.connection_id, None) + return JSONResponse({"connection_id": body.connection_id, "status": "connected"}) + + connection_id = await connector_service.connection_manager.create_connection( + connector_type="aws_s3", + name="Amazon S3", + config=conn_config, + user_id=user.user_id, + ) + return JSONResponse({"connection_id": connection_id, "status": "connected"}) + + +async def s3_list_buckets( + connection_id: str, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """List all buckets accessible with the stored S3 credentials.""" + from connectors.aws_s3.auth import create_s3_resource + + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "aws_s3": + return JSONResponse({"error": "Not an S3 connection"}, status_code=400) + + try: + s3 = create_s3_resource(connection.config) + buckets = [b.name for b in s3.buckets.all()] + return JSONResponse({"buckets": buckets}) + except Exception as exc: + return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + + +async def s3_bucket_status( + connection_id: str, + connector_service=Depends(get_connector_service), + session_manager=Depends(get_session_manager), + user: User = Depends(get_current_user), +): + """Return all buckets for an S3 connection with their ingestion status.""" + from connectors.aws_s3.auth import create_s3_resource + + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "aws_s3": + return JSONResponse({"error": "Not an S3 connection"}, status_code=400) + + # 1. List all buckets from S3 + try: + s3 = create_s3_resource(connection.config) + all_buckets = [b.name for b in s3.buckets.all()] + except Exception as exc: + return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + + # 2. Count indexed documents per bucket from OpenSearch + ingested_counts: dict = {} + try: + opensearch_client = session_manager.get_user_opensearch_client( + user.user_id, user.jwt_token + ) + query_body = { + "size": 0, + "query": {"term": {"connector_type": "aws_s3"}}, + "aggs": { + "doc_ids": { + "terms": {"field": "document_id", "size": 50000} + } + }, + } + index_name = get_index_name(user.user_id) + os_resp = opensearch_client.search(index=index_name, body=query_body) + for bucket_entry in os_resp.get("aggregations", {}).get("doc_ids", {}).get("buckets", []): + doc_id = bucket_entry["key"] + if "::" in doc_id: + bucket_name = doc_id.split("::")[0] + ingested_counts[bucket_name] = ingested_counts.get(bucket_name, 0) + 1 + except Exception: + pass # OpenSearch unavailable — show zero counts + + result = [ + { + "name": bucket, + "ingested_count": ingested_counts.get(bucket, 0), + "is_synced": ingested_counts.get(bucket, 0) > 0, + } + for bucket in all_buckets + ] + return JSONResponse({"buckets": result}) + + # --------------------------------------------------------------------------- async def sync_all_connectors( @@ -931,7 +1130,7 @@ async def sync_all_connectors( jwt_token = user.jwt_token # Cloud connector types to sync - cloud_connector_types = ["google_drive", "onedrive", "sharepoint", "ibm_cos"] + cloud_connector_types = ["google_drive", "onedrive", "sharepoint", "ibm_cos", "aws_s3"] all_task_ids = [] synced_connectors = [] diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index a49dff219..323a76285 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -3,6 +3,7 @@ from .sharepoint import SharePointConnector from .onedrive import OneDriveConnector from .ibm_cos import IBMCOSConnector +from .aws_s3 import S3Connector __all__ = [ "BaseConnector", @@ -10,4 +11,5 @@ "SharePointConnector", "OneDriveConnector", "IBMCOSConnector", + "S3Connector", ] diff --git a/src/connectors/aws_s3/__init__.py b/src/connectors/aws_s3/__init__.py new file mode 100644 index 000000000..ee9a5ccab --- /dev/null +++ b/src/connectors/aws_s3/__init__.py @@ -0,0 +1,5 @@ +"""Amazon S3 / S3-compatible connector for OpenRAG.""" + +from .connector import S3Connector + +__all__ = ["S3Connector"] diff --git a/src/connectors/aws_s3/auth.py b/src/connectors/aws_s3/auth.py new file mode 100644 index 000000000..5542debf7 --- /dev/null +++ b/src/connectors/aws_s3/auth.py @@ -0,0 +1,96 @@ +"""Amazon S3 / S3-compatible storage authentication and client factory.""" + +import os +from typing import Any, Dict, Optional + +from utils.logging_config import get_logger + +logger = get_logger(__name__) + +_DEFAULT_REGION = "us-east-1" + + +def _resolve_credentials(config: Dict[str, Any]) -> Dict[str, Any]: + """Resolve S3 credentials from config dict with environment variable fallback. + + Resolution order for each value: config dict → environment variable → default. + + Raises: + ValueError: If access_key or secret_key cannot be resolved. + """ + access_key: Optional[str] = config.get("access_key") or os.getenv("AWS_ACCESS_KEY_ID") + secret_key: Optional[str] = config.get("secret_key") or os.getenv("AWS_SECRET_ACCESS_KEY") + + if not access_key or not secret_key: + raise ValueError( + "S3 credentials are required. Provide 'access_key' and 'secret_key' in the " + "connector config, or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars." + ) + + # endpoint_url is optional — only inject when non-empty (real AWS users don't set it) + endpoint_url: Optional[str] = config.get("endpoint_url") or os.getenv("AWS_S3_ENDPOINT") or None + + region: str = config.get("region") or os.getenv("AWS_REGION") or _DEFAULT_REGION + + return { + "access_key": access_key, + "secret_key": secret_key, + "endpoint_url": endpoint_url, + "region": region, + } + + +def _build_boto3_kwargs(creds: Dict[str, Any]) -> Dict[str, Any]: + """Build the keyword arguments for boto3.resource / boto3.client.""" + kwargs: Dict[str, Any] = { + "aws_access_key_id": creds["access_key"], + "aws_secret_access_key": creds["secret_key"], + "region_name": creds["region"], + } + if creds["endpoint_url"]: + kwargs["endpoint_url"] = creds["endpoint_url"] + return kwargs + + +def create_s3_resource(config: Dict[str, Any]): + """Return a boto3 S3 resource (high-level API) for bucket/object access. + + Works with AWS S3, MinIO, Cloudflare R2, and any S3-compatible service. + """ + try: + import boto3 + except ImportError as exc: + raise ImportError( + "boto3 is required for the S3 connector. " + "Install it with: pip install boto3" + ) from exc + + creds = _resolve_credentials(config) + kwargs = _build_boto3_kwargs(creds) + logger.debug( + "Creating S3 resource with HMAC authentication (boto3), " + f"endpoint={creds['endpoint_url']!r}, region={creds['region']!r}" + ) + return boto3.resource("s3", **kwargs) + + +def create_s3_client(config: Dict[str, Any]): + """Return a boto3 S3 low-level client. + + Used for operations such as list_buckets() and get_object_acl(). + """ + try: + import boto3 + except ImportError as exc: + raise ImportError( + "boto3 is required for the S3 connector. " + "Install it with: pip install boto3" + ) from exc + + creds = _resolve_credentials(config) + kwargs = _build_boto3_kwargs(creds) + logger.debug( + "Creating S3 client with HMAC authentication (boto3), " + f"endpoint={creds['endpoint_url']!r}, region={creds['region']!r}" + ) + return boto3.client("s3", **kwargs) diff --git a/src/connectors/aws_s3/connector.py b/src/connectors/aws_s3/connector.py new file mode 100644 index 000000000..917e002bd --- /dev/null +++ b/src/connectors/aws_s3/connector.py @@ -0,0 +1,260 @@ +"""Amazon S3 / S3-compatible storage connector for OpenRAG.""" + +import mimetypes +import os +from datetime import datetime, timezone +from posixpath import basename +from typing import Any, Dict, List, Optional + +from connectors.base import BaseConnector, ConnectorDocument, DocumentACL +from utils.logging_config import get_logger + +from .auth import create_s3_client, create_s3_resource + +logger = get_logger(__name__) + +# Separator used in composite file IDs: "::" +_ID_SEPARATOR = "::" + + +def _make_file_id(bucket: str, key: str) -> str: + return f"{bucket}{_ID_SEPARATOR}{key}" + + +def _split_file_id(file_id: str): + """Split a composite file ID into (bucket, key). Raises ValueError if invalid.""" + if _ID_SEPARATOR not in file_id: + raise ValueError(f"Invalid S3 file ID (missing separator): {file_id!r}") + bucket, key = file_id.split(_ID_SEPARATOR, 1) + return bucket, key + + +class S3Connector(BaseConnector): + """Connector for Amazon S3 and S3-compatible object storage. + + Uses HMAC (Access Key + Secret Key) authentication. Supports AWS S3, + MinIO, Cloudflare R2, and any service that speaks the S3 API. + + Config dict keys: + access_key (str): Overrides AWS_ACCESS_KEY_ID. + secret_key (str): Overrides AWS_SECRET_ACCESS_KEY. + endpoint_url (str): Optional; overrides AWS_S3_ENDPOINT. Leave empty for AWS S3. + region (str): Optional; overrides AWS_REGION. Default: us-east-1. + bucket_names (list[str]): Buckets to ingest from. If empty, all accessible buckets are used. + connection_id (str): Connection identifier used for logging. + """ + + CONNECTOR_NAME = "Amazon S3" + CONNECTOR_DESCRIPTION = "Add knowledge from Amazon S3 or any S3-compatible storage" + CONNECTOR_ICON = "aws-s3" + + CLIENT_ID_ENV_VAR = "AWS_ACCESS_KEY_ID" + CLIENT_SECRET_ENV_VAR = "AWS_SECRET_ACCESS_KEY" + + def __init__(self, config: Dict[str, Any]): + if config is None: + config = {} + super().__init__(config) + + self.bucket_names: List[str] = config.get("bucket_names") or [] + self.prefix: str = config.get("prefix", "") + self.connection_id: str = config.get("connection_id", "default") + + self._resource = None # Lazy-initialised on first use + self._client = None + + def _get_resource(self): + if self._resource is None: + self._resource = create_s3_resource(self.config) + return self._resource + + def _get_client(self): + if self._client is None: + self._client = create_s3_client(self.config) + return self._client + + # ------------------------------------------------------------------ + # BaseConnector abstract method implementations + # ------------------------------------------------------------------ + + async def authenticate(self) -> bool: + """Validate credentials by listing accessible buckets.""" + try: + resource = self._get_resource() + list(resource.buckets.all()) + self._authenticated = True + logger.debug(f"S3 authenticated for connection {self.connection_id}") + return True + except Exception as exc: + logger.warning(f"S3 authentication failed: {exc}") + self._authenticated = False + return False + + def _resolve_bucket_names(self) -> List[str]: + """Return configured bucket names, or auto-discover all accessible buckets.""" + if self.bucket_names: + return self.bucket_names + try: + resource = self._get_resource() + buckets = [b.name for b in resource.buckets.all()] + logger.debug(f"S3 auto-discovered {len(buckets)} bucket(s): {buckets}") + return buckets + except Exception as exc: + logger.warning(f"S3 could not auto-discover buckets: {exc}") + return [] + + async def list_files( + self, + page_token: Optional[str] = None, + max_files: Optional[int] = None, + **kwargs, + ) -> Dict[str, Any]: + """List objects across all configured (or auto-discovered) buckets. + + Uses the boto3 resource API: Bucket.objects.all() handles pagination + internally so all objects are returned without manual continuation tokens. + + Returns: + dict with keys: + "files": list of file dicts (id, name, bucket, size, modified_time) + "next_page_token": always None (SDK handles pagination internally) + """ + resource = self._get_resource() + files: List[Dict[str, Any]] = [] + bucket_names = self._resolve_bucket_names() + + for bucket_name in bucket_names: + try: + bucket = resource.Bucket(bucket_name) + objects = ( + bucket.objects.filter(Prefix=self.prefix) + if self.prefix + else bucket.objects.all() + ) + for obj in objects: + if obj.key.endswith("/"): + continue + files.append( + { + "id": _make_file_id(bucket_name, obj.key), + "name": basename(obj.key) or obj.key, + "bucket": bucket_name, + "key": obj.key, + "size": obj.size, + "modified_time": obj.last_modified.isoformat() + if obj.last_modified + else None, + } + ) + if max_files and len(files) >= max_files: + return {"files": files, "next_page_token": None} + except Exception as exc: + logger.error(f"Failed to list objects in S3 bucket {bucket_name!r}: {exc}") + continue + + return {"files": files, "next_page_token": None} + + async def get_file_content(self, file_id: str) -> ConnectorDocument: + """Download an object from S3 and return a ConnectorDocument. + + Args: + file_id: Composite ID in the form "::". + + Returns: + ConnectorDocument with content bytes, ACL, and metadata. + """ + bucket_name, key = _split_file_id(file_id) + resource = self._get_resource() + + response = resource.Object(bucket_name, key).get() + content: bytes = response["Body"].read() + + last_modified: datetime = response.get("LastModified") or datetime.now(timezone.utc) + size: int = response.get("ContentLength", len(content)) + + # Prefer filename extension over generic S3 content-type (often application/octet-stream) + raw_content_type = response.get("ContentType", "") + if raw_content_type and raw_content_type != "application/octet-stream": + mime_type: str = raw_content_type + else: + mime_type = mimetypes.guess_type(key)[0] or "application/octet-stream" + + filename = basename(key) or key + acl = await self._extract_acl(bucket_name, key) + + return ConnectorDocument( + id=file_id, + filename=filename, + mimetype=mime_type, + content=content, + source_url=f"s3://{bucket_name}/{key}", + acl=acl, + modified_time=last_modified, + created_time=last_modified, # S3 does not expose creation time + metadata={ + "s3_bucket": bucket_name, + "s3_key": key, + "size": size, + }, + ) + + async def _extract_acl(self, bucket: str, key: str) -> DocumentACL: + """Fetch object ACL from S3 and map it to DocumentACL. + + Falls back to a minimal ACL on failure (e.g. ACLs disabled on the bucket). + """ + try: + client = self._get_client() + acl_response = client.get_object_acl(Bucket=bucket, Key=key) + + owner_id: str = ( + acl_response.get("Owner", {}).get("DisplayName") + or acl_response.get("Owner", {}).get("ID") + or "" + ) + + allowed_users: List[str] = [] + for grant in acl_response.get("Grants", []): + grantee = grant.get("Grantee", {}) + permission = grant.get("Permission", "") + if permission in ("FULL_CONTROL", "READ"): + user_id = ( + grantee.get("DisplayName") + or grantee.get("ID") + or grantee.get("EmailAddress") + ) + if user_id and user_id not in allowed_users: + allowed_users.append(user_id) + + return DocumentACL( + owner=owner_id or None, + allowed_users=allowed_users, + allowed_groups=[], + ) + except Exception as exc: + logger.warning( + f"Could not fetch ACL for s3://{bucket}/{key}: {exc}. " + "Using fallback ACL." + ) + return DocumentACL(owner=None, allowed_users=[], allowed_groups=[]) + + # ------------------------------------------------------------------ + # Webhook / subscription stubs (S3 event notifications are out of scope) + # ------------------------------------------------------------------ + + async def setup_subscription(self) -> str: + """No-op: S3 event notifications are out of scope for this connector.""" + return "" + + async def handle_webhook(self, payload: Dict[str, Any]) -> List[str]: + """No-op: webhooks are not supported in this connector version.""" + return [] + + def extract_webhook_channel_id( + self, payload: Dict[str, Any], headers: Dict[str, str] + ) -> Optional[str]: + return None + + async def cleanup_subscription(self, subscription_id: str) -> bool: + """No-op: no subscription to clean up.""" + return True diff --git a/src/connectors/connection_manager.py b/src/connectors/connection_manager.py index 35016e389..c172a93ca 100644 --- a/src/connectors/connection_manager.py +++ b/src/connectors/connection_manager.py @@ -14,6 +14,7 @@ from .sharepoint import SharePointConnector from .onedrive import OneDriveConnector from .ibm_cos import IBMCOSConnector +from .aws_s3 import S3Connector @dataclass @@ -358,6 +359,12 @@ def get_available_connector_types(self) -> Dict[str, Dict[str, Any]]: "icon": IBMCOSConnector.CONNECTOR_ICON, "available": self._is_connector_available("ibm_cos"), }, + "aws_s3": { + "name": S3Connector.CONNECTOR_NAME, + "description": S3Connector.CONNECTOR_DESCRIPTION, + "icon": S3Connector.CONNECTOR_ICON, + "available": self._is_connector_available("aws_s3"), + }, } def _is_connector_available(self, connector_type: str) -> bool: @@ -388,6 +395,8 @@ def _create_connector(self, config: ConnectionConfig) -> BaseConnector: return OneDriveConnector(config.config) elif config.connector_type == "ibm_cos": return IBMCOSConnector(config.config) + elif config.connector_type == "aws_s3": + return S3Connector(config.config) elif config.connector_type == "box": raise NotImplementedError("Box connector not implemented yet") elif config.connector_type == "dropbox": diff --git a/src/main.py b/src/main.py index 016a86ff7..8f61723b5 100644 --- a/src/main.py +++ b/src/main.py @@ -767,6 +767,10 @@ async def create_app(): app.add_api_route("/connectors/ibm_cos/configure", connectors.ibm_cos_configure, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/ibm_cos/{connection_id}/buckets", connectors.ibm_cos_list_buckets, methods=["GET"], tags=["internal"]) app.add_api_route("/connectors/ibm_cos/{connection_id}/bucket-status", connectors.ibm_cos_bucket_status, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/defaults", connectors.s3_defaults, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/configure", connectors.s3_configure, methods=["POST"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/{connection_id}/buckets", connectors.s3_list_buckets, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/{connection_id}/bucket-status", connectors.s3_bucket_status, methods=["GET"], tags=["internal"]) app.add_api_route("/connectors/{connector_type}/sync", connectors.connector_sync, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/sync-all", connectors.sync_all_connectors, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/{connector_type}/status", connectors.connector_status, methods=["GET"], tags=["internal"]) diff --git a/src/tui/config_fields.py b/src/tui/config_fields.py index 6a78a883e..184365a32 100644 --- a/src/tui/config_fields.py +++ b/src/tui/config_fields.py @@ -181,6 +181,17 @@ class ConfigSection: "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY", "Secret Access Key", placeholder="", secret=True, ), + ConfigField( + "aws_s3_endpoint", "AWS_S3_ENDPOINT", "S3 Endpoint URL (optional)", + placeholder="", + helper_text="Leave empty for AWS S3. For MinIO, R2, or other S3-compatible services, enter the endpoint URL.", + ), + ConfigField( + "aws_region", "AWS_REGION", "AWS Region (optional)", + placeholder="us-east-1", + default="us-east-1", + helper_text="AWS region (e.g. us-east-1, eu-west-1). Default: us-east-1.", + ), ], advanced=True, gate_prompt="Configure AWS credentials?"), # ── IBM Cloud Object Storage ───────────────────────────────── diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py index b16c38af2..7fba12917 100644 --- a/src/tui/managers/env_manager.py +++ b/src/tui/managers/env_manager.py @@ -57,6 +57,8 @@ class EnvConfig: webhook_base_url: str = "" aws_access_key_id: str = "" aws_secret_access_key: str = "" + aws_s3_endpoint: str = "" + aws_region: str = "" langflow_public_url: str = "" # IBM Cloud Object Storage settings @@ -200,6 +202,8 @@ def load_existing_env(self) -> bool: "WEBHOOK_BASE_URL": "webhook_base_url", "AWS_ACCESS_KEY_ID": "aws_access_key_id", "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret + "AWS_S3_ENDPOINT": "aws_s3_endpoint", + "AWS_REGION": "aws_region", "LANGFLOW_PUBLIC_URL": "langflow_public_url", "IBM_COS_API_KEY": "ibm_cos_api_key", # pragma: allowlist secret "IBM_COS_SERVICE_INSTANCE_ID": "ibm_cos_service_instance_id", From 7658d538ee2bdafd80f0cf4c293bfdda743f8688 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 14:42:30 -0500 Subject: [PATCH 03/25] S3: env fallback & show bucket errors Add env-var fallback getters for S3 credentials and clearer errors: implement get_client_id and get_client_secret to read from config or AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY and raise ValueError when missing. Frontend fixes: treat a connected connector as available in the connector card UI, and surface S3 bucket loading errors on the upload page by including the query error in the response and rendering a descriptive error message when bucket fetch fails. --- .../settings/_components/connector-card.tsx | 2 +- frontend/app/upload/[provider]/page.tsx | 6 +++++- src/connectors/aws_s3/connector.py | 20 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/frontend/app/settings/_components/connector-card.tsx b/frontend/app/settings/_components/connector-card.tsx index 37c08dc77..93711ea06 100644 --- a/frontend/app/settings/_components/connector-card.tsx +++ b/frontend/app/settings/_components/connector-card.tsx @@ -80,7 +80,7 @@ export default function ConnectorCard({
- {connector?.available ? ( + {connector?.available || isConnected ? (
{isConnected ? (
diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index 464e7c283..1b43564df 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -211,7 +211,7 @@ function S3BucketView({ onDone: () => void; }) { const queryClient = useQueryClient(); - const { data: buckets, isLoading, refetch } = useS3BucketStatusQuery( + const { data: buckets, isLoading, error: bucketsError, refetch } = useS3BucketStatusQuery( connector.connectionId, { enabled: true }, ); @@ -321,6 +321,10 @@ function S3BucketView({
+ ) : bucketsError ? ( +
+ {(bucketsError as Error).message || "Failed to load buckets. Check your S3 credentials and endpoint."} +
) : !buckets?.length ? (
No buckets found. Check your S3 credentials and endpoint. diff --git a/src/connectors/aws_s3/connector.py b/src/connectors/aws_s3/connector.py index 917e002bd..0b5ec749a 100644 --- a/src/connectors/aws_s3/connector.py +++ b/src/connectors/aws_s3/connector.py @@ -51,6 +51,26 @@ class S3Connector(BaseConnector): CLIENT_ID_ENV_VAR = "AWS_ACCESS_KEY_ID" CLIENT_SECRET_ENV_VAR = "AWS_SECRET_ACCESS_KEY" + def get_client_id(self) -> str: + """Return access key from config dict, or AWS_ACCESS_KEY_ID env var as fallback.""" + val = self.config.get("access_key") or os.getenv("AWS_ACCESS_KEY_ID") + if val: + return val + raise ValueError( + "S3 credentials not set. Provide 'access_key' in the connector config " + "or set the AWS_ACCESS_KEY_ID environment variable." + ) + + def get_client_secret(self) -> str: + """Return secret key from config dict, or AWS_SECRET_ACCESS_KEY env var as fallback.""" + val = self.config.get("secret_key") or os.getenv("AWS_SECRET_ACCESS_KEY") + if val: + return val + raise ValueError( + "S3 credentials not set. Provide 'secret_key' in the connector config " + "or set the AWS_SECRET_ACCESS_KEY environment variable." + ) + def __init__(self, config: Dict[str, Any]): if config is None: config = {} From 8676f1518dab4612956344a4c9bf5f7b3f1c649d Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 14:58:45 -0500 Subject: [PATCH 04/25] Improve connector UI, sync S3, fix AWS logo Connector card: use isConnected alone for active state, treat a connector as configured when isConnected or connector.available, and show a "Configure" action (with Settings2 icon) when onConfigure is provided; also keep existing loading state. S3 settings dialog: import useEffect and add an effect to sync buckets and selectedBuckets when defaults.bucket_names load asynchronously so defaults are applied after dialog mount. AWS logo: replace fill="currentColor" with an explicit color (#232F3E) for consistent rendering. --- .../app/settings/_components/connector-card.tsx | 9 +++++++-- .../app/settings/_components/s3-settings-dialog.tsx | 13 ++++++++++++- frontend/components/icons/aws-logo.tsx | 2 +- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/frontend/app/settings/_components/connector-card.tsx b/frontend/app/settings/_components/connector-card.tsx index 93711ea06..f18f19cb0 100644 --- a/frontend/app/settings/_components/connector-card.tsx +++ b/frontend/app/settings/_components/connector-card.tsx @@ -54,7 +54,7 @@ export default function ConnectorCard({
{connector.icon} @@ -71,7 +71,7 @@ export default function ConnectorCard({ {connector.name} - {connector?.available + {isConnected || connector?.available ? `${connector.name} is configured.` : "Not configured."} @@ -148,6 +148,11 @@ export default function ConnectorCard({ Connecting... + ) : onConfigure ? ( + <> + + Configure + ) : ( <>Connect )} diff --git a/frontend/app/settings/_components/s3-settings-dialog.tsx b/frontend/app/settings/_components/s3-settings-dialog.tsx index d2ee705ee..1f32dd0e5 100644 --- a/frontend/app/settings/_components/s3-settings-dialog.tsx +++ b/frontend/app/settings/_components/s3-settings-dialog.tsx @@ -1,6 +1,6 @@ "use client"; -import { useState } from "react"; +import { useState, useEffect } from "react"; import { FormProvider, useForm } from "react-hook-form"; import { toast } from "sonner"; import { useQueryClient } from "@tanstack/react-query"; @@ -48,6 +48,17 @@ export default function S3SettingsDialog({ const [selectedBuckets, setSelectedBuckets] = useState( defaults?.bucket_names ?? [], ); + + // Sync bucket state when defaults load asynchronously after dialog mount + useEffect(() => { + if (defaults?.bucket_names?.length) { + setBuckets(defaults.bucket_names); + setSelectedBuckets((prev) => + prev.length ? prev : defaults.bucket_names, + ); + } + }, [defaults?.bucket_names?.join(",")]); + const [isFetchingBuckets, setIsFetchingBuckets] = useState(false); const [bucketsError, setBucketsError] = useState(null); const [formError, setFormError] = useState(null); diff --git a/frontend/components/icons/aws-logo.tsx b/frontend/components/icons/aws-logo.tsx index 221eea0a9..4ed347b3e 100644 --- a/frontend/components/icons/aws-logo.tsx +++ b/frontend/components/icons/aws-logo.tsx @@ -11,7 +11,7 @@ const AwsLogo = ({ className }: { className?: string }) => { AWS Logo Date: Mon, 9 Mar 2026 15:27:53 -0500 Subject: [PATCH 05/25] Update connector-card.tsx --- .../settings/_components/connector-card.tsx | 135 ++++++++---------- 1 file changed, 61 insertions(+), 74 deletions(-) diff --git a/frontend/app/settings/_components/connector-card.tsx b/frontend/app/settings/_components/connector-card.tsx index f18f19cb0..2a958e2ec 100644 --- a/frontend/app/settings/_components/connector-card.tsx +++ b/frontend/app/settings/_components/connector-card.tsx @@ -80,96 +80,83 @@ export default function ConnectorCard({
- {connector?.available || isConnected ? ( -
- {isConnected ? ( -
- - {onConfigure ? ( - - ) : ( - - )} - -
+ {isConnected ? ( +
+ + {onConfigure ? ( + ) : ( )} +
) : onConfigure ? ( - // Connector supports UI-based configuration (e.g. IBM COS) + ) : connector?.available ? ( + ) : (

From ad7780402e9c72d2ab15aad6cce2bdd949404183 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 19:39:55 -0500 Subject: [PATCH 06/25] Potential fix for code scanning alert no. 94: Information exposure through an exception Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- src/api/connectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api/connectors.py b/src/api/connectors.py index 8f4f4130b..4c3857e69 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -1077,7 +1077,8 @@ async def s3_bucket_status( s3 = create_s3_resource(connection.config) all_buckets = [b.name for b in s3.buckets.all()] except Exception as exc: - return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + logger.exception("Failed to list buckets from S3 for connection %s", connection_id) + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) # 2. Count indexed documents per bucket from OpenSearch ingested_counts: dict = {} From a322b44f7d4f4753ad25a979c6186eea386809db Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 19:40:05 -0500 Subject: [PATCH 07/25] Potential fix for code scanning alert no. 89: Information exposure through an exception Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- src/api/connectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api/connectors.py b/src/api/connectors.py index 4c3857e69..d22e0f7da 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -795,8 +795,9 @@ async def ibm_cos_configure( cos = create_ibm_cos_client(conn_config) cos.list_buckets() except Exception as exc: + logger.exception("Failed to connect to IBM COS during credential test.") return JSONResponse( - {"error": f"Could not connect to IBM COS: {exc}"}, + {"error": "Could not connect to IBM COS with the provided configuration."}, status_code=400, ) From 547e6a910cce9d9758d48ceafd79faf2aa942b87 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 19:40:20 -0500 Subject: [PATCH 08/25] Potential fix for code scanning alert no. 90: Information exposure through an exception Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- src/api/connectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api/connectors.py b/src/api/connectors.py index d22e0f7da..80a3feac8 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -846,7 +846,8 @@ async def ibm_cos_list_buckets( cos = create_ibm_cos_client(cfg) buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] return JSONResponse({"buckets": buckets}) - except Exception as exc: + logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) From 62697d00015f5d36b1f8172afbe3afa8533d3c42 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 9 Mar 2026 19:40:46 -0500 Subject: [PATCH 09/25] Potential fix for code scanning alert no. 91: Information exposure through an exception Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- src/api/connectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api/connectors.py b/src/api/connectors.py index 80a3feac8..6bd4ee45e 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -878,7 +878,8 @@ async def ibm_cos_bucket_status( all_buckets = [b.name for b in cos.buckets.all()] else: cos = create_ibm_cos_client(cfg) - all_buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] + logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) except Exception as exc: return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) From 8b9adf5fcb0ec6ddd86901bd05ab5a0e25024fb9 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Tue, 10 Mar 2026 00:44:10 -0500 Subject: [PATCH 10/25] Fix IBM COS bucket handling; update .gitignore Clean up IBM COS bucket listing and error handling: consolidate bucket enumeration to use the COS resource API, move logger.exception into the except blocks, and return consistent error responses. This removes unreachable/duplicated code paths and prevents leaking exception details in one handler. Also add /opensearch-data-new-lf to .gitignore. --- .gitignore | 1 + src/api/connectors.py | 14 +++++--------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 9de3e2574..7da9d1140 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ documents/warmup_ocr.pdf documents/openrag-documentation.pdf documents/ibm_anthropic.pdf documents/docling.pdf +/opensearch-data-new-lf diff --git a/src/api/connectors.py b/src/api/connectors.py index 6bd4ee45e..32ec5008a 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -846,9 +846,9 @@ async def ibm_cos_list_buckets( cos = create_ibm_cos_client(cfg) buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] return JSONResponse({"buckets": buckets}) - logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) + except Exception as exc: return JSONResponse({"error": "Failed to list buckets"}, status_code=500) - return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + async def ibm_cos_bucket_status( @@ -873,14 +873,10 @@ async def ibm_cos_bucket_status( # 1. List all buckets from COS try: cfg = connection.config - if cfg.get("auth_mode", "iam") == "hmac": - cos = create_ibm_cos_resource(cfg) - all_buckets = [b.name for b in cos.buckets.all()] - else: - cos = create_ibm_cos_client(cfg) - logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) - return JSONResponse({"error": "Failed to list buckets"}, status_code=500) + cos = create_ibm_cos_resource(cfg) + all_buckets = [b.name for b in cos.buckets.all()] except Exception as exc: + logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) # 2. Count indexed documents per bucket from OpenSearch From 0d3d074308177ae86a45a703cfb3053745445ed0 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Tue, 10 Mar 2026 01:14:45 -0500 Subject: [PATCH 11/25] Add COS env vars and improve connector logging Expose additional IBM COS and S3 env vars in docker-compose and tidy up connector code. - docker-compose: added AWS_S3_ENDPOINT, AWS_REGION and multiple IBM_COS_* env vars for configuring IBM COS and custom S3 endpoints. - api/connectors: import adjusted to use create_ibm_cos_resource only. - aws_s3.auth: shortened debug messages to avoid logging endpoint/region details. - aws_s3.connector & ibm_cos.connector: switched to printf-style logging (avoid f-strings) and normalized error/warning messages for listing objects and ACL fallbacks. - ibm_cos.connector: prefer values from connector config (api_key, hmac_access_key, service_instance_id, hmac_secret_key) before falling back to environment variables. These changes improve configuration flexibility, reduce accidental logging of potentially sensitive details, and standardize connector logging. --- docker-compose.yml | 8 ++++++++ src/api/connectors.py | 2 +- src/connectors/aws_s3/auth.py | 10 ++-------- src/connectors/aws_s3/connector.py | 9 +++------ src/connectors/ibm_cos/connector.py | 23 +++++++++++++++-------- 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0aef3a172..4f97d288b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -80,6 +80,14 @@ services: - WEBHOOK_BASE_URL=${WEBHOOK_BASE_URL} - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} + - AWS_S3_ENDPOINT=${AWS_S3_ENDPOINT} + - AWS_REGION=${AWS_REGION} + - IBM_COS_API_KEY=${IBM_COS_API_KEY} + - IBM_COS_SERVICE_INSTANCE_ID=${IBM_COS_SERVICE_INSTANCE_ID} + - IBM_COS_ENDPOINT=${IBM_COS_ENDPOINT} + - IBM_COS_HMAC_ACCESS_KEY_ID=${IBM_COS_HMAC_ACCESS_KEY_ID} + - IBM_COS_HMAC_SECRET_ACCESS_KEY=${IBM_COS_HMAC_SECRET_ACCESS_KEY} + - IBM_COS_AUTH_ENDPOINT=${IBM_COS_AUTH_ENDPOINT} - OPENSEARCH_INDEX_NAME=${OPENSEARCH_INDEX_NAME:-documents} - LANGFLOW_KEY=${LANGFLOW_KEY} - LANGFLOW_KEY_RETRIES=${LANGFLOW_KEY_RETRIES:-15} diff --git a/src/api/connectors.py b/src/api/connectors.py index 32ec5008a..53ded30db 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -862,7 +862,7 @@ async def ibm_cos_bucket_status( Each entry includes the bucket name, whether it has been ingested (is_synced), and the count of indexed documents from that bucket. """ - from connectors.ibm_cos.auth import create_ibm_cos_client, create_ibm_cos_resource + from connectors.ibm_cos.auth import create_ibm_cos_resource connection = await connector_service.connection_manager.get_connection(connection_id) if not connection or connection.user_id != user.user_id: diff --git a/src/connectors/aws_s3/auth.py b/src/connectors/aws_s3/auth.py index 5542debf7..78894eca5 100644 --- a/src/connectors/aws_s3/auth.py +++ b/src/connectors/aws_s3/auth.py @@ -67,10 +67,7 @@ def create_s3_resource(config: Dict[str, Any]): creds = _resolve_credentials(config) kwargs = _build_boto3_kwargs(creds) - logger.debug( - "Creating S3 resource with HMAC authentication (boto3), " - f"endpoint={creds['endpoint_url']!r}, region={creds['region']!r}" - ) + logger.debug("Creating S3 resource with HMAC authentication (boto3)") return boto3.resource("s3", **kwargs) @@ -89,8 +86,5 @@ def create_s3_client(config: Dict[str, Any]): creds = _resolve_credentials(config) kwargs = _build_boto3_kwargs(creds) - logger.debug( - "Creating S3 client with HMAC authentication (boto3), " - f"endpoint={creds['endpoint_url']!r}, region={creds['region']!r}" - ) + logger.debug("Creating S3 client with HMAC authentication (boto3)") return boto3.client("s3", **kwargs) diff --git a/src/connectors/aws_s3/connector.py b/src/connectors/aws_s3/connector.py index 0b5ec749a..13929e41b 100644 --- a/src/connectors/aws_s3/connector.py +++ b/src/connectors/aws_s3/connector.py @@ -117,7 +117,7 @@ def _resolve_bucket_names(self) -> List[str]: try: resource = self._get_resource() buckets = [b.name for b in resource.buckets.all()] - logger.debug(f"S3 auto-discovered {len(buckets)} bucket(s): {buckets}") + logger.debug("S3 auto-discovered %d bucket(s)", len(buckets)) return buckets except Exception as exc: logger.warning(f"S3 could not auto-discover buckets: {exc}") @@ -169,7 +169,7 @@ async def list_files( if max_files and len(files) >= max_files: return {"files": files, "next_page_token": None} except Exception as exc: - logger.error(f"Failed to list objects in S3 bucket {bucket_name!r}: {exc}") + logger.error("Failed to list objects in S3 bucket: %s", exc) continue return {"files": files, "next_page_token": None} @@ -252,10 +252,7 @@ async def _extract_acl(self, bucket: str, key: str) -> DocumentACL: allowed_groups=[], ) except Exception as exc: - logger.warning( - f"Could not fetch ACL for s3://{bucket}/{key}: {exc}. " - "Using fallback ACL." - ) + logger.warning("Could not fetch S3 object ACL, using fallback: %s", exc) return DocumentACL(owner=None, allowed_users=[], allowed_groups=[]) # ------------------------------------------------------------------ diff --git a/src/connectors/ibm_cos/connector.py b/src/connectors/ibm_cos/connector.py index 4a41a5b7d..46d21a2ee 100644 --- a/src/connectors/ibm_cos/connector.py +++ b/src/connectors/ibm_cos/connector.py @@ -58,7 +58,12 @@ class IBMCOSConnector(BaseConnector): def get_client_id(self) -> str: """Return IAM API key, or HMAC access key ID as fallback.""" - val = os.getenv("IBM_COS_API_KEY") or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") + val = ( + self.config.get("api_key") + or self.config.get("hmac_access_key") + or os.getenv("IBM_COS_API_KEY") + or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") + ) if val: return val raise ValueError( @@ -68,7 +73,12 @@ def get_client_id(self) -> str: def get_client_secret(self) -> str: """Return IAM service instance ID, or HMAC secret key as fallback.""" - val = os.getenv("IBM_COS_SERVICE_INSTANCE_ID") or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") + val = ( + self.config.get("service_instance_id") + or self.config.get("hmac_secret_key") + or os.getenv("IBM_COS_SERVICE_INSTANCE_ID") + or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") + ) if val: return val raise ValueError( @@ -142,7 +152,7 @@ def _resolve_bucket_names(self) -> List[str]: else: resp = handle.list_buckets() buckets = [b["Name"] for b in resp.get("Buckets", [])] - logger.debug(f"IBM COS auto-discovered {len(buckets)} bucket(s): {buckets}") + logger.debug("IBM COS auto-discovered %d bucket(s)", len(buckets)) return buckets except Exception as exc: logger.warning(f"IBM COS could not auto-discover buckets: {exc}") @@ -228,7 +238,7 @@ async def list_files( break except Exception as exc: - logger.error(f"Failed to list objects in bucket {bucket_name!r}: {exc}") + logger.error("Failed to list objects in IBM COS bucket: %s", exc) continue return {"files": files, "next_page_token": None} @@ -326,10 +336,7 @@ async def _extract_acl(self, bucket: str, key: str) -> DocumentACL: allowed_groups=[], ) except Exception as exc: - logger.warning( - f"Could not fetch ACL for cos://{bucket}/{key}: {exc}. " - "Using fallback ACL." - ) + logger.warning("Could not fetch IBM COS object ACL, using fallback: %s", exc) return DocumentACL( owner=self._service_instance_id or None, allowed_users=[], From 28694aba4e4d66e9c551bf1394cc697f6245a106 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Tue, 10 Mar 2026 01:33:09 -0500 Subject: [PATCH 12/25] Refactor S3 and IBM COS connector APIs Move S3 and IBM COS-specific FastAPI route handlers out of the large api/connectors.py into dedicated modules under connectors/aws_s3 and connectors/ibm_cos. Add Pydantic models (models.py) and pure helper logic (support.py) for credential resolution and config construction, and factor common credential-testing logic into these new modules. Update package __init__.py exports to include the new models and API functions, and adjust main.py to register the new route handlers directly. This modularizes connector code, keeps route handlers thin, and centralizes validation/config-building for easier maintenance. --- src/api/connectors.py | 463 ----------------------------- src/connectors/aws_s3/__init__.py | 16 +- src/connectors/aws_s3/api.py | 173 +++++++++++ src/connectors/aws_s3/models.py | 13 + src/connectors/aws_s3/support.py | 51 ++++ src/connectors/ibm_cos/__init__.py | 16 +- src/connectors/ibm_cos/api.py | 201 +++++++++++++ src/connectors/ibm_cos/models.py | 20 ++ src/connectors/ibm_cos/support.py | 68 +++++ src/main.py | 29 +- 10 files changed, 577 insertions(+), 473 deletions(-) create mode 100644 src/connectors/aws_s3/api.py create mode 100644 src/connectors/aws_s3/models.py create mode 100644 src/connectors/aws_s3/support.py create mode 100644 src/connectors/ibm_cos/api.py create mode 100644 src/connectors/ibm_cos/models.py create mode 100644 src/connectors/ibm_cos/support.py diff --git a/src/api/connectors.py b/src/api/connectors.py index 53ded30db..6d2431f9b 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -652,469 +652,6 @@ async def connector_disconnect( ) -# --------------------------------------------------------------------------- -# IBM COS-specific endpoints -# --------------------------------------------------------------------------- - -class IBMCOSConfigureBody(BaseModel): - auth_mode: str # "iam" or "hmac" - endpoint: str - # IAM fields - api_key: Optional[str] = None - service_instance_id: Optional[str] = None - auth_endpoint: Optional[str] = None - # HMAC fields - hmac_access_key: Optional[str] = None - hmac_secret_key: Optional[str] = None - # Optional bucket selection - bucket_names: Optional[List[str]] = None - # Optional: update an existing connection - connection_id: Optional[str] = None - - -async def ibm_cos_defaults( - connector_service=Depends(get_connector_service), - user: User = Depends(get_current_user), -): - """Return current IBM COS env-var defaults for pre-filling the config dialog. - - Sensitive values (API key, HMAC secret) are masked — only whether they are - set is returned, not the actual values. - """ - import os - - api_key = os.getenv("IBM_COS_API_KEY", "") - service_instance_id = os.getenv("IBM_COS_SERVICE_INSTANCE_ID", "") - endpoint = os.getenv("IBM_COS_ENDPOINT", "") - hmac_access_key = os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID", "") - hmac_secret_key = os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY", "") - disable_iam = os.getenv("OPENRAG_IBM_COS_IAM_UI", "").lower() not in ("1", "true", "yes") - - # Try to read existing connection config for this user too - connections = await connector_service.connection_manager.list_connections( - user_id=user.user_id, connector_type="ibm_cos" - ) - conn_config = {} - if connections: - conn_config = connections[0].config or {} - - def _pick(conn_key, env_val): - """Prefer connection config value over env var.""" - return conn_config.get(conn_key) or env_val - - return JSONResponse({ - "api_key_set": bool(api_key or conn_config.get("api_key")), - "service_instance_id": _pick("service_instance_id", service_instance_id), - "endpoint": _pick("endpoint_url", endpoint), - "hmac_access_key_set": bool(hmac_access_key or conn_config.get("hmac_access_key")), - "hmac_secret_key_set": bool(hmac_secret_key or conn_config.get("hmac_secret_key")), - # Return which auth mode was previously used; default to hmac when IAM is disabled - "auth_mode": conn_config.get("auth_mode", "hmac" if (disable_iam or not (api_key or conn_config.get("api_key"))) else "iam"), - "disable_iam": disable_iam, - # Return bucket_names from existing connection (if any) - "bucket_names": conn_config.get("bucket_names", []), - # Return connection_id if an existing connection exists - "connection_id": connections[0].connection_id if connections else None, - }) - - -async def ibm_cos_configure( - body: IBMCOSConfigureBody, - connector_service=Depends(get_connector_service), - user: User = Depends(get_current_user), -): - """Create or update an IBM COS connection with explicit credentials. - - Tests the credentials by listing buckets, then persists the connection. - Credentials are stored in the connection config dict (not env vars) so - the connector works even without system-level env vars. - """ - import os - from connectors.ibm_cos.auth import create_ibm_cos_client, create_ibm_cos_resource - - # Build the config dict that will be stored in the connection - conn_config: dict = { - "auth_mode": body.auth_mode, - "endpoint_url": body.endpoint, - } - - if body.auth_mode == "iam": - # Resolve: use supplied value, fall back to env var, fall back to existing connection - api_key = body.api_key or os.getenv("IBM_COS_API_KEY") - svc_id = body.service_instance_id or os.getenv("IBM_COS_SERVICE_INSTANCE_ID") - - # If still empty, pull from existing connection config - existing_connections = await connector_service.connection_manager.list_connections( - user_id=user.user_id, connector_type="ibm_cos" - ) - if not api_key and existing_connections: - api_key = existing_connections[0].config.get("api_key") - if not svc_id and existing_connections: - svc_id = existing_connections[0].config.get("service_instance_id") - - if not api_key or not svc_id: - return JSONResponse( - {"error": "IAM mode requires api_key and service_instance_id"}, - status_code=400, - ) - conn_config["api_key"] = api_key - conn_config["service_instance_id"] = svc_id - if body.auth_endpoint: - conn_config["auth_endpoint"] = body.auth_endpoint - else: - # HMAC mode - hmac_access = body.hmac_access_key or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") - hmac_secret = body.hmac_secret_key or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") - - existing_connections = await connector_service.connection_manager.list_connections( - user_id=user.user_id, connector_type="ibm_cos" - ) - if not hmac_access and existing_connections: - hmac_access = existing_connections[0].config.get("hmac_access_key") - if not hmac_secret and existing_connections: - hmac_secret = existing_connections[0].config.get("hmac_secret_key") - - if not hmac_access or not hmac_secret: - return JSONResponse( - {"error": "HMAC mode requires hmac_access_key and hmac_secret_key"}, - status_code=400, - ) - conn_config["hmac_access_key"] = hmac_access - conn_config["hmac_secret_key"] = hmac_secret - - if body.bucket_names is not None: - conn_config["bucket_names"] = body.bucket_names - - # Test credentials — IAM uses client (avoids ibm_botocore discovery-call bug), - # HMAC uses resource (S3-compatible, works with MinIO). - try: - if conn_config.get("auth_mode", "iam") == "hmac": - cos = create_ibm_cos_resource(conn_config) - list(cos.buckets.all()) - else: - cos = create_ibm_cos_client(conn_config) - cos.list_buckets() - except Exception as exc: - logger.exception("Failed to connect to IBM COS during credential test.") - return JSONResponse( - {"error": "Could not connect to IBM COS with the provided configuration."}, - status_code=400, - ) - - # Persist: update existing connection or create a new one - if body.connection_id: - existing = await connector_service.connection_manager.get_connection(body.connection_id) - if existing and existing.user_id == user.user_id: - await connector_service.connection_manager.update_connection( - connection_id=body.connection_id, - config=conn_config, - ) - # Evict cached connector so next call gets a fresh instance - connector_service.connection_manager.active_connectors.pop(body.connection_id, None) - return JSONResponse({"connection_id": body.connection_id, "status": "connected"}) - - # Create a fresh connection - connection_id = await connector_service.connection_manager.create_connection( - connector_type="ibm_cos", - name="IBM Cloud Object Storage", - config=conn_config, - user_id=user.user_id, - ) - return JSONResponse({"connection_id": connection_id, "status": "connected"}) - - -async def ibm_cos_list_buckets( - connection_id: str, - connector_service=Depends(get_connector_service), - user: User = Depends(get_current_user), -): - """List all buckets accessible with the stored IBM COS credentials.""" - from connectors.ibm_cos.auth import create_ibm_cos_client, create_ibm_cos_resource - - connection = await connector_service.connection_manager.get_connection(connection_id) - if not connection or connection.user_id != user.user_id: - return JSONResponse({"error": "Connection not found"}, status_code=404) - if connection.connector_type != "ibm_cos": - return JSONResponse({"error": "Not an IBM COS connection"}, status_code=400) - - try: - cfg = connection.config - if cfg.get("auth_mode", "iam") == "hmac": - cos = create_ibm_cos_resource(cfg) - buckets = [b.name for b in cos.buckets.all()] - else: - cos = create_ibm_cos_client(cfg) - buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] - return JSONResponse({"buckets": buckets}) - except Exception as exc: - return JSONResponse({"error": "Failed to list buckets"}, status_code=500) - - - -async def ibm_cos_bucket_status( - connection_id: str, - connector_service=Depends(get_connector_service), - session_manager=Depends(get_session_manager), - user: User = Depends(get_current_user), -): - """Return all buckets for an IBM COS connection with their ingestion status. - - Each entry includes the bucket name, whether it has been ingested (is_synced), - and the count of indexed documents from that bucket. - """ - from connectors.ibm_cos.auth import create_ibm_cos_resource - - connection = await connector_service.connection_manager.get_connection(connection_id) - if not connection or connection.user_id != user.user_id: - return JSONResponse({"error": "Connection not found"}, status_code=404) - if connection.connector_type != "ibm_cos": - return JSONResponse({"error": "Not an IBM COS connection"}, status_code=400) - - # 1. List all buckets from COS - try: - cfg = connection.config - cos = create_ibm_cos_resource(cfg) - all_buckets = [b.name for b in cos.buckets.all()] - except Exception as exc: - logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) - return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) - - # 2. Count indexed documents per bucket from OpenSearch - ingested_counts: dict = {} - try: - opensearch_client = session_manager.get_user_opensearch_client( - user.user_id, user.jwt_token - ) - query_body = { - "size": 0, - "query": {"term": {"connector_type": "ibm_cos"}}, - "aggs": { - "doc_ids": { - "terms": {"field": "document_id", "size": 50000} - } - }, - } - index_name = get_index_name(user.user_id) - os_resp = opensearch_client.search(index=index_name, body=query_body) - for bucket_entry in os_resp.get("aggregations", {}).get("doc_ids", {}).get("buckets", []): - doc_id = bucket_entry["key"] - if "::" in doc_id: - bucket_name = doc_id.split("::")[0] - ingested_counts[bucket_name] = ingested_counts.get(bucket_name, 0) + 1 - except Exception: - pass # OpenSearch unavailable — show zero counts - - result = [ - { - "name": bucket, - "ingested_count": ingested_counts.get(bucket, 0), - "is_synced": ingested_counts.get(bucket, 0) > 0, - } - for bucket in all_buckets - ] - return JSONResponse({"buckets": result}) - - -# --------------------------------------------------------------------------- -# Amazon S3 / S3-compatible endpoints -# --------------------------------------------------------------------------- - -class S3ConfigureBody(BaseModel): - access_key: Optional[str] = None - secret_key: Optional[str] = None - endpoint_url: Optional[str] = None - region: Optional[str] = None - bucket_names: Optional[List[str]] = None - connection_id: Optional[str] = None - - -async def s3_defaults( - connector_service=Depends(get_connector_service), - user: User = Depends(get_current_user), -): - """Return current S3 env-var defaults for pre-filling the config dialog. - - Sensitive values (secret key) are masked — only whether they are set is returned. - """ - import os - - access_key = os.getenv("AWS_ACCESS_KEY_ID", "") - secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "") - endpoint_url = os.getenv("AWS_S3_ENDPOINT", "") - region = os.getenv("AWS_REGION", "") - - connections = await connector_service.connection_manager.list_connections( - user_id=user.user_id, connector_type="aws_s3" - ) - conn_config = {} - if connections: - conn_config = connections[0].config or {} - - def _pick(conn_key, env_val): - return conn_config.get(conn_key) or env_val - - return JSONResponse({ - "access_key_set": bool(access_key or conn_config.get("access_key")), - "secret_key_set": bool(secret_key or conn_config.get("secret_key")), - "endpoint": _pick("endpoint_url", endpoint_url), - "region": _pick("region", region), - "bucket_names": conn_config.get("bucket_names", []), - "connection_id": connections[0].connection_id if connections else None, - }) - - -async def s3_configure( - body: S3ConfigureBody, - connector_service=Depends(get_connector_service), - user: User = Depends(get_current_user), -): - """Create or update an S3 connection with explicit credentials. - - Tests the credentials by listing buckets, then persists the connection. - """ - import os - from connectors.aws_s3.auth import create_s3_resource - - access_key = body.access_key or os.getenv("AWS_ACCESS_KEY_ID") - secret_key = body.secret_key or os.getenv("AWS_SECRET_ACCESS_KEY") - - # Fall back to existing connection config - existing_connections = await connector_service.connection_manager.list_connections( - user_id=user.user_id, connector_type="aws_s3" - ) - if not access_key and existing_connections: - access_key = existing_connections[0].config.get("access_key") - if not secret_key and existing_connections: - secret_key = existing_connections[0].config.get("secret_key") - - if not access_key or not secret_key: - return JSONResponse( - {"error": "access_key and secret_key are required"}, - status_code=400, - ) - - conn_config: dict = { - "access_key": access_key.strip(), - "secret_key": secret_key.strip(), - } - if body.endpoint_url: - conn_config["endpoint_url"] = body.endpoint_url.strip() - if body.region: - conn_config["region"] = body.region.strip() - if body.bucket_names is not None: - conn_config["bucket_names"] = body.bucket_names - - # Test credentials - try: - s3 = create_s3_resource(conn_config) - list(s3.buckets.all()) - except Exception as exc: - return JSONResponse( - {"error": f"Could not connect to S3: {exc}"}, - status_code=400, - ) - - # Persist: update existing connection or create a new one - if body.connection_id: - existing = await connector_service.connection_manager.get_connection(body.connection_id) - if existing and existing.user_id == user.user_id: - await connector_service.connection_manager.update_connection( - connection_id=body.connection_id, - config=conn_config, - ) - connector_service.connection_manager.active_connectors.pop(body.connection_id, None) - return JSONResponse({"connection_id": body.connection_id, "status": "connected"}) - - connection_id = await connector_service.connection_manager.create_connection( - connector_type="aws_s3", - name="Amazon S3", - config=conn_config, - user_id=user.user_id, - ) - return JSONResponse({"connection_id": connection_id, "status": "connected"}) - - -async def s3_list_buckets( - connection_id: str, - connector_service=Depends(get_connector_service), - user: User = Depends(get_current_user), -): - """List all buckets accessible with the stored S3 credentials.""" - from connectors.aws_s3.auth import create_s3_resource - - connection = await connector_service.connection_manager.get_connection(connection_id) - if not connection or connection.user_id != user.user_id: - return JSONResponse({"error": "Connection not found"}, status_code=404) - if connection.connector_type != "aws_s3": - return JSONResponse({"error": "Not an S3 connection"}, status_code=400) - - try: - s3 = create_s3_resource(connection.config) - buckets = [b.name for b in s3.buckets.all()] - return JSONResponse({"buckets": buckets}) - except Exception as exc: - return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) - - -async def s3_bucket_status( - connection_id: str, - connector_service=Depends(get_connector_service), - session_manager=Depends(get_session_manager), - user: User = Depends(get_current_user), -): - """Return all buckets for an S3 connection with their ingestion status.""" - from connectors.aws_s3.auth import create_s3_resource - - connection = await connector_service.connection_manager.get_connection(connection_id) - if not connection or connection.user_id != user.user_id: - return JSONResponse({"error": "Connection not found"}, status_code=404) - if connection.connector_type != "aws_s3": - return JSONResponse({"error": "Not an S3 connection"}, status_code=400) - - # 1. List all buckets from S3 - try: - s3 = create_s3_resource(connection.config) - all_buckets = [b.name for b in s3.buckets.all()] - except Exception as exc: - logger.exception("Failed to list buckets from S3 for connection %s", connection_id) - return JSONResponse({"error": "Failed to list buckets"}, status_code=500) - - # 2. Count indexed documents per bucket from OpenSearch - ingested_counts: dict = {} - try: - opensearch_client = session_manager.get_user_opensearch_client( - user.user_id, user.jwt_token - ) - query_body = { - "size": 0, - "query": {"term": {"connector_type": "aws_s3"}}, - "aggs": { - "doc_ids": { - "terms": {"field": "document_id", "size": 50000} - } - }, - } - index_name = get_index_name(user.user_id) - os_resp = opensearch_client.search(index=index_name, body=query_body) - for bucket_entry in os_resp.get("aggregations", {}).get("doc_ids", {}).get("buckets", []): - doc_id = bucket_entry["key"] - if "::" in doc_id: - bucket_name = doc_id.split("::")[0] - ingested_counts[bucket_name] = ingested_counts.get(bucket_name, 0) + 1 - except Exception: - pass # OpenSearch unavailable — show zero counts - - result = [ - { - "name": bucket, - "ingested_count": ingested_counts.get(bucket, 0), - "is_synced": ingested_counts.get(bucket, 0) > 0, - } - for bucket in all_buckets - ] - return JSONResponse({"buckets": result}) - - # --------------------------------------------------------------------------- async def sync_all_connectors( diff --git a/src/connectors/aws_s3/__init__.py b/src/connectors/aws_s3/__init__.py index ee9a5ccab..32c63a878 100644 --- a/src/connectors/aws_s3/__init__.py +++ b/src/connectors/aws_s3/__init__.py @@ -1,5 +1,19 @@ """Amazon S3 / S3-compatible connector for OpenRAG.""" from .connector import S3Connector +from .models import S3ConfigureBody +from .api import ( + s3_defaults, + s3_configure, + s3_list_buckets, + s3_bucket_status, +) -__all__ = ["S3Connector"] +__all__ = [ + "S3Connector", + "S3ConfigureBody", + "s3_defaults", + "s3_configure", + "s3_list_buckets", + "s3_bucket_status", +] diff --git a/src/connectors/aws_s3/api.py b/src/connectors/aws_s3/api.py new file mode 100644 index 000000000..2612ca698 --- /dev/null +++ b/src/connectors/aws_s3/api.py @@ -0,0 +1,173 @@ +"""FastAPI route handlers for AWS S3-specific endpoints.""" + +import os + +from fastapi import Depends +from fastapi.responses import JSONResponse + +from config.settings import get_index_name +from dependencies import get_connector_service, get_session_manager, get_current_user +from session_manager import User +from utils.logging_config import get_logger + +from .auth import create_s3_resource +from .models import S3ConfigureBody +from .support import build_s3_config + +logger = get_logger(__name__) + + +async def s3_defaults( + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Return current S3 env-var defaults for pre-filling the config dialog. + + Sensitive values (secret key) are masked — only whether they are set is returned. + """ + access_key = os.getenv("AWS_ACCESS_KEY_ID", "") + secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "") + endpoint_url = os.getenv("AWS_S3_ENDPOINT", "") + region = os.getenv("AWS_REGION", "") + + connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="aws_s3" + ) + conn_config = connections[0].config or {} if connections else {} + + def _pick(conn_key, env_val): + return conn_config.get(conn_key) or env_val + + return JSONResponse({ + "access_key_set": bool(access_key or conn_config.get("access_key")), + "secret_key_set": bool(secret_key or conn_config.get("secret_key")), + "endpoint": _pick("endpoint_url", endpoint_url), + "region": _pick("region", region), + "bucket_names": conn_config.get("bucket_names", []), + "connection_id": connections[0].connection_id if connections else None, + }) + + +async def s3_configure( + body: S3ConfigureBody, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Create or update an S3 connection with explicit credentials. + + Tests the credentials by listing buckets, then persists the connection. + """ + existing_connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="aws_s3" + ) + existing_config = existing_connections[0].config if existing_connections else {} + + conn_config, error = build_s3_config(body, existing_config) + if error: + return JSONResponse({"error": error}, status_code=400) + + # Test credentials + try: + s3 = create_s3_resource(conn_config) + list(s3.buckets.all()) + except Exception as exc: + return JSONResponse( + {"error": f"Could not connect to S3: {exc}"}, + status_code=400, + ) + + # Persist: update existing connection or create a new one + if body.connection_id: + existing = await connector_service.connection_manager.get_connection(body.connection_id) + if existing and existing.user_id == user.user_id: + await connector_service.connection_manager.update_connection( + connection_id=body.connection_id, + config=conn_config, + ) + connector_service.connection_manager.active_connectors.pop(body.connection_id, None) + return JSONResponse({"connection_id": body.connection_id, "status": "connected"}) + + connection_id = await connector_service.connection_manager.create_connection( + connector_type="aws_s3", + name="Amazon S3", + config=conn_config, + user_id=user.user_id, + ) + return JSONResponse({"connection_id": connection_id, "status": "connected"}) + + +async def s3_list_buckets( + connection_id: str, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """List all buckets accessible with the stored S3 credentials.""" + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "aws_s3": + return JSONResponse({"error": "Not an S3 connection"}, status_code=400) + + try: + s3 = create_s3_resource(connection.config) + buckets = [b.name for b in s3.buckets.all()] + return JSONResponse({"buckets": buckets}) + except Exception as exc: + return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + + +async def s3_bucket_status( + connection_id: str, + connector_service=Depends(get_connector_service), + session_manager=Depends(get_session_manager), + user: User = Depends(get_current_user), +): + """Return all buckets for an S3 connection with their ingestion status.""" + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "aws_s3": + return JSONResponse({"error": "Not an S3 connection"}, status_code=400) + + # 1. List all buckets from S3 + try: + s3 = create_s3_resource(connection.config) + all_buckets = [b.name for b in s3.buckets.all()] + except Exception as exc: + logger.exception("Failed to list buckets from S3 for connection %s", connection_id) + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) + + # 2. Count indexed documents per bucket from OpenSearch + ingested_counts: dict = {} + try: + opensearch_client = session_manager.get_user_opensearch_client( + user.user_id, user.jwt_token + ) + query_body = { + "size": 0, + "query": {"term": {"connector_type": "aws_s3"}}, + "aggs": { + "doc_ids": { + "terms": {"field": "document_id", "size": 50000} + } + }, + } + index_name = get_index_name() + os_resp = opensearch_client.search(index=index_name, body=query_body) + for bucket_entry in os_resp.get("aggregations", {}).get("doc_ids", {}).get("buckets", []): + doc_id = bucket_entry["key"] + if "::" in doc_id: + bucket_name = doc_id.split("::")[0] + ingested_counts[bucket_name] = ingested_counts.get(bucket_name, 0) + 1 + except Exception: + pass # OpenSearch unavailable — show zero counts + + result = [ + { + "name": bucket, + "ingested_count": ingested_counts.get(bucket, 0), + "is_synced": ingested_counts.get(bucket, 0) > 0, + } + for bucket in all_buckets + ] + return JSONResponse({"buckets": result}) diff --git a/src/connectors/aws_s3/models.py b/src/connectors/aws_s3/models.py new file mode 100644 index 000000000..582111bd2 --- /dev/null +++ b/src/connectors/aws_s3/models.py @@ -0,0 +1,13 @@ +"""Pydantic request/response models for AWS S3 API endpoints.""" + +from typing import List, Optional +from pydantic import BaseModel + + +class S3ConfigureBody(BaseModel): + access_key: Optional[str] = None + secret_key: Optional[str] = None + endpoint_url: Optional[str] = None + region: Optional[str] = None + bucket_names: Optional[List[str]] = None + connection_id: Optional[str] = None diff --git a/src/connectors/aws_s3/support.py b/src/connectors/aws_s3/support.py new file mode 100644 index 000000000..ccaa069b7 --- /dev/null +++ b/src/connectors/aws_s3/support.py @@ -0,0 +1,51 @@ +"""Support helpers for AWS S3 API endpoints. + +Contains pure (non-async) business logic for credential resolution and +config dict construction, keeping the route handlers thin. +""" + +import os +from typing import Dict, Optional, Tuple + +from .models import S3ConfigureBody + + +def build_s3_config( + body: S3ConfigureBody, + existing_config: Dict, +) -> Tuple[Dict, Optional[str]]: + """Resolve S3 credentials and build the connection config dict. + + Resolution order for each credential: request body → environment variable + → existing connection config. + + Returns: + (conn_config, None) on success + ({}, error_message) on validation failure + """ + access_key = ( + body.access_key + or os.getenv("AWS_ACCESS_KEY_ID") + or existing_config.get("access_key") + ) + secret_key = ( + body.secret_key + or os.getenv("AWS_SECRET_ACCESS_KEY") + or existing_config.get("secret_key") + ) + + if not access_key or not secret_key: + return {}, "access_key and secret_key are required" + + conn_config: Dict = { + "access_key": access_key.strip(), + "secret_key": secret_key.strip(), + } + if body.endpoint_url: + conn_config["endpoint_url"] = body.endpoint_url.strip() + if body.region: + conn_config["region"] = body.region.strip() + if body.bucket_names is not None: + conn_config["bucket_names"] = body.bucket_names + + return conn_config, None diff --git a/src/connectors/ibm_cos/__init__.py b/src/connectors/ibm_cos/__init__.py index 32f19defc..5bf01ad43 100644 --- a/src/connectors/ibm_cos/__init__.py +++ b/src/connectors/ibm_cos/__init__.py @@ -1,3 +1,17 @@ from .connector import IBMCOSConnector +from .models import IBMCOSConfigureBody +from .api import ( + ibm_cos_defaults, + ibm_cos_configure, + ibm_cos_list_buckets, + ibm_cos_bucket_status, +) -__all__ = ["IBMCOSConnector"] +__all__ = [ + "IBMCOSConnector", + "IBMCOSConfigureBody", + "ibm_cos_defaults", + "ibm_cos_configure", + "ibm_cos_list_buckets", + "ibm_cos_bucket_status", +] diff --git a/src/connectors/ibm_cos/api.py b/src/connectors/ibm_cos/api.py new file mode 100644 index 000000000..e6793f2f0 --- /dev/null +++ b/src/connectors/ibm_cos/api.py @@ -0,0 +1,201 @@ +"""FastAPI route handlers for IBM COS-specific endpoints.""" + +import os + +from fastapi import Depends +from fastapi.responses import JSONResponse + +from config.settings import get_index_name +from dependencies import get_connector_service, get_session_manager, get_current_user +from session_manager import User +from utils.logging_config import get_logger + +from .auth import create_ibm_cos_client, create_ibm_cos_resource +from .models import IBMCOSConfigureBody +from .support import build_ibm_cos_config + +logger = get_logger(__name__) + + +async def ibm_cos_defaults( + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Return current IBM COS env-var defaults for pre-filling the config dialog. + + Sensitive values (API key, HMAC secret) are masked — only whether they are + set is returned, not the actual values. + """ + api_key = os.getenv("IBM_COS_API_KEY", "") + service_instance_id = os.getenv("IBM_COS_SERVICE_INSTANCE_ID", "") + endpoint = os.getenv("IBM_COS_ENDPOINT", "") + hmac_access_key = os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID", "") + hmac_secret_key = os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY", "") + disable_iam = os.getenv("OPENRAG_IBM_COS_IAM_UI", "").lower() not in ("1", "true", "yes") + + connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="ibm_cos" + ) + conn_config = connections[0].config or {} if connections else {} + + def _pick(conn_key, env_val): + """Prefer connection config value over env var.""" + return conn_config.get(conn_key) or env_val + + return JSONResponse({ + "api_key_set": bool(api_key or conn_config.get("api_key")), + "service_instance_id": _pick("service_instance_id", service_instance_id), + "endpoint": _pick("endpoint_url", endpoint), + "hmac_access_key_set": bool(hmac_access_key or conn_config.get("hmac_access_key")), + "hmac_secret_key_set": bool(hmac_secret_key or conn_config.get("hmac_secret_key")), + "auth_mode": conn_config.get( + "auth_mode", + "hmac" if (disable_iam or not (api_key or conn_config.get("api_key"))) else "iam", + ), + "disable_iam": disable_iam, + "bucket_names": conn_config.get("bucket_names", []), + "connection_id": connections[0].connection_id if connections else None, + }) + + +async def ibm_cos_configure( + body: IBMCOSConfigureBody, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """Create or update an IBM COS connection with explicit credentials. + + Tests the credentials by listing buckets, then persists the connection. + Credentials are stored in the connection config dict (not env vars) so + the connector works even without system-level env vars. + """ + existing_connections = await connector_service.connection_manager.list_connections( + user_id=user.user_id, connector_type="ibm_cos" + ) + existing_config = existing_connections[0].config if existing_connections else {} + + conn_config, error = build_ibm_cos_config(body, existing_config) + if error: + return JSONResponse({"error": error}, status_code=400) + + # Test credentials — IAM uses client (avoids ibm_botocore discovery-call bug), + # HMAC uses resource (S3-compatible, works with MinIO). + try: + if conn_config.get("auth_mode", "iam") == "hmac": + cos = create_ibm_cos_resource(conn_config) + list(cos.buckets.all()) + else: + cos = create_ibm_cos_client(conn_config) + cos.list_buckets() + except Exception: + logger.exception("Failed to connect to IBM COS during credential test.") + return JSONResponse( + {"error": "Could not connect to IBM COS with the provided configuration."}, + status_code=400, + ) + + # Persist: update existing connection or create a new one + if body.connection_id: + existing = await connector_service.connection_manager.get_connection(body.connection_id) + if existing and existing.user_id == user.user_id: + await connector_service.connection_manager.update_connection( + connection_id=body.connection_id, + config=conn_config, + ) + connector_service.connection_manager.active_connectors.pop(body.connection_id, None) + return JSONResponse({"connection_id": body.connection_id, "status": "connected"}) + + connection_id = await connector_service.connection_manager.create_connection( + connector_type="ibm_cos", + name="IBM Cloud Object Storage", + config=conn_config, + user_id=user.user_id, + ) + return JSONResponse({"connection_id": connection_id, "status": "connected"}) + + +async def ibm_cos_list_buckets( + connection_id: str, + connector_service=Depends(get_connector_service), + user: User = Depends(get_current_user), +): + """List all buckets accessible with the stored IBM COS credentials.""" + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "ibm_cos": + return JSONResponse({"error": "Not an IBM COS connection"}, status_code=400) + + try: + cfg = connection.config + if cfg.get("auth_mode", "iam") == "hmac": + cos = create_ibm_cos_resource(cfg) + buckets = [b.name for b in cos.buckets.all()] + else: + cos = create_ibm_cos_client(cfg) + buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] + return JSONResponse({"buckets": buckets}) + except Exception as exc: + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) + + +async def ibm_cos_bucket_status( + connection_id: str, + connector_service=Depends(get_connector_service), + session_manager=Depends(get_session_manager), + user: User = Depends(get_current_user), +): + """Return all buckets for an IBM COS connection with their ingestion status. + + Each entry includes the bucket name, whether it has been ingested (is_synced), + and the count of indexed documents from that bucket. + """ + connection = await connector_service.connection_manager.get_connection(connection_id) + if not connection or connection.user_id != user.user_id: + return JSONResponse({"error": "Connection not found"}, status_code=404) + if connection.connector_type != "ibm_cos": + return JSONResponse({"error": "Not an IBM COS connection"}, status_code=400) + + # 1. List all buckets from COS + try: + cfg = connection.config + cos = create_ibm_cos_resource(cfg) + all_buckets = [b.name for b in cos.buckets.all()] + except Exception as exc: + logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) + return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + + # 2. Count indexed documents per bucket from OpenSearch + ingested_counts: dict = {} + try: + opensearch_client = session_manager.get_user_opensearch_client( + user.user_id, user.jwt_token + ) + query_body = { + "size": 0, + "query": {"term": {"connector_type": "ibm_cos"}}, + "aggs": { + "doc_ids": { + "terms": {"field": "document_id", "size": 50000} + } + }, + } + index_name = get_index_name() + os_resp = opensearch_client.search(index=index_name, body=query_body) + for bucket_entry in os_resp.get("aggregations", {}).get("doc_ids", {}).get("buckets", []): + doc_id = bucket_entry["key"] + if "::" in doc_id: + bucket_name = doc_id.split("::")[0] + ingested_counts[bucket_name] = ingested_counts.get(bucket_name, 0) + 1 + except Exception: + pass # OpenSearch unavailable — show zero counts + + result = [ + { + "name": bucket, + "ingested_count": ingested_counts.get(bucket, 0), + "is_synced": ingested_counts.get(bucket, 0) > 0, + } + for bucket in all_buckets + ] + return JSONResponse({"buckets": result}) diff --git a/src/connectors/ibm_cos/models.py b/src/connectors/ibm_cos/models.py new file mode 100644 index 000000000..59f2324f9 --- /dev/null +++ b/src/connectors/ibm_cos/models.py @@ -0,0 +1,20 @@ +"""Pydantic request/response models for IBM COS API endpoints.""" + +from typing import List, Optional +from pydantic import BaseModel + + +class IBMCOSConfigureBody(BaseModel): + auth_mode: str # "iam" or "hmac" + endpoint: str + # IAM fields + api_key: Optional[str] = None + service_instance_id: Optional[str] = None + auth_endpoint: Optional[str] = None + # HMAC fields + hmac_access_key: Optional[str] = None + hmac_secret_key: Optional[str] = None + # Optional bucket selection + bucket_names: Optional[List[str]] = None + # Optional: update an existing connection + connection_id: Optional[str] = None diff --git a/src/connectors/ibm_cos/support.py b/src/connectors/ibm_cos/support.py new file mode 100644 index 000000000..25e5e5bcb --- /dev/null +++ b/src/connectors/ibm_cos/support.py @@ -0,0 +1,68 @@ +"""Support helpers for IBM COS API endpoints. + +Contains pure (non-async) business logic for credential resolution and +config dict construction, keeping the route handlers thin. +""" + +import os +from typing import Dict, Optional, Tuple + +from .models import IBMCOSConfigureBody + + +def build_ibm_cos_config( + body: IBMCOSConfigureBody, + existing_config: Dict, +) -> Tuple[Dict, Optional[str]]: + """Resolve IBM COS credentials and build the connection config dict. + + Resolution order for each credential: request body → environment variable + → existing connection config. + + Returns: + (conn_config, None) on success + ({}, error_message) on validation failure + """ + conn_config: Dict = { + "auth_mode": body.auth_mode, + "endpoint_url": body.endpoint, + } + + if body.auth_mode == "iam": + api_key = ( + body.api_key + or os.getenv("IBM_COS_API_KEY") + or existing_config.get("api_key") + ) + svc_id = ( + body.service_instance_id + or os.getenv("IBM_COS_SERVICE_INSTANCE_ID") + or existing_config.get("service_instance_id") + ) + if not api_key or not svc_id: + return {}, "IAM mode requires api_key and service_instance_id" + conn_config["api_key"] = api_key + conn_config["service_instance_id"] = svc_id + if body.auth_endpoint: + conn_config["auth_endpoint"] = body.auth_endpoint + else: + # HMAC mode + hmac_access = ( + body.hmac_access_key + or os.getenv("IBM_COS_HMAC_ACCESS_KEY_ID") + or existing_config.get("hmac_access_key") + ) + hmac_secret = ( + body.hmac_secret_key + or os.getenv("IBM_COS_HMAC_SECRET_ACCESS_KEY") + or existing_config.get("hmac_secret_key") + ) + if not hmac_access or not hmac_secret: + return {}, "HMAC mode requires hmac_access_key and hmac_secret_key" + conn_config["hmac_access_key"] = hmac_access + conn_config["hmac_secret_key"] = hmac_secret + + if body.bucket_names is not None: + conn_config["bucket_names"] = body.bucket_names + + return conn_config, None diff --git a/src/main.py b/src/main.py index 01f5c479f..e48c41744 100644 --- a/src/main.py +++ b/src/main.py @@ -38,6 +38,18 @@ ) from api.connector_router import ConnectorRouter +from connectors.ibm_cos.api import ( + ibm_cos_defaults, + ibm_cos_configure, + ibm_cos_list_buckets, + ibm_cos_bucket_status, +) +from connectors.aws_s3.api import ( + s3_defaults, + s3_configure, + s3_list_buckets, + s3_bucket_status, +) from services.api_key_service import APIKeyService from api import keys as api_keys from api.v1 import chat as v1_chat, search as v1_search, documents as v1_documents, settings as v1_settings, models as v1_models, knowledge_filters as v1_knowledge_filters @@ -756,14 +768,15 @@ async def create_app(): # Connector endpoints app.add_api_route("/connectors", connectors.list_connectors, methods=["GET"], tags=["internal"]) # IBM COS-specific routes (registered before generic /{connector_type}/... to avoid shadowing) - app.add_api_route("/connectors/ibm_cos/defaults", connectors.ibm_cos_defaults, methods=["GET"], tags=["internal"]) - app.add_api_route("/connectors/ibm_cos/configure", connectors.ibm_cos_configure, methods=["POST"], tags=["internal"]) - app.add_api_route("/connectors/ibm_cos/{connection_id}/buckets", connectors.ibm_cos_list_buckets, methods=["GET"], tags=["internal"]) - app.add_api_route("/connectors/ibm_cos/{connection_id}/bucket-status", connectors.ibm_cos_bucket_status, methods=["GET"], tags=["internal"]) - app.add_api_route("/connectors/aws_s3/defaults", connectors.s3_defaults, methods=["GET"], tags=["internal"]) - app.add_api_route("/connectors/aws_s3/configure", connectors.s3_configure, methods=["POST"], tags=["internal"]) - app.add_api_route("/connectors/aws_s3/{connection_id}/buckets", connectors.s3_list_buckets, methods=["GET"], tags=["internal"]) - app.add_api_route("/connectors/aws_s3/{connection_id}/bucket-status", connectors.s3_bucket_status, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/defaults", ibm_cos_defaults, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/configure", ibm_cos_configure, methods=["POST"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/{connection_id}/buckets", ibm_cos_list_buckets, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/ibm_cos/{connection_id}/bucket-status", ibm_cos_bucket_status, methods=["GET"], tags=["internal"]) + # AWS S3-specific routes (registered before generic /{connector_type}/... to avoid shadowing) + app.add_api_route("/connectors/aws_s3/defaults", s3_defaults, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/configure", s3_configure, methods=["POST"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/{connection_id}/buckets", s3_list_buckets, methods=["GET"], tags=["internal"]) + app.add_api_route("/connectors/aws_s3/{connection_id}/bucket-status", s3_bucket_status, methods=["GET"], tags=["internal"]) app.add_api_route("/connectors/{connector_type}/sync", connectors.connector_sync, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/sync-all", connectors.sync_all_connectors, methods=["POST"], tags=["internal"]) app.add_api_route("/connectors/{connector_type}/status", connectors.connector_status, methods=["GET"], tags=["internal"]) From 54b10f018a2e439f9359d2794dae0c8ea05ceccc Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Tue, 10 Mar 2026 01:50:10 -0500 Subject: [PATCH 13/25] Improve S3/IBM COS error handling and logging Replace returning raw exception details to clients with generic error messages and add logger.exception calls to capture stack traces. Changes affect aws_s3 (s3_configure, s3_list_buckets) and ibm_cos (ibm_cos_list_buckets, ibm_cos_bucket_status) handlers: removed f-string exception exposures, standardized response bodies, and added contextual logging (including connection_id) to aid server-side debugging without leaking internals to API consumers. --- src/connectors/aws_s3/api.py | 10 ++++++---- src/connectors/ibm_cos/api.py | 7 ++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/connectors/aws_s3/api.py b/src/connectors/aws_s3/api.py index 2612ca698..178c78ab5 100644 --- a/src/connectors/aws_s3/api.py +++ b/src/connectors/aws_s3/api.py @@ -70,9 +70,10 @@ async def s3_configure( try: s3 = create_s3_resource(conn_config) list(s3.buckets.all()) - except Exception as exc: + except Exception: + logger.exception("Failed to connect to S3 during credential test.") return JSONResponse( - {"error": f"Could not connect to S3: {exc}"}, + {"error": "Could not connect to S3 with the provided configuration."}, status_code=400, ) @@ -112,8 +113,9 @@ async def s3_list_buckets( s3 = create_s3_resource(connection.config) buckets = [b.name for b in s3.buckets.all()] return JSONResponse({"buckets": buckets}) - except Exception as exc: - return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + except Exception: + logger.exception("Failed to list S3 buckets for connection %s", connection_id) + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) async def s3_bucket_status( diff --git a/src/connectors/ibm_cos/api.py b/src/connectors/ibm_cos/api.py index e6793f2f0..ee4ef97b8 100644 --- a/src/connectors/ibm_cos/api.py +++ b/src/connectors/ibm_cos/api.py @@ -135,7 +135,8 @@ async def ibm_cos_list_buckets( cos = create_ibm_cos_client(cfg) buckets = [b["Name"] for b in cos.list_buckets().get("Buckets", [])] return JSONResponse({"buckets": buckets}) - except Exception as exc: + except Exception: + logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) return JSONResponse({"error": "Failed to list buckets"}, status_code=500) @@ -161,9 +162,9 @@ async def ibm_cos_bucket_status( cfg = connection.config cos = create_ibm_cos_resource(cfg) all_buckets = [b.name for b in cos.buckets.all()] - except Exception as exc: + except Exception: logger.exception("Failed to list IBM COS buckets for connection %s", connection_id) - return JSONResponse({"error": f"Failed to list buckets: {exc}"}, status_code=500) + return JSONResponse({"error": "Failed to list buckets"}, status_code=500) # 2. Count indexed documents per bucket from OpenSearch ingested_counts: dict = {} From 44ec1cf88f1a0d3c045149d9fa2836defb76ee84 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Tue, 10 Mar 2026 01:53:18 -0500 Subject: [PATCH 14/25] Add IBM COS auth endpoint and AWS S3/region envs Introduce an optional ibm_cos_auth_endpoint in EnvConfig to allow overriding the IBM IAM token endpoint. Add IBM_COS_AUTH_ENDPOINT to the env-to-config mapping and include it in the list of environment variables written out. Also add AWS_S3_ENDPOINT and AWS_REGION to the environment write-out list so those settings are persisted. --- src/tui/managers/env_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py index 7fba12917..c9a77dd8d 100644 --- a/src/tui/managers/env_manager.py +++ b/src/tui/managers/env_manager.py @@ -67,6 +67,7 @@ class EnvConfig: ibm_cos_endpoint: str = "" ibm_cos_hmac_access_key_id: str = "" ibm_cos_hmac_secret_access_key: str = "" + ibm_cos_auth_endpoint: str = "" # Optional: override IAM token endpoint # Langfuse settings (optional) langfuse_secret_key: str = "" @@ -210,6 +211,7 @@ def load_existing_env(self) -> bool: "IBM_COS_ENDPOINT": "ibm_cos_endpoint", "IBM_COS_HMAC_ACCESS_KEY_ID": "ibm_cos_hmac_access_key_id", "IBM_COS_HMAC_SECRET_ACCESS_KEY": "ibm_cos_hmac_secret_access_key", # pragma: allowlist secret + "IBM_COS_AUTH_ENDPOINT": "ibm_cos_auth_endpoint", "OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths", "OPENRAG_DOCUMENTS_PATH": "openrag_documents_path", "OPENRAG_KEYS_PATH": "openrag_keys_path", @@ -536,12 +538,15 @@ def save_env_file(self) -> bool: ("WEBHOOK_BASE_URL", self.config.webhook_base_url), ("AWS_ACCESS_KEY_ID", self.config.aws_access_key_id), ("AWS_SECRET_ACCESS_KEY", self.config.aws_secret_access_key), + ("AWS_S3_ENDPOINT", self.config.aws_s3_endpoint), + ("AWS_REGION", self.config.aws_region), ("LANGFLOW_PUBLIC_URL", self.config.langflow_public_url), ("IBM_COS_API_KEY", self.config.ibm_cos_api_key), ("IBM_COS_SERVICE_INSTANCE_ID", self.config.ibm_cos_service_instance_id), ("IBM_COS_ENDPOINT", self.config.ibm_cos_endpoint), ("IBM_COS_HMAC_ACCESS_KEY_ID", self.config.ibm_cos_hmac_access_key_id), ("IBM_COS_HMAC_SECRET_ACCESS_KEY", self.config.ibm_cos_hmac_secret_access_key), + ("IBM_COS_AUTH_ENDPOINT", self.config.ibm_cos_auth_endpoint), ] optional_written = False From 37227cc74ef353a31a93f887b51c55cbe43fd1b3 Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Wed, 11 Mar 2026 14:23:16 -0500 Subject: [PATCH 15/25] aws logo --- frontend/components/icons/aws-logo.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/components/icons/aws-logo.tsx b/frontend/components/icons/aws-logo.tsx index 4ed347b3e..221eea0a9 100644 --- a/frontend/components/icons/aws-logo.tsx +++ b/frontend/components/icons/aws-logo.tsx @@ -11,7 +11,7 @@ const AwsLogo = ({ className }: { className?: string }) => { AWS Logo Date: Wed, 11 Mar 2026 16:11:44 -0500 Subject: [PATCH 16/25] update ingest page --- frontend/app/upload/[provider]/page.tsx | 338 ++++++++++-------------- 1 file changed, 137 insertions(+), 201 deletions(-) diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index 1b43564df..ace13be5a 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -1,17 +1,18 @@ "use client"; +import { useQueryClient } from "@tanstack/react-query"; import { AlertCircle, ArrowLeft, RefreshCw } from "lucide-react"; import { useParams, useRouter } from "next/navigation"; import { useState } from "react"; import { toast } from "sonner"; -import { useQueryClient } from "@tanstack/react-query"; import { useSyncConnector } from "@/app/api/mutations/useSyncConnector"; import { useGetConnectorsQuery } from "@/app/api/queries/useGetConnectorsQuery"; import { useGetConnectorTokenQuery } from "@/app/api/queries/useGetConnectorTokenQuery"; import { useIBMCOSBucketStatusQuery } from "@/app/api/queries/useIBMCOSBucketStatusQuery"; import { useS3BucketStatusQuery } from "@/app/api/queries/useS3BucketStatusQuery"; import { type CloudFile, UnifiedCloudPicker } from "@/components/cloud-picker"; -import type { IngestSettings } from "@/components/cloud-picker/types"; +import { IngestSettings } from "@/components/cloud-picker/ingest-settings"; +import type { IngestSettings as IngestSettingsType } from "@/components/cloud-picker/types"; import { Button } from "@/components/ui/button"; import { Tooltip, @@ -24,32 +25,45 @@ import { useTask } from "@/contexts/task-context"; const DIRECT_SYNC_PROVIDERS = ["ibm_cos", "aws_s3"]; // --------------------------------------------------------------------------- -// IBM COS bucket list with sync status +// Shared bucket view — used by both IBM COS and S3 // --------------------------------------------------------------------------- -function IBMCOSBucketView({ +function BucketView({ connector, + buckets, + isLoading, + bucketsError, + onRefetch, + invalidateQueryKey, syncMutation, addTask, onBack, onDone, }: { connector: any; + buckets: Array<{ name: string; ingested_count: number }> | undefined; + isLoading: boolean; + bucketsError?: Error | null; + onRefetch: () => void; + invalidateQueryKey: readonly unknown[]; syncMutation: ReturnType; addTask: (id: string) => void; onBack: () => void; onDone: () => void; }) { const queryClient = useQueryClient(); - const { data: buckets, isLoading, refetch } = useIBMCOSBucketStatusQuery( - connector.connectionId, - { enabled: true }, - ); - const [syncingBucket, setSyncingBucket] = useState(null); + const [ingestSettings, setIngestSettings] = useState({ + chunkSize: 1000, + chunkOverlap: 200, + ocr: false, + pictureDescriptions: false, + embeddingModel: "text-embedding-3-small", + }); + const [isSettingsOpen, setIsSettingsOpen] = useState(false); - const invalidateBucketStatus = () => { - queryClient.invalidateQueries({ queryKey: ["ibm-cos-bucket-status", connector.connectionId] }); + const invalidate = () => { + queryClient.invalidateQueries({ queryKey: invalidateQueryKey }); }; const syncAll = () => { @@ -60,11 +74,12 @@ function IBMCOSBucketView({ connection_id: connector.connectionId!, selected_files: [], sync_all: true, + settings: ingestSettings, }, }, { onSuccess: (result) => { - invalidateBucketStatus(); + invalidate(); if (result.task_ids?.length) { addTask(result.task_ids[0]); onDone(); @@ -88,12 +103,13 @@ function IBMCOSBucketView({ connection_id: connector.connectionId!, selected_files: [], bucket_filter: [bucketName], + settings: ingestSettings, }, }, { onSuccess: (result) => { setSyncingBucket(null); - invalidateBucketStatus(); + invalidate(); if (result.task_ids?.length) { addTask(result.task_ids[0]); onDone(); @@ -125,35 +141,29 @@ function IBMCOSBucketView({

Select a bucket to ingest, or sync everything at once.

-
- - -
+
{isLoading ? (
+ ) : bucketsError ? ( +
+ {bucketsError.message || + "Failed to load buckets. Check your credentials and endpoint."} +
) : !buckets?.length ? (
- No buckets found. Check your IBM COS credentials and endpoint. + No buckets found. Check your credentials and endpoint.
) : (
@@ -162,15 +172,14 @@ function IBMCOSBucketView({ key={bucket.name} className="flex items-center justify-between px-4 py-3" > -
-
-

{bucket.name}

- {bucket.ingested_count > 0 && ( -

- {bucket.ingested_count} document{bucket.ingested_count !== 1 ? "s" : ""} ingested -

- )} -
+
+

{bucket.name}

+ {bucket.ingested_count > 0 && ( +

+ {bucket.ingested_count} document + {bucket.ingested_count !== 1 ? "s" : ""} ingested +

+ )}
+ +
+
+ + +
); } // --------------------------------------------------------------------------- -// Amazon S3 bucket list with sync status +// IBM COS wrapper // --------------------------------------------------------------------------- -function S3BucketView({ +function IBMCOSBucketView({ connector, syncMutation, addTask, @@ -210,160 +248,62 @@ function S3BucketView({ onBack: () => void; onDone: () => void; }) { - const queryClient = useQueryClient(); - const { data: buckets, isLoading, error: bucketsError, refetch } = useS3BucketStatusQuery( - connector.connectionId, - { enabled: true }, + const { + data: buckets, + isLoading, + refetch, + } = useIBMCOSBucketStatusQuery(connector.connectionId, { enabled: true }); + return ( + ); +} - const [syncingBucket, setSyncingBucket] = useState(null); - - const invalidateBucketStatus = () => { - queryClient.invalidateQueries({ queryKey: ["s3-bucket-status", connector.connectionId] }); - }; - - const syncAll = () => { - syncMutation.mutate( - { - connectorType: connector.type, - body: { - connection_id: connector.connectionId!, - selected_files: [], - sync_all: true, - }, - }, - { - onSuccess: (result) => { - invalidateBucketStatus(); - if (result.task_ids?.length) { - addTask(result.task_ids[0]); - onDone(); - } else { - toast.info("No files found in any bucket."); - } - }, - onError: (err) => { - toast.error(err instanceof Error ? err.message : "Sync failed"); - }, - }, - ); - }; - - const syncBucket = (bucketName: string) => { - setSyncingBucket(bucketName); - syncMutation.mutate( - { - connectorType: connector.type, - body: { - connection_id: connector.connectionId!, - selected_files: [], - bucket_filter: [bucketName], - }, - }, - { - onSuccess: (result) => { - setSyncingBucket(null); - invalidateBucketStatus(); - if (result.task_ids?.length) { - addTask(result.task_ids[0]); - onDone(); - } else { - toast.info(`No files found in bucket "${bucketName}".`); - } - }, - onError: (err) => { - setSyncingBucket(null); - toast.error(err instanceof Error ? err.message : "Sync failed"); - }, - }, - ); - }; +// --------------------------------------------------------------------------- +// Amazon S3 wrapper +// --------------------------------------------------------------------------- +function S3BucketView({ + connector, + syncMutation, + addTask, + onBack, + onDone, +}: { + connector: any; + syncMutation: ReturnType; + addTask: (id: string) => void; + onBack: () => void; + onDone: () => void; +}) { + const { + data: buckets, + isLoading, + error: bucketsError, + refetch, + } = useS3BucketStatusQuery(connector.connectionId, { enabled: true }); return ( - <> -
- -

- Add from {connector.name} -

-
- -
-
-

- Select a bucket to ingest, or sync everything at once. -

-
- - -
-
- - {isLoading ? ( -
-
-
- ) : bucketsError ? ( -
- {(bucketsError as Error).message || "Failed to load buckets. Check your S3 credentials and endpoint."} -
- ) : !buckets?.length ? ( -
- No buckets found. Check your S3 credentials and endpoint. -
- ) : ( -
- {buckets.map((bucket) => ( -
-
-
-

{bucket.name}

- {bucket.ingested_count > 0 && ( -

- {bucket.ingested_count} document{bucket.ingested_count !== 1 ? "s" : ""} ingested -

- )} -
-
- -
- ))} -
- )} -
- + ); } @@ -373,7 +313,7 @@ export default function UploadProviderPage() { const params = useParams(); const router = useRouter(); const provider = params.provider as string; - const { addTask, tasks } = useTask(); + const { addTask } = useTask(); const { data: connectors = [], @@ -406,10 +346,7 @@ export default function UploadProviderPage() { const syncMutation = useSyncConnector(); const [selectedFiles, setSelectedFiles] = useState([]); - const [currentSyncTaskId, setCurrentSyncTaskId] = useState( - null, - ); - const [ingestSettings, setIngestSettings] = useState({ + const [ingestSettings, setIngestSettings] = useState({ chunkSize: 1000, chunkOverlap: 200, ocr: false, @@ -459,7 +396,6 @@ export default function UploadProviderPage() { if (taskIds && taskIds.length > 0) { const taskId = taskIds[0]; // Use the first task ID addTask(taskId); - setCurrentSyncTaskId(taskId); // Redirect to knowledge page already to show the syncing document router.push("/knowledge"); } From d820160416ed1d8b916a6fc390df5a80ef7d19e0 Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Wed, 11 Mar 2026 16:15:42 -0500 Subject: [PATCH 17/25] merge fix --- .../app/settings/_components/connector-card.tsx | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/frontend/app/settings/_components/connector-card.tsx b/frontend/app/settings/_components/connector-card.tsx index a2229ff02..5bdfe7694 100644 --- a/frontend/app/settings/_components/connector-card.tsx +++ b/frontend/app/settings/_components/connector-card.tsx @@ -133,19 +133,6 @@ export default function ConnectorCard({ )} )} -
) : onConfigure ? ( )}
- ) : onConfigure ? ( - - ) : connector?.available ? ( - ) : (

For more details see our

diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index ace13be5a..2e73bb838 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -1,7 +1,7 @@ "use client"; import { useQueryClient } from "@tanstack/react-query"; -import { AlertCircle, ArrowLeft, RefreshCw } from "lucide-react"; +import { AlertCircle, ArrowLeft, FolderOpen, RefreshCw } from "lucide-react"; import { useParams, useRouter } from "next/navigation"; import { useState } from "react"; import { toast } from "sonner"; @@ -52,7 +52,9 @@ function BucketView({ onDone: () => void; }) { const queryClient = useQueryClient(); - const [syncingBucket, setSyncingBucket] = useState(null); + const [selectedBuckets, setSelectedBuckets] = useState>( + new Set(), + ); const [ingestSettings, setIngestSettings] = useState({ chunkSize: 1000, chunkOverlap: 200, @@ -66,59 +68,40 @@ function BucketView({ queryClient.invalidateQueries({ queryKey: invalidateQueryKey }); }; - const syncAll = () => { - syncMutation.mutate( - { - connectorType: connector.type, - body: { - connection_id: connector.connectionId!, - selected_files: [], - sync_all: true, - settings: ingestSettings, - }, - }, - { - onSuccess: (result) => { - invalidate(); - if (result.task_ids?.length) { - addTask(result.task_ids[0]); - onDone(); - } else { - toast.info("No files found in any bucket."); - } - }, - onError: (err) => { - toast.error(err instanceof Error ? err.message : "Sync failed"); - }, - }, - ); + const toggleBucket = (bucketName: string) => { + setSelectedBuckets((prev) => { + const next = new Set(prev); + if (next.has(bucketName)) { + next.delete(bucketName); + } else { + next.add(bucketName); + } + return next; + }); }; - const syncBucket = (bucketName: string) => { - setSyncingBucket(bucketName); + const ingestSelected = () => { syncMutation.mutate( { connectorType: connector.type, body: { connection_id: connector.connectionId!, selected_files: [], - bucket_filter: [bucketName], + bucket_filter: Array.from(selectedBuckets), settings: ingestSettings, }, }, { onSuccess: (result) => { - setSyncingBucket(null); invalidate(); if (result.task_ids?.length) { addTask(result.task_ids[0]); onDone(); } else { - toast.info(`No files found in bucket "${bucketName}".`); + toast.info("No files found in the selected buckets."); } }, onError: (err) => { - setSyncingBucket(null); toast.error(err instanceof Error ? err.message : "Sync failed"); }, }, @@ -139,17 +122,32 @@ function BucketView({

- Select a bucket to ingest, or sync everything at once. + Select buckets to ingest.

- +
+ + +
{isLoading ? ( @@ -167,34 +165,54 @@ function BucketView({
) : (
- {buckets.map((bucket) => ( -
-
-

{bucket.name}

- {bucket.ingested_count > 0 && ( -

- {bucket.ingested_count} document - {bucket.ingested_count !== 1 ? "s" : ""} ingested -

- )} -
- -
- ))} +
+ {isSelected && ( + + + + )} +
+
+
+ +
+
+

+ {bucket.name} +

+ {bucket.ingested_count > 0 && ( +

+ {bucket.ingested_count} document + {bucket.ingested_count !== 1 ? "s" : ""} ingested +

+ )} +
+
+
+ ); + })}
)} @@ -217,13 +235,15 @@ function BucketView({
diff --git a/frontend/components/icons/ibm-cos-icon.tsx b/frontend/components/icons/ibm-cos-icon.tsx index a8fc4b324..1637afbf3 100644 --- a/frontend/components/icons/ibm-cos-icon.tsx +++ b/frontend/components/icons/ibm-cos-icon.tsx @@ -1,63 +1,64 @@ export default function IBMCOSIcon(props: React.SVGProps) { return ( + + + + + + + + + + - - + + - - + + - - - - - - - - - - - ); } From 985bb73ebf616bb1cca860e45794fd8b42294aae Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Fri, 13 Mar 2026 15:40:48 -0500 Subject: [PATCH 19/25] select buttons --- frontend/app/upload/[provider]/page.tsx | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index 2e73bb838..832d313c7 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -125,6 +125,15 @@ function BucketView({ Select buckets to ingest.

+ {selectedBuckets.size > 0 && ( + + )}
From f05568c630d270002c17d61adde7c408f5319d47 Mon Sep 17 00:00:00 2001 From: Rico Furtado Date: Fri, 13 Mar 2026 16:50:26 -0400 Subject: [PATCH 20/25] fix: Enhance connector availability checks to be user-scoped by adding user_id parameter --- src/api/connectors.py | 4 +-- src/connectors/connection_manager.py | 51 ++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/src/api/connectors.py b/src/api/connectors.py index 6d2431f9b..ea2e50eaa 100644 --- a/src/api/connectors.py +++ b/src/api/connectors.py @@ -101,8 +101,8 @@ async def list_connectors( ): """List available connector types with metadata""" try: - connector_types = ( - connector_service.connection_manager.get_available_connector_types() + connector_types = connector_service.connection_manager.get_available_connector_types( + user_id=user.user_id ) return JSONResponse({"connectors": connector_types}) except Exception as e: diff --git a/src/connectors/connection_manager.py b/src/connectors/connection_manager.py index c172a93ca..0600b11a3 100644 --- a/src/connectors/connection_manager.py +++ b/src/connectors/connection_manager.py @@ -332,43 +332,71 @@ async def get_connector(self, connection_id: str) -> Optional[BaseConnector]: logger.warning(f"Authentication failed for {connection_id}") return None - def get_available_connector_types(self) -> Dict[str, Dict[str, Any]]: - """Get available connector types with their metadata""" + def get_available_connector_types( + self, user_id: Optional[str] = None + ) -> Dict[str, Dict[str, Any]]: + """Get available connector types with their metadata. + + Availability is user-scoped when ``user_id`` is provided: + a connector is considered available if either: + 1) its required env credentials are present, or + 2) the user has an active saved connection with usable credentials. + """ return { "google_drive": { "name": GoogleDriveConnector.CONNECTOR_NAME, "description": GoogleDriveConnector.CONNECTOR_DESCRIPTION, "icon": GoogleDriveConnector.CONNECTOR_ICON, - "available": self._is_connector_available("google_drive"), + "available": self._is_connector_available("google_drive", user_id), }, "sharepoint": { "name": SharePointConnector.CONNECTOR_NAME, "description": SharePointConnector.CONNECTOR_DESCRIPTION, "icon": SharePointConnector.CONNECTOR_ICON, - "available": self._is_connector_available("sharepoint"), + "available": self._is_connector_available("sharepoint", user_id), }, "onedrive": { "name": OneDriveConnector.CONNECTOR_NAME, "description": OneDriveConnector.CONNECTOR_DESCRIPTION, "icon": OneDriveConnector.CONNECTOR_ICON, - "available": self._is_connector_available("onedrive"), + "available": self._is_connector_available("onedrive", user_id), }, "ibm_cos": { "name": IBMCOSConnector.CONNECTOR_NAME, "description": IBMCOSConnector.CONNECTOR_DESCRIPTION, "icon": IBMCOSConnector.CONNECTOR_ICON, - "available": self._is_connector_available("ibm_cos"), + "available": self._is_connector_available("ibm_cos", user_id), }, "aws_s3": { "name": S3Connector.CONNECTOR_NAME, "description": S3Connector.CONNECTOR_DESCRIPTION, "icon": S3Connector.CONNECTOR_ICON, - "available": self._is_connector_available("aws_s3"), + "available": self._is_connector_available("aws_s3", user_id), }, } - def _is_connector_available(self, connector_type: str) -> bool: - """Check if a connector type is available (has required env vars)""" + def _has_saved_credentials_for_user( + self, connector_type: str, user_id: Optional[str] + ) -> bool: + """Check if user has an active saved connection with usable credentials.""" + for connection in self.connections.values(): + if connection.connector_type != connector_type or not connection.is_active: + continue + if user_id is not None and connection.user_id != user_id: + continue + try: + connector = self._create_connector(connection) + connector.get_client_id() + connector.get_client_secret() + return True + except (ValueError, NotImplementedError, RuntimeError): + continue + return False + + def _is_connector_available( + self, connector_type: str, user_id: Optional[str] = None + ) -> bool: + """Check whether connector is available for use by the given user.""" try: temp_config = ConnectionConfig( connection_id="temp", @@ -381,8 +409,9 @@ def _is_connector_available(self, connector_type: str) -> bool: connector.get_client_id() connector.get_client_secret() return True - except (ValueError, NotImplementedError): - return False + except (ValueError, NotImplementedError, RuntimeError): + # Fallback: saved per-user connection config (e.g. aws_s3 / ibm_cos) + return self._has_saved_credentials_for_user(connector_type, user_id) def _create_connector(self, config: ConnectionConfig) -> BaseConnector: """Factory method to create connector instances""" From 9fee2e80090296715f9bc8415beb8a178562f63a Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Mon, 16 Mar 2026 09:08:33 -0500 Subject: [PATCH 21/25] Add logout support --- src/api/auth.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/api/auth.py b/src/api/auth.py index 7c5e97a75..b88f88991 100644 --- a/src/api/auth.py +++ b/src/api/auth.py @@ -108,15 +108,18 @@ async def auth_logout( auth_service=Depends(get_auth_service), user: User = Depends(get_current_user), ): - """Logout user by clearing auth cookie""" + """Logout user by clearing auth cookie(s)""" + from config.settings import IBM_AUTH_ENABLED, IBM_SESSION_COOKIE_NAME + await TelemetryClient.send_event(Category.AUTHENTICATION, MessageId.ORB_AUTH_LOGOUT) response = JSONResponse( {"status": "logged_out", "message": "Successfully logged out"} ) - # Clear the auth cookie - response.delete_cookie( - key="auth_token", httponly=True, secure=False, samesite="lax" - ) + if IBM_AUTH_ENABLED: + response.delete_cookie(key=IBM_SESSION_COOKIE_NAME, httponly=True, samesite="lax") + response.delete_cookie(key="ibm-auth-basic", httponly=True, samesite="lax") + else: + response.delete_cookie(key="auth_token", httponly=True, secure=False, samesite="lax") return response From 2513cdc61c72c7a8cbc43f6f68584e220eb3a775 Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Mon, 16 Mar 2026 09:10:37 -0500 Subject: [PATCH 22/25] Revert "Add logout support" This reverts commit 9fee2e80090296715f9bc8415beb8a178562f63a. --- src/api/auth.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/api/auth.py b/src/api/auth.py index b88f88991..7c5e97a75 100644 --- a/src/api/auth.py +++ b/src/api/auth.py @@ -108,18 +108,15 @@ async def auth_logout( auth_service=Depends(get_auth_service), user: User = Depends(get_current_user), ): - """Logout user by clearing auth cookie(s)""" - from config.settings import IBM_AUTH_ENABLED, IBM_SESSION_COOKIE_NAME - + """Logout user by clearing auth cookie""" await TelemetryClient.send_event(Category.AUTHENTICATION, MessageId.ORB_AUTH_LOGOUT) response = JSONResponse( {"status": "logged_out", "message": "Successfully logged out"} ) - if IBM_AUTH_ENABLED: - response.delete_cookie(key=IBM_SESSION_COOKIE_NAME, httponly=True, samesite="lax") - response.delete_cookie(key="ibm-auth-basic", httponly=True, samesite="lax") - else: - response.delete_cookie(key="auth_token", httponly=True, secure=False, samesite="lax") + # Clear the auth cookie + response.delete_cookie( + key="auth_token", httponly=True, secure=False, samesite="lax" + ) return response From 18446ddb75f13c8299543a27e894849a955344f3 Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Mon, 16 Mar 2026 10:32:49 -0500 Subject: [PATCH 23/25] make bucket connectors avaliable by default --- src/connectors/connection_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectors/connection_manager.py b/src/connectors/connection_manager.py index 0600b11a3..3ebe266a1 100644 --- a/src/connectors/connection_manager.py +++ b/src/connectors/connection_manager.py @@ -365,13 +365,13 @@ def get_available_connector_types( "name": IBMCOSConnector.CONNECTOR_NAME, "description": IBMCOSConnector.CONNECTOR_DESCRIPTION, "icon": IBMCOSConnector.CONNECTOR_ICON, - "available": self._is_connector_available("ibm_cos", user_id), + "available": True, # Credentials provided via settings form }, "aws_s3": { "name": S3Connector.CONNECTOR_NAME, "description": S3Connector.CONNECTOR_DESCRIPTION, "icon": S3Connector.CONNECTOR_ICON, - "available": self._is_connector_available("aws_s3", user_id), + "available": True, # Credentials provided via settings form }, } From 7801acbf313861b499de0d9f547d3eada4b5d1bd Mon Sep 17 00:00:00 2001 From: Mike Fortman Date: Mon, 16 Mar 2026 11:44:31 -0500 Subject: [PATCH 24/25] support adding via the add knowlegde on the knowledge page --- frontend/components/knowledge-dropdown.tsx | 152 +++++++-------------- 1 file changed, 48 insertions(+), 104 deletions(-) diff --git a/frontend/components/knowledge-dropdown.tsx b/frontend/components/knowledge-dropdown.tsx index 1eba42fba..a5e1563d5 100644 --- a/frontend/components/knowledge-dropdown.tsx +++ b/frontend/components/knowledge-dropdown.tsx @@ -3,7 +3,6 @@ import { useQueryClient } from "@tanstack/react-query"; import { ChevronDown, - Cloud, File as FileIcon, Folder, FolderOpen, @@ -18,6 +17,7 @@ import { useGetTasksQuery } from "@/app/api/queries/useGetTasksQuery"; import { DuplicateHandlingDialog } from "@/components/duplicate-handling-dialog"; import AwsIcon from "@/components/icons/aws-logo"; import GoogleDriveIcon from "@/components/icons/google-drive-logo"; +import IBMCOSIcon from "@/components/icons/ibm-cos-icon"; import OneDriveIcon from "@/components/icons/one-drive-logo"; import SharePointIcon from "@/components/icons/share-point-logo"; import { Button } from "@/components/ui/button"; @@ -88,16 +88,14 @@ export function KnowledgeDropdown() { const [mounted, setMounted] = useState(false); const [isMenuOpen, setIsMenuOpen] = useState(false); const [showFolderDialog, setShowFolderDialog] = useState(false); - const [showS3Dialog, setShowS3Dialog] = useState(false); const [showDuplicateDialog, setShowDuplicateDialog] = useState(false); - const [awsEnabled, setAwsEnabled] = useState(false); const [uploadBatchSize, setUploadBatchSize] = useState(25); const [folderPath, setFolderPath] = useState(""); - const [bucketUrl, setBucketUrl] = useState("s3://"); const [folderLoading, setFolderLoading] = useState(false); - const [s3Loading, setS3Loading] = useState(false); const [fileUploading, setFileUploading] = useState(false); const [isNavigatingToCloud, setIsNavigatingToCloud] = useState(false); + const [ibmCosConfigured, setIbmCosConfigured] = useState(false); + const [s3Configured, setS3Configured] = useState(false); const [pendingFile, setPendingFile] = useState(null); const [duplicateFilename, setDuplicateFilename] = useState(""); const [cloudConnectors, setCloudConnectors] = useState<{ @@ -115,19 +113,41 @@ export function KnowledgeDropdown() { useEffect(() => { const checkAvailability = async () => { try { - // Check AWS and upload batch size - const awsRes = await fetch("/api/upload_options"); - if (awsRes.ok) { - const awsData = await awsRes.json(); - setAwsEnabled(Boolean(awsData.aws)); + // Check upload batch size and bucket connector availability in parallel + const [uploadOptionsRes, ibmCosRes, s3Res] = await Promise.all([ + fetch("/api/upload_options"), + fetch("/api/connectors/ibm_cos/defaults"), + fetch("/api/connectors/aws_s3/defaults"), + ]); + + if (uploadOptionsRes.ok) { + const uploadOptionsData = await uploadOptionsRes.json(); if ( - typeof awsData.upload_batch_size === "number" && - awsData.upload_batch_size > 0 + typeof uploadOptionsData.upload_batch_size === "number" && + uploadOptionsData.upload_batch_size > 0 ) { - setUploadBatchSize(awsData.upload_batch_size); + setUploadBatchSize(uploadOptionsData.upload_batch_size); } } + if (ibmCosRes.ok) { + const ibmCosData = await ibmCosRes.json(); + setIbmCosConfigured( + Boolean( + ibmCosData.connection_id || + ibmCosData.api_key_set || + ibmCosData.hmac_access_key_set, + ), + ); + } + + if (s3Res.ok) { + const s3Data = await s3Res.json(); + setS3Configured( + Boolean(s3Data.connection_id || s3Data.access_key_set), + ); + } + // Check cloud connectors const connectorsRes = await fetch("/api/connectors"); if (connectorsRes.ok) { @@ -461,49 +481,6 @@ export function KnowledgeDropdown() { } }; - const handleS3Upload = async () => { - if (!bucketUrl.trim()) return; - - setS3Loading(true); - setShowS3Dialog(false); - - try { - const response = await fetch("/api/upload_bucket", { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ s3_url: bucketUrl }), - }); - - const result = await response.json(); - - if (response.status === 201) { - const taskId = result.task_id || result.id; - - if (!taskId) { - throw new Error("No task ID received from server"); - } - - addTask(taskId); - setBucketUrl("s3://"); - // Refetch tasks to show the new task - refetchTasks(); - } else { - console.error("S3 upload failed:", result.error); - if (response.status === 400) { - toast.error("Upload failed", { - description: result.error || "Bad request", - }); - } - } - } catch (error) { - console.error("S3 upload error:", error); - } finally { - setS3Loading(false); - } - }; - // Icon mapping for cloud connectors const connectorIconMap = { google_drive: GoogleDriveIcon, @@ -544,12 +521,21 @@ export function KnowledgeDropdown() { icon: FolderIconWithColor, onClick: () => folderInputRef.current?.click(), }, - ...(awsEnabled + ...(s3Configured ? [ { label: "Amazon S3", icon: AwsIcon, - onClick: () => setShowS3Dialog(true), + onClick: () => router.push("/upload/aws_s3"), + }, + ] + : []), + ...(ibmCosConfigured + ? [ + { + label: "IBM Cloud Object Storage", + icon: IBMCOSIcon, + onClick: () => router.push("/upload/ibm_cos"), }, ] : []), @@ -557,8 +543,7 @@ export function KnowledgeDropdown() { ]; // Comprehensive loading state - const isLoading = - fileUploading || folderLoading || s3Loading || isNavigatingToCloud; + const isLoading = fileUploading || folderLoading || isNavigatingToCloud; if (!mounted) { return ( @@ -581,11 +566,9 @@ export function KnowledgeDropdown() { ? "Uploading..." : folderLoading ? "Processing Folder..." - : s3Loading - ? "Processing S3..." - : isNavigatingToCloud - ? "Loading..." - : "Processing..." + : isNavigatingToCloud + ? "Loading..." + : "Processing..." : "Add Knowledge"} {!isLoading && ( @@ -673,45 +656,6 @@ export function KnowledgeDropdown() { - {/* Process S3 Bucket Dialog */} - - - - - - Process S3 Bucket - - - Process all documents from an S3 bucket. AWS credentials must be - configured. - - -
-
- - setBucketUrl(e.target.value)} - /> -
-
- - -
-
-
-
- {/* Duplicate Handling Dialog */} Date: Mon, 16 Mar 2026 15:53:49 -0500 Subject: [PATCH 25/25] feature flag for new connectors --- .../settings/_components/connector-cards.tsx | 24 ++++++++++--------- frontend/components/knowledge-dropdown.tsx | 6 +++-- frontend/contexts/auth-context.tsx | 5 ++++ src/config/settings.py | 2 ++ src/connectors/connection_manager.py | 5 ++-- src/services/auth_service.py | 7 ++++-- 6 files changed, 32 insertions(+), 17 deletions(-) diff --git a/frontend/app/settings/_components/connector-cards.tsx b/frontend/app/settings/_components/connector-cards.tsx index 1121fad94..021de5558 100644 --- a/frontend/app/settings/_components/connector-cards.tsx +++ b/frontend/app/settings/_components/connector-cards.tsx @@ -1,18 +1,18 @@ "use client"; -import { useCallback, useState } from "react"; import { useRouter } from "next/navigation"; +import { useCallback, useState } from "react"; import { useConnectConnectorMutation } from "@/app/api/mutations/useConnectConnectorMutation"; import { useDisconnectConnectorMutation } from "@/app/api/mutations/useDisconnectConnectorMutation"; import { type Connector as QueryConnector, useGetConnectorsQuery, } from "@/app/api/queries/useGetConnectorsQuery"; +import AwsLogo from "@/components/icons/aws-logo"; import GoogleDriveIcon from "@/components/icons/google-drive-logo"; import IBMCOSIcon from "@/components/icons/ibm-cos-icon"; import OneDriveIcon from "@/components/icons/one-drive-logo"; import SharePointIcon from "@/components/icons/share-point-logo"; -import AwsLogo from "@/components/icons/aws-logo"; import { useAuth } from "@/contexts/auth-context"; import ConnectorCard, { type Connector } from "./connector-card"; import ConnectorsSkeleton from "./connectors-skeleton"; @@ -20,7 +20,7 @@ import IBMCOSSettingsDialog from "./ibm-cos-settings-dialog"; import S3SettingsDialog from "./s3-settings-dialog"; export default function ConnectorCards() { - const { isAuthenticated, isNoAuthMode } = useAuth(); + const { isAuthenticated, isNoAuthMode, isIbmAuthMode } = useAuth(); const router = useRouter(); const [ibmCOSDialogOpen, setIBMCOSDialogOpen] = useState(false); const [s3DialogOpen, setS3DialogOpen] = useState(false); @@ -50,10 +50,15 @@ export default function ConnectorCards() { ); }, []); - const connectors = queryConnectors.map((c) => ({ - ...c, - icon: getConnectorIcon(c.icon), - })) as Connector[]; + const connectors = queryConnectors + .filter((c) => { + if (c.type === "ibm_cos" || c.type === "aws_s3") return isIbmAuthMode; + return true; + }) + .map((c) => ({ + ...c, + icon: getConnectorIcon(c.icon), + })) as Connector[]; const handleConnect = async (connector: Connector) => { connectMutation.mutate({ @@ -121,10 +126,7 @@ export default function ConnectorCards() { open={ibmCOSDialogOpen} setOpen={setIBMCOSDialogOpen} /> - + ); } diff --git a/frontend/components/knowledge-dropdown.tsx b/frontend/components/knowledge-dropdown.tsx index a5e1563d5..0895f9ba1 100644 --- a/frontend/components/knowledge-dropdown.tsx +++ b/frontend/components/knowledge-dropdown.tsx @@ -36,6 +36,7 @@ import { } from "@/components/ui/dropdown-menu"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; +import { useAuth } from "@/contexts/auth-context"; import { useTask } from "@/contexts/task-context"; import { duplicateCheck, @@ -81,6 +82,7 @@ const FolderIconWithColor = ({ className }: { className?: string }) => ( ); export function KnowledgeDropdown() { + const { isIbmAuthMode } = useAuth(); const { addTask } = useTask(); const { refetch: refetchTasks } = useGetTasksQuery(); const queryClient = useQueryClient(); @@ -521,7 +523,7 @@ export function KnowledgeDropdown() { icon: FolderIconWithColor, onClick: () => folderInputRef.current?.click(), }, - ...(s3Configured + ...(isIbmAuthMode && s3Configured ? [ { label: "Amazon S3", @@ -530,7 +532,7 @@ export function KnowledgeDropdown() { }, ] : []), - ...(ibmCosConfigured + ...(isIbmAuthMode && ibmCosConfigured ? [ { label: "IBM Cloud Object Storage", diff --git a/frontend/contexts/auth-context.tsx b/frontend/contexts/auth-context.tsx index 004045314..7eb0182b2 100644 --- a/frontend/contexts/auth-context.tsx +++ b/frontend/contexts/auth-context.tsx @@ -23,6 +23,7 @@ interface AuthContextType { isLoading: boolean; isAuthenticated: boolean; isNoAuthMode: boolean; + isIbmAuthMode: boolean; login: () => void; logout: () => Promise; refreshAuth: () => Promise; @@ -46,6 +47,7 @@ export function AuthProvider({ children }: AuthProviderProps) { const [user, setUser] = useState(null); const [isLoading, setIsLoading] = useState(true); const [isNoAuthMode, setIsNoAuthMode] = useState(false); + const [isIbmAuthMode, setIsIbmAuthMode] = useState(false); const checkAuth = useCallback(async () => { try { @@ -60,6 +62,8 @@ export function AuthProvider({ children }: AuthProviderProps) { const data = await response.json(); + setIsIbmAuthMode(!!data.ibm_auth_mode); + // Check if we're in no-auth mode if (data.no_auth_mode) { setIsNoAuthMode(true); @@ -176,6 +180,7 @@ export function AuthProvider({ children }: AuthProviderProps) { isLoading, isAuthenticated: !!user, isNoAuthMode, + isIbmAuthMode, login, logout, refreshAuth, diff --git a/src/config/settings.py b/src/config/settings.py index b1df97074..b31fe2679 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -51,6 +51,8 @@ GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") DOCLING_OCR_ENGINE = os.getenv("DOCLING_OCR_ENGINE") +IBM_AUTH_ENABLED = os.getenv("IBM_AUTH_ENABLED", "false").lower() in ("true", "1", "yes") + # Ingestion configuration DISABLE_INGEST_WITH_LANGFLOW = os.getenv( "DISABLE_INGEST_WITH_LANGFLOW", "false" diff --git a/src/connectors/connection_manager.py b/src/connectors/connection_manager.py index 3ebe266a1..65674bda6 100644 --- a/src/connectors/connection_manager.py +++ b/src/connectors/connection_manager.py @@ -1,4 +1,5 @@ import json +import os import uuid import aiofiles from typing import Dict, List, Any, Optional @@ -365,13 +366,13 @@ def get_available_connector_types( "name": IBMCOSConnector.CONNECTOR_NAME, "description": IBMCOSConnector.CONNECTOR_DESCRIPTION, "icon": IBMCOSConnector.CONNECTOR_ICON, - "available": True, # Credentials provided via settings form + "available": os.environ.get("IBM_AUTH_ENABLED", "").lower() in ("1", "true", "yes"), }, "aws_s3": { "name": S3Connector.CONNECTOR_NAME, "description": S3Connector.CONNECTOR_DESCRIPTION, "icon": S3Connector.CONNECTOR_ICON, - "available": True, # Credentials provided via settings form + "available": os.environ.get("IBM_AUTH_ENABLED", "").lower() in ("1", "true", "yes"), }, } diff --git a/src/services/auth_service.py b/src/services/auth_service.py index 277638c6a..54e9b4b0a 100644 --- a/src/services/auth_service.py +++ b/src/services/auth_service.py @@ -492,6 +492,8 @@ async def _handle_data_source_auth( async def get_user_info(self, request) -> Optional[dict]: """Get current user information from request""" + from config.settings import IBM_AUTH_ENABLED + # In no-auth mode, return a consistent response if is_no_auth_mode(): return {"authenticated": False, "user": None, "no_auth_mode": True} @@ -501,6 +503,7 @@ async def get_user_info(self, request) -> Optional[dict]: if user: user_data = { "authenticated": True, + "ibm_auth_mode": IBM_AUTH_ENABLED, "user": { "user_id": user.user_id, "email": user.email, @@ -512,7 +515,7 @@ async def get_user_info(self, request) -> Optional[dict]: else None, }, } - + return user_data else: - return {"authenticated": False, "user": None} + return {"authenticated": False, "ibm_auth_mode": IBM_AUTH_ENABLED, "user": None}