awslabs · darenwkt · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -54,6 +54,7 @@ lib/
 !05-blueprints/**/lib/
 !02-use-cases/visa-b2b-account-payable-agent/infrastructure/lib/
 !04-infrastructure-as-code/cdk/typescript/knowledge-base-rag-agent/infrastructure/lib/
+!04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/infrastructure/lib/
 !01-tutorials/01-AgentCore-runtime/01-hosting-agent/05-java-agents/01-springai-with-bedrock-model/infra/lib/
 !01-tutorials/01-AgentCore-runtime/01-hosting-agent/05-java-agents/02-embabel-with-bedrock-model/infra/lib/
 lib64/
@@ -255,4 +256,5 @@ Test-Downloads/
 
 ### Docker ###
 Dockerfile
+!04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/**/Dockerfile
 .dockerignore
diff --git a/04-infrastructure-as-code/README.md b/04-infrastructure-as-code/README.md
@@ -117,7 +117,8 @@ For Terraform samples, also install:
 │   │   ├── multi-agent-runtime/
 │   │   └── end-to-end-weather-agent/
 │   └── typescript/                   # TypeScript CDK samples
-│       └── knowledge-base-rag-agent/
+│       ├── knowledge-base-rag-agent/
+│       └── digital-preservation-agent/
 └── terraform/                        # Terraform samples
     ├── README.md                     # Terraform-specific guide
     ├── basic-runtime/

diff --git a/04-infrastructure-as-code/cdk/README.md b/04-infrastructure-as-code/cdk/README.md
@@ -7,7 +7,7 @@ Deploy Amazon Bedrock AgentCore resources using AWS CDK in Python or TypeScript.
 | Language | Description | Samples |
 |----------|-------------|---------|
 | **[Python](./python/)** | Familiar syntax for Python developers, quick prototyping | 4 samples |
-| **[TypeScript](./typescript/)** | Strong typing, rich npm ecosystem, compile-time checks | 1 sample |
+| **[TypeScript](./typescript/)** | Strong typing, rich npm ecosystem, compile-time checks | 2 samples |
 
 ## Prerequisites
 
@@ -41,6 +41,7 @@ Deploy Amazon Bedrock AgentCore resources using AWS CDK in Python or TypeScript.
 | Sample | Description |
 |--------|-------------|
 | [knowledge-base-rag-agent](./typescript/knowledge-base-rag-agent/) | Full-stack RAG agent with Knowledge Base, OpenSearch Serverless, web interface, and Cognito authentication |
+| [digital-preservation-agent](./typescript/digital-preservation-agent/) | Digital preservation agent with Apache Tika, Siegfried, DROID, and MediaInfo on ECS Fargate, AgentCore Gateway (MCP), and AgentCore Runtime |
 
 ## CDK Advantages Over CloudFormation
 

diff --git a/04-infrastructure-as-code/cdk/typescript/README.md b/04-infrastructure-as-code/cdk/typescript/README.md
@@ -26,6 +26,7 @@ cdk deploy --all
 ## Samples
 
 - **[knowledge-base-rag-agent/](./knowledge-base-rag-agent/)** - Full-stack RAG agent with Knowledge Base, OpenSearch Serverless, web interface, and Cognito authentication
+- **[digital-preservation-agent/](./digital-preservation-agent/)** - Document processing agent with Apache Tika on ECS Fargate, AgentCore Gateway, and AgentCore Runtime
 
 ## TypeScript CDK Advantages
 

diff --git a/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/.gitignore b/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/.gitignore
@@ -0,0 +1,11 @@
+node_modules/
+cdk.out/
+cdk.context.json
+dist/
+*.js
+*.d.ts
+*.js.map
+!jest.config.js
+.env
+__pycache__/
+*.pyc
diff --git a/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/README.md b/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/README.md
@@ -0,0 +1,149 @@
+# Digital Preservation Agent (CDK TypeScript)
+
+Deploy a digital preservation agent using Amazon Bedrock AgentCore with multiple file analysis tools running on ECS Fargate: [Apache Tika](https://tika.apache.org/), [Siegfried](https://www.itforarchivists.com/siegfried), [DROID](https://digital-preservation.github.io/droid/), and [MediaInfo](https://mediaarea.net/en/MediaInfo). An AgentCore Gateway exposes all tools via MCP, Lambda functions bridge tool calls to each service, and an AgentCore Runtime hosts a Strands agent that orchestrates analysis workflows.
+
+## Architecture
+
+```
+User → AgentCore Runtime (Strands Agent, Claude 3 Haiku)
+         ↓  MCP
+       AgentCore Gateway
+         ↓
+       Lambda functions (tool bridges)
+         ↓
+       Internal ALB (path-based routing)
+         ├── /tika*, /detect/*, /meta*  → ECS Fargate (Apache Tika :9998)
+         ├── /identify/*               → ECS Fargate (Siegfried :5138)
+         ├── /api/*                    → ECS Fargate (DROID :8080)
+         └── /mediainfo/*             → ECS Fargate (MediaInfo :8081)
+         ↓
+       S3 Bucket (document uploads + reports)
+```
+
+## Prerequisites
+
+- [Node.js 18+](https://nodejs.org/) and npm
+- [Docker](https://docs.docker.com/get-docker/) (required for building container images)
+- [AWS CDK v2](https://docs.aws.amazon.com/cdk/v2/guide/getting-started.html) (`npm install -g aws-cdk`)
+- [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) configured with appropriate credentials
+- [Bedrock Foundation model access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) — ensure Claude 3 Haiku is enabled in your target region
+
+## Deployment
+
+```bash
+cd infrastructure
+npm install
+npx cdk bootstrap   # first time only
+npx cdk deploy
+```
+
+> Use `npx cdk` to run the project-local CDK CLI. A globally installed CDK may be too old for the `aws-cdk-lib` version used here.
+
+### Stack Outputs
+
+| Output | Description |
+|---|---|
+| `RuntimeArn` | AgentCore Runtime ARN (use to invoke the agent) |
+| `GatewayUrl` | AgentCore Gateway URL (MCP endpoint) |
+| `GatewayId` | AgentCore Gateway ID |
+| `AlbDns` | Internal ALB DNS name |
+| `DocsBucketName` | S3 bucket for uploads and reports |
+
+## Usage
+
+1. Upload a file:
+   ```bash
+   aws s3 cp my-report.pdf s3://<DocsBucketName>/my-report.pdf
+   ```
+
+2. Invoke the agent:
+   ```python
+   import boto3, json
+   client = boto3.client("bedrock-agentcore")
+   response = client.invoke_agent_runtime(
+       agentRuntimeArn="<RuntimeArn>",
+       request={"prompt": "Identify the format of my-report.pdf using Siegfried and extract its text with Tika"},
+   )
+   print(json.dumps(response, indent=2, default=str))
+   ```
+
+## Available Tools
+
+| Tool | Service | Description |
+|---|---|---|
+| `tika_process` | Tika | Fetch S3 file → extract text, metadata, or detect MIME type (handles archives directly) |
+| `siegfried_identify` | Siegfried | Identify file format using PRONOM registry (requires extraction for archives) |
+| `droid_profile` | DROID | Profile file format using DROID (requires extraction for archives) |
+| `mediainfo_analyze` | MediaInfo | Analyze media file technical metadata (requires extraction for archives) |
+| `extract_archive` | Lambda | Extract ZIP/TAR archives to S3 |
+| `save_report_to_s3` | Lambda | Save analysis report as JSON to S3 |
+
+## ALB Routing
+
+| Path Pattern | Target | Port |
+|---|---|---|
+| `/tika*`, `/detect/*`, `/meta*` | Apache Tika | 9998 |
+| `/identify/*` | Siegfried | 5138 |
+| `/api/*` | DROID | 8080 |
+| `/mediainfo/*` | MediaInfo | 8081 |
+
+## Sample Prompts
+
+- Identify the format of my-report.pdf using Siegfried
+- Extract text from presentation.pptx
+- Analyze the media metadata of video.mp4
+- Run a full preservation analysis on archive.zip — extract it, then identify all files
+- Profile document.docx with DROID and save the results as a report
+
+## Project Structure
+
+```
+digital-preservation-agent/
+├── agent/
+│   ├── main.py                # Strands agent for AgentCore Runtime
+│   ├── requirements.txt
+│   └── Dockerfile
+├── backend/
+│   ├── tika_handler.py         # Tika tool bridge
+│   ├── siegfried_handler.py   # Siegfried tool bridge
+│   ├── droid_handler.py       # DROID tool bridge
+│   ├── mediainfo_handler.py   # MediaInfo tool bridge
+│   ├── extract_handler.py     # Archive extraction (S3 only)
+│   └── s3_report_handler.py   # Report persistence (S3 only)
+├── containers/
+│   ├── droid/                 # DROID Docker image (eclipse-temurin:17-jre)
+│   └── mediainfo/            # MediaInfo Docker image (alpine:3.20)
+├── infrastructure/
+│   ├── bin/app.ts
+│   ├── lib/stacks/
+│   │   └── digital-preservation-stack.ts
+│   ├── config.json
+│   ├── package.json
+│   ├── tsconfig.json
+│   └── cdk.json
+├── .gitignore
+└── README.md
+```
+
+## Configuration
+
+Edit `infrastructure/config.json`:
+
+| Key | Default | Description |
+|---|---|---|
+| `agentModelId` | `eu.anthropic.claude-3-5-haiku-20241022-v1:0` | Foundation model ([cross-region inference profile](https://docs.aws.amazon.com/bedrock/latest/userguide/cross-region-inference.html)) |
+| `tikaImageTag` | `3.2.3.0-full` | Apache Tika Docker image tag |
+| `fargateMemoryMiB` | `2048` | Fargate task memory (shared across all services) |
+| `fargateCpu` | `1024` | Fargate task CPU |
+| `desiredCount` | `1` | Number of Fargate tasks per service |
+
+> Siegfried uses the pre-built `ghcr.io/keeps/siegfried:v1.10.1` image directly (no Dockerfile). DROID and MediaInfo have custom Dockerfiles in `containers/` and are built with `--platform linux/amd64` for ECS Fargate compatibility.
+
+> The default `agentModelId` uses the `eu.` cross-region inference prefix, which routes requests to EU-based Bedrock endpoints. If deploying to a non-EU region, change this to a region-appropriate prefix (e.g., `us.anthropic.claude-3-5-haiku-20241022-v1:0`) or use the base model ID `anthropic.claude-3-5-haiku-20241022-v1:0`. See [cross-region inference](https://docs.aws.amazon.com/bedrock/latest/userguide/cross-region-inference.html) for available prefixes.
+
+## Clean Up
+
+```bash
+cd infrastructure
+npx cdk destroy
+```
diff --git a/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/agent/Dockerfile b/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/agent/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY main.py .
+
+EXPOSE 8080
+CMD ["opentelemetry-instrument", "python", "main.py"]
diff --git a/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/agent/main.py b/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/agent/main.py
@@ -0,0 +1,82 @@
+"""
+Strands agent for digital preservation with Apache Tika, Siegfried, DROID, and MediaInfo.
+
+Deployed to AgentCore Runtime, connects to an AgentCore Gateway
+that exposes tools via MCP. Uses SigV4 signing for IAM-authenticated
+Gateway access.
+"""
+
+import os
+import logging
+
+from bedrock_agentcore.runtime import BedrockAgentCoreApp
+from strands import Agent
+from strands.models import BedrockModel
+from strands.tools.mcp import MCPClient
+from mcp_proxy_for_aws.client import aws_iam_streamablehttp_client
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = BedrockAgentCoreApp()
+
+# --- Module-level config (read once at container startup) ---
+GATEWAY_URL = os.environ.get("GATEWAY_URL", "")
+AWS_REGION = os.environ.get("AWS_REGION", "us-east-1")
+MODEL_ID = os.environ.get("MODEL_ID", "eu.anthropic.claude-3-5-haiku-20241022-v1:0")
+AGENT_INSTRUCTION = os.environ.get(
+    "AGENT_INSTRUCTION",
+    "You are a digital preservation assistant with access to Apache Tika, "
+    "Siegfried, DROID, and MediaInfo for file format identification, text "
+    "extraction, metadata retrieval, and media analysis. You can also "
+    "extract archives and save analysis reports to S3. Apache Tika can "
+    "process archives directly, but Siegfried, DROID, and MediaInfo require "
+    "files to be extracted first using extract_archive before analysis.",
+)
+
+# Cache the model instance — it's stateless and safe to reuse across requests.
+model = BedrockModel(inference_profile_id=MODEL_ID, region_name=AWS_REGION)
+
+
+def _create_mcp_client():
+    """Create an MCP client factory for the AgentCore Gateway."""
+    return MCPClient(
+        lambda: aws_iam_streamablehttp_client(
+            endpoint=GATEWAY_URL,
+            aws_region=AWS_REGION,
+            aws_service="bedrock-agentcore",
+        )
+    )
+
+
+@app.entrypoint
+def handler(payload: dict) -> dict:
+    """Handle incoming agent requests from AgentCore Runtime."""
+    prompt = payload.get("prompt", payload.get("message", ""))
+    if not prompt:
+        return {"error": "No prompt provided", "status": "error"}
+
+    logger.info(
+        "Received prompt: %s | model=%s region=%s", prompt[:200], MODEL_ID, AWS_REGION
+    )
+
+    # MCPClient uses a context manager to manage the MCP session lifecycle.
+    # Each request gets its own session to avoid stale connection issues.
+    mcp_client = _create_mcp_client()
+
+    with mcp_client:
+        tools = list(mcp_client.list_tools_sync())
+        logger.info("Available tools: %s", [t.tool_name for t in tools])
+
+        agent = Agent(
+            model=model,
+            tools=tools,
+            system_prompt=AGENT_INSTRUCTION,
+        )
+
+        result = agent(prompt)
+        return {"response": str(result.message), "status": "success"}
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/agent/requirements.txt b/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/agent/requirements.txt
@@ -0,0 +1,7 @@
+bedrock-agentcore
+strands-agents
+strands-agents-tools
+mcp
+mcp-proxy-for-aws
+boto3
+aws-opentelemetry-distro>=0.10.0
diff --git a/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/backend/droid_handler.py b/04-infrastructure-as-code/cdk/typescript/digital-preservation-agent/backend/droid_handler.py
@@ -0,0 +1,60 @@
+"""Lambda bridging AgentCore Gateway MCP tool calls to DROID on Fargate."""
+
+import json
+import logging
+import os
+import urllib.request
+import boto3
+from botocore.exceptions import ClientError
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+ALB_URL = os.environ["ALB_URL"]
+DOCS_BUCKET = os.environ.get("DOCS_BUCKET", "")
+s3 = boto3.client("s3")
+MAX_FILE_SIZE = 100 * 1024 * 1024
+
+
+def handler(event, context):
+    logger.info("Event: %s", json.dumps(event, default=str))
+
+    s3_key = event.get("s3_key", "")
+    if not s3_key:
+        return _resp({"error": "s3_key is required"})
+
+    if s3_key.startswith("s3://"):
+        s3_key = s3_key[5:].split("/", 1)[-1]
+
+    try:
+        head = s3.head_object(Bucket=DOCS_BUCKET, Key=s3_key)
+        if head.get("ContentLength", 0) > MAX_FILE_SIZE:
+            return _resp(
+                {"error": f"File exceeds {MAX_FILE_SIZE // (1024 * 1024)} MB limit"}
+            )
+        obj = s3.get_object(Bucket=DOCS_BUCKET, Key=s3_key)
+        file_bytes = obj["Body"].read()
+    except ClientError as e:
+        code = e.response["Error"]["Code"]
+        if code in ("NoSuchKey", "NoSuchBucket"):
+            return _resp({"error": f"Not found: s3://{DOCS_BUCKET}/{s3_key}"})
+        return _resp({"error": "Failed to retrieve file from S3"})
+
+    filename = s3_key.split("/")[-1]
+    try:
+        req = urllib.request.Request(
+            f"{ALB_URL}/api/identify/{filename}",
+            data=file_bytes,
+            headers={"Content-Type": "application/octet-stream"},
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=120) as resp:
+            result = json.loads(resp.read().decode("utf-8"))
+        return _resp({"s3_key": s3_key, "profile": result})
+    except Exception:
+        logger.exception("DROID call failed")
+        return _resp({"error": "DROID profiling failed"})
+
+
+def _resp(body):
+    return {"output": json.dumps(body)}