diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..79a1e2881 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,196 @@ +name: Documentation + +on: + push: + branches: [main] + paths: + - 'api/**' + - 'docs/**' + - 'crd-ref-docs.config.yaml' + - '.github/workflows/docs.yml' + - 'Makefile' + pull_request: + paths: + - 'api/**' + - 'docs/**' + - 'crd-ref-docs.config.yaml' + - '.github/workflows/docs.yml' + - 'Makefile' + +env: + GO_VERSION: '1.21' + PYTHON_VERSION: '3.11' + +jobs: + build-docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v4 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Cache Go modules + uses: actions/cache@v3 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + + - name: Cache Python dependencies + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install Go dependencies + run: | + make crd-ref-docs gen-crd-api-reference-docs + + - name: Install Python dependencies + run: | + pip install -r docs/requirements.txt + + - name: Generate API documentation + run: | + make api-docs + + - name: Build documentation site + run: | + make docs-build + + - name: Upload documentation artifacts + uses: actions/upload-artifact@v4 + with: + name: documentation-site + path: docs/site/ + retention-days: 30 + + - name: Upload API documentation + uses: actions/upload-artifact@v4 + with: + name: api-documentation + path: docs/content/reference/api.md + retention-days: 30 + + deploy-preview: + if: github.event_name == 'pull_request' + needs: build-docs + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Download documentation artifacts + uses: actions/download-artifact@v4 + with: + name: documentation-site + path: docs/site/ + + - name: Deploy to Cloudflare Pages (Preview) + uses: cloudflare/pages-action@v1 + with: + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: llamastack-k8s-operator-docs + directory: docs/site + gitHubToken: ${{ secrets.GITHUB_TOKEN }} + wranglerVersion: '3' + + deploy-production: + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + needs: build-docs + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Download documentation artifacts + uses: actions/download-artifact@v4 + with: + name: documentation-site + path: docs/site/ + + - name: Deploy to Cloudflare Pages (Production) + uses: cloudflare/pages-action@v1 + with: + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: llamastack-k8s-operator-docs + directory: docs/site + gitHubToken: ${{ secrets.GITHUB_TOKEN }} + wranglerVersion: '3' + + - name: Update legacy API docs (backward compatibility) + run: | + make api-docs || echo "Legacy API docs target not found, skipping" + + - name: Commit updated API docs + if: github.ref == 'refs/heads/main' + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + git add docs/api-overview.md || true + git diff --staged --quiet || git commit -m "docs: update API documentation [skip ci]" + git push || true + + validate-docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + pip install -r docs/requirements.txt + pip install linkchecker + + - name: Validate MkDocs configuration + run: | + cd docs && mkdocs build --clean --verbose + + - name: Check for broken links (if built) + run: | + if [ -d "docs/site" ]; then + cd docs/site + python -m http.server 8000 & + sleep 5 + linkchecker http://localhost:8000 --no-warnings --ignore-url=".*\.css$" --ignore-url=".*\.js$" || true + kill %1 2>/dev/null || true + fi + + security-scan: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + scan-type: 'fs' + scan-ref: 'docs/' + format: 'sarif' + output: 'trivy-results.sarif' + + - name: Upload Trivy scan results + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: 'trivy-results.sarif' diff --git a/.gitignore b/.gitignore index b0f43ee7d..75aecfa52 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,26 @@ catalog-item config/manifests/bases/llama-stack-k8s-operator.clusterserviceversion.yaml .DS_Store + +# Documentation build artifacts +docs/site/ +docs/content/reference/api.md + +# Python virtual environments and cache (for MkDocs) +docs/.venv/ +docs/venv/ +docs/__pycache__/ +*.pyc +*.pyo + +# MkDocs temporary files +docs/.mkdocs_cache/ + +# Wrangler/Cloudflare Pages +.wrangler/ +wrangler.toml.bak + +# Documentation tool binaries (specific to our setup) +bin/crd-ref-docs* +bin/gen-crd-api-reference-docs* +DOCUMENTATION_STRUCTURE.md diff --git a/Makefile b/Makefile index d6014f984..78a59a899 100644 --- a/Makefile +++ b/Makefile @@ -244,6 +244,7 @@ GOLANGCI_LINT ?= $(LOCALBIN)/golangci-lint YQ ?= $(LOCALBIN)/yq YAMLFMT ?= $(LOCALBIN)/yamlfmt CRD_REF_DOCS ?= $(LOCALBIN)/crd-ref-docs +GEN_CRD_API_REF_DOCS ?= $(LOCALBIN)/gen-crd-api-reference-docs ## Tool Versions KUSTOMIZE_VERSION ?= v5.4.3 @@ -253,6 +254,7 @@ GOLANGCI_LINT_VERSION ?= v1.64.4 YQ_VERSION ?= v4.45.3 YAMLFMT_VERSION ?= v0.12.0 CRD_REF_DOCS_VERSION = v0.2.0 +GEN_CRD_API_REF_DOCS_VERSION = v0.3.0 .PHONY: kustomize kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. @@ -289,6 +291,11 @@ crd-ref-docs: $(CRD_REF_DOCS) ## Download crd-ref-docs locally if necessary. $(CRD_REF_DOCS): $(LOCALBIN) $(call go-install-tool,$(CRD_REF_DOCS),github.com/elastic/crd-ref-docs,$(CRD_REF_DOCS_VERSION)) +.PHONY: gen-crd-api-reference-docs +gen-crd-api-reference-docs: $(GEN_CRD_API_REF_DOCS) ## Download gen-crd-api-reference-docs locally if necessary. +$(GEN_CRD_API_REF_DOCS): $(LOCALBIN) + $(call go-install-tool,$(GEN_CRD_API_REF_DOCS),github.com/ahmetb/gen-crd-api-reference-docs,$(GEN_CRD_API_REF_DOCS_VERSION)) + # go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist # $1 - target path with name of binary # $2 - package url which can be installed @@ -338,10 +345,13 @@ OPERATOR_SDK = $(shell which operator-sdk) endif endif +##@ Documentation + .PHONY: api-docs -API_DOCS_PATH = ./docs/api-overview.md -api-docs: crd-ref-docs ## Creates API docs using https://github.com/elastic/crd-ref-docs - mkdir -p docs +API_DOCS_PATH = ./docs/content/reference/api.md +api-docs: crd-ref-docs gen-crd-api-reference-docs ## Generate comprehensive API documentation (HyperShift-style) + mkdir -p docs/content/reference + @echo "Generating API documentation..." $(CRD_REF_DOCS) --source-path ./ --output-path $(API_DOCS_PATH) --renderer markdown --config ./crd-ref-docs.config.yaml @# Combined command to remove .io links, ensure a trailing newline, and collapse multiple blank lines. @sed -i.bak -e '/^$$/N;/^\n$$/D' $(API_DOCS_PATH) @@ -351,6 +361,41 @@ api-docs: crd-ref-docs ## Creates API docs using https://github.com/elastic/crd- sed -i.bak -e '$${/^$$/d}' -e '$${N;/^\n$$/d}' $(API_DOCS_PATH); \ fi rm -f $(API_DOCS_PATH).bak + @echo "API documentation generated at $(API_DOCS_PATH)" + +.PHONY: docs-build +docs-build: api-docs ## Build complete documentation site + @echo "Building documentation site..." + @if [ ! -f docs/requirements.txt ]; then echo "Error: docs/requirements.txt not found"; exit 1; fi + @if command -v pip >/dev/null 2>&1; then \ + pip install -r docs/requirements.txt; \ + else \ + echo "Warning: pip not found, assuming dependencies are installed"; \ + fi + cd docs && mkdocs build + @echo "Documentation site built in docs/site/" + +.PHONY: docs-serve +docs-serve: docs-build ## Serve documentation locally (like HyperShift's serve-containerized) + @echo "Starting documentation server at http://localhost:8000" + cd docs && mkdocs serve --dev-addr 0.0.0.0:8000 + +.PHONY: docs-clean +docs-clean: ## Clean documentation build artifacts + rm -rf docs/site/ + rm -f docs/content/reference/api.md + +# Legacy target for backward compatibility +.PHONY: api-docs-legacy +API_DOCS_LEGACY_PATH = ./docs/api-overview.md +api-docs-legacy: crd-ref-docs ## Creates legacy API docs (backward compatibility) + mkdir -p docs + $(CRD_REF_DOCS) --source-path ./ --output-path $(API_DOCS_LEGACY_PATH) --renderer markdown --config ./crd-ref-docs.config.yaml + @sed -i.bak -e '/\.io\/[^v][^1].*)/d' -e '/^$$/N;/^\n$$/D' $(API_DOCS_LEGACY_PATH) + @if sed --version >/dev/null 2>&1; then \ + sed -i.bak -e '$${/^$$/d}' -e '$${N;/^\n$$/d}' $(API_DOCS_LEGACY_PATH); \ + fi + rm -f $(API_DOCS_LEGACY_PATH).bak .PHONY: bundle bundle: manifests kustomize operator-sdk ## Generate bundle manifests and metadata, then validate generated files. diff --git a/crd-ref-docs.config.yaml b/crd-ref-docs.config.yaml index 970e755ad..632eb0db8 100644 --- a/crd-ref-docs.config.yaml +++ b/crd-ref-docs.config.yaml @@ -3,6 +3,32 @@ render: # RE2 regular expressions describing types that should be excluded from the generated documentation. ignoreTypes: - "(LlamaStackDistribution)List$" - - # Version of Kubernetes to use when generating links to Kubernetes API documentation. + - ".*Status$" + # Add custom type mappings + typeDisplayNamePrefixOverrides: + "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1": "" + + # Enhanced rendering options kubernetesVersion: 1.31 + markdownDisabled: false + frontMatter: + title: "API Reference" + description: "Complete API reference for LlamaStack Kubernetes Operator" + weight: 100 + + # Custom sections + sections: + - title: "Overview" + content: | + This document contains the API reference for the LlamaStack Kubernetes Operator. + The operator manages LlamaStack distributions in Kubernetes clusters. + + ## Quick Links + + - [LlamaStackDistribution](#llamastackdistribution) - Main resource for deploying LlamaStack + - [Getting Started Guide](../getting-started/quick-start/) - Quick start tutorial + - [Examples](../examples/) - Real-world configuration examples + + - title: "Resource Types" + content: | + The LlamaStack Operator defines the following Kubernetes custom resources: diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..a91f2bcd7 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,321 @@ +# LlamaStack Operator Documentation + +This directory contains the documentation for the LlamaStack Kubernetes Operator, built using [MkDocs](https://www.mkdocs.org/) with the [Material theme](https://squidfunk.github.io/mkdocs-material/). + +## πŸ—οΈ Architecture + +The documentation follows HyperShift's approach with enhanced features: + +```mermaid +graph TD + A[API Types] --> B[Documentation Generator] + B --> C[MkDocs Build] + C --> D[Static Site] + D --> E[Cloudflare Pages] + + F[GitHub Actions] --> G[Auto Build] + G --> H[Deploy Preview] + G --> I[Deploy Production] + + J[Pull Request] --> K[Preview Deployment] + K --> L[Review & Test] + L --> M[Merge & Deploy] +``` + +## πŸ“ Structure + +``` +docs/ +β”œβ”€β”€ mkdocs.yml # MkDocs configuration +β”œβ”€β”€ requirements.txt # Python dependencies +β”œβ”€β”€ README.md # This file +β”œβ”€β”€ api-doc-gen/ # API documentation generation config +β”‚ └── config.json # HyperShift-style API doc config +└── content/ # Documentation content + β”œβ”€β”€ index.md # Homepage + β”œβ”€β”€ getting-started/ # Installation and setup guides + β”‚ β”œβ”€β”€ installation.md + β”‚ β”œβ”€β”€ quick-start.md + β”‚ └── configuration.md + β”œβ”€β”€ how-to/ # Task-oriented guides + β”‚ β”œβ”€β”€ deploy-llamastack.md + β”‚ β”œβ”€β”€ configure-storage.md + β”‚ β”œβ”€β”€ scaling.md + β”‚ β”œβ”€β”€ monitoring.md + β”‚ └── troubleshooting.md + β”œβ”€β”€ reference/ # API and configuration reference + β”‚ β”œβ”€β”€ api.md # Generated API documentation + β”‚ β”œβ”€β”€ configuration.md + β”‚ └── cli.md + β”œβ”€β”€ examples/ # Real-world examples + β”‚ β”œβ”€β”€ basic-deployment.md + β”‚ β”œβ”€β”€ production-setup.md + β”‚ └── custom-images.md + β”œβ”€β”€ contributing/ # Development guides + β”‚ β”œβ”€β”€ development.md + β”‚ β”œβ”€β”€ testing.md + β”‚ └── documentation.md + β”œβ”€β”€ stylesheets/ # Custom CSS + β”‚ └── extra.css + └── javascripts/ # Custom JavaScript + └── extra.js +``` + +## πŸš€ Quick Start + +### Prerequisites + +- **Go 1.24+** - For API documentation generation tools +- **Python 3.8+** - For MkDocs and dependencies +- **pip3** - Python package manager + +### Local Development + +1. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +2. **Generate API documentation**: + ```bash + make api-docs + ``` + +3. **Build and serve locally**: + ```bash + make docs-serve + ``` + +4. **Or use the build script**: + ```bash + ./scripts/build-docs.sh + ``` + +The documentation will be available at `http://localhost:8000`. + +## πŸ› οΈ Development + +### Adding New Content + +1. **Create new markdown files** in the appropriate `content/` subdirectory +2. **Update navigation** in `mkdocs.yml` +3. **Test locally** with `make docs-serve` +4. **Submit a pull request** + +### API Documentation + +The API documentation is automatically generated from Go type definitions: + +- **Source**: `api/v1alpha1/llamastackdistribution_types.go` +- **Config**: `crd-ref-docs.config.yaml` +- **Output**: `docs/content/reference/api.md` + +To regenerate API docs: +```bash +make api-docs +``` + +### Styling and Theming + +- **CSS**: Add custom styles to `content/stylesheets/extra.css` +- **JavaScript**: Add functionality to `content/javascripts/extra.js` +- **Theme**: Configure in `mkdocs.yml` under the `theme` section + +### Interactive Features + +The documentation includes several interactive features: + +- **Code copy buttons** - Automatic copy-to-clipboard for code blocks +- **YAML validator** - Validate LlamaStackDistribution configurations +- **Search enhancements** - Improved search with suggestions +- **Navigation aids** - Breadcrumbs, edit links, and page navigation + +## πŸ”§ Configuration + +### MkDocs Configuration + +The main configuration is in [`mkdocs.yml`](mkdocs.yml): + +- **Site metadata** - Title, description, URLs +- **Theme configuration** - Material theme with custom colors +- **Navigation structure** - Page organization +- **Plugins** - Search, Mermaid diagrams +- **Markdown extensions** - Code highlighting, admonitions, etc. + +### API Documentation Configuration + +API documentation generation is configured in: + +- **[`crd-ref-docs.config.yaml`](../crd-ref-docs.config.yaml)** - Main configuration +- **[`api-doc-gen/config.json`](api-doc-gen/config.json)** - HyperShift-style configuration + +### Build Configuration + +- **[`requirements.txt`](requirements.txt)** - Python dependencies +- **[`../Makefile`](../Makefile)** - Build targets and tool installation +- **[`../scripts/build-docs.sh`](../scripts/build-docs.sh)** - Comprehensive build script + +## πŸš€ Deployment + +### Cloudflare Pages + +The documentation is automatically deployed to Cloudflare Pages: + +- **Production**: `https://llamastack-k8s-operator.pages.dev` +- **Preview**: Automatic preview deployments for pull requests +- **Configuration**: [`../wrangler.toml`](../wrangler.toml) + +### GitHub Actions + +Automated builds and deployments are handled by GitHub Actions: + +- **Workflow**: [`.github/workflows/docs.yml`](../.github/workflows/docs.yml) +- **Triggers**: Changes to `api/`, `docs/`, or configuration files +- **Features**: Build validation, security scanning, preview deployments + +### Manual Deployment + +For manual deployments: + +```bash +# Build the documentation +make docs-build + +# Deploy to Cloudflare Pages (requires wrangler CLI) +wrangler pages deploy docs/site --project-name llamastack-k8s-operator-docs +``` + +## πŸ“Š Analytics and Monitoring + +### Performance Monitoring + +- **Build time tracking** - Monitor documentation build performance +- **Page load metrics** - Track user experience +- **Search analytics** - Understand user search patterns + +### Content Analytics + +- **Popular pages** - Identify most-visited content +- **User flow** - Understand navigation patterns +- **Feedback collection** - Gather user feedback + +## πŸ” Quality Assurance + +### Automated Checks + +- **Link validation** - Check for broken internal and external links +- **Markup validation** - Ensure valid HTML output +- **Accessibility testing** - WCAG 2.1 AA compliance +- **Performance testing** - Page load speed optimization + +### Manual Review Process + +1. **Content review** - Technical accuracy and clarity +2. **Design review** - Visual consistency and usability +3. **Accessibility review** - Screen reader compatibility +4. **Mobile testing** - Responsive design validation + +## πŸ›‘οΈ Security + +### Content Security + +- **Input validation** - Sanitize user-generated content +- **XSS protection** - Prevent cross-site scripting +- **CSRF protection** - Secure form submissions + +### Deployment Security + +- **HTTPS enforcement** - All traffic encrypted +- **Security headers** - Comprehensive security header configuration +- **Dependency scanning** - Regular security updates + +## 🀝 Contributing + +### Documentation Guidelines + +1. **Write for your audience** - Consider user experience level +2. **Use clear headings** - Organize content hierarchically +3. **Include examples** - Provide practical, working examples +4. **Test instructions** - Verify all commands and procedures +5. **Update navigation** - Ensure new content is discoverable + +### Style Guide + +- **Tone**: Professional but approachable +- **Voice**: Active voice preferred +- **Formatting**: Consistent use of markdown features +- **Code examples**: Complete, runnable examples +- **Screenshots**: High-quality, up-to-date images + +### Review Process + +1. **Create feature branch** - `docs/feature-name` +2. **Make changes** - Follow style guide +3. **Test locally** - Verify build and functionality +4. **Submit pull request** - Include preview link +5. **Address feedback** - Respond to review comments +6. **Merge and deploy** - Automatic deployment on merge + +## πŸ“š Resources + +### Documentation Tools + +- **[MkDocs](https://www.mkdocs.org/)** - Static site generator +- **[Material for MkDocs](https://squidfunk.github.io/mkdocs-material/)** - Theme +- **[crd-ref-docs](https://github.com/elastic/crd-ref-docs)** - API documentation generator +- **[Mermaid](https://mermaid-js.github.io/)** - Diagram generation + +### Deployment Platforms + +- **[Cloudflare Pages](https://pages.cloudflare.com/)** - Static site hosting +- **[GitHub Actions](https://github.com/features/actions)** - CI/CD automation +- **[Wrangler](https://developers.cloudflare.com/workers/wrangler/)** - Cloudflare CLI + +### Best Practices + +- **[DiΓ‘taxis](https://diataxis.fr/)** - Documentation framework +- **[Google Developer Documentation Style Guide](https://developers.google.com/style)** +- **[Write the Docs](https://www.writethedocs.org/)** - Documentation community + +## πŸ†˜ Troubleshooting + +### Common Issues + +**Build failures:** +```bash +# Check dependencies +pip install -r requirements.txt +make crd-ref-docs + +# Clean and rebuild +make docs-clean +make docs-build +``` + +**API documentation not updating:** +```bash +# Regenerate API docs +make api-docs + +# Check source files +git status api/v1alpha1/ +``` + +**Local server issues:** +```bash +# Check port availability +lsof -i :8000 + +# Use different port +cd docs && mkdocs serve --dev-addr 0.0.0.0:8001 +``` + +### Getting Help + +- **GitHub Issues**: [Report bugs and request features](https://github.com/llamastack/llama-stack-k8s-operator/issues) +- **Discussions**: [Community discussions](https://github.com/llamastack/llama-stack-k8s-operator/discussions) +- **Documentation**: [Contributing guide](content/contributing/documentation.md) + +## πŸ“„ License + +This documentation is licensed under the Apache License 2.0. See the [LICENSE](../LICENSE) file for details. diff --git a/docs/api-doc-gen/config.json b/docs/api-doc-gen/config.json new file mode 100644 index 000000000..3a144f96a --- /dev/null +++ b/docs/api-doc-gen/config.json @@ -0,0 +1,30 @@ +{ + "hideMemberFields": [ + "TypeMeta" + ], + "hideTypePatterns": [ + "(LlamaStackDistribution)List$" + ], + "externalPackages": [ + { + "typeMatchPrefix": "k8s.io/api/", + "docsURLTemplate": "https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#{{lower .TypeIdentifier}}-{{arrIndex .PackageSegments -1}}-{{arrIndex .PackageSegments -2}}" + }, + { + "typeMatchPrefix": "k8s.io/apimachinery/pkg/apis/meta/v1", + "docsURLTemplate": "https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#{{lower .TypeIdentifier}}-v1-meta" + }, + { + "typeMatchPrefix": "k8s.io/api/core/v1", + "docsURLTemplate": "https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#{{lower .TypeIdentifier}}-v1-core" + }, + { + "typeMatchPrefix": "k8s.io/apimachinery/pkg/api/resource", + "docsURLTemplate": "https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#quantity-resource-core" + } + ], + "typeDisplayNamePrefixOverrides": { + "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1": "" + }, + "markdownDisabled": false +} diff --git a/docs/content/contributing/development.md b/docs/content/contributing/development.md new file mode 100644 index 000000000..aa2065d79 --- /dev/null +++ b/docs/content/contributing/development.md @@ -0,0 +1,56 @@ +# Development Guide + +Guide for contributing to the LlamaStack Kubernetes Operator. + +## Development Setup + +### Prerequisites + +- Go 1.24+ +- Docker +- Kubernetes cluster (kind/minikube for local development) +- kubectl +- make + +### Local Development + +```bash +# Clone the repository +git clone https://github.com/llamastack/llama-stack-k8s-operator.git +cd llama-stack-k8s-operator + +# Install dependencies +make deps + +# Run tests +make test + +# Build operator +make build + +# Run locally +make run +``` + +## Contributing + +### Code Style + +- Follow Go conventions +- Use `gofmt` for formatting +- Add tests for new features +- Update documentation + +### Pull Request Process + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests +5. Update documentation +6. Submit a pull request + +## Next Steps + +- [Testing Guide](testing.md) +- [Documentation Guide](documentation.md) diff --git a/docs/content/contributing/documentation.md b/docs/content/contributing/documentation.md new file mode 100644 index 000000000..2492c1ddb --- /dev/null +++ b/docs/content/contributing/documentation.md @@ -0,0 +1,75 @@ +# Documentation Guide + +Guide for contributing to the LlamaStack Kubernetes Operator documentation. + +## Documentation Structure + +The documentation is built with MkDocs and follows this structure: + +``` +docs/ +β”œβ”€β”€ content/ +β”‚ β”œβ”€β”€ index.md +β”‚ β”œβ”€β”€ getting-started/ +β”‚ β”œβ”€β”€ how-to/ +β”‚ β”œβ”€β”€ reference/ +β”‚ β”œβ”€β”€ examples/ +β”‚ └── contributing/ +└── mkdocs.yml +``` + +## Writing Documentation + +### Markdown Guidelines + +- Use clear, concise language +- Include code examples +- Add diagrams where helpful +- Follow the existing style + +### Code Examples + +```yaml +# Always include complete, working examples +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: example +spec: + image: llamastack/llamastack:latest +``` + +## Building Documentation + +### Local Development + +```bash +# Install dependencies +pip install -r docs/requirements.txt + +# Serve locally +make docs-serve + +# Build static site +make docs-build +``` + +### API Documentation + +API documentation is auto-generated from Go types: + +```bash +# Generate API docs +make api-docs +``` + +## Contributing + +1. Edit markdown files in `docs/content/` +2. Test locally with `make docs-serve` +3. Submit a pull request + +## Next Steps + +- [Development Guide](development.md) +- [Testing Guide](testing.md) diff --git a/docs/content/contributing/testing.md b/docs/content/contributing/testing.md new file mode 100644 index 000000000..8be17fcb2 --- /dev/null +++ b/docs/content/contributing/testing.md @@ -0,0 +1,51 @@ +# Testing Guide + +Testing guidelines for the LlamaStack Kubernetes Operator. + +## Test Types + +### Unit Tests + +```bash +# Run unit tests +make test + +# Run with coverage +make test-coverage + +# Run specific package +go test ./controllers/... +``` + +### Integration Tests + +```bash +# Run integration tests +make test-integration + +# Run e2e tests +make test-e2e +``` + +## Writing Tests + +### Controller Tests + +```go +func TestLlamaStackDistributionController(t *testing.T) { + // Test implementation +} +``` + +### E2E Tests + +```go +func TestE2EDeployment(t *testing.T) { + // E2E test implementation +} +``` + +## Next Steps + +- [Development Guide](development.md) +- [Documentation Guide](documentation.md) diff --git a/docs/content/distributions/bedrock.md b/docs/content/distributions/bedrock.md new file mode 100644 index 000000000..ee7d7d429 --- /dev/null +++ b/docs/content/distributions/bedrock.md @@ -0,0 +1,436 @@ +# AWS Bedrock Distribution + +!!! warning "Distribution Availability" + The Bedrock distribution container image may not be currently maintained or available. + Please verify the image exists at `docker.io/llamastack/distribution-bedrock:latest` before using this distribution. + For production use, consider using the `ollama` or `vllm` distributions which are actively maintained. + +The **Bedrock** distribution enables seamless integration with Amazon Bedrock, AWS's fully managed service for foundation models. This distribution allows you to leverage AWS Bedrock's powerful models through the LlamaStack Kubernetes Operator. + +## Overview + +Amazon Bedrock provides access to high-performing foundation models from leading AI companies through a single API. The Bedrock distribution: + +- **Connects to AWS Bedrock** for model inference +- **Manages AWS credentials** securely +- **Provides unified API** through LlamaStack +- **Supports multiple Bedrock models** (Claude, Llama, Titan, etc.) + +## Distribution Details + +| Property | Value | +|----------|-------| +| **Distribution Name** | `bedrock` | +| **Image** | `docker.io/llamastack/distribution-bedrock:latest` | +| **Use Case** | AWS Bedrock model integration | +| **Requirements** | AWS credentials and Bedrock access | +| **Recommended For** | AWS users, enterprise deployments | + +## Prerequisites + +### 1. AWS Account Setup + +- AWS account with Bedrock access +- IAM user/role with Bedrock permissions +- Bedrock model access enabled in your AWS region + +### 2. Required AWS Permissions + +Your AWS credentials need the following permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "bedrock:InvokeModel", + "bedrock:InvokeModelWithResponseStream", + "bedrock:ListFoundationModels", + "bedrock:GetFoundationModel" + ], + "Resource": "*" + } + ] +} +``` + +### 3. Enable Bedrock Models + +Enable the models you want to use in the AWS Bedrock console: +- Anthropic Claude models +- Meta Llama models +- Amazon Titan models +- Cohere Command models + +## Quick Start + +### 1. Create AWS Credentials Secret + +```bash +kubectl create secret generic aws-credentials \ + --from-literal=AWS_ACCESS_KEY_ID=your-access-key \ + --from-literal=AWS_SECRET_ACCESS_KEY=your-secret-key \ + --from-literal=AWS_DEFAULT_REGION=us-east-1 +``` + +### 2. Create Bedrock Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-bedrock-llamastack + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "bedrock" + containerSpec: + port: 8321 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_DEFAULT_REGION + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-sonnet-20240229-v1:0" + storage: + size: "10Gi" +``` + +### 3. Deploy the Distribution + +```bash +kubectl apply -f bedrock-distribution.yaml +``` + +### 4. Verify Deployment + +```bash +# Check the distribution status +kubectl get llamastackdistribution my-bedrock-llamastack + +# Check the pods +kubectl get pods -l app=llama-stack + +# Check logs for AWS connection +kubectl logs -l app=llama-stack +``` + +## Configuration Options + +### Supported Bedrock Models + +Configure different Bedrock models using the `BEDROCK_MODEL_ID` environment variable: + +#### Anthropic Claude Models +```yaml +env: + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-sonnet-20240229-v1:0" # Claude 3 Sonnet + # value: "anthropic.claude-3-haiku-20240307-v1:0" # Claude 3 Haiku + # value: "anthropic.claude-v2:1" # Claude 2.1 +``` + +#### Meta Llama Models +```yaml +env: + - name: BEDROCK_MODEL_ID + value: "meta.llama2-70b-chat-v1" # Llama 2 70B Chat + # value: "meta.llama2-13b-chat-v1" # Llama 2 13B Chat +``` + +#### Amazon Titan Models +```yaml +env: + - name: BEDROCK_MODEL_ID + value: "amazon.titan-text-express-v1" # Titan Text Express + # value: "amazon.titan-text-lite-v1" # Titan Text Lite +``` + +### AWS Authentication Methods + +#### Method 1: Access Keys (Secrets) +```yaml +env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY + - name: AWS_DEFAULT_REGION + value: "us-east-1" +``` + +#### Method 2: IAM Roles for Service Accounts (IRSA) +```yaml +spec: + server: + podOverrides: + serviceAccountName: bedrock-service-account + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::123456789012:role/BedrockRole +``` + +#### Method 3: Instance Profile (EKS Nodes) +```yaml +# No additional configuration needed if EKS nodes have Bedrock permissions +env: + - name: AWS_DEFAULT_REGION + value: "us-east-1" +``` + +### Environment Variables + +```yaml +env: + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-sonnet-20240229-v1:0" + - name: AWS_DEFAULT_REGION + value: "us-east-1" + - name: BEDROCK_MAX_TOKENS + value: "4096" + - name: BEDROCK_TEMPERATURE + value: "0.7" + - name: LOG_LEVEL + value: "INFO" +``` + +## Advanced Configuration + +### Multi-Model Setup + +Deploy multiple Bedrock distributions for different models: + +```yaml +# Claude 3 Sonnet Distribution +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: bedrock-claude-sonnet +spec: + server: + distribution: + name: "bedrock" + containerSpec: + env: + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-sonnet-20240229-v1:0" +--- +# Llama 2 70B Distribution +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: bedrock-llama2-70b +spec: + server: + distribution: + name: "bedrock" + containerSpec: + env: + - name: BEDROCK_MODEL_ID + value: "meta.llama2-70b-chat-v1" +``` + +### Production Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: production-bedrock + namespace: production +spec: + replicas: 3 + server: + distribution: + name: "bedrock" + containerSpec: + resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY + - name: AWS_DEFAULT_REGION + value: "us-east-1" + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-sonnet-20240229-v1:0" + - name: LOG_LEVEL + value: "WARNING" + - name: ENABLE_TELEMETRY + value: "true" + storage: + size: "20Gi" +``` + +## Use Cases + +### 1. Enterprise AI Applications + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: enterprise-bedrock + namespace: enterprise +spec: + replicas: 5 + server: + distribution: + name: "bedrock" + containerSpec: + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + env: + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-sonnet-20240229-v1:0" + - name: AWS_DEFAULT_REGION + value: "us-east-1" +``` + +### 2. Development and Testing + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: dev-bedrock + namespace: development +spec: + replicas: 1 + server: + distribution: + name: "bedrock" + containerSpec: + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + env: + - name: BEDROCK_MODEL_ID + value: "anthropic.claude-3-haiku-20240307-v1:0" # Faster, cheaper model + - name: LOG_LEVEL + value: "DEBUG" +``` + +## Monitoring and Troubleshooting + +### Health Checks + +```bash +# Check distribution status +kubectl get llamastackdistribution + +# Check pod logs for AWS connectivity +kubectl logs -l app=llama-stack + +# Test AWS credentials +kubectl exec -it -- aws bedrock list-foundation-models +``` + +### Common Issues + +1. **AWS Credentials Invalid** + ```bash + # Verify credentials in secret + kubectl get secret aws-credentials -o yaml + + # Test credentials + kubectl exec -it -- aws sts get-caller-identity + ``` + +2. **Model Access Denied** + - Enable model access in AWS Bedrock console + - Verify IAM permissions include `bedrock:InvokeModel` + - Check if model is available in your AWS region + +3. **Region Issues** + - Ensure Bedrock is available in your region + - Verify `AWS_DEFAULT_REGION` matches model availability + +### Cost Monitoring + +Monitor AWS Bedrock costs: +- Use AWS Cost Explorer to track Bedrock usage +- Set up billing alerts for unexpected usage +- Consider using cheaper models for development + +## Best Practices + +### Security +- Use IAM roles instead of access keys when possible +- Store credentials in Kubernetes Secrets +- Implement least-privilege IAM policies +- Enable AWS CloudTrail for audit logging + +### Performance +- Choose appropriate models for your use case +- Use Haiku for speed, Sonnet for balance, Opus for quality +- Scale replicas based on request volume +- Monitor response times and adjust accordingly + +### Cost Optimization +- Use smaller models for development/testing +- Implement request caching where appropriate +- Monitor token usage and optimize prompts +- Set up cost alerts and budgets + +## Next Steps + +- [Configure Storage](../how-to-guides/storage.md) +- [Set up Monitoring](../how-to-guides/monitoring.md) +- [Scaling Guide](../how-to-guides/scaling.md) +- [Security Best Practices](../how-to-guides/security.md) + +## API Reference + +For complete API documentation, see: +- [API Reference](../reference/api.md) +- [Configuration Reference](../reference/configuration.md) diff --git a/docs/content/distributions/bring-your-own.md b/docs/content/distributions/bring-your-own.md new file mode 100644 index 000000000..4667c7a1b --- /dev/null +++ b/docs/content/distributions/bring-your-own.md @@ -0,0 +1,562 @@ +# Bring Your Own (BYO) Distributions + +The LlamaStack Kubernetes operator supports both pre-built distributions and custom "Bring Your Own" (BYO) distributions. This guide shows you how to build, customize, and deploy your own LlamaStack distributions. + +## Overview + +### Supported vs BYO Distributions + +| Type | Description | Use Case | Configuration | +|------|-------------|----------|---------------| +| **Supported** | Pre-built distributions maintained by the LlamaStack team | Quick deployment, standard configurations | Use `distribution.name` field | +| **BYO** | Custom distributions you build and maintain | Custom providers, specialized configurations | Use `distribution.image` field | + +### Why Build Custom Distributions? + +- **Custom Providers**: Integrate with proprietary or specialized inference engines +- **Specific Configurations**: Tailor the stack for your exact requirements +- **External Dependencies**: Include additional libraries or tools +- **Security Requirements**: Control the entire build process and dependencies +- **Performance Optimization**: Optimize for your specific hardware or use case + +## Building LlamaStack Distributions + +### Prerequisites + +1. **Install LlamaStack CLI**: + ```bash + pip install llama-stack + ``` + +2. **Docker or Podman** (for container builds): + ```bash + # Verify Docker is running + docker --version + ``` + +3. **Conda** (for conda builds): + ```bash + # Verify Conda is available + conda --version + ``` + +### Quick Start: Building from Templates + +#### 1. List Available Templates + +```bash +llama stack build --list-templates +``` + +This shows available templates like: +- `ollama` - Ollama-based inference +- `vllm-gpu` - vLLM with GPU support +- `meta-reference-gpu` - Meta's reference implementation +- `bedrock` - AWS Bedrock integration +- `fireworks` - Fireworks AI integration + +#### 2. Build from Template + +```bash +# Build a container image from Ollama template +llama stack build --template ollama --image-type container + +# Build a conda environment from vLLM template +llama stack build --template vllm-gpu --image-type conda + +# Build with custom name +llama stack build --template ollama --image-type container --image-name my-custom-ollama +``` + +#### 3. Interactive Build + +```bash +llama stack build +``` + +This launches an interactive wizard: + +``` +> Enter a name for your Llama Stack (e.g. my-local-stack): my-custom-stack +> Enter the image type you want your Llama Stack to be built as (container or conda or venv): container + +Llama Stack is composed of several APIs working together. Let's select +the provider types (implementations) you want to use for these APIs. + +> Enter provider for API inference: inline::meta-reference +> Enter provider for API safety: inline::llama-guard +> Enter provider for API agents: inline::meta-reference +> Enter provider for API memory: inline::faiss +> Enter provider for API datasetio: inline::meta-reference +> Enter provider for API scoring: inline::meta-reference +> Enter provider for API eval: inline::meta-reference +> Enter provider for API telemetry: inline::meta-reference + +> (Optional) Enter a short description for your Llama Stack: My custom distribution +``` + +### Advanced: Custom Configuration Files + +#### 1. Create a Custom Build Configuration + +Create `my-custom-build.yaml`: + +```yaml +name: my-custom-stack +distribution_spec: + description: Custom distribution with external Ollama + providers: + inference: remote::ollama + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference + datasetio: inline::meta-reference + scoring: inline::meta-reference + eval: inline::meta-reference +image_name: my-custom-stack +image_type: container + +# Optional: External providers directory +external_providers_dir: ~/.llama/providers.d +``` + +#### 2. Build from Custom Configuration + +```bash +llama stack build --config my-custom-build.yaml +``` + +### Image Types + +#### Container Images + +Best for production deployments and Kubernetes: + +```bash +llama stack build --template ollama --image-type container +``` + +**Advantages**: +- Consistent across environments +- Easy to deploy in Kubernetes +- Isolated dependencies +- Reproducible builds + +#### Conda Environments + +Good for development and local testing: + +```bash +llama stack build --template ollama --image-type conda +``` + +**Advantages**: +- Fast iteration during development +- Easy dependency management +- Good for experimentation + +#### Virtual Environments + +Lightweight option for Python-only setups: + +```bash +llama stack build --template ollama --image-type venv +``` + +## Custom Providers + +### Adding External Providers + +#### 1. Create Provider Configuration + +Create `~/.llama/providers.d/custom-ollama.yaml`: + +```yaml +adapter: + adapter_type: custom_ollama + pip_packages: + - ollama + - aiohttp + - llama-stack-provider-ollama + config_class: llama_stack_ollama_provider.config.OllamaImplConfig + module: llama_stack_ollama_provider +api_dependencies: [] +optional_api_dependencies: [] +``` + +#### 2. Reference in Build Configuration + +```yaml +name: custom-external-stack +distribution_spec: + description: Custom distro with external providers + providers: + inference: remote::custom_ollama + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference +image_type: container +image_name: custom-external-stack +external_providers_dir: ~/.llama/providers.d +``` + +## Using Custom Distributions with Kubernetes + +### 1. Build and Push Container Image + +```bash +# Build the distribution +llama stack build --template ollama --image-type container --image-name my-ollama-dist + +# Tag for your registry +docker tag distribution-my-ollama-dist:dev my-registry.com/my-ollama-dist:v1.0.0 + +# Push to registry +docker push my-registry.com/my-ollama-dist:v1.0.0 +``` + +### 2. Deploy with Kubernetes Operator + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-custom-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + image: "my-registry.com/my-ollama-dist:v1.0.0" # Custom image + containerSpec: + port: 8321 + resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" + - name: OLLAMA_URL + value: "http://ollama-server:11434" + storage: + size: "20Gi" +``` + +### 3. Verify Deployment + +```bash +kubectl get llamastackdistribution my-custom-distribution +kubectl get pods -l app=llama-stack +kubectl logs -l app=llama-stack +``` + +## Examples + +### Example 1: Custom Ollama Distribution + +#### Build Configuration (`custom-ollama-build.yaml`) + +```yaml +name: custom-ollama +distribution_spec: + description: Custom Ollama distribution with additional tools + providers: + inference: remote::ollama + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference +image_name: custom-ollama +image_type: container +``` + +#### Build and Deploy + +```bash +# Build the distribution +llama stack build --config custom-ollama-build.yaml + +# Tag and push +docker tag distribution-custom-ollama:dev my-registry.com/custom-ollama:latest +docker push my-registry.com/custom-ollama:latest +``` + +#### Kubernetes Deployment + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-ollama-dist +spec: + replicas: 2 + server: + distribution: + image: "my-registry.com/custom-ollama:latest" + containerSpec: + resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "llama3.2:3b" + - name: OLLAMA_URL + value: "http://ollama-service:11434" +``` + +### Example 2: Custom vLLM Distribution + +#### Build Configuration (`custom-vllm-build.yaml`) + +```yaml +name: custom-vllm +distribution_spec: + description: Custom vLLM distribution with GPU optimization + providers: + inference: inline::vllm + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference +image_name: custom-vllm +image_type: container +``` + +#### Enhanced Dockerfile + +Create a custom Dockerfile to extend the base distribution: + +```dockerfile +FROM distribution-custom-vllm:dev + +# Install additional dependencies +RUN pip install custom-optimization-library + +# Add custom configuration +COPY custom-vllm-config.json /app/config.json + +# Set environment variables +ENV VLLM_OPTIMIZATION_LEVEL=high +ENV CUSTOM_GPU_SETTINGS=enabled + +# Expose port +EXPOSE 8321 +``` + +#### Build and Deploy + +```bash +# Build the LlamaStack distribution +llama stack build --config custom-vllm-build.yaml + +# Build enhanced Docker image +docker build -t my-registry.com/enhanced-vllm:latest . + +# Push to registry +docker push my-registry.com/enhanced-vllm:latest +``` + +#### Kubernetes Deployment + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: enhanced-vllm-dist +spec: + replicas: 1 + server: + distribution: + image: "my-registry.com/enhanced-vllm:latest" + containerSpec: + resources: + requests: + nvidia.com/gpu: "2" + memory: "32Gi" + cpu: "8" + limits: + nvidia.com/gpu: "2" + memory: "64Gi" + cpu: "16" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-13b-chat-hf" + - name: VLLM_GPU_MEMORY_UTILIZATION + value: "0.9" + - name: VLLM_TENSOR_PARALLEL_SIZE + value: "2" +``` + +### Example 3: Multi-Provider Distribution + +#### Build Configuration (`multi-provider-build.yaml`) + +```yaml +name: multi-provider +distribution_spec: + description: Distribution with multiple inference providers + providers: + inference: + - remote::ollama + - remote::vllm + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference +image_name: multi-provider +image_type: container +``` + +## Testing Custom Distributions + +### Local Testing + +#### 1. Run Locally with Docker + +```bash +# Set environment variables +export LLAMA_STACK_PORT=8321 +export INFERENCE_MODEL="llama3.2:1b" + +# Run the custom distribution +docker run -d \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + distribution-custom-ollama:dev \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env OLLAMA_URL=http://host.docker.internal:11434 +``` + +#### 2. Test API Endpoints + +```bash +# Health check +curl http://localhost:8321/v1/health + +# List providers +curl http://localhost:8321/v1/providers + +# Test inference +curl -X POST http://localhost:8321/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama3.2:1b", + "prompt": "Hello, world!", + "max_tokens": 50 + }' +``` + +### Kubernetes Testing + +#### 1. Deploy to Test Namespace + +```bash +kubectl create namespace llama-test +kubectl apply -f custom-distribution.yaml -n llama-test +``` + +#### 2. Port Forward for Testing + +```bash +kubectl port-forward svc/my-custom-distribution-service 8321:8321 -n llama-test +``` + +#### 3. Run Tests + +```bash +# Test from within cluster +kubectl run test-pod --image=curlimages/curl --rm -it -- \ + curl http://my-custom-distribution-service:8321/v1/health +``` + +## Best Practices + +### Security + +1. **Use Private Registries**: Store custom images in private container registries +2. **Scan Images**: Use container scanning tools to check for vulnerabilities +3. **Minimal Base Images**: Use slim or distroless base images when possible +4. **Secrets Management**: Use Kubernetes secrets for API keys and credentials + +### Performance + +1. **Multi-stage Builds**: Use multi-stage Dockerfiles to reduce image size +2. **Layer Caching**: Optimize Dockerfile layer ordering for better caching +3. **Resource Limits**: Set appropriate CPU and memory limits +4. **GPU Optimization**: Configure GPU settings for inference workloads + +### Maintenance + +1. **Version Tags**: Use semantic versioning for your custom images +2. **Documentation**: Document your custom configurations and dependencies +3. **Testing**: Implement automated testing for custom distributions +4. **Monitoring**: Set up monitoring and logging for custom deployments + +### Development Workflow + +1. **Local Development**: Use conda/venv builds for rapid iteration +2. **CI/CD Integration**: Automate building and testing of custom distributions +3. **Staging Environment**: Test in staging before production deployment +4. **Rollback Strategy**: Maintain previous versions for quick rollbacks + +## Troubleshooting + +### Common Issues + +#### Build Failures + +```bash +# Check build logs +llama stack build --template ollama --image-type container --verbose + +# Verify dependencies +llama stack build --config my-build.yaml --print-deps-only +``` + +#### Runtime Issues + +```bash +# Check container logs +docker logs + +# Debug with interactive shell +docker run -it --entrypoint /bin/bash distribution-custom:dev +``` + +#### Kubernetes Issues + +```bash +# Check pod status +kubectl describe pod + +# View logs +kubectl logs -f + +# Check events +kubectl get events --sort-by=.metadata.creationTimestamp +``` + +### Getting Help + +1. **LlamaStack Documentation**: [Official docs](https://github.com/meta-llama/llama-stack) +2. **GitHub Issues**: Report bugs and ask questions +3. **Community Forums**: Join the LlamaStack community discussions +4. **Operator Documentation**: Check the Kubernetes operator guides + +## Next Steps + +- [vLLM Distribution](vllm.md) - Learn about vLLM-specific configurations +- [Ollama Distribution](ollama.md) - Explore Ollama distribution options +- [Configuration Reference](../reference/configuration.md) - Complete API reference +- [Scaling Guide](../how-to/scaling.md) - Scale your custom distributions diff --git a/docs/content/distributions/ollama.md b/docs/content/distributions/ollama.md new file mode 100644 index 000000000..5ffb01657 --- /dev/null +++ b/docs/content/distributions/ollama.md @@ -0,0 +1,733 @@ +# Ollama Distribution + +Ollama is a user-friendly platform for running large language models locally. The LlamaStack Kubernetes operator provides built-in support for Ollama through a pre-configured distribution. + +## Overview + +Ollama offers several advantages: + +- **Ease of Use**: Simple model management and deployment +- **Local Execution**: Run models entirely on your infrastructure +- **Model Library**: Access to a curated collection of popular models +- **Resource Efficiency**: Optimized for various hardware configurations +- **API Compatibility**: OpenAI-compatible API endpoints + +## Pre-Built Ollama Distribution + +The operator includes one pre-configured Ollama distribution: + +### ollama +- **Image**: `docker.io/llamastack/distribution-ollama:latest` +- **Purpose**: Standard Ollama deployment +- **Requirements**: CPU or GPU resources depending on model +- **Use Case**: General-purpose local LLM inference + +## Quick Start with Ollama + +### 1. Create a LlamaStackDistribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-ollama-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "ollama" # Use supported distribution + containerSpec: + port: 8321 + resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" + storage: + size: "20Gi" + mountPath: "/.llama" +``` + +### 2. Deploy the Distribution + +```bash +kubectl apply -f ollama-distribution.yaml +``` + +### 3. Verify Deployment + +```bash +kubectl get llamastackdistribution my-ollama-distribution +kubectl get pods -l app=llama-stack +``` + +## Configuration Options + +### Container Specification + +The `containerSpec` section allows you to configure the container: + +```yaml +spec: + server: + containerSpec: + name: "llama-stack" # Optional, defaults to "llama-stack" + port: 8321 # Optional, defaults to 8321 + resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" + - name: OLLAMA_HOST + value: "0.0.0.0:11434" + - name: OLLAMA_ORIGINS + value: "*" +``` + +### Environment Variables + +Configure Ollama behavior through environment variables: + +```yaml +env: + - name: INFERENCE_MODEL + value: "llama2:7b" + - name: OLLAMA_HOST + value: "0.0.0.0:11434" + - name: OLLAMA_ORIGINS + value: "*" + - name: OLLAMA_NUM_PARALLEL + value: "4" + - name: OLLAMA_MAX_LOADED_MODELS + value: "3" +``` + +### Popular Models + +You can specify different models using the `INFERENCE_MODEL` environment variable: + +```yaml +# Llama 2 variants +- name: INFERENCE_MODEL + value: "llama2:7b" # 7B parameter model +# value: "llama2:13b" # 13B parameter model +# value: "llama2:70b" # 70B parameter model + +# Code-focused models +# value: "codellama:7b" # Code generation +# value: "codellama:13b" # Larger code model + +# Chat-optimized models +# value: "llama2:7b-chat" +# value: "llama2:13b-chat" + +# Other popular models +# value: "mistral:7b" # Mistral 7B +# value: "neural-chat:7b" # Intel's neural chat +# value: "orca-mini:3b" # Smaller, efficient model +``` + +### Resource Requirements + +```yaml +resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" +``` + +### GPU Support + +For GPU acceleration: + +```yaml +resources: + requests: + nvidia.com/gpu: "1" + memory: "8Gi" + cpu: "2" + limits: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "4" +env: + - name: INFERENCE_MODEL + value: "llama2:7b" + - name: OLLAMA_GPU_LAYERS + value: "35" # Number of layers to run on GPU +``` + +### Storage Configuration + +```yaml +storage: + size: "20Gi" + mountPath: "/.llama" # Optional, defaults to "/.llama" +``` + +## Advanced Configuration + +### Custom Model Management with Pod Overrides + +```yaml +spec: + server: + podOverrides: + volumes: + - name: ollama-models + persistentVolumeClaim: + claimName: ollama-models-pvc + volumeMounts: + - name: ollama-models + mountPath: /root/.ollama + containerSpec: + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" + - name: OLLAMA_MODELS + value: "/root/.ollama/models" +``` + +### Multiple Model Setup + +```yaml +spec: + server: + containerSpec: + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" # Primary model + - name: OLLAMA_MAX_LOADED_MODELS + value: "3" + - name: ADDITIONAL_MODELS + value: "codellama:7b,mistral:7b" # Additional models to pull + resources: + requests: + memory: "24Gi" + cpu: "8" + limits: + memory: "48Gi" + cpu: "16" +``` + +### Scaling with Multiple Replicas + +```yaml +spec: + replicas: 2 + server: + distribution: + name: "ollama" + containerSpec: + resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" +``` + +## Using Ollama with the Kubernetes Operator + +The LlamaStack Kubernetes operator supports Ollama in two ways: + +### 1. Pre-Built Distribution (Recommended) + +Use the pre-built, maintained distribution with the `distribution.name` field: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: ollama-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "ollama" # Supported distribution + containerSpec: + port: 8321 + resources: + requests: + memory: "8Gi" + cpu: "4" + limits: + memory: "16Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" + - name: OLLAMA_URL + value: "http://ollama-server-service.ollama-dist.svc.cluster.local:11434" + storage: + size: "20Gi" +``` + +#### With GPU Support + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: ollama-gpu-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "ollama" # Supported distribution + containerSpec: + resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "8" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" + cpu: "16" + env: + - name: INFERENCE_MODEL + value: "llama2:7b" + - name: OLLAMA_GPU_LAYERS + value: "35" + - name: OLLAMA_NUM_PARALLEL + value: "4" + storage: + size: "50Gi" +``` + +### 2. Bring Your Own (BYO) Custom Images + +Use custom-built distributions with the `distribution.image` field: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-ollama-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + image: "my-registry.com/custom-ollama:v1.0.0" # Custom image + containerSpec: + resources: + requests: + memory: "16Gi" + cpu: "8" + limits: + memory: "32Gi" + cpu: "16" + env: + - name: INFERENCE_MODEL + value: "custom-model:latest" + - name: CUSTOM_OLLAMA_SETTING + value: "optimized" + storage: + size: "100Gi" +``` + +## Building Custom Ollama Distributions + +### Step 1: Build with LlamaStack CLI + +#### Option A: From Template + +```bash +# Install LlamaStack CLI +pip install llama-stack + +# Build from Ollama template +llama stack build --template ollama --image-type container --image-name my-ollama-dist +``` + +#### Option B: Custom Configuration + +Create `custom-ollama-build.yaml`: + +```yaml +name: custom-ollama +distribution_spec: + description: Custom Ollama distribution with pre-loaded models + providers: + inference: remote::ollama + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference +image_name: custom-ollama +image_type: container +``` + +Build the distribution: + +```bash +llama stack build --config custom-ollama-build.yaml +``` + +### Step 2: Enhance with Custom Dockerfile + +Create `Dockerfile.enhanced`: + +```dockerfile +FROM distribution-custom-ollama:dev + +# Install additional tools +RUN apt-get update && apt-get install -y \ + curl \ + jq \ + htop \ + && rm -rf /var/lib/apt/lists/* + +# Pre-pull popular models +RUN ollama pull llama3.2:1b && \ + ollama pull llama3.2:3b && \ + ollama pull codellama:7b && \ + ollama pull mistral:7b + +# Add custom model management scripts +COPY scripts/model-manager.sh /usr/local/bin/model-manager +COPY scripts/health-check.sh /usr/local/bin/health-check +RUN chmod +x /usr/local/bin/model-manager /usr/local/bin/health-check + +# Add custom Ollama configuration +COPY ollama-config.json /etc/ollama/config.json + +# Set optimized environment variables +ENV OLLAMA_HOST=0.0.0.0:11434 +ENV OLLAMA_ORIGINS=* +ENV OLLAMA_NUM_PARALLEL=4 +ENV OLLAMA_MAX_LOADED_MODELS=3 +ENV OLLAMA_KEEP_ALIVE=5m + +# Add health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD health-check + +EXPOSE 8321 11434 +``` + +### Step 3: Deploy with Operator + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: enhanced-ollama-dist + namespace: production +spec: + replicas: 2 + server: + distribution: + image: "my-registry.com/enhanced-ollama:v1.0.0" + containerSpec: + resources: + requests: + memory: "16Gi" + cpu: "8" + nvidia.com/gpu: "1" + limits: + memory: "32Gi" + cpu: "16" + nvidia.com/gpu: "1" + env: + - name: INFERENCE_MODEL + value: "llama3.2:3b" + - name: OLLAMA_NUM_PARALLEL + value: "4" + - name: OLLAMA_MAX_LOADED_MODELS + value: "2" + - name: CUSTOM_OPTIMIZATION + value: "enabled" + storage: + size: "200Gi" + podOverrides: + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: shared-model-cache + volumeMounts: + - name: model-cache + mountPath: /shared-models +``` + +## Comparison: Pre-Built vs BYO + +| Aspect | Pre-Built Distribution | BYO Custom Images | +|--------|----------------------|-------------------| +| **Setup Complexity** | Simple - just specify `name` | Complex - build and maintain images | +| **Maintenance** | Maintained by LlamaStack team | You maintain the images | +| **Model Management** | Runtime model pulling | Pre-loaded models possible | +| **Customization** | Limited to environment variables | Full control over Ollama configuration | +| **Security** | Vetted by maintainers | You control security scanning and updates | +| **Performance** | Standard Ollama setup | Custom optimizations possible | +| **Support** | Community and official support | Self-supported | +| **Updates** | Automatic with operator updates | Manual image rebuilds required | + +### When to Use Pre-Built Distribution + +- **Quick deployment** and standard use cases +- **Production environments** where stability is key +- **Dynamic model management** (pull models at runtime) +- **Teams without container expertise** +- **Standard Ollama configurations** + +### When to Use BYO Custom Images + +- **Pre-loaded models** for faster startup +- **Custom Ollama configurations** or patches +- **Additional tools** and utilities +- **Compliance requirements** for image provenance +- **Integration** with existing model management systems +- **Custom model formats** or converters + +## Model Management + +### Accessing the Ollama Container + +```bash +# Connect to running Ollama pod +kubectl exec -it -- bash + +# Pull models +ollama pull llama2:7b + +# List available models +ollama list + +# Remove unused models +ollama rm old-model:tag +``` + +### Model Information + +```bash +# Show model details +kubectl exec -it -- ollama show llama2:7b + +# Check model size and parameters +kubectl exec -it -- ollama show llama2:7b --modelfile +``` + +## API Usage + +### REST API + +Ollama provides OpenAI-compatible endpoints: + +```bash +# Generate completion +curl -X POST http://ollama-service:8321/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama2:7b", + "prompt": "Why is the sky blue?", + "max_tokens": 100 + }' + +# Chat completion +curl -X POST http://ollama-service:8321/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama2:7b", + "messages": [ + {"role": "user", "content": "Hello!"} + ] + }' +``` + +### Python Client + +```python +import requests + +# Generate text +response = requests.post( + "http://ollama-service:8321/v1/completions", + json={ + "model": "llama2:7b", + "prompt": "Explain quantum computing", + "max_tokens": 200 + } +) + +print(response.json()) +``` + +## Monitoring and Troubleshooting + +### Health Checks + +```bash +# Check pod status +kubectl get pods -l app=llama-stack + +# View logs +kubectl logs -l app=llama-stack + +# Test API endpoint +kubectl port-forward svc/my-ollama-distribution-service 8321:8321 +curl http://localhost:8321/v1/health +``` + +### Performance Monitoring + +```bash +# Monitor resource usage +kubectl top pods -l app=llama-stack + +# Check model loading status +kubectl exec -it -- ollama ps +``` + +### Common Issues + +1. **Model Download Failures** + - Check internet connectivity + - Verify sufficient storage space + - Ensure proper permissions + +2. **Out of Memory** + - Use smaller models (3b, 7b instead of 13b, 70b) + - Increase memory limits + - Reduce concurrent requests + +3. **Slow Performance** + - Enable GPU acceleration + - Use faster storage for model cache + - Optimize model selection for hardware + +## Best Practices + +### Resource Planning + +- **Memory**: Allocate 2-4x model size in RAM +- **Storage**: Plan for model downloads and cache +- **CPU**: More cores improve concurrent request handling + +### Model Selection + +```yaml +# For development/testing +env: + - name: INFERENCE_MODEL + value: "orca-mini:3b" # Fast, lightweight + +# For general use +env: + - name: INFERENCE_MODEL + value: "llama2:7b" # Good balance of quality/performance + +# For high-quality responses +env: + - name: INFERENCE_MODEL + value: "llama2:13b" # Better quality, more resources + +# For code generation +env: + - name: INFERENCE_MODEL + value: "codellama:7b" # Specialized for coding tasks +``` + +### Security Considerations + +- Use private registries for custom images +- Implement network policies for API access +- Secure model storage with appropriate permissions +- Monitor API usage and implement rate limiting + +## Examples + +### Production Setup + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: production-ollama + namespace: llama-production +spec: + replicas: 2 + server: + distribution: + name: "ollama" + containerSpec: + resources: + requests: + memory: "16Gi" + cpu: "8" + nvidia.com/gpu: "1" + limits: + memory: "32Gi" + cpu: "16" + nvidia.com/gpu: "1" + env: + - name: INFERENCE_MODEL + value: "llama3.2:1b" + - name: OLLAMA_NUM_PARALLEL + value: "4" + - name: OLLAMA_MAX_LOADED_MODELS + value: "2" + storage: + size: "100Gi" +``` + +### Development Setup + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: dev-ollama + namespace: development +spec: + replicas: 1 + server: + distribution: + name: "ollama" + containerSpec: + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + env: + - name: INFERENCE_MODEL + value: "orca-mini:3b" + storage: + size: "20Gi" +``` + +## API Reference + +For complete API documentation, see: +- [API Reference](../reference/api.md) +- [Configuration Reference](../reference/configuration.md) + +## Next Steps + +- [Configure Storage](../how-to/configure-storage.md) +- [Scaling Guide](../how-to/scaling.md) +- [Monitoring Setup](../how-to/monitoring.md) +- [vLLM Distribution](vllm.md) +- [Understanding Distributions](../getting-started/distributions.md) diff --git a/docs/content/distributions/starter.md b/docs/content/distributions/starter.md new file mode 100644 index 000000000..c5114da4d --- /dev/null +++ b/docs/content/distributions/starter.md @@ -0,0 +1,363 @@ +# Starter Distribution + +The **Starter** distribution is the recommended default distribution for new users of the LlamaStack Kubernetes Operator. It provides a general-purpose LlamaStack deployment that's easy to set up and suitable for most use cases. + +## Overview + +The Starter distribution is designed to: + +- **Get you started quickly** with minimal configuration +- **Provide a stable foundation** for LlamaStack applications +- **Support common use cases** out of the box +- **Serve as a learning platform** for understanding LlamaStack concepts + +## Distribution Details + +| Property | Value | +|----------|-------| +| **Distribution Name** | `starter` | +| **Image** | `docker.io/llamastack/distribution-starter:latest` | +| **Use Case** | General-purpose LlamaStack deployment | +| **Requirements** | Basic Kubernetes resources | +| **Recommended For** | New users, development, prototyping | + +## Quick Start + +### 1. Create a Basic Starter Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-starter-llamastack + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + port: 8321 + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1" + storage: + size: "20Gi" +``` + +### 2. Deploy the Distribution + +```bash +kubectl apply -f starter-distribution.yaml +``` + +### 3. Verify Deployment + +```bash +# Check the distribution status +kubectl get llamastackdistribution my-starter-llamastack + +# Check the pods +kubectl get pods -l app=llama-stack + +# Check the service +kubectl get svc my-starter-llamastack-service +``` + +## Configuration Options + +### Basic Configuration + +```yaml +spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + port: 8321 + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1" + env: + - name: LOG_LEVEL + value: "INFO" + storage: + size: "20Gi" + mountPath: "/.llama" +``` + +### Environment Variables + +Common environment variables for the Starter distribution: + +```yaml +env: + - name: LOG_LEVEL + value: "INFO" # DEBUG, INFO, WARNING, ERROR + - name: SERVER_PORT + value: "8321" + - name: ENABLE_TELEMETRY + value: "true" +``` + +### Resource Requirements + +#### Development Setup +```yaml +resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "500m" +``` + +#### Production Setup +```yaml +resources: + requests: + memory: "4Gi" + cpu: "1" + limits: + memory: "8Gi" + cpu: "2" +``` + +## Advanced Configuration + +### Using ConfigMaps + +You can provide custom configuration using ConfigMaps: + +```yaml +spec: + server: + distribution: + name: "starter" + userConfig: + configMapName: "my-llamastack-config" + configMapNamespace: "default" # Optional, defaults to same namespace +``` + +Create the ConfigMap: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: my-llamastack-config + namespace: default +data: + run.yaml: | + built_with: llama-stack-0.0.53 + called_from: /tmp + distribution: + description: Built by `llama stack build` from `starter` template + name: starter + providers: + agents: meta-reference + inference: meta-reference + memory: meta-reference + safety: meta-reference + telemetry: meta-reference + image_name: starter +``` + +### Scaling + +Scale your Starter distribution horizontally: + +```yaml +spec: + replicas: 3 + server: + distribution: + name: "starter" + containerSpec: + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1" +``` + +### Custom Storage + +Configure persistent storage for your data: + +```yaml +spec: + server: + storage: + size: "50Gi" + mountPath: "/.llama" +``` + +## Use Cases + +### 1. Learning and Development + +Perfect for developers new to LlamaStack: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: learning-llamastack + namespace: development +spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "500m" + env: + - name: LOG_LEVEL + value: "DEBUG" + storage: + size: "10Gi" +``` + +### 2. Prototyping Applications + +For building and testing LlamaStack applications: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: prototype-llamastack + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1" + storage: + size: "20Gi" +``` + +### 3. Small Production Workloads + +For lightweight production deployments: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: production-starter + namespace: production +spec: + replicas: 2 + server: + distribution: + name: "starter" + containerSpec: + resources: + requests: + memory: "4Gi" + cpu: "1" + limits: + memory: "8Gi" + cpu: "2" + env: + - name: LOG_LEVEL + value: "WARNING" + - name: ENABLE_TELEMETRY + value: "true" + storage: + size: "100Gi" +``` + +## Monitoring and Troubleshooting + +### Health Checks + +Check the health of your Starter distribution: + +```bash +# Check pod status +kubectl get pods -l app=llama-stack + +# View logs +kubectl logs -l app=llama-stack + +# Check service endpoints +kubectl get svc -l app=llama-stack +``` + +### Common Issues + +1. **Pod Not Starting** + - Check resource availability in your cluster + - Verify image pull permissions + - Review pod events: `kubectl describe pod ` + +2. **Service Not Accessible** + - Verify service creation: `kubectl get svc` + - Check port configuration + - Ensure network policies allow traffic + +3. **Storage Issues** + - Verify PVC creation: `kubectl get pvc` + - Check storage class availability + - Ensure sufficient cluster storage + +## Best Practices + +### Resource Planning +- Start with minimal resources and scale up as needed +- Monitor resource usage with `kubectl top pods` +- Use resource requests to ensure scheduling + +### Configuration Management +- Use ConfigMaps for complex configurations +- Store sensitive data in Secrets +- Version your configuration files + +### Monitoring +- Enable telemetry for production deployments +- Set up log aggregation +- Monitor pod health and resource usage + +## Next Steps + +Once you're comfortable with the Starter distribution, consider: + +1. **[Ollama Distribution](ollama.md)** - For local inference with Ollama +2. **[vLLM Distribution](vllm.md)** - For high-performance GPU inference +3. **[Bedrock Distribution](bedrock.md)** - For AWS Bedrock integration +4. **[Custom Images](bring-your-own.md)** - For specialized requirements + +## API Reference + +For complete API documentation, see: +- [API Reference](../reference/api.md) +- [Configuration Reference](../reference/configuration.md) diff --git a/docs/content/distributions/tgi.md b/docs/content/distributions/tgi.md new file mode 100644 index 000000000..c01fb1e69 --- /dev/null +++ b/docs/content/distributions/tgi.md @@ -0,0 +1,519 @@ +# Text Generation Inference (TGI) Distribution + +!!! warning "Distribution Availability" + The TGI distribution container image may not be currently maintained or available. + Please verify the image exists at `docker.io/llamastack/distribution-tgi:latest` before using this distribution. + For production use, consider using the `ollama` or `vllm` distributions which are actively maintained. + +The **TGI** distribution integrates with Hugging Face's Text Generation Inference (TGI) server, providing high-performance inference for large language models with optimized serving capabilities. + +## Overview + +Text Generation Inference (TGI) is Hugging Face's solution for deploying and serving Large Language Models. The TGI distribution: + +- **Connects to TGI servers** for optimized model inference +- **Supports streaming responses** for real-time applications +- **Provides high throughput** with batching and optimization +- **Compatible with Hugging Face models** from the Hub + +## Distribution Details + +| Property | Value | +|----------|-------| +| **Distribution Name** | `tgi` | +| **Image** | `docker.io/llamastack/distribution-tgi:latest` | +| **Use Case** | Hugging Face TGI server integration | +| **Requirements** | TGI server endpoint | +| **Recommended For** | High-performance inference, Hugging Face ecosystem | + +## Prerequisites + +### 1. TGI Server Setup + +You need a running TGI server. You can deploy one using: + +#### Option A: Deploy TGI in Kubernetes +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tgi-server +spec: + replicas: 1 + selector: + matchLabels: + app: tgi-server + template: + metadata: + labels: + app: tgi-server + spec: + containers: + - name: tgi + image: ghcr.io/huggingface/text-generation-inference:latest + ports: + - containerPort: 80 + env: + - name: MODEL_ID + value: "microsoft/DialoGPT-medium" + - name: PORT + value: "80" + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" +--- +apiVersion: v1 +kind: Service +metadata: + name: tgi-server-service +spec: + selector: + app: tgi-server + ports: + - port: 80 + targetPort: 80 +``` + +#### Option B: External TGI Server +Use an existing TGI deployment (cloud or on-premises). + +## Quick Start + +### 1. Create TGI Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-tgi-llamastack + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "tgi" + containerSpec: + port: 8321 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" + env: + - name: TGI_URL + value: "http://tgi-server-service:80" + - name: MODEL_ID + value: "microsoft/DialoGPT-medium" + storage: + size: "10Gi" +``` + +### 2. Deploy the Distribution + +```bash +kubectl apply -f tgi-distribution.yaml +``` + +### 3. Verify Deployment + +```bash +# Check the distribution status +kubectl get llamastackdistribution my-tgi-llamastack + +# Check the pods +kubectl get pods -l app=llama-stack + +# Test TGI connectivity +kubectl logs -l app=llama-stack +``` + +## Configuration Options + +### Environment Variables + +Configure TGI connection and behavior: + +```yaml +env: + - name: TGI_URL + value: "http://tgi-server-service:80" + - name: MODEL_ID + value: "microsoft/DialoGPT-medium" + - name: TGI_TIMEOUT + value: "30" # Request timeout in seconds + - name: TGI_MAX_TOKENS + value: "512" + - name: TGI_TEMPERATURE + value: "0.7" + - name: TGI_TOP_P + value: "0.9" + - name: LOG_LEVEL + value: "INFO" +``` + +### TGI Server Configuration + +Common TGI server models and configurations: + +#### Small Models (Development) +```yaml +env: + - name: TGI_URL + value: "http://tgi-server:80" + - name: MODEL_ID + value: "microsoft/DialoGPT-small" # ~117M parameters + - name: TGI_MAX_TOKENS + value: "256" +``` + +#### Medium Models (Production) +```yaml +env: + - name: TGI_URL + value: "http://tgi-server:80" + - name: MODEL_ID + value: "microsoft/DialoGPT-medium" # ~345M parameters + - name: TGI_MAX_TOKENS + value: "512" +``` + +#### Large Models (High Performance) +```yaml +env: + - name: TGI_URL + value: "http://tgi-server:80" + - name: MODEL_ID + value: "microsoft/DialoGPT-large" # ~762M parameters + - name: TGI_MAX_TOKENS + value: "1024" +``` + +### Resource Requirements + +#### Lightweight Setup +```yaml +resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" +``` + +#### Standard Setup +```yaml +resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" +``` + +#### High-Performance Setup +```yaml +resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" +``` + +## Advanced Configuration + +### Multiple TGI Servers + +Connect to multiple TGI servers for load balancing: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: multi-tgi-llamastack +spec: + replicas: 2 + server: + distribution: + name: "tgi" + containerSpec: + env: + - name: TGI_URLS + value: "http://tgi-server-1:80,http://tgi-server-2:80" + - name: TGI_LOAD_BALANCE + value: "round_robin" # round_robin, random, least_connections +``` + +### TGI with GPU Support + +For GPU-accelerated TGI servers: + +```yaml +# TGI Server with GPU +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tgi-gpu-server +spec: + template: + spec: + containers: + - name: tgi + image: ghcr.io/huggingface/text-generation-inference:latest + env: + - name: MODEL_ID + value: "meta-llama/Llama-2-7b-chat-hf" + - name: CUDA_VISIBLE_DEVICES + value: "0" + resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" +``` + +### Custom TGI Configuration + +Use ConfigMaps for complex TGI configurations: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: tgi-config +data: + tgi-settings.json: | + { + "max_concurrent_requests": 128, + "max_best_of": 2, + "max_stop_sequences": 4, + "max_input_length": 1024, + "max_total_tokens": 2048, + "waiting_served_ratio": 1.2, + "max_batch_prefill_tokens": 4096, + "max_batch_total_tokens": 8192 + } +--- +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-tgi-llamastack +spec: + server: + distribution: + name: "tgi" + containerSpec: + env: + - name: TGI_CONFIG_FILE + value: "/config/tgi-settings.json" + podOverrides: + volumes: + - name: tgi-config + configMap: + name: tgi-config + volumeMounts: + - name: tgi-config + mountPath: /config +``` + +## Use Cases + +### 1. Development Environment + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: dev-tgi + namespace: development +spec: + replicas: 1 + server: + distribution: + name: "tgi" + containerSpec: + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + env: + - name: TGI_URL + value: "http://tgi-dev-server:80" + - name: MODEL_ID + value: "microsoft/DialoGPT-small" + - name: LOG_LEVEL + value: "DEBUG" + storage: + size: "5Gi" +``` + +### 2. Production Deployment + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: production-tgi + namespace: production +spec: + replicas: 3 + server: + distribution: + name: "tgi" + containerSpec: + resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" + env: + - name: TGI_URL + value: "http://tgi-prod-server:80" + - name: MODEL_ID + value: "meta-llama/Llama-2-7b-chat-hf" + - name: TGI_MAX_TOKENS + value: "1024" + - name: TGI_TEMPERATURE + value: "0.7" + - name: ENABLE_TELEMETRY + value: "true" + storage: + size: "50Gi" +``` + +### 3. High-Throughput Setup + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: high-throughput-tgi + namespace: production +spec: + replicas: 5 + server: + distribution: + name: "tgi" + containerSpec: + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + env: + - name: TGI_URLS + value: "http://tgi-1:80,http://tgi-2:80,http://tgi-3:80" + - name: TGI_LOAD_BALANCE + value: "least_connections" + - name: TGI_TIMEOUT + value: "60" + - name: TGI_MAX_CONCURRENT_REQUESTS + value: "256" +``` + +## Monitoring and Troubleshooting + +### Health Checks + +```bash +# Check distribution status +kubectl get llamastackdistribution + +# Check TGI connectivity +kubectl logs -l app=llama-stack | grep -i tgi + +# Test TGI server directly +kubectl exec -it -- curl http://tgi-server:80/health +``` + +### Performance Monitoring + +```bash +# Monitor resource usage +kubectl top pods -l app=llama-stack + +# Check TGI server metrics +kubectl exec -it -- curl http://localhost:80/metrics + +# Monitor request latency +kubectl logs -l app=llama-stack | grep -i "response_time" +``` + +### Common Issues + +1. **TGI Server Unreachable** + ```bash + # Check TGI server status + kubectl get pods -l app=tgi-server + kubectl logs -l app=tgi-server + + # Test connectivity + kubectl exec -it -- curl http://tgi-server:80/health + ``` + +2. **Model Loading Failures** + - Verify model ID exists on Hugging Face Hub + - Check TGI server has sufficient resources + - Ensure model is compatible with TGI + +3. **Timeout Issues** + - Increase `TGI_TIMEOUT` value + - Check TGI server performance + - Monitor network latency + +## Best Practices + +### Performance Optimization +- Use appropriate batch sizes for your workload +- Configure TGI server with optimal parameters +- Monitor and tune timeout values +- Use multiple TGI servers for high availability + +### Resource Management +- Size TGI servers based on model requirements +- Monitor GPU utilization if using GPU acceleration +- Scale LlamaStack replicas based on request volume +- Use resource requests and limits + +### Security +- Secure TGI server endpoints with authentication +- Use network policies to restrict access +- Monitor API usage and implement rate limiting +- Keep TGI server images updated + +### Model Management +- Version your models and TGI configurations +- Test model changes in development first +- Monitor model performance and accuracy +- Have rollback procedures for model updates + +## Next Steps + +- [Configure Scaling](../how-to-guides/scaling.md) +- [Set up Monitoring](../how-to-guides/monitoring.md) +- [Security Configuration](../how-to-guides/security.md) +- [Performance Tuning](../how-to-guides/performance.md) + +## API Reference + +For complete API documentation, see: +- [API Reference](../reference/api.md) +- [Configuration Reference](../reference/configuration.md) diff --git a/docs/content/distributions/together.md b/docs/content/distributions/together.md new file mode 100644 index 000000000..31cf59337 --- /dev/null +++ b/docs/content/distributions/together.md @@ -0,0 +1,553 @@ +# Together AI Distribution + +!!! warning "Distribution Availability" + The Together distribution container image may not be currently maintained or available. + Please verify the image exists at `docker.io/llamastack/distribution-together:latest` before using this distribution. + For production use, consider using the `ollama` or `vllm` distributions which are actively maintained. + +The **Together** distribution integrates with Together AI's inference platform, providing access to a wide variety of open-source models through their optimized API service. + +## Overview + +Together AI offers fast, scalable inference for open-source language models. The Together distribution: + +- **Connects to Together AI API** for model inference +- **Supports multiple open-source models** (Llama, Mistral, CodeLlama, etc.) +- **Provides high-performance inference** with optimized serving +- **Offers cost-effective scaling** with pay-per-use pricing + +## Distribution Details + +| Property | Value | +|----------|-------| +| **Distribution Name** | `together` | +| **Image** | `docker.io/llamastack/distribution-together:latest` | +| **Use Case** | Together AI API integration | +| **Requirements** | Together AI API key | +| **Recommended For** | Open-source models, cost-effective inference | + +## Prerequisites + +### 1. Together AI Account + +- Sign up at [together.ai](https://together.ai) +- Get your API key from the dashboard +- Choose your preferred models + +### 2. API Key Setup + +Create a Kubernetes secret with your Together AI API key: + +```bash +kubectl create secret generic together-api-key \ + --from-literal=TOGETHER_API_KEY=your-api-key-here +``` + +## Quick Start + +### 1. Create Together Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-together-llamastack + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "together" + containerSpec: + port: 8321 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" + env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-api-key + key: TOGETHER_API_KEY + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + storage: + size: "10Gi" +``` + +### 2. Deploy the Distribution + +```bash +kubectl apply -f together-distribution.yaml +``` + +### 3. Verify Deployment + +```bash +# Check the distribution status +kubectl get llamastackdistribution my-together-llamastack + +# Check the pods +kubectl get pods -l app=llama-stack + +# Check logs for Together AI connectivity +kubectl logs -l app=llama-stack +``` + +## Configuration Options + +### Supported Models + +Together AI supports many popular open-source models: + +#### Meta Llama Models +```yaml +env: + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + # value: "meta-llama/Llama-2-13b-chat-hf" + # value: "meta-llama/Llama-2-70b-chat-hf" + # value: "meta-llama/CodeLlama-7b-Instruct-hf" + # value: "meta-llama/CodeLlama-13b-Instruct-hf" +``` + +#### Mistral Models +```yaml +env: + - name: TOGETHER_MODEL + value: "mistralai/Mistral-7B-Instruct-v0.1" + # value: "mistralai/Mixtral-8x7B-Instruct-v0.1" +``` + +#### Other Popular Models +```yaml +env: + - name: TOGETHER_MODEL + value: "togethercomputer/RedPajama-INCITE-7B-Chat" + # value: "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO" + # value: "teknium/OpenHermes-2.5-Mistral-7B" +``` + +### Environment Variables + +Configure Together AI connection and model parameters: + +```yaml +env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-api-key + key: TOGETHER_API_KEY + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + - name: TOGETHER_MAX_TOKENS + value: "512" + - name: TOGETHER_TEMPERATURE + value: "0.7" + - name: TOGETHER_TOP_P + value: "0.9" + - name: TOGETHER_TOP_K + value: "50" + - name: TOGETHER_REPETITION_PENALTY + value: "1.0" + - name: TOGETHER_TIMEOUT + value: "30" # Request timeout in seconds + - name: LOG_LEVEL + value: "INFO" +``` + +### Resource Requirements + +#### Development Setup +```yaml +resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" +``` + +#### Production Setup +```yaml +resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" +``` + +#### High-Throughput Setup +```yaml +resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" +``` + +## Advanced Configuration + +### Multiple Models + +Deploy different distributions for different models: + +```yaml +# Llama 2 7B for general chat +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: together-llama2-7b +spec: + server: + distribution: + name: "together" + containerSpec: + env: + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" +--- +# CodeLlama for code generation +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: together-codellama +spec: + server: + distribution: + name: "together" + containerSpec: + env: + - name: TOGETHER_MODEL + value: "meta-llama/CodeLlama-7b-Instruct-hf" +``` + +### Production Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: production-together + namespace: production +spec: + replicas: 3 + server: + distribution: + name: "together" + containerSpec: + resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" + env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-api-key + key: TOGETHER_API_KEY + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-13b-chat-hf" + - name: TOGETHER_MAX_TOKENS + value: "1024" + - name: TOGETHER_TEMPERATURE + value: "0.7" + - name: TOGETHER_TIMEOUT + value: "60" + - name: LOG_LEVEL + value: "WARNING" + - name: ENABLE_TELEMETRY + value: "true" + storage: + size: "20Gi" +``` + +### Custom Configuration with ConfigMap + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: together-config +data: + together-settings.json: | + { + "default_model": "meta-llama/Llama-2-7b-chat-hf", + "max_tokens": 512, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "repetition_penalty": 1.0, + "stop_sequences": ["", "[INST]", "[/INST]"], + "retry_config": { + "max_retries": 3, + "backoff_factor": 2, + "max_backoff": 60 + } + } +--- +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-together +spec: + server: + distribution: + name: "together" + containerSpec: + env: + - name: TOGETHER_CONFIG_FILE + value: "/config/together-settings.json" + podOverrides: + volumes: + - name: together-config + configMap: + name: together-config + volumeMounts: + - name: together-config + mountPath: /config +``` + +## Use Cases + +### 1. Development and Prototyping + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: dev-together + namespace: development +spec: + replicas: 1 + server: + distribution: + name: "together" + containerSpec: + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-api-key + key: TOGETHER_API_KEY + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + - name: TOGETHER_MAX_TOKENS + value: "256" + - name: LOG_LEVEL + value: "DEBUG" + storage: + size: "5Gi" +``` + +### 2. Code Generation Service + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: code-generation-together + namespace: default +spec: + replicas: 2 + server: + distribution: + name: "together" + containerSpec: + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1" + env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-api-key + key: TOGETHER_API_KEY + - name: TOGETHER_MODEL + value: "meta-llama/CodeLlama-13b-Instruct-hf" + - name: TOGETHER_MAX_TOKENS + value: "2048" + - name: TOGETHER_TEMPERATURE + value: "0.1" # Lower temperature for code + storage: + size: "15Gi" +``` + +### 3. High-Volume Production + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: high-volume-together + namespace: production +spec: + replicas: 5 + server: + distribution: + name: "together" + containerSpec: + resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" + env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-api-key + key: TOGETHER_API_KEY + - name: TOGETHER_MODEL + value: "meta-llama/Llama-2-70b-chat-hf" + - name: TOGETHER_MAX_TOKENS + value: "1024" + - name: TOGETHER_TIMEOUT + value: "120" + - name: ENABLE_TELEMETRY + value: "true" + storage: + size: "50Gi" +``` + +## Monitoring and Troubleshooting + +### Health Checks + +```bash +# Check distribution status +kubectl get llamastackdistribution + +# Check API connectivity +kubectl logs -l app=llama-stack | grep -i together + +# Test API key +kubectl exec -it -- curl -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/models +``` + +### Performance Monitoring + +```bash +# Monitor resource usage +kubectl top pods -l app=llama-stack + +# Check API response times +kubectl logs -l app=llama-stack | grep -i "response_time" + +# Monitor API usage +kubectl logs -l app=llama-stack | grep -i "api_usage" +``` + +### Common Issues + +1. **Invalid API Key** + ```bash + # Verify API key in secret + kubectl get secret together-api-key -o yaml + + # Test API key manually + kubectl exec -it -- env | grep TOGETHER_API_KEY + ``` + +2. **Model Not Available** + - Check if model exists in Together AI catalog + - Verify model name spelling and format + - Some models may have usage restrictions + +3. **Rate Limiting** + - Monitor API usage and limits + - Implement request queuing + - Consider upgrading Together AI plan + +4. **Timeout Issues** + - Increase `TOGETHER_TIMEOUT` value + - Check network connectivity + - Monitor Together AI service status + +## Best Practices + +### Cost Optimization +- Choose appropriate models for your use case +- Monitor token usage and optimize prompts +- Use smaller models for development/testing +- Implement caching for repeated requests +- Set up usage alerts and budgets + +### Performance +- Scale replicas based on request volume +- Use connection pooling and keep-alive +- Implement request batching where possible +- Monitor and optimize timeout values + +### Security +- Store API keys in Kubernetes Secrets +- Use least-privilege access controls +- Monitor API usage for anomalies +- Rotate API keys regularly +- Implement rate limiting and request validation + +### Reliability +- Implement retry logic with exponential backoff +- Use multiple replicas for high availability +- Monitor Together AI service status +- Have fallback mechanisms for service outages + +## Cost Management + +### Usage Monitoring +```yaml +env: + - name: ENABLE_USAGE_TRACKING + value: "true" + - name: USAGE_LOG_LEVEL + value: "INFO" + - name: COST_ALERT_THRESHOLD + value: "100" # Alert when daily cost exceeds $100 +``` + +### Budget Controls +- Set up billing alerts in Together AI dashboard +- Implement request quotas per user/application +- Monitor token usage patterns +- Use smaller models for non-critical workloads + +## Next Steps + +- [Configure Monitoring](../how-to-guides/monitoring.md) +- [Set up Scaling](../how-to-guides/scaling.md) +- [Security Best Practices](../how-to-guides/security.md) +- [Cost Optimization](../how-to-guides/cost-optimization.md) + +## API Reference + +For complete API documentation, see: +- [API Reference](../reference/api.md) +- [Configuration Reference](../reference/configuration.md) +- [Together AI API Documentation](https://docs.together.ai/) diff --git a/docs/content/distributions/vllm.md b/docs/content/distributions/vllm.md new file mode 100644 index 000000000..a1afd214d --- /dev/null +++ b/docs/content/distributions/vllm.md @@ -0,0 +1,613 @@ +# vLLM Distribution + +vLLM is a high-performance inference engine optimized for large language models. The LlamaStack Kubernetes operator provides built-in support for vLLM through pre-configured distributions. + +## Overview + +vLLM offers excellent performance characteristics: + +- **High Throughput**: Optimized for serving multiple concurrent requests +- **Memory Efficiency**: Advanced memory management and attention mechanisms +- **GPU Acceleration**: Native CUDA support for NVIDIA GPUs +- **Model Compatibility**: Supports a wide range of popular model architectures + +## Pre-Built vLLM Distributions + +The operator includes two pre-built vLLM distributions: + +### vllm-gpu (Self-Hosted) +- **Image**: `docker.io/llamastack/distribution-vllm-gpu:latest` +- **Purpose**: GPU-accelerated vLLM inference with local model serving +- **Requirements**: NVIDIA GPU with CUDA support +- **Infrastructure**: You provide GPU infrastructure +- **Use Case**: High-performance inference for production workloads + +### remote-vllm (External Connection) +- **Image**: `docker.io/llamastack/distribution-remote-vllm:latest` +- **Purpose**: Connect to external vLLM server +- **Requirements**: Access to external vLLM endpoint +- **Infrastructure**: External vLLM server required +- **Use Case**: Using existing vLLM deployments or managed services + +## Quick Start with vLLM + +### 1. Create a LlamaStackDistribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-vllm-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "vllm-gpu" # Use supported distribution + containerSpec: + port: 8321 + resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "4" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + storage: + size: "50Gi" + mountPath: "/.llama" +``` + +### 2. Deploy the Distribution + +```bash +kubectl apply -f vllm-distribution.yaml +``` + +### 3. Verify Deployment + +```bash +kubectl get llamastackdistribution my-vllm-distribution +kubectl get pods -l app=llama-stack +``` + +## Configuration Options + +### Container Specification + +The `containerSpec` section allows you to configure the container: + +```yaml +spec: + server: + containerSpec: + name: "llama-stack" # Optional, defaults to "llama-stack" + port: 8321 # Optional, defaults to 8321 + resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "4" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + - name: VLLM_GPU_MEMORY_UTILIZATION + value: "0.9" + - name: VLLM_MAX_SEQ_LEN + value: "4096" +``` + +### Environment Variables + +Configure vLLM behavior through environment variables: + +```yaml +env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + - name: VLLM_GPU_MEMORY_UTILIZATION + value: "0.9" + - name: VLLM_MAX_SEQ_LEN + value: "4096" + - name: VLLM_MAX_BATCH_SIZE + value: "32" + - name: VLLM_TENSOR_PARALLEL_SIZE + value: "1" +``` + +### Resource Requirements + +```yaml +resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "4" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" + cpu: "8" +``` + +### Storage Configuration + +```yaml +storage: + size: "50Gi" + mountPath: "/.llama" # Optional, defaults to "/.llama" +``` + +## Advanced Configuration + +### Multi-GPU Setup + +For larger models requiring multiple GPUs: + +```yaml +spec: + server: + containerSpec: + resources: + requests: + nvidia.com/gpu: "4" + memory: "64Gi" + cpu: "16" + limits: + nvidia.com/gpu: "4" + memory: "128Gi" + cpu: "32" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-70b-chat-hf" + - name: VLLM_TENSOR_PARALLEL_SIZE + value: "4" +``` + +### Custom Volumes with Pod Overrides + +```yaml +spec: + server: + podOverrides: + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: model-cache-pvc + volumeMounts: + - name: model-cache + mountPath: /models + containerSpec: + env: + - name: INFERENCE_MODEL + value: "/models/custom-llama-model" +``` + +### Scaling with Multiple Replicas + +```yaml +spec: + replicas: 3 + server: + distribution: + name: "vllm-gpu" + containerSpec: + resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" +``` + +## Using vLLM with the Kubernetes Operator + +The LlamaStack Kubernetes operator supports vLLM in two ways: + +### 1. Pre-Built Distributions (Recommended) + +Use pre-built, maintained distributions with the `distribution.name` field: + +#### vllm-gpu Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: vllm-gpu-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "vllm-gpu" # Supported distribution + containerSpec: + resources: + requests: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "4" + limits: + nvidia.com/gpu: "1" + memory: "32Gi" + cpu: "8" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + - name: VLLM_GPU_MEMORY_UTILIZATION + value: "0.9" + storage: + size: "50Gi" +``` + +#### remote-vllm Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: remote-vllm-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "remote-vllm" # Supported distribution + containerSpec: + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "8Gi" + cpu: "4" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-7b-chat-hf" + - name: VLLM_URL + value: "http://external-vllm-service:8000" +``` + +### 2. Bring Your Own (BYO) Custom Images + +Use custom-built distributions with the `distribution.image` field: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-vllm-distribution + namespace: default +spec: + replicas: 1 + server: + distribution: + image: "my-registry.com/custom-vllm:v1.0.0" # Custom image + containerSpec: + resources: + requests: + nvidia.com/gpu: "2" + memory: "32Gi" + cpu: "8" + limits: + nvidia.com/gpu: "2" + memory: "64Gi" + cpu: "16" + env: + - name: INFERENCE_MODEL + value: "my-custom-model" + - name: CUSTOM_VLLM_SETTING + value: "optimized" + storage: + size: "100Gi" +``` + +## Building Custom vLLM Distributions + +### Step 1: Build with LlamaStack CLI + +#### Option A: From Template + +```bash +# Install LlamaStack CLI +pip install llama-stack + +# Build from vLLM template +llama stack build --template vllm-gpu --image-type container --image-name my-vllm-dist +``` + +#### Option B: Custom Configuration + +Create `custom-vllm-build.yaml`: + +```yaml +name: custom-vllm +distribution_spec: + description: Custom vLLM distribution with optimizations + providers: + inference: inline::vllm + memory: inline::faiss + safety: inline::llama-guard + agents: inline::meta-reference + telemetry: inline::meta-reference +image_name: custom-vllm +image_type: container +``` + +Build the distribution: + +```bash +llama stack build --config custom-vllm-build.yaml +``` + +### Step 2: Enhance with Custom Dockerfile + +Create `Dockerfile.enhanced`: + +```dockerfile +FROM distribution-custom-vllm:dev + +# Install additional dependencies +RUN pip install \ + flash-attn \ + custom-optimization-lib \ + monitoring-tools + +# Add custom configurations +COPY vllm-config.json /app/config.json +COPY custom-models/ /app/models/ + +# Set optimization environment variables +ENV VLLM_USE_FLASH_ATTN=1 +ENV VLLM_OPTIMIZATION_LEVEL=high +ENV CUSTOM_GPU_SETTINGS=enabled + +# Add health check script +COPY health-check.sh /app/health-check.sh +RUN chmod +x /app/health-check.sh + +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD /app/health-check.sh + +EXPOSE 8321 +``` + +Build the enhanced image: + +```bash +docker build -f Dockerfile.enhanced -t my-registry.com/enhanced-vllm:v1.0.0 . +``` + +### Step 3: Push to Registry + +```bash +# Tag for your registry +docker tag my-registry.com/enhanced-vllm:v1.0.0 my-registry.com/enhanced-vllm:latest + +# Push to registry +docker push my-registry.com/enhanced-vllm:v1.0.0 +docker push my-registry.com/enhanced-vllm:latest +``` + +### Step 4: Deploy with Operator + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: enhanced-vllm-dist + namespace: production +spec: + replicas: 2 + server: + distribution: + image: "my-registry.com/enhanced-vllm:v1.0.0" + containerSpec: + resources: + requests: + nvidia.com/gpu: "2" + memory: "32Gi" + cpu: "8" + limits: + nvidia.com/gpu: "2" + memory: "64Gi" + cpu: "16" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-13b-chat-hf" + - name: VLLM_TENSOR_PARALLEL_SIZE + value: "2" + - name: VLLM_GPU_MEMORY_UTILIZATION + value: "0.85" + - name: CUSTOM_OPTIMIZATION + value: "enabled" + storage: + size: "200Gi" + podOverrides: + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: shared-model-cache + volumeMounts: + - name: model-cache + mountPath: /shared-models +``` + +## Comparison: Pre-Built vs BYO + +| Aspect | Pre-Built Distributions | BYO Custom Images | +|--------|------------------------|-------------------| +| **Setup Complexity** | Simple - just specify `name` | Complex - build and maintain images | +| **Maintenance** | Maintained by LlamaStack team | You maintain the images | +| **Customization** | Limited to environment variables | Full control over dependencies and configuration | +| **Security** | Vetted by maintainers | You control security scanning and updates | +| **Performance** | Standard optimizations | Custom optimizations possible | +| **Support** | Community and official support | Self-supported | +| **Updates** | Automatic with operator updates | Manual image rebuilds required | + +### When to Use Pre-Built Distributions + +- **Quick deployment** and standard use cases +- **Production environments** where stability is key +- **Limited customization** requirements +- **Teams without container expertise** + +### When to Use BYO Custom Images + +- **Specialized models** or inference engines +- **Custom optimizations** for specific hardware +- **Additional dependencies** not in standard images +- **Compliance requirements** for image provenance +- **Integration** with existing infrastructure + +## Monitoring and Troubleshooting + +### Health Checks + +The vLLM distribution includes built-in health checks: + +```bash +# Check pod status +kubectl get pods -l app=llama-stack + +# View logs +kubectl logs -l app=llama-stack + +# Check service endpoints +kubectl get svc my-vllm-distribution-service +``` + +### Performance Monitoring + +```bash +# Monitor GPU utilization +kubectl exec -it -- nvidia-smi + +# Check memory usage +kubectl top pods -l app=llama-stack +``` + +### Common Issues + +1. **GPU Not Available** + - Ensure NVIDIA device plugin is installed + - Verify GPU resources in node capacity + +2. **Out of Memory** + - Reduce `VLLM_GPU_MEMORY_UTILIZATION` + - Increase memory limits + - Use smaller models + +3. **Model Loading Failures** + - Check model path and permissions + - Verify sufficient storage space + - Check environment variable values + +## Best Practices + +### Resource Planning + +- **GPU Memory**: Ensure sufficient VRAM for model + batch processing +- **CPU**: Allocate adequate CPU for preprocessing and coordination +- **Storage**: Use fast storage (NVMe SSD) for model loading + +### Environment Variable Guidelines + +- Use `INFERENCE_MODEL` to specify the model to load +- Set `VLLM_GPU_MEMORY_UTILIZATION` to control GPU memory usage (0.8-0.9 recommended) +- Configure `VLLM_MAX_SEQ_LEN` based on your use case requirements +- Use `VLLM_TENSOR_PARALLEL_SIZE` for multi-GPU setups + +### Security + +- Use private registries for custom images +- Implement proper RBAC for distribution management +- Secure model storage with appropriate access controls + +## Examples + +### Production Setup + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: production-vllm + namespace: llama-production +spec: + replicas: 2 + server: + distribution: + name: "vllm-gpu" + containerSpec: + resources: + requests: + nvidia.com/gpu: "2" + memory: "32Gi" + cpu: "8" + limits: + nvidia.com/gpu: "2" + memory: "64Gi" + cpu: "16" + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-2-13b-chat-hf" + - name: VLLM_TENSOR_PARALLEL_SIZE + value: "2" + - name: VLLM_GPU_MEMORY_UTILIZATION + value: "0.85" + storage: + size: "100Gi" +``` + +### Development Setup + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: dev-vllm + namespace: development +spec: + replicas: 1 + server: + distribution: + name: "vllm-gpu" + containerSpec: + resources: + requests: + nvidia.com/gpu: "1" + memory: "8Gi" + cpu: "2" + limits: + nvidia.com/gpu: "1" + memory: "16Gi" + cpu: "4" + env: + - name: INFERENCE_MODEL + value: "microsoft/DialoGPT-small" + storage: + size: "20Gi" +``` + +## API Reference + +For complete API documentation, see: +- [API Reference](../reference/api.md) +- [Configuration Reference](../reference/configuration.md) + +## Next Steps + +- [Configure Storage](../how-to/configure-storage.md) +- [Scaling Guide](../how-to/scaling.md) +- [Monitoring Setup](../how-to/monitoring.md) +- [Ollama Distribution](ollama.md) diff --git a/docs/content/examples/basic-deployment.md b/docs/content/examples/basic-deployment.md new file mode 100644 index 000000000..0ce40fd0a --- /dev/null +++ b/docs/content/examples/basic-deployment.md @@ -0,0 +1,328 @@ +# Basic Deployment Example + +This example demonstrates a simple LlamaStack deployment suitable for development and testing environments. + +## Overview + +This configuration creates a single-replica LlamaStack instance using the ollama distribution with basic resource allocation. + +## Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: basic-llamastack + namespace: default + labels: + app: llamastack + environment: development +spec: + replicas: 1 + server: + distribution: + name: "ollama" + containerSpec: + name: "llama-stack" + port: 8321 + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1" + env: + - name: LOG_LEVEL + value: "info" + - name: INFERENCE_MODEL + value: "meta-llama/Llama-3.2-3B-Instruct" +``` + +## Deployment Steps + +1. **Save the configuration** to a file named `basic-deployment.yaml` + +2. **Apply the configuration**: + ```bash + kubectl apply -f basic-deployment.yaml + ``` + +3. **Verify the deployment**: + ```bash + kubectl get llamastackdistribution basic-llamastack + kubectl get pods -l app=llama-stack + ``` + +4. **Check the status**: + ```bash + kubectl describe llamastackdistribution basic-llamastack + ``` + +## Expected Resources + +This deployment will create: + +- **Deployment**: `basic-llamastack` with 1 replica +- **Service**: `basic-llamastack` exposing port 8321 +- **ConfigMap**: Configuration for the LlamaStack instance +- **Pod**: Single pod running the LlamaStack container + +## Accessing the Service + +### Port Forward (Development) + +```bash +kubectl port-forward service/basic-llamastack 8321:8321 +``` + +Access at: `http://localhost:8321` + +### Service Exposure (Testing) + +Create a NodePort service for external access: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: basic-llamastack-nodeport +spec: + type: NodePort + selector: + app: llama-stack + llamastack.io/instance: basic-llamastack + ports: + - port: 8321 + targetPort: 8321 + nodePort: 30321 + protocol: TCP +``` + +## Testing the Deployment + +### Health Check + +```bash +curl http://localhost:8321/health +``` + +Expected response: +```json +{ + "status": "healthy", + "version": "0.0.1", + "distribution": "meta-reference" +} +``` + +### API Endpoints + +```bash +# List providers +curl http://localhost:8321/providers + +# Get distribution info +curl http://localhost:8321/distribution/info + +# List available models +curl http://localhost:8321/models +``` + +## Resource Usage + +This basic deployment typically uses: + +- **CPU**: 0.5-1 core +- **Memory**: 2-4 GB +- **Storage**: Ephemeral (no persistent storage) +- **Network**: Single service port (8321) + +## Monitoring + +### Pod Status + +```bash +# Check pod status +kubectl get pods -l app=llama-stack + +# View pod details +kubectl describe pod -l app=llama-stack + +# Check resource usage +kubectl top pod -l app=llama-stack +``` + +### Logs + +```bash +# View recent logs +kubectl logs deployment/basic-llamastack + +# Follow logs in real-time +kubectl logs -f deployment/basic-llamastack + +# View logs with timestamps +kubectl logs deployment/basic-llamastack --timestamps +``` + +## Scaling + +### Manual Scaling + +Scale the deployment to multiple replicas: + +```bash +# Scale to 3 replicas +kubectl scale llamastackdistribution basic-llamastack --replicas=3 + +# Verify scaling +kubectl get pods -l app=llama-stack +``` + +### Resource Updates + +Update resource allocations: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: basic-llamastack +spec: + replicas: 1 + server: + distribution: + name: "meta-reference" + containerSpec: + port: 8321 + resources: + requests: + memory: "4Gi" # Increased from 2Gi + cpu: "1" # Increased from 500m + limits: + memory: "8Gi" # Increased from 4Gi + cpu: "2" # Increased from 1 +``` + +Apply the update: +```bash +kubectl apply -f basic-deployment.yaml +``` + +## Troubleshooting + +### Common Issues + +**Pod not starting:** +```bash +# Check pod events +kubectl describe pod -l app=llama-stack + +# Check resource constraints +kubectl describe node +``` + +**Service not accessible:** +```bash +# Check service endpoints +kubectl get endpoints basic-llamastack + +# Verify service configuration +kubectl describe service basic-llamastack +``` + +**Application errors:** +```bash +# Check application logs +kubectl logs deployment/basic-llamastack --tail=50 + +# Check for configuration issues +kubectl get configmap -l app=llama-stack +``` + +### Debug Commands + +```bash +# Get detailed resource information +kubectl get llamastackdistribution basic-llamastack -o yaml + +# Check events in the namespace +kubectl get events --sort-by=.metadata.creationTimestamp + +# Exec into the pod for debugging +kubectl exec -it deployment/basic-llamastack -- /bin/bash +``` + +## Cleanup + +Remove the deployment: + +```bash +# Delete the LlamaStack instance +kubectl delete llamastackdistribution basic-llamastack + +# Verify cleanup +kubectl get pods -l app=llama-stack +kubectl get services -l app=llama-stack +``` + +## Next Steps + +After successfully deploying this basic example: + +1. **[Try the production setup](production-setup.md)** - Learn about production-ready configurations +2. **[Add persistent storage](../how-to/configure-storage.md)** - Configure persistent volumes +3. **[Set up monitoring](../how-to/monitoring.md)** - Add observability +4. **[Configure scaling](../how-to/scaling.md)** - Learn about auto-scaling + +## Variations + +### Different Distribution + +Use the Ollama distribution instead: + +```yaml +spec: + server: + distribution: + name: "ollama" + containerSpec: + port: 8321 + env: + - name: OLLAMA_HOST + value: "0.0.0.0" +``` + +### Custom Environment Variables + +Add custom configuration: + +```yaml +spec: + server: + containerSpec: + env: + - name: LLAMASTACK_CONFIG_PATH + value: "/config/llamastack.yaml" + - name: MODEL_CACHE_DIR + value: "/tmp/models" + - name: MAX_CONCURRENT_REQUESTS + value: "10" +``` + +### Resource Constraints + +For resource-constrained environments: + +```yaml +spec: + server: + containerSpec: + resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "500m" diff --git a/docs/content/examples/custom-images.md b/docs/content/examples/custom-images.md new file mode 100644 index 000000000..26f45b877 --- /dev/null +++ b/docs/content/examples/custom-images.md @@ -0,0 +1,78 @@ +# Custom Images + +Guide for building and using custom LlamaStack images with the Kubernetes operator. + +## Building Custom Images + +### Base Dockerfile + +```dockerfile +FROM llamastack/llamastack:latest + +# Add custom models +COPY models/ /models/ + +# Add custom configurations +COPY config/ /config/ + +# Install additional dependencies +RUN pip install custom-package + +# Set custom entrypoint +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +``` + +### Multi-stage Build + +```dockerfile +# Build stage +FROM python:3.11-slim as builder + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Runtime stage +FROM llamastack/llamastack:latest + +COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages +COPY custom-code/ /app/ + +CMD ["python", "/app/main.py"] +``` + +## Using Custom Images + +### Basic Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-llamastack +spec: + image: "myregistry.com/custom-llamastack:v1.0.0" + imagePullPolicy: Always + imagePullSecrets: + - name: registry-credentials +``` + +### With Custom Configuration + +```yaml +spec: + image: "myregistry.com/llamastack-custom:latest" + config: + models: + - name: "custom-model" + path: "/models/custom-model" + provider: "custom-provider" +``` + +## Next Steps + +- [Production Setup](production-setup.md) +- [Basic Deployment](basic-deployment.md) diff --git a/docs/content/examples/production-setup.md b/docs/content/examples/production-setup.md new file mode 100644 index 000000000..4c3399c9e --- /dev/null +++ b/docs/content/examples/production-setup.md @@ -0,0 +1,670 @@ +# Production Setup + +Complete guide for deploying LlamaStack in production environments. + +## Production Architecture + +### High-Level Overview + +```mermaid +graph TB + LB[Load Balancer] --> IG[Ingress Gateway] + IG --> SVC[LlamaStack Service] + SVC --> POD1[LlamaStack Pod 1] + SVC --> POD2[LlamaStack Pod 2] + SVC --> POD3[LlamaStack Pod 3] + + POD1 --> PV1[Persistent Volume 1] + POD2 --> PV2[Persistent Volume 2] + POD3 --> PV3[Persistent Volume 3] + + MON[Monitoring] --> POD1 + MON --> POD2 + MON --> POD3 +``` + +### Infrastructure Requirements + +- **Kubernetes**: v1.24+ +- **Nodes**: 3+ worker nodes with GPU support +- **Storage**: High-performance SSD storage +- **Network**: Low-latency networking +- **Monitoring**: Prometheus + Grafana stack + +## Production Configuration + +### Complete Production Manifest + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-production + namespace: llamastack-prod + labels: + app: llamastack + environment: production + version: v1.0.0 +spec: + # Image configuration + image: llamastack/llamastack:v1.0.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: registry-credentials + + # Scaling configuration + replicas: 3 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + maxSurge: 1 + + # Resource configuration + resources: + requests: + cpu: "4" + memory: "8Gi" + nvidia.com/gpu: "1" + limits: + cpu: "8" + memory: "16Gi" + nvidia.com/gpu: "1" + + # Storage configuration + storage: + models: + size: "1Ti" + storageClass: "fast-ssd" + mountPath: "/models" + accessMode: ReadWriteOnce + data: + size: "500Gi" + storageClass: "standard-ssd" + mountPath: "/data" + accessMode: ReadWriteMany + cache: + size: "100Gi" + storageClass: "fast-ssd" + mountPath: "/cache" + accessMode: ReadWriteOnce + + # LlamaStack configuration + config: + models: + - name: "llama2-70b-chat" + path: "/models/llama2-70b-chat" + provider: "meta-reference" + config: + max_seq_len: 4096 + max_batch_size: 4 + - name: "llama2-13b-chat" + path: "/models/llama2-13b-chat" + provider: "meta-reference" + config: + max_seq_len: 4096 + max_batch_size: 8 + + inference: + provider: "meta-reference" + config: + model: "llama2-70b-chat" + max_tokens: 2048 + temperature: 0.7 + top_p: 0.9 + + safety: + provider: "llama-guard" + config: + model: "llama-guard-7b" + enable_prompt_guard: true + enable_response_guard: true + + memory: + provider: "faiss" + config: + vector_store: + provider: "faiss" + config: + dimension: 4096 + index_type: "IndexFlatIP" + + # Security configuration + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + containerSecurityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + # Scheduling configuration + nodeSelector: + node-type: "gpu" + zone: "us-west-2a" + + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "dedicated" + operator: "Equal" + value: "llamastack" + effect: "NoSchedule" + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/arch" + operator: "In" + values: ["amd64"] + - key: "node-type" + operator: "In" + values: ["gpu"] + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: "app" + operator: "In" + values: ["llamastack"] + topologyKey: "kubernetes.io/hostname" + + # Service configuration + service: + type: ClusterIP + port: 8080 + targetPort: 8080 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + prometheus.io/path: "/metrics" + + # Ingress configuration + ingress: + enabled: true + className: "nginx" + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + hosts: + - host: "api.llamastack.example.com" + paths: + - path: "/" + pathType: "Prefix" + tls: + - secretName: "llamastack-tls" + hosts: + - "api.llamastack.example.com" + + # Health checks + healthCheck: + livenessProbe: + httpGet: + path: "/health" + port: 8080 + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: "/ready" + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + + # Monitoring + metrics: + enabled: true + port: 9090 + path: "/metrics" + serviceMonitor: + enabled: true + interval: "30s" + scrapeTimeout: "10s" + + # Environment variables + env: + - name: LLAMASTACK_LOG_LEVEL + value: "INFO" + - name: LLAMASTACK_METRICS_ENABLED + value: "true" + - name: LLAMASTACK_CACHE_ENABLED + value: "true" + - name: LLAMASTACK_MAX_WORKERS + value: "4" +``` + +## Supporting Resources + +### Namespace + +```yaml +apiVersion: v1 +kind: Namespace +metadata: + name: llamastack-prod + labels: + name: llamastack-prod + environment: production +``` + +### Storage Classes + +```yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: fast-ssd +provisioner: kubernetes.io/aws-ebs +parameters: + type: gp3 + iops: "10000" + throughput: "1000" +allowVolumeExpansion: true +reclaimPolicy: Retain +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: standard-ssd +provisioner: kubernetes.io/aws-ebs +parameters: + type: gp3 + iops: "3000" + throughput: "125" +allowVolumeExpansion: true +reclaimPolicy: Retain +``` + +### Network Policies + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: llamastack-netpol + namespace: llamastack-prod +spec: + podSelector: + matchLabels: + app: llamastack + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + name: ingress-nginx + - podSelector: + matchLabels: + app: prometheus + ports: + - protocol: TCP + port: 8080 + - protocol: TCP + port: 9090 + egress: + - to: [] + ports: + - protocol: TCP + port: 53 + - protocol: UDP + port: 53 + - to: [] + ports: + - protocol: TCP + port: 443 + - protocol: TCP + port: 80 +``` + +## Auto Scaling + +### Horizontal Pod Autoscaler + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: llamastack-hpa + namespace: llamastack-prod +spec: + scaleTargetRef: + apiVersion: llamastack.io/v1alpha1 + kind: LlamaStackDistribution + name: llamastack-production + minReplicas: 3 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + - type: Pods + pods: + metric: + name: llamastack_active_requests + target: + type: AverageValue + averageValue: "100" + behavior: + scaleUp: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 50 + periodSeconds: 60 + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 25 + periodSeconds: 60 +``` + +### Vertical Pod Autoscaler + +```yaml +apiVersion: autoscaling.k8s.io/v1 +kind: VerticalPodAutoscaler +metadata: + name: llamastack-vpa + namespace: llamastack-prod +spec: + targetRef: + apiVersion: llamastack.io/v1alpha1 + kind: LlamaStackDistribution + name: llamastack-production + updatePolicy: + updateMode: "Auto" + resourcePolicy: + containerPolicies: + - containerName: llamastack + maxAllowed: + cpu: "16" + memory: "32Gi" + minAllowed: + cpu: "2" + memory: "4Gi" + controlledResources: ["cpu", "memory"] +``` + +## Monitoring Setup + +### ServiceMonitor + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: llamastack-monitor + namespace: llamastack-prod +spec: + selector: + matchLabels: + app: llamastack + endpoints: + - port: metrics + interval: 30s + path: /metrics + scrapeTimeout: 10s +``` + +### PrometheusRule + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: llamastack-alerts + namespace: llamastack-prod +spec: + groups: + - name: llamastack.rules + rules: + - alert: LlamaStackDown + expr: up{job="llamastack"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "LlamaStack instance is down" + description: "LlamaStack instance {{ $labels.instance }} has been down for more than 1 minute." + + - alert: HighErrorRate + expr: rate(llamastack_requests_total{status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value }} errors per second." + + - alert: HighLatency + expr: histogram_quantile(0.95, rate(llamastack_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High latency detected" + description: "95th percentile latency is {{ $value }} seconds." +``` + +## Backup Strategy + +### Automated Backups + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: llamastack-backup + namespace: llamastack-prod +spec: + schedule: "0 2 * * *" # Daily at 2 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: backup + image: velero/velero:latest + command: + - /bin/sh + - -c + - | + velero backup create llamastack-$(date +%Y%m%d-%H%M%S) \ + --include-namespaces llamastack-prod \ + --storage-location default \ + --ttl 720h0m0s + restartPolicy: OnFailure +``` + +## Security Hardening + +### Pod Security Policy + +```yaml +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: llamastack-psp +spec: + privileged: false + allowPrivilegeEscalation: false + requiredDropCapabilities: + - ALL + volumes: + - 'configMap' + - 'emptyDir' + - 'projected' + - 'secret' + - 'downwardAPI' + - 'persistentVolumeClaim' + runAsUser: + rule: 'MustRunAsNonRoot' + seLinux: + rule: 'RunAsAny' + fsGroup: + rule: 'RunAsAny' +``` + +### RBAC + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: llamastack-prod + name: llamastack-role +rules: +- apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: llamastack-rolebinding + namespace: llamastack-prod +subjects: +- kind: ServiceAccount + name: llamastack + namespace: llamastack-prod +roleRef: + kind: Role + name: llamastack-role + apiGroup: rbac.authorization.k8s.io +``` + +## Deployment Process + +### 1. Pre-deployment Checklist + +- [ ] Kubernetes cluster ready (v1.24+) +- [ ] GPU nodes available and labeled +- [ ] Storage classes configured +- [ ] Monitoring stack deployed +- [ ] Ingress controller configured +- [ ] TLS certificates ready +- [ ] Image registry accessible +- [ ] Backup solution configured + +### 2. Deployment Steps + +```bash +# 1. Create namespace +kubectl apply -f namespace.yaml + +# 2. Create storage classes +kubectl apply -f storage-classes.yaml + +# 3. Create RBAC resources +kubectl apply -f rbac.yaml + +# 4. Create network policies +kubectl apply -f network-policies.yaml + +# 5. Deploy LlamaStack +kubectl apply -f llamastack-production.yaml + +# 6. Create HPA +kubectl apply -f hpa.yaml + +# 7. Create monitoring resources +kubectl apply -f monitoring.yaml + +# 8. Verify deployment +kubectl get llamastackdistribution -n llamastack-prod +kubectl get pods -n llamastack-prod +``` + +### 3. Post-deployment Verification + +```bash +# Check pod status +kubectl get pods -n llamastack-prod -l app=llamastack + +# Check service endpoints +kubectl get endpoints -n llamastack-prod + +# Test health endpoints +kubectl exec -n llamastack-prod -it -- curl http://localhost:8080/health + +# Check metrics +kubectl port-forward -n llamastack-prod svc/llamastack-production 9090:9090 +curl http://localhost:9090/metrics + +# Test ingress +curl -k https://api.llamastack.example.com/health +``` + +## Maintenance + +### Rolling Updates + +```bash +# Update image version +kubectl patch llamastackdistribution llamastack-production -n llamastack-prod \ + -p '{"spec":{"image":"llamastack/llamastack:v1.1.0"}}' + +# Monitor rollout +kubectl rollout status deployment/llamastack-production -n llamastack-prod +``` + +### Scaling Operations + +```bash +# Manual scaling +kubectl scale llamastackdistribution llamastack-production -n llamastack-prod --replicas=5 + +# Check HPA status +kubectl get hpa -n llamastack-prod +``` + +### Backup and Recovery + +```bash +# Manual backup +velero backup create llamastack-manual --include-namespaces llamastack-prod + +# List backups +velero backup get + +# Restore from backup +velero restore create --from-backup llamastack-20240101-120000 +``` + +## Next Steps + +- [Custom Images Guide](custom-images.md) +- [Monitoring Setup](../how-to/monitoring.md) +- [Scaling Guide](../how-to/scaling.md) +- [Troubleshooting](../how-to/troubleshooting.md) diff --git a/docs/content/getting-started/configuration.md b/docs/content/getting-started/configuration.md new file mode 100644 index 000000000..ee96f1aa0 --- /dev/null +++ b/docs/content/getting-started/configuration.md @@ -0,0 +1,69 @@ +# Configuration + +This guide covers how to configure the LlamaStack Kubernetes Operator for your environment. + +## Basic Configuration + +The operator can be configured through various methods: + +### Environment Variables + +Key environment variables for the operator: + +```bash +# Operator configuration +OPERATOR_NAMESPACE=llamastack-system +LOG_LEVEL=info +METRICS_ADDR=:8080 +``` + +### ConfigMaps + +The operator uses ConfigMaps for distribution configurations: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: llamastack-config + namespace: llamastack-system +data: + config.yaml: | + distributions: + - name: default + image: llamastack/llamastack:latest +``` + +## Advanced Configuration + +### Resource Limits + +Configure resource limits for LlamaStack distributions: + +```yaml +spec: + resources: + limits: + cpu: "2" + memory: "4Gi" + requests: + cpu: "1" + memory: "2Gi" +``` + +### Storage Configuration + +Configure persistent storage for your distributions: + +```yaml +spec: + storage: + size: "10Gi" + storageClass: "fast-ssd" +``` + +## Next Steps + +- [Quick Start Guide](quick-start.md) +- [API Reference](../reference/api.md) +- [Troubleshooting](../how-to/troubleshooting.md) diff --git a/docs/content/getting-started/distributions.md b/docs/content/getting-started/distributions.md new file mode 100644 index 000000000..bcb461285 --- /dev/null +++ b/docs/content/getting-started/distributions.md @@ -0,0 +1,357 @@ +# Understanding LlamaStack Distributions + +This guide explains the different ways to deploy LlamaStack using the Kubernetes operator, focusing on the distinction between **Supported Distributions** and **Bring-Your-Own (BYO) Distributions**. + +## Distribution Types Overview + +The LlamaStack Kubernetes Operator supports two main approaches for deploying LlamaStack: + +### 🎯 **Supported Distributions** (Recommended) +Pre-configured, tested distributions maintained by the LlamaStack team with specific provider integrations. + +### πŸ› οΈ **Bring-Your-Own (BYO) Distributions** +Custom container images that you build and maintain yourself. + +## Supported Distributions + +### What are Supported Distributions? + +Supported distributions are **pre-built, tested container images** that include: +- βœ… **Specific provider integrations** (Ollama, vLLM, NVIDIA, etc.) +- βœ… **Optimized configurations** for each provider +- βœ… **Tested compatibility** with the operator +- βœ… **Regular updates** and security patches +- βœ… **Documentation and examples** + +### Available Pre-Built Distributions + +The operator currently supports **7 pre-built distributions** that are actively maintained and tested: + +| Distribution | Image | Use Case | Requirements | +|--------------|-------|----------|--------------| +| `starter` | `docker.io/llamastack/distribution-starter:latest` | **Recommended default** - General purpose LlamaStack | Basic Kubernetes resources | +| `ollama` | `docker.io/llamastack/distribution-ollama:latest` | Local inference with Ollama integration | Ollama server | +| `bedrock` | `docker.io/llamastack/distribution-bedrock:latest` | AWS Bedrock models | AWS credentials | +| `remote-vllm` | `docker.io/llamastack/distribution-remote-vllm:latest` | Remote vLLM server integration | External vLLM server | +| `tgi` | `docker.io/llamastack/distribution-tgi:latest` | Hugging Face Text Generation Inference | TGI server setup | +| `together` | `docker.io/llamastack/distribution-together:latest` | Together AI API integration | Together API key | +| `vllm-gpu` | `docker.io/llamastack/distribution-vllm-gpu:latest` | High-performance GPU inference with vLLM | GPU infrastructure | + +!!! note "Distribution Selection" + - **New users**: Start with `starter` distribution + - **Ollama users**: Use `ollama` distribution + - **GPU inference**: Use `vllm-gpu` distribution + - **Cloud APIs**: Use `bedrock` or `together` distributions + +### Using Supported Distributions + +#### Basic Syntax + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-distribution +spec: + server: + distribution: + name: "distribution-name" # Use distribution name + # ... other configuration +``` + +#### Example: Starter Distribution (Recommended) + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-llamastack +spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + port: 8321 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + storage: + size: "20Gi" +``` + +#### Example: Ollama Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: ollama-llamastack +spec: + replicas: 1 + server: + distribution: + name: "ollama" + containerSpec: + port: 8321 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + env: + - name: OLLAMA_URL + value: "http://ollama-server-service.ollama-dist.svc.cluster.local:11434" + storage: + size: "20Gi" +``` + +#### Example: vLLM GPU Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: vllm-gpu-llamastack +spec: + replicas: 1 + server: + distribution: + name: "vllm-gpu" + containerSpec: + port: 8321 + resources: + requests: + cpu: "2" + memory: "8Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" + env: + - name: MODEL_NAME + value: "meta-llama/Llama-2-7b-chat-hf" + - name: TENSOR_PARALLEL_SIZE + value: "1" + storage: + size: "50Gi" +``` + +### Benefits of Supported Distributions + +- **πŸš€ Quick Setup**: No need to build custom images +- **πŸ”’ Security**: Regular security updates from LlamaStack team +- **πŸ“š Documentation**: Comprehensive guides and examples +- **πŸ§ͺ Tested**: Thoroughly tested with the operator +- **πŸ”§ Optimized**: Pre-configured for optimal performance +- **πŸ†˜ Support**: Community and official support available + +## Bring-Your-Own (BYO) Distributions + +### What are BYO Distributions? + +BYO distributions allow you to use **custom container images** that you build and maintain: +- πŸ› οΈ **Custom integrations** not available in supported distributions +- 🎨 **Specialized configurations** for your use case +- πŸ”§ **Custom dependencies** and libraries +- πŸ“¦ **Private or proprietary** model integrations + +### Using BYO Distributions + +#### Basic Syntax + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-custom-distribution +spec: + server: + distribution: + image: "your-registry.com/custom-llamastack:tag" # Use custom image + # ... other configuration +``` + +#### Example: Custom Image + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-llamastack +spec: + replicas: 1 + server: + distribution: + image: "myregistry.com/custom-llamastack:v1.0.0" + containerSpec: + port: 8321 + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "4" + memory: "8Gi" + env: + - name: CUSTOM_CONFIG_PATH + value: "/app/config/custom.yaml" + - name: API_KEY + valueFrom: + secretKeyRef: + name: custom-credentials + key: api-key + storage: + size: "100Gi" + podOverrides: + volumes: + - name: custom-config + configMap: + name: custom-llamastack-config + volumeMounts: + - name: custom-config + mountPath: "/app/config" + readOnly: true +``` + +### Building Custom Images + +#### Example Dockerfile + +```dockerfile +# Start from a supported distribution or base image +FROM llamastack/llamastack:latest + +# Add your custom dependencies +RUN pip install custom-package-1 custom-package-2 + +# Copy custom configuration +COPY custom-config/ /app/config/ + +# Copy custom code +COPY src/ /app/src/ + +# Set custom environment variables +ENV CUSTOM_SETTING=value + +# Override entrypoint if needed +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh +ENTRYPOINT ["/entrypoint.sh"] +``` + +#### Building and Pushing + +```bash +# Build your custom image +docker build -t myregistry.com/custom-llamastack:v1.0.0 . + +# Push to your registry +docker push myregistry.com/custom-llamastack:v1.0.0 +``` + +### BYO Distribution Considerations + +#### Advantages +- **🎯 Full Control**: Complete customization of the stack +- **πŸ”§ Custom Integrations**: Add proprietary or specialized providers +- **πŸ“¦ Private Models**: Include private or fine-tuned models +- **⚑ Optimizations**: Custom performance optimizations + +#### Responsibilities +- **πŸ”’ Security**: You maintain security updates +- **πŸ§ͺ Testing**: You test compatibility with the operator +- **πŸ“š Documentation**: You document your custom setup +- **πŸ†˜ Support**: Limited community support for custom images +- **πŸ”„ Updates**: You manage updates and compatibility + +## Key Differences Summary + +| Aspect | Supported Distributions | BYO Distributions | +|--------|------------------------|-------------------| +| **Setup Complexity** | βœ… Simple (just specify name) | πŸ”§ Complex (build & maintain image) | +| **Maintenance** | βœ… Handled by LlamaStack team | ❌ Your responsibility | +| **Security Updates** | βœ… Automatic | ❌ Manual | +| **Documentation** | βœ… Comprehensive | ❌ You create | +| **Support** | βœ… Community + Official | ⚠️ Limited | +| **Customization** | ⚠️ Limited to configuration | βœ… Full control | +| **Testing** | βœ… Pre-tested | ❌ You test | +| **Time to Deploy** | βœ… Minutes | ⏱️ Hours/Days | + +## Choosing the Right Approach + +### Use Supported Distributions When: +- βœ… Your use case matches available providers (Ollama, vLLM, etc.) +- βœ… You want quick setup and deployment +- βœ… You prefer maintained and tested solutions +- βœ… You need community support +- βœ… Security and updates are important + +### Use BYO Distributions When: +- πŸ› οΈ You need custom provider integrations +- πŸ”§ You have specialized requirements +- πŸ“¦ You use proprietary or private models +- ⚑ You need specific performance optimizations +- 🎯 You have the expertise to maintain custom images + +## Migration Between Approaches + +### From Supported to BYO +```yaml +# Before (supported) +spec: + server: + distribution: + name: "ollama" + +# After (BYO) +spec: + server: + distribution: + image: "myregistry.com/custom-ollama:v1.0.0" +``` + +### From BYO to Supported +```yaml +# Before (BYO) +spec: + server: + distribution: + image: "myregistry.com/custom-vllm:v1.0.0" + +# After (supported) +spec: + server: + distribution: + name: "vllm-gpu" +``` + +## Best Practices + +### For Supported Distributions +1. **Start Simple**: Begin with basic configuration +2. **Use Environment Variables**: Configure via `env` section +3. **Monitor Resources**: Set appropriate resource limits +4. **Check Documentation**: Review provider-specific guides + +### For BYO Distributions +1. **Base on Supported Images**: Start from `llamastack/llamastack:latest` +2. **Document Everything**: Maintain clear documentation +3. **Test Thoroughly**: Test with the operator before production +4. **Version Control**: Tag and version your custom images +5. **Security Scanning**: Regularly scan for vulnerabilities + +## Next Steps + +- [Configuration Reference](../reference/configuration.md) - Detailed configuration options +- [Basic Deployment](../examples/basic-deployment.md) - Simple deployment examples +- [Production Setup](../examples/production-setup.md) - Production-ready configurations +- [Custom Images Guide](../examples/custom-images.md) - Building custom images +- [Troubleshooting](../how-to/troubleshooting.md) - Common issues and solutions diff --git a/docs/content/getting-started/installation.md b/docs/content/getting-started/installation.md new file mode 100644 index 000000000..e9bacca1a --- /dev/null +++ b/docs/content/getting-started/installation.md @@ -0,0 +1,273 @@ +# Installation Guide + +This guide walks you through installing the LlamaStack Kubernetes Operator in your cluster. + +## Prerequisites + +Before installing the operator, ensure you have: + +- **Kubernetes cluster** (version 1.25 or later) +- **kubectl** configured to access your cluster +- **Cluster admin permissions** to install CRDs and RBAC resources +- **Container runtime** that supports pulling images from public registries + +## Installation Methods + +### Method 1: Kustomize (Recommended) + +The recommended way to install the operator is using Kustomize: + +```bash +# Clone the repository +git clone https://github.com/llamastack/llama-stack-k8s-operator.git +cd llama-stack-k8s-operator + +# Install using Kustomize +kubectl apply -k config/default +``` + +This will: +- Install the Custom Resource Definitions (CRDs) +- Create the necessary RBAC resources +- Deploy the operator in the `llama-stack-k8s-operator-system` namespace + +### Method 2: Build from Source + +For development or customized builds: + +```bash +# Clone the repository +git clone https://github.com/llamastack/llama-stack-k8s-operator.git +cd llama-stack-k8s-operator + +# Build and deploy +make docker-build docker-push IMG=/llama-stack-k8s-operator:tag +make deploy IMG=/llama-stack-k8s-operator:tag +``` + +## Verification + +After installation, verify that the operator is running: + +```bash +# Check operator deployment +kubectl get deployment -n llama-stack-k8s-operator-system llama-stack-k8s-operator-controller-manager + +# Check operator logs +kubectl logs -n llama-stack-k8s-operator-system deployment/llama-stack-k8s-operator-controller-manager + +# Verify CRDs are installed +kubectl get crd llamastackdistributions.llamastack.io +``` + +Expected output: +``` +NAME CREATED AT +llamastackdistributions.llamastack.io 2024-01-15T10:30:00Z +``` + +## Configuration + +### Resource Requirements + +The operator has minimal resource requirements: + +```yaml +resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi +``` + +### Environment Variables + +Configure the operator behavior using environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `METRICS_BIND_ADDRESS` | Metrics server bind address | `:8080` | +| `HEALTH_PROBE_BIND_ADDRESS` | Health probe bind address | `:8081` | +| `LEADER_ELECT` | Enable leader election | `false` | +| `LOG_LEVEL` | Logging level | `info` | + +### Custom Configuration + +For custom configurations, create a `kustomization.yaml`: + +```yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- https://github.com/llamastack/llama-stack-k8s-operator/config/default + +patchesStrategicMerge: +- manager_config_patch.yaml + +images: +- name: quay.io/llamastack/llama-stack-k8s-operator + newTag: v0.1.0 +``` + +## Namespace Configuration + +### Default Namespace + +By default, the operator watches all namespaces. To restrict to specific namespaces: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llamastack-operator-controller-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: WATCH_NAMESPACE + value: "llamastack-system,production" +``` + +### Multi-tenant Setup + +For multi-tenant environments, install the operator with namespace restrictions: + +```bash +# Install operator in tenant namespace +kubectl create namespace tenant-a +kubectl apply -f operator.yaml -n tenant-a + +# Configure RBAC for tenant isolation +kubectl apply -f tenant-rbac.yaml +``` + +## Security Configuration + +### RBAC + +The operator requires the following permissions: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: llamastack-operator-manager-role +rules: +- apiGroups: ["llamastack.io"] + resources: ["llamastackdistributions"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +- apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +- apiGroups: [""] + resources: ["services", "configmaps", "persistentvolumeclaims"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +``` + +### Network Policies + +Secure your deployment with network policies: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: llamastack-operator-netpol + namespace: llama-stack-k8s-operator-system +spec: + podSelector: + matchLabels: + control-plane: controller-manager + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 8080 + - protocol: TCP + port: 8081 +``` + +## Troubleshooting + +### Common Issues + +**1. CRD Installation Failed** +```bash +# Check if CRDs exist +kubectl get crd | grep llamastack + +# Manually install CRDs +kubectl apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/config/crd/bases/llamastack.io_llamastackdistributions.yaml +``` + +**2. Operator Pod Not Starting** +```bash +# Check pod status +kubectl get pods -n llama-stack-k8s-operator-system + +# Check events +kubectl describe pod -n llama-stack-k8s-operator-system + +# Check logs +kubectl logs -n llama-stack-k8s-operator-system +``` + +**3. Permission Denied Errors** +```bash +# Check RBAC configuration +kubectl auth can-i create llamastackdistributions --as=system:serviceaccount:llama-stack-k8s-operator-system:llama-stack-k8s-operator-controller-manager + +# Verify service account +kubectl get serviceaccount -n llama-stack-k8s-operator-system +``` + +### Debug Mode + +Enable debug logging for troubleshooting: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llamastack-operator-controller-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: LOG_LEVEL + value: "debug" +``` + +## Uninstallation + +To remove the operator: + +```bash +# Delete operator deployment +kubectl delete -f https://github.com/llamastack/llama-stack-k8s-operator/releases/latest/download/operator.yaml + +# Clean up CRDs (this will delete all LlamaStackDistribution resources) +kubectl delete crd llamastackdistributions.llamastack.io +``` + +!!! warning "Data Loss Warning" + Deleting the CRD will remove all LlamaStackDistribution resources and their associated data. Make sure to backup any important configurations before uninstalling. + +## Next Steps + +After successful installation: + +1. [Deploy your first LlamaStack instance](quick-start.md) +2. [Learn about configuration options](configuration.md) +3. [Explore examples](../examples/basic-deployment.md) diff --git a/docs/content/getting-started/quick-start.md b/docs/content/getting-started/quick-start.md new file mode 100644 index 000000000..a17733b84 --- /dev/null +++ b/docs/content/getting-started/quick-start.md @@ -0,0 +1,407 @@ +# Quick Start Guide + +This guide will help you deploy your first LlamaStack instance using the Kubernetes operator in just a few minutes. + +## Prerequisites + +- LlamaStack Operator installed ([Installation Guide](installation.md)) +- kubectl configured and connected to your cluster +- At least 4GB of available memory in your cluster + +## Step 1: Create a Basic LlamaStack Instance + +Create a file named `basic-llamastack.yaml`: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-first-llamastack + namespace: default +spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + port: 8321 + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1" +``` + +## Step 2: Deploy the Instance + +Apply the configuration to your cluster: + +```bash +kubectl apply -f basic-llamastack.yaml +``` + +## Step 3: Monitor the Deployment + +Watch the deployment progress: + +```bash +# Check the LlamaStackDistribution status +kubectl get llamastackdistribution my-first-llamastack + +# Watch the pods being created +kubectl get pods -l app=llama-stack -w + +# Check deployment status +kubectl get deployment my-first-llamastack +``` + +Expected output: +``` +NAME READY STATUS RESTARTS AGE +my-first-llamastack 1/1 Running 0 2m +``` + +## Step 4: Access Your LlamaStack Instance + +### Port Forward (Development) + +For development and testing, use port forwarding: + +```bash +kubectl port-forward service/my-first-llamastack 8321:8321 +``` + +Now you can access LlamaStack at `http://localhost:8321`. + +### Service Exposure (Production) + +For production access, expose the service: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: my-first-llamastack-external +spec: + type: LoadBalancer + selector: + app: llama-stack + llamastack.io/instance: my-first-llamastack + ports: + - port: 80 + targetPort: 8321 + protocol: TCP +``` + +Apply the service: +```bash +kubectl apply -f service.yaml +``` + +## Step 5: Test the API + +Test that your LlamaStack instance is working: + +```bash +# Health check +curl http://localhost:8321/health + +# List available providers +curl http://localhost:8321/providers + +# Get distribution info +curl http://localhost:8321/distribution/info +``` + +Expected response for health check: +```json +{ + "status": "healthy", + "version": "0.0.1", + "distribution": "meta-reference" +} +``` + +## Step 6: Explore the API + +LlamaStack provides a comprehensive API for AI applications. Here are some key endpoints: + +### Models API +```bash +# List available models +curl http://localhost:8321/models + +# Get model info +curl http://localhost:8321/models/{model_id} +``` + +### Inference API +```bash +# Text completion +curl -X POST http://localhost:8321/inference/completion \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-2-7b-chat-hf", + "prompt": "Hello, how are you?", + "max_tokens": 100 + }' +``` + +### Memory API +```bash +# Create memory bank +curl -X POST http://localhost:8321/memory/create \ + -H "Content-Type: application/json" \ + -d '{ + "bank_id": "my-memory", + "config": { + "type": "vector", + "embedding_model": "all-MiniLM-L6-v2" + } + }' +``` + +## Configuration Examples + +### Custom Distribution + +Use a different LlamaStack distribution: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: ollama-llamastack +spec: + replicas: 1 + server: + distribution: + name: "ollama" + containerSpec: + port: 8321 + env: + - name: OLLAMA_HOST + value: "0.0.0.0" +``` + +### Custom Container Image + +Use your own LlamaStack image: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-llamastack +spec: + replicas: 1 + server: + distribution: + image: "my-registry.com/llamastack:custom" + containerSpec: + port: 8321 +``` + +### With Persistent Storage + +Add persistent storage for models and data: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: persistent-llamastack +spec: + replicas: 1 + server: + distribution: + name: "meta-reference" + containerSpec: + port: 8321 + storage: + size: "50Gi" + mountPath: "/.llama" +``` + +### High Availability Setup + +Deploy multiple replicas with load balancing: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: ha-llamastack +spec: + replicas: 3 + server: + distribution: + name: "meta-reference" + containerSpec: + port: 8321 + resources: + requests: + memory: "4Gi" + cpu: "1" + limits: + memory: "8Gi" + cpu: "2" + storage: + size: "100Gi" + mountPath: "/.llama" +``` + +## Monitoring and Observability + +### Check Resource Usage + +Monitor resource consumption: + +```bash +# Pod resource usage +kubectl top pods -l app=llama-stack + +# Node resource usage +kubectl top nodes +``` + +### View Logs + +Access application logs: + +```bash +# View recent logs +kubectl logs deployment/my-first-llamastack + +# Follow logs in real-time +kubectl logs -f deployment/my-first-llamastack + +# View logs from all replicas +kubectl logs -l app=llama-stack --tail=100 +``` + +### Health Checks + +The operator automatically configures health checks: + +```yaml +# Readiness probe +readinessProbe: + httpGet: + path: /health + port: 8321 + initialDelaySeconds: 30 + periodSeconds: 10 + +# Liveness probe +livenessProbe: + httpGet: + path: /health + port: 8321 + initialDelaySeconds: 60 + periodSeconds: 30 +``` + +## Scaling + +### Manual Scaling + +Scale your deployment manually: + +```bash +# Scale to 3 replicas +kubectl patch llamastackdistribution my-first-llamastack -p '{"spec":{"replicas":3}}' + +# Verify scaling +kubectl get pods -l app=llama-stack +``` + +### Resource Updates + +Update resource allocations: + +```bash +kubectl patch llamastackdistribution my-first-llamastack -p '{ + "spec": { + "server": { + "containerSpec": { + "resources": { + "requests": { + "memory": "4Gi", + "cpu": "1" + }, + "limits": { + "memory": "8Gi", + "cpu": "2" + } + } + } + } + } +}' +``` + +## Cleanup + +When you're done experimenting, clean up the resources: + +```bash +# Delete the LlamaStack instance +kubectl delete llamastackdistribution my-first-llamastack + +# Delete any additional services +kubectl delete service my-first-llamastack-external + +# Verify cleanup +kubectl get pods -l app=llama-stack +``` + +## Troubleshooting + +### Common Issues + +**Pod not starting:** +```bash +# Check pod events +kubectl describe pod + +# Check resource constraints +kubectl describe node +``` + +**Service not accessible:** +```bash +# Check service endpoints +kubectl get endpoints my-first-llamastack + +# Verify port forwarding +kubectl port-forward service/my-first-llamastack 8321:8321 --address 0.0.0.0 +``` + +**API errors:** +```bash +# Check application logs +kubectl logs deployment/my-first-llamastack + +# Verify configuration +kubectl get llamastackdistribution my-first-llamastack -o yaml +``` + +## Next Steps + +Now that you have a working LlamaStack instance: + +1. **[Learn about configuration options](configuration.md)** - Explore advanced configuration +2. **[Check out examples](../examples/basic-deployment.md)** - See real-world use cases +3. **[Read the API reference](../reference/api.md)** - Understand all available options +4. **[Set up monitoring](../how-to/monitoring.md)** - Add observability to your deployment + +## Getting Help + +If you encounter issues: + +- Check the [troubleshooting guide](../how-to/troubleshooting.md) +- Review [GitHub issues](https://github.com/llamastack/llama-stack-k8s-operator/issues) +- Join the [community discussions](https://github.com/llamastack/llama-stack-k8s-operator/discussions) diff --git a/docs/content/how-to/configure-storage.md b/docs/content/how-to/configure-storage.md new file mode 100644 index 000000000..216ce5fc2 --- /dev/null +++ b/docs/content/how-to/configure-storage.md @@ -0,0 +1,265 @@ +# Configure Storage + +Learn how to configure persistent storage for your LlamaStack distributions. + +## Storage Overview + +LlamaStack distributions can use persistent storage for: + +- Model files and weights +- Configuration data +- Logs and metrics +- User data and sessions + +## Basic Storage Configuration + +### Default Storage + +By default, LlamaStack uses ephemeral storage: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: basic-llamastack +spec: + image: llamastack/llamastack:latest + # No storage configuration = ephemeral storage +``` + +### Persistent Storage + +Enable persistent storage: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: persistent-llamastack +spec: + image: llamastack/llamastack:latest + storage: + size: "50Gi" + storageClass: "standard" + accessMode: "ReadWriteOnce" +``` + +## Storage Classes + +### Available Storage Classes + +Common storage classes and their use cases: + +| Storage Class | Performance | Use Case | +|---------------|-------------|----------| +| `standard` | Standard | General purpose | +| `fast-ssd` | High | Model inference | +| `slow-hdd` | Low | Archival storage | + +### Custom Storage Class + +Create a custom storage class for LlamaStack: + +```yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: llamastack-storage +provisioner: kubernetes.io/aws-ebs +parameters: + type: gp3 + iops: "3000" + throughput: "125" +allowVolumeExpansion: true +``` + +## Advanced Storage Configurations + +### Multiple Volumes + +Configure separate volumes for different purposes: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: multi-volume-llamastack +spec: + image: llamastack/llamastack:latest + storage: + models: + size: "100Gi" + storageClass: "fast-ssd" + mountPath: "/models" + data: + size: "50Gi" + storageClass: "standard" + mountPath: "/data" + logs: + size: "10Gi" + storageClass: "standard" + mountPath: "/logs" +``` + +### Shared Storage + +Configure shared storage across replicas: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: shared-storage-llamastack +spec: + image: llamastack/llamastack:latest + replicas: 3 + storage: + size: "200Gi" + storageClass: "nfs" + accessMode: "ReadWriteMany" # Allows multiple pods to mount +``` + +## Storage Optimization + +### Performance Tuning + +Optimize storage for model inference: + +```yaml +spec: + storage: + size: "500Gi" + storageClass: "nvme-ssd" + iops: 10000 + throughput: "1000MB/s" +``` + +### Cost Optimization + +Use tiered storage for cost efficiency: + +```yaml +spec: + storage: + hot: + size: "50Gi" + storageClass: "fast-ssd" + mountPath: "/models/active" + warm: + size: "200Gi" + storageClass: "standard" + mountPath: "/models/cache" + cold: + size: "1Ti" + storageClass: "slow-hdd" + mountPath: "/models/archive" +``` + +## Backup and Recovery + +### Automated Backups + +Configure automated backups: + +```yaml +spec: + backup: + enabled: true + schedule: "0 2 * * *" # Daily at 2 AM + retention: "30d" + destination: "s3://my-backup-bucket" +``` + +### Manual Backup + +Create manual backups: + +```bash +# Create a snapshot +kubectl create volumesnapshot llamastack-backup \ + --from-pvc=llamastack-storage + +# Restore from snapshot +kubectl apply -f - < -- df -h + +# Check I/O metrics +kubectl top pods --containers +``` + +### Alerts + +Set up storage alerts: + +```yaml +# Prometheus alert for high storage usage +- alert: HighStorageUsage + expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "High storage usage on {{ $labels.persistentvolumeclaim }}" +``` + +## Troubleshooting + +### Common Issues + +**PVC Stuck in Pending:** +```bash +# Check storage class +kubectl get storageclass + +# Check events +kubectl describe pvc +``` + +**Out of Space:** +```bash +# Expand volume (if supported) +kubectl patch pvc -p '{"spec":{"resources":{"requests":{"storage":"100Gi"}}}}' +``` + +**Performance Issues:** +```bash +# Check I/O wait +kubectl exec -it -- iostat -x 1 + +# Check storage class parameters +kubectl describe storageclass +``` + +## Next Steps + +- [Scaling Guide](scaling.md) +- [Monitoring Setup](monitoring.md) +- [Troubleshooting](troubleshooting.md) diff --git a/docs/content/how-to/deploy-llamastack.md b/docs/content/how-to/deploy-llamastack.md new file mode 100644 index 000000000..50ef66dde --- /dev/null +++ b/docs/content/how-to/deploy-llamastack.md @@ -0,0 +1,133 @@ +# Deploy LlamaStack + +This guide walks you through deploying a LlamaStack distribution using the Kubernetes operator. + +## Prerequisites + +- Kubernetes cluster (v1.20+) +- LlamaStack Kubernetes Operator installed +- `kubectl` configured to access your cluster + +## Basic Deployment + +### 1. Create a LlamaStackDistribution + +Create a basic LlamaStack distribution: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: my-llamastack + namespace: default +spec: + image: llamastack/llamastack:latest + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" +``` + +### 2. Apply the Configuration + +```bash +kubectl apply -f llamastack-distribution.yaml +``` + +### 3. Verify Deployment + +Check the status of your deployment: + +```bash +# Check the distribution +kubectl get llamastackdistribution my-llamastack + +# Check the pods +kubectl get pods -l app=my-llamastack + +# Check logs +kubectl logs -l app=my-llamastack +``` + +## Advanced Deployment Options + +### With Persistent Storage + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-with-storage +spec: + image: llamastack/llamastack:latest + storage: + size: "20Gi" + storageClass: "fast-ssd" + persistence: + enabled: true +``` + +### With Custom Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-custom +spec: + image: llamastack/llamastack:latest + config: + models: + - name: "llama2-7b" + path: "/models/llama2-7b" + inference: + provider: "meta-reference" +``` + +## Scaling + +### Horizontal Scaling + +Scale your deployment by adjusting replicas: + +```bash +kubectl patch llamastackdistribution my-llamastack -p '{"spec":{"replicas":3}}' +``` + +### Vertical Scaling + +Update resource limits: + +```yaml +spec: + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "4" + memory: "8Gi" +``` + +## Monitoring + +Monitor your deployment: + +```bash +# Check resource usage +kubectl top pods -l app=my-llamastack + +# Check events +kubectl get events --field-selector involvedObject.name=my-llamastack +``` + +## Next Steps + +- [Configure Storage](configure-storage.md) +- [Set up Monitoring](monitoring.md) +- [Scaling Guide](scaling.md) +- [Troubleshooting](troubleshooting.md) diff --git a/docs/content/how-to/monitoring.md b/docs/content/how-to/monitoring.md new file mode 100644 index 000000000..64985cd35 --- /dev/null +++ b/docs/content/how-to/monitoring.md @@ -0,0 +1,407 @@ +# Monitoring + +Set up comprehensive monitoring for your LlamaStack distributions. + +## Monitoring Overview + +Monitor your LlamaStack deployments with: + +- **Metrics**: Performance and resource usage +- **Logs**: Application and system logs +- **Alerts**: Proactive issue detection +- **Dashboards**: Visual monitoring + +## Metrics Collection + +### Prometheus Setup + +Deploy Prometheus for metrics collection: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config +data: + prometheus.yml: | + global: + scrape_interval: 15s + scrape_configs: + - job_name: 'llamastack' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: llamastack +``` + +### ServiceMonitor + +Create a ServiceMonitor for automatic discovery: + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: llamastack-monitor +spec: + selector: + matchLabels: + app: llamastack + endpoints: + - port: metrics + interval: 30s + path: /metrics +``` + +## Key Metrics + +### Application Metrics + +Monitor LlamaStack-specific metrics: + +```yaml +# Custom metrics exposed by LlamaStack +llamastack_requests_total +llamastack_request_duration_seconds +llamastack_active_connections +llamastack_model_load_time_seconds +llamastack_inference_latency_seconds +``` + +### Resource Metrics + +Track resource usage: + +```yaml +# CPU and Memory +container_cpu_usage_seconds_total +container_memory_usage_bytes +container_memory_working_set_bytes + +# Network +container_network_receive_bytes_total +container_network_transmit_bytes_total + +# Storage +kubelet_volume_stats_used_bytes +kubelet_volume_stats_capacity_bytes +``` + +## Logging + +### Centralized Logging + +Set up log aggregation with Fluentd: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentd-config +data: + fluent.conf: | + + @type tail + path /var/log/containers/*llamastack*.log + pos_file /var/log/fluentd-containers.log.pos + tag kubernetes.* + format json + + + + @type elasticsearch + host elasticsearch.logging.svc.cluster.local + port 9200 + index_name llamastack-logs + +``` + +### Log Levels + +Configure appropriate log levels: + +```yaml +spec: + env: + - name: LOG_LEVEL + value: "info" # debug, info, warn, error + - name: LOG_FORMAT + value: "json" # json, text +``` + +## Dashboards + +### Grafana Dashboard + +Create a comprehensive dashboard: + +```json +{ + "dashboard": { + "title": "LlamaStack Monitoring", + "panels": [ + { + "title": "Request Rate", + "type": "graph", + "targets": [ + { + "expr": "rate(llamastack_requests_total[5m])", + "legendFormat": "{{instance}}" + } + ] + }, + { + "title": "Response Time", + "type": "graph", + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(llamastack_request_duration_seconds_bucket[5m]))", + "legendFormat": "95th percentile" + } + ] + }, + { + "title": "Resource Usage", + "type": "graph", + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total[5m])", + "legendFormat": "CPU" + }, + { + "expr": "container_memory_usage_bytes", + "legendFormat": "Memory" + } + ] + } + ] + } +} +``` + +## Alerting + +### Prometheus Alerts + +Define critical alerts: + +```yaml +groups: +- name: llamastack.rules + rules: + - alert: LlamaStackDown + expr: up{job="llamastack"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "LlamaStack instance is down" + description: "LlamaStack instance {{ $labels.instance }} has been down for more than 1 minute." + + - alert: HighErrorRate + expr: rate(llamastack_requests_total{status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value }} errors per second." + + - alert: HighLatency + expr: histogram_quantile(0.95, rate(llamastack_request_duration_seconds_bucket[5m])) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "High latency detected" + description: "95th percentile latency is {{ $value }} seconds." + + - alert: HighMemoryUsage + expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage" + description: "Memory usage is above 90%." +``` + +### AlertManager Configuration + +Configure alert routing: + +```yaml +global: + smtp_smarthost: 'localhost:587' + smtp_from: 'alerts@example.com' + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'web.hook' + +receivers: +- name: 'web.hook' + email_configs: + - to: 'admin@example.com' + subject: 'LlamaStack Alert: {{ .GroupLabels.alertname }}' + body: | + {{ range .Alerts }} + Alert: {{ .Annotations.summary }} + Description: {{ .Annotations.description }} + {{ end }} +``` + +## Health Checks + +### Liveness Probe + +Configure liveness probes: + +```yaml +spec: + containers: + - name: llamastack + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 +``` + +### Readiness Probe + +Configure readiness probes: + +```yaml +spec: + containers: + - name: llamastack + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 +``` + +## Performance Monitoring + +### Custom Metrics + +Expose custom application metrics: + +```python +# Example Python code for custom metrics +from prometheus_client import Counter, Histogram, Gauge + +REQUEST_COUNT = Counter('llamastack_requests_total', 'Total requests', ['method', 'endpoint']) +REQUEST_LATENCY = Histogram('llamastack_request_duration_seconds', 'Request latency') +ACTIVE_CONNECTIONS = Gauge('llamastack_active_connections', 'Active connections') + +# In your application code +REQUEST_COUNT.labels(method='POST', endpoint='/inference').inc() +REQUEST_LATENCY.observe(response_time) +ACTIVE_CONNECTIONS.set(current_connections) +``` + +### Distributed Tracing + +Set up distributed tracing with Jaeger: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: jaeger-config +data: + config.yaml: | + jaeger: + endpoint: "http://jaeger-collector:14268/api/traces" + service_name: "llamastack" + sampler: + type: "probabilistic" + param: 0.1 +``` + +## Monitoring Best Practices + +### Resource Monitoring + +Monitor these key resources: + +```bash +# CPU usage +kubectl top pods -l app=llamastack + +# Memory usage +kubectl top pods -l app=llamastack --containers + +# Storage usage +kubectl exec -it -- df -h + +# Network usage +kubectl exec -it -- netstat -i +``` + +### Log Analysis + +Analyze logs for issues: + +```bash +# Check error logs +kubectl logs -l app=llamastack | grep ERROR + +# Check recent logs +kubectl logs -l app=llamastack --since=1h + +# Follow logs in real-time +kubectl logs -f -l app=llamastack +``` + +## Troubleshooting Monitoring + +### Common Issues + +**Metrics Not Appearing:** +```bash +# Check ServiceMonitor +kubectl get servicemonitor + +# Check Prometheus targets +kubectl port-forward svc/prometheus 9090:9090 +# Visit http://localhost:9090/targets +``` + +**High Resource Usage:** +```bash +# Check resource limits +kubectl describe pod + +# Check node resources +kubectl describe node +``` + +**Alert Fatigue:** +```bash +# Review alert thresholds +kubectl get prometheusrule + +# Check alert history +kubectl logs -l app=alertmanager +``` + +## Next Steps + +- [Troubleshooting Guide](troubleshooting.md) +- [Scaling Guide](scaling.md) +- [Configure Storage](configure-storage.md) diff --git a/docs/content/how-to/scaling.md b/docs/content/how-to/scaling.md new file mode 100644 index 000000000..7a7819768 --- /dev/null +++ b/docs/content/how-to/scaling.md @@ -0,0 +1,326 @@ +# Scaling + +Learn how to scale your LlamaStack distributions for optimal performance and cost efficiency. + +## Scaling Overview + +LlamaStack supports both horizontal and vertical scaling: + +- **Horizontal Scaling**: Add more replicas +- **Vertical Scaling**: Increase resources per replica +- **Auto Scaling**: Automatic scaling based on metrics + +## Horizontal Scaling + +### Manual Scaling + +Scale replicas manually: + +```bash +# Scale to 3 replicas +kubectl patch llamastackdistribution my-llamastack \ + -p '{"spec":{"replicas":3}}' + +# Or edit the resource directly +kubectl edit llamastackdistribution my-llamastack +``` + +### Declarative Scaling + +Update your YAML configuration: + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: scaled-llamastack +spec: + image: llamastack/llamastack:latest + replicas: 5 # Scale to 5 replicas + resources: + requests: + cpu: "1" + memory: "2Gi" +``` + +## Vertical Scaling + +### Resource Adjustment + +Increase CPU and memory: + +```yaml +spec: + resources: + requests: + cpu: "2" # Increased from 1 + memory: "4Gi" # Increased from 2Gi + limits: + cpu: "4" # Increased from 2 + memory: "8Gi" # Increased from 4Gi +``` + +### GPU Scaling + +Add GPU resources: + +```yaml +spec: + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "2" +``` + +## Auto Scaling + +### Horizontal Pod Autoscaler (HPA) + +Create an HPA for automatic scaling: + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: llamastack-hpa +spec: + scaleTargetRef: + apiVersion: llamastack.io/v1alpha1 + kind: LlamaStackDistribution + name: my-llamastack + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +``` + +### Vertical Pod Autoscaler (VPA) + +Enable automatic resource adjustment: + +```yaml +apiVersion: autoscaling.k8s.io/v1 +kind: VerticalPodAutoscaler +metadata: + name: llamastack-vpa +spec: + targetRef: + apiVersion: llamastack.io/v1alpha1 + kind: LlamaStackDistribution + name: my-llamastack + updatePolicy: + updateMode: "Auto" + resourcePolicy: + containerPolicies: + - containerName: llamastack + maxAllowed: + cpu: "4" + memory: "8Gi" + minAllowed: + cpu: "100m" + memory: "128Mi" +``` + +## Performance Considerations + +### Load Balancing + +Configure load balancing for multiple replicas: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: llamastack-service +spec: + selector: + app: my-llamastack + ports: + - port: 8080 + targetPort: 8080 + type: LoadBalancer + sessionAffinity: None # Round-robin +``` + +### Resource Requests vs Limits + +Best practices for resource configuration: + +```yaml +spec: + resources: + requests: + cpu: "1" # Guaranteed resources + memory: "2Gi" + limits: + cpu: "2" # Maximum allowed (2x requests) + memory: "4Gi" # Maximum allowed (2x requests) +``` + +## Monitoring Scaling + +### Scaling Metrics + +Monitor key scaling metrics: + +```bash +# Check HPA status +kubectl get hpa + +# Check resource usage +kubectl top pods -l app=my-llamastack + +# Check scaling events +kubectl describe hpa llamastack-hpa +``` + +### Custom Metrics + +Scale based on custom metrics: + +```yaml +metrics: +- type: Pods + pods: + metric: + name: requests_per_second + target: + type: AverageValue + averageValue: "100" +``` + +## Scaling Strategies + +### Blue-Green Scaling + +Deploy new version alongside old: + +```yaml +# Blue deployment (current) +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-blue +spec: + image: llamastack/llamastack:v1.0 + replicas: 3 + +--- +# Green deployment (new) +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-green +spec: + image: llamastack/llamastack:v1.1 + replicas: 3 +``` + +### Canary Scaling + +Gradual rollout with traffic splitting: + +```yaml +# Main deployment (90% traffic) +spec: + replicas: 9 + version: "stable" + +--- +# Canary deployment (10% traffic) +spec: + replicas: 1 + version: "canary" +``` + +## Cost Optimization + +### Spot Instances + +Use spot instances for cost savings: + +```yaml +spec: + nodeSelector: + node-type: "spot" + tolerations: + - key: "spot" + operator: "Equal" + value: "true" + effect: "NoSchedule" +``` + +### Scheduled Scaling + +Scale down during off-hours: + +```yaml +# CronJob for scaling down +apiVersion: batch/v1 +kind: CronJob +metadata: + name: scale-down-llamastack +spec: + schedule: "0 18 * * *" # 6 PM daily + jobTemplate: + spec: + template: + spec: + containers: + - name: kubectl + image: bitnami/kubectl + command: + - kubectl + - patch + - llamastackdistribution + - my-llamastack + - -p + - '{"spec":{"replicas":1}}' +``` + +## Troubleshooting Scaling + +### Common Issues + +**Pods Not Scaling:** +```bash +# Check HPA conditions +kubectl describe hpa llamastack-hpa + +# Check resource metrics +kubectl top nodes +kubectl top pods +``` + +**Resource Constraints:** +```bash +# Check node capacity +kubectl describe nodes + +# Check resource quotas +kubectl describe resourcequota +``` + +**Scaling Too Aggressive:** +```bash +# Adjust HPA behavior +kubectl patch hpa llamastack-hpa -p '{"spec":{"behavior":{"scaleUp":{"stabilizationWindowSeconds":300}}}}' +``` + +## Next Steps + +- [Monitoring Setup](monitoring.md) +- [Configure Storage](configure-storage.md) +- [Troubleshooting](troubleshooting.md) diff --git a/docs/content/how-to/troubleshooting.md b/docs/content/how-to/troubleshooting.md new file mode 100644 index 000000000..652353ada --- /dev/null +++ b/docs/content/how-to/troubleshooting.md @@ -0,0 +1,371 @@ +# Troubleshooting + +Common issues and solutions for LlamaStack Kubernetes Operator. + +## Quick Diagnostics + +### Check Operator Status + +```bash +# Check operator pod +kubectl get pods -n llamastack-system + +# Check operator logs +kubectl logs -n llamastack-system -l app=llamastack-operator + +# Check CRD installation +kubectl get crd llamastackdistributions.llamastack.io +``` + +### Check Distribution Status + +```bash +# List all distributions +kubectl get llamastackdistribution + +# Check specific distribution +kubectl describe llamastackdistribution + +# Check distribution events +kubectl get events --field-selector involvedObject.name= +``` + +## Common Issues + +### 1. Operator Not Starting + +**Symptoms:** +- Operator pod in CrashLoopBackOff +- No CRDs created + +**Diagnosis:** +```bash +kubectl logs -n llamastack-system -l app=llamastack-operator +kubectl describe pod -n llamastack-system -l app=llamastack-operator +``` + +**Solutions:** +```bash +# Check RBAC permissions +kubectl auth can-i create llamastackdistributions --as=system:serviceaccount:llamastack-system:llamastack-operator + +# Reinstall operator +kubectl delete -f operator.yaml +kubectl apply -f operator.yaml + +# Check resource limits +kubectl describe pod -n llamastack-system -l app=llamastack-operator +``` + +### 2. Distribution Not Creating Pods + +**Symptoms:** +- LlamaStackDistribution exists but no pods created +- Status shows "Pending" or "Failed" + +**Diagnosis:** +```bash +kubectl describe llamastackdistribution +kubectl get events --field-selector involvedObject.name= +``` + +**Solutions:** +```bash +# Check image availability +kubectl run test --image= --dry-run=client + +# Check resource quotas +kubectl describe resourcequota + +# Check node capacity +kubectl describe nodes +``` + +### 3. Pods Failing to Start + +**Symptoms:** +- Pods in CrashLoopBackOff or Error state +- Container exits immediately + +**Diagnosis:** +```bash +kubectl logs +kubectl describe pod +kubectl get events --field-selector involvedObject.name= +``` + +**Solutions:** +```bash +# Check image pull secrets +kubectl get secrets + +# Check resource limits +kubectl describe pod + +# Check volume mounts +kubectl exec -it -- ls -la /mnt +``` + +### 4. Storage Issues + +**Symptoms:** +- PVCs stuck in Pending +- Pods can't mount volumes +- Out of disk space + +**Diagnosis:** +```bash +kubectl get pvc +kubectl describe pvc +kubectl get storageclass +``` + +**Solutions:** +```bash +# Check storage class +kubectl describe storageclass + +# Check available storage +kubectl get nodes -o custom-columns=NAME:.metadata.name,CAPACITY:.status.capacity.storage + +# Expand volume (if supported) +kubectl patch pvc -p '{"spec":{"resources":{"requests":{"storage":"100Gi"}}}}' +``` + +### 5. Network Connectivity Issues + +**Symptoms:** +- Services not accessible +- Pods can't communicate +- External traffic not reaching pods + +**Diagnosis:** +```bash +kubectl get svc +kubectl describe svc +kubectl get endpoints +``` + +**Solutions:** +```bash +# Check service selector +kubectl get pods --show-labels +kubectl describe svc + +# Test connectivity +kubectl exec -it -- wget -qO- http://:8080/health + +# Check network policies +kubectl get networkpolicy +``` + +## Performance Issues + +### High CPU Usage + +**Diagnosis:** +```bash +kubectl top pods -l app=llamastack +kubectl exec -it -- top +``` + +**Solutions:** +```bash +# Increase CPU limits +kubectl patch llamastackdistribution -p '{"spec":{"resources":{"limits":{"cpu":"4"}}}}' + +# Scale horizontally +kubectl patch llamastackdistribution -p '{"spec":{"replicas":3}}' +``` + +### High Memory Usage + +**Diagnosis:** +```bash +kubectl top pods -l app=llamastack --containers +kubectl exec -it -- free -h +``` + +**Solutions:** +```bash +# Increase memory limits +kubectl patch llamastackdistribution -p '{"spec":{"resources":{"limits":{"memory":"8Gi"}}}}' + +# Check for memory leaks +kubectl exec -it -- ps aux --sort=-%mem +``` + +### Slow Response Times + +**Diagnosis:** +```bash +# Check application logs +kubectl logs -l app=llamastack | grep -i latency + +# Test response time +kubectl exec -it -- curl -w "@curl-format.txt" http://localhost:8080/health +``` + +**Solutions:** +```bash +# Optimize resource allocation +kubectl patch llamastackdistribution -p '{"spec":{"resources":{"requests":{"cpu":"2","memory":"4Gi"}}}}' + +# Enable caching +kubectl patch llamastackdistribution -p '{"spec":{"config":{"cache":{"enabled":true}}}}' +``` + +## Debugging Tools + +### Log Analysis + +```bash +# Get all logs +kubectl logs -l app=llamastack --all-containers=true + +# Follow logs +kubectl logs -f -l app=llamastack + +# Get previous container logs +kubectl logs --previous + +# Search for errors +kubectl logs -l app=llamastack | grep -i error +``` + +### Resource Monitoring + +```bash +# Check resource usage +kubectl top pods -l app=llamastack +kubectl top nodes + +# Check resource limits +kubectl describe pod | grep -A 5 "Limits\|Requests" + +# Check resource quotas +kubectl describe resourcequota +``` + +### Network Debugging + +```bash +# Test DNS resolution +kubectl exec -it -- nslookup kubernetes.default + +# Test service connectivity +kubectl exec -it -- telnet 8080 + +# Check iptables rules +kubectl exec -it -- iptables -L +``` + +## Advanced Debugging + +### Debug Container + +Run a debug container in the same network namespace: + +```bash +kubectl debug -it --image=nicolaka/netshoot +``` + +### Port Forwarding + +Access services directly: + +```bash +# Forward to pod +kubectl port-forward 8080:8080 + +# Forward to service +kubectl port-forward svc/ 8080:8080 +``` + +### Exec into Container + +Access container shell: + +```bash +# Get shell access +kubectl exec -it -- /bin/bash + +# Run specific commands +kubectl exec -- ps aux +kubectl exec -- netstat -tulpn +``` + +## Configuration Issues + +### Invalid YAML + +**Symptoms:** +- kubectl apply fails +- Validation errors + +**Solutions:** +```bash +# Validate YAML syntax +kubectl apply --dry-run=client -f distribution.yaml + +# Check API version +kubectl api-resources | grep llamastack + +# Validate against schema +kubectl explain llamastackdistribution.spec +``` + +### Missing Dependencies + +**Symptoms:** +- Operator fails to start +- Missing CRDs + +**Solutions:** +```bash +# Install CRDs +kubectl apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/config/crd/bases/llamastack.io_llamastackdistributions.yaml + +# Check operator dependencies +kubectl get deployment -n llamastack-system +``` + +## Getting Help + +### Collect Debug Information + +```bash +#!/bin/bash +# Debug information collection script + +echo "=== Operator Status ===" +kubectl get pods -n llamastack-system +kubectl logs -n llamastack-system -l app=llamastack-operator --tail=100 + +echo "=== Distributions ===" +kubectl get llamastackdistribution -o wide +kubectl describe llamastackdistribution + +echo "=== Pods ===" +kubectl get pods -l app=llamastack -o wide +kubectl describe pods -l app=llamastack + +echo "=== Events ===" +kubectl get events --sort-by=.metadata.creationTimestamp + +echo "=== Resources ===" +kubectl top nodes +kubectl top pods -l app=llamastack +``` + +### Support Channels + +- **GitHub Issues**: [Report bugs and feature requests](https://github.com/llamastack/llama-stack-k8s-operator/issues) +- **Documentation**: [Official documentation](https://llamastack-k8s-operator.pages.dev) +- **Community**: [Join the discussion](https://github.com/llamastack/llama-stack-k8s-operator/discussions) + +## Next Steps + +- [Monitoring Setup](monitoring.md) +- [Scaling Guide](scaling.md) +- [Configure Storage](configure-storage.md) diff --git a/docs/content/index.md b/docs/content/index.md new file mode 100644 index 000000000..59c7cc64e --- /dev/null +++ b/docs/content/index.md @@ -0,0 +1,130 @@ +# LlamaStack Kubernetes Operator + +The LlamaStack Kubernetes Operator provides a simple and efficient way to deploy and manage LlamaStack distributions in Kubernetes clusters. + +## Overview + +LlamaStack is a comprehensive framework for building AI applications with Large Language Models (LLMs). This Kubernetes operator simplifies the deployment and management of LlamaStack distributions, providing: + +- **Easy Deployment**: Deploy LlamaStack with a single Kubernetes resource +- **Scalability**: Automatically scale LlamaStack instances based on demand +- **Storage Management**: Persistent storage for models and data +- **Configuration Management**: Flexible configuration options for different use cases +- **Monitoring**: Built-in observability and health checks + +## Quick Start + +Get started with the LlamaStack Operator in just a few steps: + +1. **Install the Operator** + ```bash + kubectl apply -f https://github.com/llamastack/llama-stack-k8s-operator/releases/latest/download/operator.yaml + ``` + +2. **Deploy a LlamaStack Instance** + ```yaml + apiVersion: llamastack.io/v1alpha1 + kind: LlamaStackDistribution + metadata: + name: my-llamastack + spec: + replicas: 1 + server: + distribution: + name: "starter" + containerSpec: + port: 8321 + resources: + requests: + memory: "2Gi" + cpu: "1" + ``` + +3. **Apply the Configuration** + ```bash + kubectl apply -f llamastack.yaml + ``` + +## Key Features + +### πŸš€ **Simple Deployment** +Deploy LlamaStack distributions with minimal configuration using Kubernetes-native resources. + +### πŸ“ˆ **Auto-scaling** +Automatically scale your LlamaStack instances based on resource utilization and demand. + +### πŸ’Ύ **Persistent Storage** +Built-in support for persistent storage to maintain models, cache, and application data. + +### πŸ”§ **Flexible Configuration** +Support for multiple LlamaStack distributions and custom container images. + +### πŸ“Š **Observability** +Integrated monitoring, logging, and health checks for production deployments. + +### πŸ”’ **Security** +Security best practices with RBAC, network policies, and secure defaults. + +## Architecture + +```mermaid +graph TD + A[LlamaStackDistribution CRD] --> B[Operator Controller] + B --> C[Deployment] + B --> D[Service] + B --> E[ConfigMap] + B --> F[PersistentVolumeClaim] + + C --> G[LlamaStack Pod 1] + C --> H[LlamaStack Pod 2] + C --> I[LlamaStack Pod N] + + G --> J[Storage Volume] + H --> J + I --> J + + D --> K[Load Balancer] + K --> L[External Access] +``` + +## Use Cases + +### Development and Testing +- Quick setup for development environments +- Testing different LlamaStack configurations +- Prototyping AI applications + +### Production Deployments +- Scalable LlamaStack deployments +- High availability configurations +- Enterprise-grade security and monitoring + +### Multi-tenant Environments +- Isolated LlamaStack instances per team +- Resource quotas and limits +- Namespace-based separation + +## Getting Started + +Ready to get started? Check out our comprehensive guides: + +- [Installation Guide](getting-started/installation.md) - Install the operator in your cluster +- [Quick Start Tutorial](getting-started/quick-start.md) - Deploy your first LlamaStack instance +- [Configuration Guide](getting-started/configuration.md) - Learn about configuration options + +## Documentation + +- **[API Reference](reference/api.md)** - Complete API documentation +- **[How-to Guides](how-to/deploy-llamastack.md)** - Task-oriented guides +- **[Examples](examples/basic-deployment.md)** - Real-world configuration examples +- **[Contributing](contributing/development.md)** - Development and contribution guide + +## Community + +- **GitHub**: [llamastack/llama-stack-k8s-operator](https://github.com/llamastack/llama-stack-k8s-operator) +- **Issues**: [Report bugs and request features](https://github.com/llamastack/llama-stack-k8s-operator/issues) +- **Discussions**: [Community discussions](https://github.com/llamastack/llama-stack-k8s-operator/discussions) + +## License + +This project is licensed under the Apache License 2.0. See the [LICENSE](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/LICENSE) file for details. diff --git a/docs/content/javascripts/extra.js b/docs/content/javascripts/extra.js new file mode 100644 index 000000000..c6d014845 --- /dev/null +++ b/docs/content/javascripts/extra.js @@ -0,0 +1,617 @@ +// Enhanced functionality for LlamaStack Operator Documentation + +document.addEventListener('DOMContentLoaded', function() { + // Initialize all enhancements + initializeCodeCopyButtons(); + initializeAPIExplorer(); + initializeSearchEnhancements(); + initializeNavigationEnhancements(); + initializeAccessibility(); + initializeAnalytics(); +}); + +/** + * Enhanced copy buttons for code blocks + */ +function initializeCodeCopyButtons() { + // Add copy success feedback + document.addEventListener('clipboard-success', function(e) { + const button = e.detail.trigger; + const originalText = button.textContent; + + button.textContent = 'Copied!'; + button.style.background = '#10b981'; + + setTimeout(() => { + button.textContent = originalText; + button.style.background = ''; + }, 2000); + }); + + // Add copy error feedback + document.addEventListener('clipboard-error', function(e) { + const button = e.detail.trigger; + const originalText = button.textContent; + + button.textContent = 'Failed'; + button.style.background = '#ef4444'; + + setTimeout(() => { + button.textContent = originalText; + button.style.background = ''; + }, 2000); + }); +} + +/** + * Interactive API Explorer + */ +function initializeAPIExplorer() { + // YAML validator for LlamaStackDistribution specs + const yamlInputs = document.querySelectorAll('.yaml-validator'); + + yamlInputs.forEach(input => { + const validateButton = document.createElement('button'); + validateButton.textContent = 'Validate YAML'; + validateButton.className = 'md-button md-button--primary yaml-validate-btn'; + + validateButton.addEventListener('click', function() { + validateYAML(input); + }); + + input.parentNode.insertBefore(validateButton, input.nextSibling); + }); + + // Add interactive examples + addInteractiveExamples(); +} + +/** + * Validate YAML content + */ +function validateYAML(input) { + const content = input.value; + const resultDiv = getOrCreateResultDiv(input); + + try { + // Basic YAML validation (you might want to use a proper YAML parser) + if (!content.trim()) { + throw new Error('Empty YAML content'); + } + + // Check for basic LlamaStackDistribution structure + if (!content.includes('apiVersion: llamastack.io/v1alpha1')) { + throw new Error('Missing required apiVersion'); + } + + if (!content.includes('kind: LlamaStackDistribution')) { + throw new Error('Missing required kind'); + } + + resultDiv.innerHTML = ` +
+ βœ… Valid YAML +

Your LlamaStackDistribution configuration appears to be valid.

+
+ `; + resultDiv.className = 'validation-result success'; + + } catch (error) { + resultDiv.innerHTML = ` +
+ ❌ Invalid YAML +

Error: ${error.message}

+
+ `; + resultDiv.className = 'validation-result error'; + } +} + +/** + * Get or create result div for validation + */ +function getOrCreateResultDiv(input) { + let resultDiv = input.parentNode.querySelector('.validation-result'); + if (!resultDiv) { + resultDiv = document.createElement('div'); + resultDiv.className = 'validation-result'; + input.parentNode.appendChild(resultDiv); + } + return resultDiv; +} + +/** + * Add interactive examples + */ +function addInteractiveExamples() { + const examples = document.querySelectorAll('.interactive-example'); + + examples.forEach(example => { + const tryButton = document.createElement('button'); + tryButton.textContent = 'Try this example'; + tryButton.className = 'md-button try-example-btn'; + + tryButton.addEventListener('click', function() { + const codeBlock = example.querySelector('code'); + if (codeBlock) { + copyToClipboard(codeBlock.textContent); + showNotification('Example copied to clipboard!'); + } + }); + + example.appendChild(tryButton); + }); +} + +/** + * Enhanced search functionality + */ +function initializeSearchEnhancements() { + const searchInput = document.querySelector('.md-search__input'); + if (!searchInput) return; + + // Add search suggestions + const suggestionsDiv = document.createElement('div'); + suggestionsDiv.className = 'search-suggestions'; + suggestionsDiv.style.display = 'none'; + searchInput.parentNode.appendChild(suggestionsDiv); + + // Popular search terms + const popularSearches = [ + 'installation', + 'quick start', + 'API reference', + 'examples', + 'troubleshooting', + 'configuration', + 'scaling', + 'storage' + ]; + + searchInput.addEventListener('focus', function() { + if (!this.value) { + showSearchSuggestions(popularSearches, suggestionsDiv); + } + }); + + searchInput.addEventListener('blur', function() { + setTimeout(() => { + suggestionsDiv.style.display = 'none'; + }, 200); + }); + + // Search analytics + searchInput.addEventListener('input', function() { + if (this.value.length > 2) { + trackSearchQuery(this.value); + } + }); +} + +/** + * Show search suggestions + */ +function showSearchSuggestions(suggestions, container) { + container.innerHTML = suggestions.map(term => + `
${term}
` + ).join(''); + container.style.display = 'block'; +} + +/** + * Perform search + */ +function performSearch(term) { + const searchInput = document.querySelector('.md-search__input'); + if (searchInput) { + searchInput.value = term; + searchInput.dispatchEvent(new Event('input')); + searchInput.focus(); + } +} + +/** + * Navigation enhancements + */ +function initializeNavigationEnhancements() { + // Add breadcrumb navigation + addBreadcrumbs(); + + // Add "Edit this page" links + addEditLinks(); + + // Add page navigation (previous/next) + addPageNavigation(); + + // Smooth scrolling for anchor links + document.querySelectorAll('a[href^="#"]').forEach(anchor => { + anchor.addEventListener('click', function (e) { + e.preventDefault(); + const target = document.querySelector(this.getAttribute('href')); + if (target) { + target.scrollIntoView({ + behavior: 'smooth', + block: 'start' + }); + } + }); + }); +} + +/** + * Add breadcrumb navigation + */ +function addBreadcrumbs() { + const nav = document.querySelector('.md-nav--primary'); + if (!nav) return; + + const currentPath = window.location.pathname; + const pathParts = currentPath.split('/').filter(part => part); + + if (pathParts.length > 1) { + const breadcrumbContainer = document.createElement('nav'); + breadcrumbContainer.className = 'breadcrumb-nav'; + breadcrumbContainer.setAttribute('aria-label', 'Breadcrumb'); + + let breadcrumbHTML = ''; + breadcrumbContainer.innerHTML = breadcrumbHTML; + + const content = document.querySelector('.md-content'); + if (content) { + content.insertBefore(breadcrumbContainer, content.firstChild); + } + } +} + +/** + * Add edit links + */ +function addEditLinks() { + const repoUrl = 'https://github.com/llamastack/llama-stack-k8s-operator'; + const currentPath = window.location.pathname; + const editUrl = `${repoUrl}/edit/main/docs/content${currentPath.replace(/\/$/, '')}.md`; + + const editLink = document.createElement('a'); + editLink.href = editUrl; + editLink.textContent = '✏️ Edit this page'; + editLink.className = 'edit-link'; + editLink.target = '_blank'; + editLink.rel = 'noopener noreferrer'; + + const article = document.querySelector('article'); + if (article) { + article.appendChild(editLink); + } +} + +/** + * Add page navigation + */ +function addPageNavigation() { + // This would require parsing the navigation structure + // Implementation depends on MkDocs navigation data +} + +/** + * Accessibility enhancements + */ +function initializeAccessibility() { + // Add skip to content link + const skipLink = document.createElement('a'); + skipLink.href = '#main-content'; + skipLink.textContent = 'Skip to main content'; + skipLink.className = 'skip-link'; + document.body.insertBefore(skipLink, document.body.firstChild); + + // Mark main content + const mainContent = document.querySelector('.md-content'); + if (mainContent) { + mainContent.id = 'main-content'; + } + + // Enhance keyboard navigation + document.addEventListener('keydown', function(e) { + // Alt + S for search + if (e.altKey && e.key === 's') { + e.preventDefault(); + const searchInput = document.querySelector('.md-search__input'); + if (searchInput) { + searchInput.focus(); + } + } + + // Alt + H for home + if (e.altKey && e.key === 'h') { + e.preventDefault(); + window.location.href = '/'; + } + }); + + // Add ARIA labels to interactive elements + document.querySelectorAll('.md-nav__link').forEach(link => { + if (!link.getAttribute('aria-label')) { + link.setAttribute('aria-label', `Navigate to ${link.textContent.trim()}`); + } + }); +} + +/** + * Analytics and tracking + */ +function initializeAnalytics() { + // Track page views + trackPageView(); + + // Track user interactions + trackUserInteractions(); + + // Track performance metrics + trackPerformanceMetrics(); +} + +/** + * Track page view + */ +function trackPageView() { + // Implementation depends on your analytics provider + console.log('Page view tracked:', window.location.pathname); +} + +/** + * Track search queries + */ +function trackSearchQuery(query) { + // Implementation depends on your analytics provider + console.log('Search query tracked:', query); +} + +/** + * Track user interactions + */ +function trackUserInteractions() { + // Track copy button clicks + document.addEventListener('click', function(e) { + if (e.target.classList.contains('md-clipboard')) { + console.log('Copy button clicked'); + } + + if (e.target.classList.contains('try-example-btn')) { + console.log('Try example button clicked'); + } + }); +} + +/** + * Track performance metrics + */ +function trackPerformanceMetrics() { + // Track page load time + window.addEventListener('load', function() { + const loadTime = performance.now(); + console.log('Page load time:', loadTime); + }); +} + +/** + * Utility functions + */ + +/** + * Copy text to clipboard + */ +function copyToClipboard(text) { + if (navigator.clipboard) { + navigator.clipboard.writeText(text); + } else { + // Fallback for older browsers + const textArea = document.createElement('textarea'); + textArea.value = text; + document.body.appendChild(textArea); + textArea.select(); + document.execCommand('copy'); + document.body.removeChild(textArea); + } +} + +/** + * Show notification + */ +function showNotification(message, type = 'info') { + const notification = document.createElement('div'); + notification.className = `notification notification--${type}`; + notification.textContent = message; + + document.body.appendChild(notification); + + // Animate in + setTimeout(() => { + notification.classList.add('notification--visible'); + }, 100); + + // Remove after 3 seconds + setTimeout(() => { + notification.classList.remove('notification--visible'); + setTimeout(() => { + document.body.removeChild(notification); + }, 300); + }, 3000); +} + +/** + * Debounce function + */ +function debounce(func, wait) { + let timeout; + return function executedFunction(...args) { + const later = () => { + clearTimeout(timeout); + func(...args); + }; + clearTimeout(timeout); + timeout = setTimeout(later, wait); + }; +} + +/** + * Throttle function + */ +function throttle(func, limit) { + let inThrottle; + return function() { + const args = arguments; + const context = this; + if (!inThrottle) { + func.apply(context, args); + inThrottle = true; + setTimeout(() => inThrottle = false, limit); + } + }; +} + +// Add CSS for notifications and other dynamic elements +const dynamicStyles = ` + .notification { + position: fixed; + top: 20px; + right: 20px; + padding: 12px 20px; + border-radius: 6px; + color: white; + font-weight: 500; + z-index: 1000; + transform: translateX(100%); + transition: transform 0.3s ease; + } + + .notification--visible { + transform: translateX(0); + } + + .notification--info { + background: #2563eb; + } + + .notification--success { + background: #10b981; + } + + .notification--error { + background: #ef4444; + } + + .breadcrumb-nav { + margin-bottom: 2rem; + padding: 1rem 0; + border-bottom: 1px solid var(--md-default-fg-color--lightest); + } + + .breadcrumb { + list-style: none; + padding: 0; + margin: 0; + display: flex; + align-items: center; + gap: 0.5rem; + } + + .breadcrumb li:not(:last-child)::after { + content: "β€Ί"; + margin-left: 0.5rem; + color: var(--md-default-fg-color--light); + } + + .breadcrumb a { + color: var(--md-default-fg-color--light); + text-decoration: none; + } + + .breadcrumb a:hover { + color: var(--md-primary-fg-color); + } + + .edit-link { + display: inline-block; + margin-top: 2rem; + padding: 0.5rem 1rem; + background: var(--md-default-fg-color--lightest); + border-radius: 4px; + text-decoration: none; + font-size: 0.9em; + } + + .edit-link:hover { + background: var(--md-default-fg-color--lighter); + } + + .search-suggestions { + position: absolute; + top: 100%; + left: 0; + right: 0; + background: var(--md-default-bg-color); + border: 1px solid var(--md-default-fg-color--lightest); + border-radius: 4px; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); + z-index: 100; + } + + .search-suggestion { + padding: 0.5rem 1rem; + cursor: pointer; + border-bottom: 1px solid var(--md-default-fg-color--lightest); + } + + .search-suggestion:hover { + background: var(--md-default-fg-color--lightest); + } + + .search-suggestion:last-child { + border-bottom: none; + } + + .validation-result { + margin-top: 1rem; + padding: 1rem; + border-radius: 4px; + } + + .validation-result.success { + background: rgba(16, 185, 129, 0.1); + border: 1px solid #10b981; + } + + .validation-result.error { + background: rgba(239, 68, 68, 0.1); + border: 1px solid #ef4444; + } + + .yaml-validate-btn { + margin-top: 0.5rem; + margin-bottom: 1rem; + } + + .try-example-btn { + margin-top: 1rem; + } +`; + +// Inject dynamic styles +const styleSheet = document.createElement('style'); +styleSheet.textContent = dynamicStyles; +document.head.appendChild(styleSheet); diff --git a/docs/content/reference/cli.md b/docs/content/reference/cli.md new file mode 100644 index 000000000..73fb4a9a1 --- /dev/null +++ b/docs/content/reference/cli.md @@ -0,0 +1,456 @@ +# CLI Reference + +Command-line interface reference for LlamaStack Kubernetes Operator. + +## kubectl Commands + +### Basic Operations + +#### List Distributions + +```bash +# List all LlamaStack distributions +kubectl get llamastackdistribution + +# List with additional details +kubectl get llamastackdistribution -o wide + +# List in all namespaces +kubectl get llamastackdistribution --all-namespaces +``` + +#### Describe Distribution + +```bash +# Get detailed information +kubectl describe llamastackdistribution + +# Get YAML output +kubectl get llamastackdistribution -o yaml + +# Get JSON output +kubectl get llamastackdistribution -o json +``` + +#### Create Distribution + +```bash +# Create from file +kubectl apply -f llamastack-distribution.yaml + +# Create from URL +kubectl apply -f https://example.com/llamastack.yaml + +# Dry run to validate +kubectl apply -f llamastack.yaml --dry-run=client +``` + +#### Update Distribution + +```bash +# Apply changes from file +kubectl apply -f llamastack-distribution.yaml + +# Edit directly +kubectl edit llamastackdistribution + +# Patch specific fields +kubectl patch llamastackdistribution -p '{"spec":{"replicas":3}}' +``` + +#### Delete Distribution + +```bash +# Delete by name +kubectl delete llamastackdistribution + +# Delete from file +kubectl delete -f llamastack-distribution.yaml + +# Delete all distributions +kubectl delete llamastackdistribution --all +``` + +### Advanced Operations + +#### Scale Distribution + +```bash +# Scale to specific replica count +kubectl scale llamastackdistribution --replicas=5 + +# Scale multiple distributions +kubectl scale llamastackdistribution --all --replicas=3 +``` + +#### Rollout Management + +```bash +# Check rollout status +kubectl rollout status deployment/ + +# Rollout history +kubectl rollout history deployment/ + +# Rollback to previous version +kubectl rollout undo deployment/ + +# Rollback to specific revision +kubectl rollout undo deployment/ --to-revision=2 +``` + +#### Resource Management + +```bash +# Get resource usage +kubectl top pods -l app= + +# Get node usage +kubectl top nodes + +# Describe resource quotas +kubectl describe resourcequota +``` + +## Operator Management + +### Installation + +```bash +# Install operator +kubectl apply -f https://github.com/llamastack/llama-stack-k8s-operator/releases/latest/download/operator.yaml + +# Install specific version +kubectl apply -f https://github.com/llamastack/llama-stack-k8s-operator/releases/download/v1.0.0/operator.yaml + +# Install from local file +kubectl apply -f operator.yaml +``` + +### Operator Status + +```bash +# Check operator pods +kubectl get pods -n llamastack-system + +# Check operator logs +kubectl logs -n llamastack-system -l app=llamastack-operator + +# Follow operator logs +kubectl logs -n llamastack-system -l app=llamastack-operator -f +``` + +### Operator Configuration + +```bash +# Get operator configuration +kubectl get configmap -n llamastack-system llamastack-config -o yaml + +# Update operator configuration +kubectl patch configmap -n llamastack-system llamastack-config -p '{"data":{"config.yaml":"..."}}' +``` + +## Debugging Commands + +### Pod Operations + +```bash +# List pods for a distribution +kubectl get pods -l app= + +# Get pod logs +kubectl logs + +# Follow pod logs +kubectl logs -f + +# Get previous container logs +kubectl logs --previous + +# Execute commands in pod +kubectl exec -it -- /bin/bash + +# Copy files to/from pod +kubectl cp : +kubectl cp : +``` + +### Service Operations + +```bash +# List services +kubectl get svc -l app= + +# Describe service +kubectl describe svc + +# Get service endpoints +kubectl get endpoints + +# Port forward to service +kubectl port-forward svc/ 8080:8080 +``` + +### Network Debugging + +```bash +# Test DNS resolution +kubectl exec -it -- nslookup + +# Test connectivity +kubectl exec -it -- curl http://:8080/health + +# Check network policies +kubectl get networkpolicy +``` + +### Storage Operations + +```bash +# List persistent volume claims +kubectl get pvc -l app= + +# Describe PVC +kubectl describe pvc + +# Check storage usage +kubectl exec -it -- df -h + +# List storage classes +kubectl get storageclass +``` + +## Monitoring Commands + +### Resource Monitoring + +```bash +# Get resource usage for pods +kubectl top pods -l app= + +# Get resource usage for nodes +kubectl top nodes + +# Get resource usage with containers +kubectl top pods -l app= --containers +``` + +### Event Monitoring + +```bash +# Get events for a distribution +kubectl get events --field-selector involvedObject.name= + +# Get recent events +kubectl get events --sort-by=.metadata.creationTimestamp + +# Watch events in real-time +kubectl get events --watch +``` + +### Metrics Access + +```bash +# Port forward to metrics endpoint +kubectl port-forward 9090:9090 + +# Access metrics via curl +kubectl exec -it -- curl http://localhost:9090/metrics +``` + +## Configuration Management + +### ConfigMaps + +```bash +# Create ConfigMap from file +kubectl create configmap llamastack-config --from-file=config.yaml + +# Create ConfigMap from literal values +kubectl create configmap llamastack-config --from-literal=key1=value1 + +# Update ConfigMap +kubectl patch configmap llamastack-config -p '{"data":{"key":"new-value"}}' + +# Get ConfigMap +kubectl get configmap llamastack-config -o yaml +``` + +### Secrets + +```bash +# Create secret from file +kubectl create secret generic llamastack-secret --from-file=secret.txt + +# Create secret from literal values +kubectl create secret generic llamastack-secret --from-literal=password=secret123 + +# Get secret (base64 encoded) +kubectl get secret llamastack-secret -o yaml + +# Decode secret +kubectl get secret llamastack-secret -o jsonpath='{.data.password}' | base64 -d +``` + +## Backup and Recovery + +### Backup + +```bash +# Backup distribution configuration +kubectl get llamastackdistribution -o yaml > backup.yaml + +# Backup all distributions +kubectl get llamastackdistribution -o yaml > all-distributions-backup.yaml + +# Create volume snapshot +kubectl create volumesnapshot --from-pvc= +``` + +### Recovery + +```bash +# Restore from backup +kubectl apply -f backup.yaml + +# Restore from volume snapshot +kubectl apply -f - < + kind: VolumeSnapshot + apiGroup: snapshot.storage.k8s.io + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi +EOF +``` + +## Useful Aliases + +Add these to your shell configuration: + +```bash +# Basic aliases +alias k='kubectl' +alias kgd='kubectl get llamastackdistribution' +alias kdd='kubectl describe llamastackdistribution' +alias ked='kubectl edit llamastackdistribution' + +# Pod aliases +alias kgp='kubectl get pods' +alias kdp='kubectl describe pod' +alias kl='kubectl logs' +alias kex='kubectl exec -it' + +# Service aliases +alias kgs='kubectl get svc' +alias kds='kubectl describe svc' +alias kpf='kubectl port-forward' + +# Monitoring aliases +alias ktop='kubectl top' +alias kge='kubectl get events --sort-by=.metadata.creationTimestamp' +``` + +## Bash Completion + +Enable kubectl completion: + +```bash +# For bash +echo 'source <(kubectl completion bash)' >>~/.bashrc + +# For zsh +echo 'source <(kubectl completion zsh)' >>~/.zshrc + +# For fish +kubectl completion fish | source +``` + +## Common Workflows + +### Development Workflow + +```bash +# 1. Create development distribution +kubectl apply -f dev-llamastack.yaml + +# 2. Check status +kubectl get llamastackdistribution dev-llamastack + +# 3. Check pods +kubectl get pods -l app=dev-llamastack + +# 4. View logs +kubectl logs -f -l app=dev-llamastack + +# 5. Test connectivity +kubectl port-forward svc/dev-llamastack 8080:8080 + +# 6. Update configuration +kubectl edit llamastackdistribution dev-llamastack + +# 7. Clean up +kubectl delete llamastackdistribution dev-llamastack +``` + +### Production Deployment + +```bash +# 1. Validate configuration +kubectl apply -f prod-llamastack.yaml --dry-run=client + +# 2. Deploy +kubectl apply -f prod-llamastack.yaml + +# 3. Monitor rollout +kubectl rollout status deployment/prod-llamastack + +# 4. Verify health +kubectl get pods -l app=prod-llamastack +kubectl logs -l app=prod-llamastack | grep "Ready" + +# 5. Scale if needed +kubectl scale llamastackdistribution prod-llamastack --replicas=5 + +# 6. Monitor metrics +kubectl top pods -l app=prod-llamastack +``` + +### Troubleshooting Workflow + +```bash +# 1. Check distribution status +kubectl describe llamastackdistribution + +# 2. Check pod status +kubectl get pods -l app= +kubectl describe pod + +# 3. Check logs +kubectl logs +kubectl logs --previous + +# 4. Check events +kubectl get events --field-selector involvedObject.name= + +# 5. Debug network +kubectl exec -it -- curl http://localhost:8080/health + +# 6. Check resources +kubectl top pods -l app= +kubectl describe node +``` + +## Next Steps + +- [API Reference](api.md) +- [Configuration Reference](configuration.md) +- [Troubleshooting Guide](../how-to/troubleshooting.md) diff --git a/docs/content/reference/configuration.md b/docs/content/reference/configuration.md new file mode 100644 index 000000000..befc08ce8 --- /dev/null +++ b/docs/content/reference/configuration.md @@ -0,0 +1,513 @@ +# Configuration Reference + +Complete reference for configuring LlamaStack Kubernetes Operator based on the actual API. + +## LlamaStackDistribution Specification + +### Basic Structure + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: string + namespace: string +spec: + replicas: integer # Default: 1 + server: + distribution: + # Either name OR image (mutually exclusive) + name: string # Distribution name from supported distributions + image: string # Direct container image reference + containerSpec: + name: string # Default: "llama-stack" + port: integer # Default: 8321 + resources: + requests: + cpu: string + memory: string + limits: + cpu: string + memory: string + env: + - name: string + value: string + podOverrides: # Optional pod-level customization + volumes: + - name: string + # ... volume spec + volumeMounts: + - name: string + mountPath: string + storage: # Optional persistent storage + size: string # Default: "10Gi" + mountPath: string # Default: "/.llama" +``` + +## Core Configuration + +### Distribution Configuration + +You can specify either a distribution name OR a direct image reference: + +```yaml +# Option 1: Use a named distribution (recommended) +spec: + server: + distribution: + name: "starter" # Maps to supported distributions + +# Option 2: Use a direct image +spec: + server: + distribution: + image: "llamastack/llamastack:latest" +``` + +### Supported Distribution Names + +The operator supports the following **7 pre-configured distributions**: + +| Distribution Name | Image | Description | +|-------------------|-------|-------------| +| `starter` | `docker.io/llamastack/distribution-starter:latest` | **Recommended default** - General purpose LlamaStack distribution | +| `ollama` | `docker.io/llamastack/distribution-ollama:latest` | Ollama-based distribution for local inference | +| `bedrock` | `docker.io/llamastack/distribution-bedrock:latest` | AWS Bedrock distribution for cloud-based models | +| `remote-vllm` | `docker.io/llamastack/distribution-remote-vllm:latest` | Remote vLLM server integration | +| `tgi` | `docker.io/llamastack/distribution-tgi:latest` | Hugging Face Text Generation Inference | +| `together` | `docker.io/llamastack/distribution-together:latest` | Together AI API integration | +| `vllm-gpu` | `docker.io/llamastack/distribution-vllm-gpu:latest` | High-performance GPU inference with vLLM | +| `remote-vllm` | `docker.io/llamastack/distribution-remote-vllm:latest` | Remote vLLM distribution | +| `sambanova` | `docker.io/llamastack/distribution-sambanova:latest` | SambaNova distribution | +| `tgi` | `docker.io/llamastack/distribution-tgi:latest` | Text Generation Inference distribution | +| `together` | `docker.io/llamastack/distribution-together:latest` | Together AI distribution | +| `vllm-gpu` | `docker.io/llamastack/distribution-vllm-gpu:latest` | vLLM GPU distribution | +| `watsonx` | `docker.io/llamastack/distribution-watsonx:latest` | IBM watsonx distribution | +| `fireworks` | `docker.io/llamastack/distribution-fireworks:latest` | Fireworks AI distribution | + +**Examples:** + +```yaml +# Ollama distribution +spec: + server: + distribution: + name: "ollama" + +# Hugging Face Endpoint +spec: + server: + distribution: + name: "hf-endpoint" + +# NVIDIA distribution +spec: + server: + distribution: + name: "nvidia" + +# vLLM GPU distribution +spec: + server: + distribution: + name: "vllm-gpu" +``` + +### Replica Configuration + +```yaml +spec: + replicas: 3 # Default: 1 +``` + +### Container Configuration + +```yaml +spec: + server: + containerSpec: + name: "llama-stack" # Default container name + port: 8321 # Default port + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + env: + - name: "INFERENCE_MODEL" + value: "llama2-7b" + - name: "LOG_LEVEL" + value: "INFO" +``` + +## Storage Configuration + +### Basic Storage + +```yaml +spec: + server: + storage: + size: "50Gi" # Default: "10Gi" + mountPath: "/.llama" # Default mount path +``` + +### Custom Mount Path + +```yaml +spec: + server: + storage: + size: "100Gi" + mountPath: "/custom/path" +``` + +## Advanced Pod Customization + +### Additional Volumes + +```yaml +spec: + server: + podOverrides: + volumes: + - name: "model-cache" + emptyDir: + sizeLimit: "20Gi" + - name: "config" + configMap: + name: "llamastack-config" + volumeMounts: + - name: "model-cache" + mountPath: "/cache" + - name: "config" + mountPath: "/config" + readOnly: true +``` + +### ConfigMap Integration + +```yaml +spec: + server: + podOverrides: + volumes: + - name: "llamastack-config" + configMap: + name: "my-llamastack-config" + volumeMounts: + - name: "llamastack-config" + mountPath: "/app/config" +``` + +## Configuration Examples + +### Minimal Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: simple-llamastack +spec: + server: + distribution: + name: "ollama" +``` + +### Development Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-dev +spec: + replicas: 1 + server: + distribution: + image: "llamastack/llamastack:latest" + containerSpec: + port: 8321 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1" + memory: "2Gi" + env: + - name: "LOG_LEVEL" + value: "DEBUG" + storage: + size: "20Gi" +``` + +### Production Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: llamastack-prod +spec: + replicas: 3 + server: + distribution: + image: "llamastack/llamastack:v1.0.0" + containerSpec: + name: "llama-stack" + port: 8321 + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "4" + memory: "8Gi" + env: + - name: "INFERENCE_MODEL" + value: "llama2-70b" + - name: "MAX_WORKERS" + value: "4" + storage: + size: "500Gi" + mountPath: "/.llama" + podOverrides: + volumes: + - name: "model-cache" + emptyDir: + sizeLimit: "100Gi" + volumeMounts: + - name: "model-cache" + mountPath: "/cache" +``` + +### Custom Image with Configuration + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: custom-llamastack +spec: + replicas: 2 + server: + distribution: + image: "myregistry.com/custom-llamastack:v1.0" + containerSpec: + port: 8321 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + env: + - name: "CUSTOM_CONFIG" + value: "/config/custom.yaml" + storage: + size: "100Gi" + podOverrides: + volumes: + - name: "custom-config" + configMap: + name: "llamastack-custom-config" + volumeMounts: + - name: "custom-config" + mountPath: "/config" + readOnly: true +``` + +### Distribution-Specific Examples + +#### Ollama Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: ollama-llamastack +spec: + replicas: 1 + server: + distribution: + name: "ollama" + containerSpec: + port: 8321 + env: + - name: OLLAMA_URL + value: "http://ollama-server-service.ollama-dist.svc.cluster.local:11434" + storage: + size: "20Gi" +``` + +#### Hugging Face Endpoint + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: hf-endpoint-llamastack +spec: + server: + distribution: + name: "hf-endpoint" + containerSpec: + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-credentials + key: token + - name: HF_MODEL_ID + value: "meta-llama/Llama-2-7b-chat-hf" +``` + +#### NVIDIA Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: nvidia-llamastack +spec: + server: + distribution: + name: "nvidia" + containerSpec: + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + env: + - name: NVIDIA_API_KEY + valueFrom: + secretKeyRef: + name: nvidia-credentials + key: api-key +``` + +#### vLLM GPU Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: vllm-gpu-llamastack +spec: + server: + distribution: + name: "vllm-gpu" + containerSpec: + resources: + requests: + nvidia.com/gpu: "1" + memory: "8Gi" + limits: + nvidia.com/gpu: "1" + memory: "16Gi" + env: + - name: MODEL_NAME + value: "meta-llama/Llama-2-7b-chat-hf" + storage: + size: "50Gi" +``` + +#### AWS Bedrock Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: bedrock-llamastack +spec: + server: + distribution: + name: "bedrock" + containerSpec: + env: + - name: AWS_REGION + value: "us-east-1" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: access-key-id + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: secret-access-key +``` + +#### Together AI Distribution + +```yaml +apiVersion: llamastack.io/v1alpha1 +kind: LlamaStackDistribution +metadata: + name: together-llamastack +spec: + server: + distribution: + name: "together" + containerSpec: + env: + - name: TOGETHER_API_KEY + valueFrom: + secretKeyRef: + name: together-credentials + key: api-key + - name: MODEL_NAME + value: "meta-llama/Llama-2-7b-chat-hf" +``` + +## Status Information + +The operator provides status information about the distribution: + +```yaml +status: + version: "1.0.0" + ready: true + distributionConfig: + activeDistribution: "meta-reference" + providers: + - api: "inference" + provider_id: "meta-reference" + provider_type: "inference" + availableDistributions: + "meta-reference": "llamastack/llamastack:latest" +``` + +## Constants and Defaults + +The API defines several constants: + +- **Default Container Name**: `llama-stack` +- **Default Server Port**: `8321` +- **Default Service Port Name**: `http` +- **Default Mount Path**: `/.llama` +- **Default Storage Size**: `10Gi` +- **Default Label Key**: `app` +- **Default Label Value**: `llama-stack` + +## Validation Rules + +The API includes validation: + +- **Distribution**: Only one of `name` or `image` can be specified +- **Port**: Must be a valid port number +- **Resources**: Follow Kubernetes resource requirements format +- **Storage Size**: Must be a valid Kubernetes quantity + +## Next Steps + +- [API Reference](api.md) +- [CLI Reference](cli.md) +- [How-to Guides](../how-to/deploy-llamastack.md) diff --git a/docs/content/stylesheets/extra.css b/docs/content/stylesheets/extra.css new file mode 100644 index 000000000..45413ff29 --- /dev/null +++ b/docs/content/stylesheets/extra.css @@ -0,0 +1,322 @@ +/* Custom styles for LlamaStack Operator Documentation */ + +:root { + --llamastack-primary: #2563eb; + --llamastack-secondary: #64748b; + --llamastack-accent: #0ea5e9; + --llamastack-success: #10b981; + --llamastack-warning: #f59e0b; + --llamastack-error: #ef4444; +} + +/* Enhanced code blocks */ +.highlight { + border-radius: 8px; + overflow: hidden; +} + +.highlight pre { + margin: 0; + padding: 1rem; + background: var(--md-code-bg-color); +} + +/* Copy button styling */ +.md-clipboard { + border-radius: 4px; + transition: all 0.2s ease; +} + +.md-clipboard:hover { + background-color: var(--llamastack-primary); + color: white; +} + +/* Enhanced admonitions */ +.md-typeset .admonition { + border-radius: 8px; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +.md-typeset .admonition.note { + border-left: 4px solid var(--llamastack-primary); +} + +.md-typeset .admonition.tip { + border-left: 4px solid var(--llamastack-success); +} + +.md-typeset .admonition.warning { + border-left: 4px solid var(--llamastack-warning); +} + +.md-typeset .admonition.danger { + border-left: 4px solid var(--llamastack-error); +} + +/* API reference styling */ +.api-section { + margin: 2rem 0; + padding: 1.5rem; + border: 1px solid var(--md-default-fg-color--lightest); + border-radius: 8px; + background: var(--md-code-bg-color); +} + +.api-section h3 { + margin-top: 0; + color: var(--llamastack-primary); +} + +.api-field { + margin: 1rem 0; + padding: 0.5rem; + background: rgba(37, 99, 235, 0.05); + border-radius: 4px; +} + +.api-field-name { + font-family: var(--md-code-font); + font-weight: bold; + color: var(--llamastack-primary); +} + +.api-field-type { + font-family: var(--md-code-font); + color: var(--llamastack-secondary); + font-size: 0.9em; +} + +.api-field-description { + margin-top: 0.5rem; + color: var(--md-default-fg-color); +} + +/* Enhanced tables */ +.md-typeset table:not([class]) { + border-radius: 8px; + overflow: hidden; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +.md-typeset table:not([class]) th { + background: var(--llamastack-primary); + color: white; + font-weight: 600; +} + +.md-typeset table:not([class]) tr:nth-child(even) { + background: rgba(37, 99, 235, 0.05); +} + +/* Navigation enhancements */ +.md-nav__item--active > .md-nav__link { + color: var(--llamastack-primary); + font-weight: 600; +} + +.md-nav__link:hover { + color: var(--llamastack-accent); +} + +/* Search enhancements */ +.md-search__input { + border-radius: 8px; +} + +.md-search__input:focus { + border-color: var(--llamastack-primary); + box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2); +} + +/* Button styling */ +.md-button { + border-radius: 6px; + transition: all 0.2s ease; +} + +.md-button--primary { + background: var(--llamastack-primary); + border-color: var(--llamastack-primary); +} + +.md-button--primary:hover { + background: var(--llamastack-accent); + border-color: var(--llamastack-accent); + transform: translateY(-1px); +} + +/* Code snippet enhancements */ +.md-typeset code { + background: rgba(37, 99, 235, 0.1); + color: var(--llamastack-primary); + padding: 0.2em 0.4em; + border-radius: 4px; + font-size: 0.9em; +} + +/* Mermaid diagram styling */ +.mermaid { + text-align: center; + margin: 2rem 0; +} + +.mermaid svg { + max-width: 100%; + height: auto; +} + +/* Status badges */ +.status-badge { + display: inline-block; + padding: 0.25rem 0.5rem; + border-radius: 4px; + font-size: 0.8em; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.status-badge--stable { + background: var(--llamastack-success); + color: white; +} + +.status-badge--beta { + background: var(--llamastack-warning); + color: white; +} + +.status-badge--alpha { + background: var(--llamastack-error); + color: white; +} + +/* Command line styling */ +.command-line { + background: #1e293b; + color: #e2e8f0; + padding: 1rem; + border-radius: 8px; + font-family: var(--md-code-font); + overflow-x: auto; +} + +.command-line::before { + content: "$ "; + color: var(--llamastack-success); + font-weight: bold; +} + +/* Responsive adjustments */ +@media screen and (max-width: 768px) { + .api-section { + padding: 1rem; + margin: 1rem 0; + } + + .md-typeset table:not([class]) { + font-size: 0.9em; + } + + .highlight pre { + padding: 0.75rem; + font-size: 0.9em; + } +} + +/* Dark mode adjustments */ +[data-md-color-scheme="slate"] { + --llamastack-primary: #60a5fa; + --llamastack-secondary: #94a3b8; + --llamastack-accent: #38bdf8; +} + +[data-md-color-scheme="slate"] .api-section { + background: rgba(255, 255, 255, 0.05); + border-color: rgba(255, 255, 255, 0.1); +} + +[data-md-color-scheme="slate"] .api-field { + background: rgba(96, 165, 250, 0.1); +} + +[data-md-color-scheme="slate"] .md-typeset table:not([class]) tr:nth-child(even) { + background: rgba(96, 165, 250, 0.05); +} + +/* Print styles */ +@media print { + .md-header, + .md-sidebar, + .md-footer { + display: none !important; + } + + .md-main__inner { + margin: 0 !important; + } + + .md-content { + max-width: none !important; + } + + .highlight { + break-inside: avoid; + } + + .api-section { + break-inside: avoid; + box-shadow: none; + border: 1px solid #ccc; + } +} + +/* Accessibility improvements */ +.md-nav__link:focus, +.md-search__input:focus, +.md-button:focus { + outline: 2px solid var(--llamastack-primary); + outline-offset: 2px; +} + +/* Skip to content link */ +.skip-link { + position: absolute; + top: -40px; + left: 6px; + background: var(--llamastack-primary); + color: white; + padding: 8px; + text-decoration: none; + border-radius: 4px; + z-index: 1000; +} + +.skip-link:focus { + top: 6px; +} + +/* Loading animation for dynamic content */ +.loading { + display: inline-block; + width: 20px; + height: 20px; + border: 3px solid rgba(37, 99, 235, 0.3); + border-radius: 50%; + border-top-color: var(--llamastack-primary); + animation: spin 1s ease-in-out infinite; +} + +@keyframes spin { + to { transform: rotate(360deg); } +} + +/* Enhanced footer */ +.md-footer { + background: linear-gradient(135deg, var(--llamastack-primary), var(--llamastack-accent)); +} + +.md-footer-meta { + background: rgba(0, 0, 0, 0.1); +} diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 000000000..09675af65 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,118 @@ +site_name: LlamaStack Kubernetes Operator +site_description: Kubernetes operator for managing LlamaStack distributions +site_url: https://llamastack-k8s-operator.pages.dev +repo_url: https://github.com/llamastack/llama-stack-k8s-operator +repo_name: llamastack/llama-stack-k8s-operator + +# Directory configuration +docs_dir: content +site_dir: site + +# Theme configuration (Material Design like HyperShift) +theme: + name: material + palette: + - scheme: default + primary: blue + accent: blue + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + primary: blue + accent: blue + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.highlight + - search.share + - content.code.copy + - content.code.annotate + - navigation.footer + +# Navigation structure +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Quick Start: getting-started/quick-start.md + - Understanding Distributions: getting-started/distributions.md + - Configuration: getting-started/configuration.md + - Distributions: + - Starter: distributions/starter.md + - vLLM: distributions/vllm.md + - Ollama: distributions/ollama.md + - AWS Bedrock: distributions/bedrock.md + - Text Generation Inference: distributions/tgi.md + - Together AI: distributions/together.md + - Bring Your Own: distributions/bring-your-own.md + - How-to Guides: + - Deploy LlamaStack: how-to/deploy-llamastack.md + - Configure Storage: how-to/configure-storage.md + - Scaling: how-to/scaling.md + - Monitoring: how-to/monitoring.md + - Troubleshooting: how-to/troubleshooting.md + - Reference: + - API Reference: reference/api.md + - Configuration Reference: reference/configuration.md + - CLI Reference: reference/cli.md + - Examples: + - Basic Deployment: examples/basic-deployment.md + - Production Setup: examples/production-setup.md + - Custom Images: examples/custom-images.md + - Contributing: + - Development Guide: contributing/development.md + - Testing: contributing/testing.md + - Documentation: contributing/documentation.md + +# Plugins +plugins: + - search + - mermaid2 + +# Markdown extensions +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.tabbed: + alternate_style: true + - attr_list + - md_in_html + - toc: + permalink: true + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + +# Extra CSS and JavaScript +extra_css: + - stylesheets/extra.css + +extra_javascript: + - javascripts/extra.js + +# Footer +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/llamastack/llama-stack-k8s-operator + - icon: fontawesome/brands/docker + link: https://quay.io/repository/llamastack/llama-stack-k8s-operator + version: + provider: mike diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..6cc0f31b7 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,11 @@ +# MkDocs and theme +mkdocs>=1.5.0 +mkdocs-material>=9.4.0 + +# Plugins +mkdocs-mermaid2-plugin>=1.1.0 + +# Python dependencies for documentation generation +PyYAML>=6.0 +Jinja2>=3.1.0 +Markdown>=3.5.0 diff --git a/scripts/build-docs.sh b/scripts/build-docs.sh new file mode 100755 index 000000000..d4fbec679 --- /dev/null +++ b/scripts/build-docs.sh @@ -0,0 +1,189 @@ +#!/bin/bash +set -e + +# Build script for LlamaStack Operator Documentation +# This script builds the documentation site locally for development and testing + +echo "πŸš€ Building LlamaStack Operator Documentation..." + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if we're in the right directory +if [ ! -f "Makefile" ] || [ ! -d "docs" ]; then + print_error "This script must be run from the repository root directory" + exit 1 +fi + +# Check dependencies +print_status "Checking dependencies..." + +# Check Go +if ! command -v go &> /dev/null; then + print_error "Go is not installed. Please install Go 1.21 or later." + exit 1 +fi + +GO_VERSION=$(go version | grep -oE 'go[0-9]+\.[0-9]+' | sed 's/go//') +REQUIRED_GO_VERSION="1.21" + +if [ "$(printf '%s\n' "$REQUIRED_GO_VERSION" "$GO_VERSION" | sort -V | head -n1)" != "$REQUIRED_GO_VERSION" ]; then + print_error "Go version $REQUIRED_GO_VERSION or later is required. Found: $GO_VERSION" + exit 1 +fi + +print_success "Go version $GO_VERSION found" + +# Check Python +if ! command -v python3 &> /dev/null; then + print_error "Python 3 is not installed. Please install Python 3.8 or later." + exit 1 +fi + +PYTHON_VERSION=$(python3 --version | grep -oE '[0-9]+\.[0-9]+') +REQUIRED_PYTHON_VERSION="3.8" + +if [ "$(printf '%s\n' "$REQUIRED_PYTHON_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" != "$REQUIRED_PYTHON_VERSION" ]; then + print_error "Python version $REQUIRED_PYTHON_VERSION or later is required. Found: $PYTHON_VERSION" + exit 1 +fi + +print_success "Python version $PYTHON_VERSION found" + +# Check pip +if ! command -v pip3 &> /dev/null; then + print_error "pip3 is not installed. Please install pip3." + exit 1 +fi + +print_success "pip3 found" + +# Install Go tools +print_status "Installing Go documentation tools..." + +if ! make crd-ref-docs &> /dev/null; then + print_error "Failed to install crd-ref-docs" + exit 1 +fi + +if ! make gen-crd-api-reference-docs &> /dev/null; then + print_warning "gen-crd-api-reference-docs installation failed, continuing with crd-ref-docs only" +fi + +print_success "Go tools installed" + +# Install Python dependencies +print_status "Installing Python dependencies..." + +if [ -f "docs/requirements.txt" ]; then + if ! pip3 install -r docs/requirements.txt; then + print_error "Failed to install Python dependencies" + exit 1 + fi + print_success "Python dependencies installed" +else + print_error "docs/requirements.txt not found" + exit 1 +fi + +# Generate API documentation +print_status "Generating API documentation..." + +if ! make api-docs; then + print_error "Failed to generate API documentation" + exit 1 +fi + +print_success "API documentation generated" + +# Build documentation site +print_status "Building documentation site..." + +if ! make docs-build; then + print_error "Failed to build documentation site" + exit 1 +fi + +print_success "Documentation site built successfully" + +# Check if site directory exists and has content +if [ ! -d "docs/site" ]; then + print_error "Documentation site directory not found" + exit 1 +fi + +SITE_SIZE=$(du -sh docs/site | cut -f1) +print_success "Documentation site built (${SITE_SIZE})" + +# Validate the build +print_status "Validating build..." + +# Check for index.html +if [ ! -f "docs/site/index.html" ]; then + print_error "index.html not found in build output" + exit 1 +fi + +# Check for API documentation +if [ ! -f "docs/site/reference/api/index.html" ]; then + print_warning "API reference page not found, but continuing..." +fi + +# Check for assets +if [ ! -d "docs/site/assets" ]; then + print_warning "Assets directory not found, but continuing..." +fi + +print_success "Build validation completed" + +# Display build information +echo "" +echo "πŸ“Š Build Summary:" +echo "==================" +echo "πŸ“ Output directory: docs/site/" +echo "πŸ“ Site size: ${SITE_SIZE}" +echo "πŸ”— Local preview: http://localhost:8000" +echo "" + +# Offer to serve the site locally +read -p "🌐 Would you like to serve the documentation locally? (y/N): " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]]; then + print_status "Starting local server..." + echo "πŸ“– Documentation will be available at: http://localhost:8000" + echo "πŸ›‘ Press Ctrl+C to stop the server" + echo "" + + cd docs && python3 -m mkdocs serve --dev-addr 0.0.0.0:8000 +else + echo "" + print_success "Documentation build completed successfully!" + echo "" + echo "To serve the documentation locally, run:" + echo " cd docs && mkdocs serve" + echo "" + echo "Or use the Makefile target:" + echo " make docs-serve" + echo "" +fi diff --git a/wrangler.toml b/wrangler.toml new file mode 100644 index 000000000..eed412e6b --- /dev/null +++ b/wrangler.toml @@ -0,0 +1,81 @@ +name = "llamastack-k8s-operator-docs" +compatibility_date = "2024-01-15" + +[env.production] +name = "llamastack-k8s-operator-docs" + +[env.preview] +name = "llamastack-k8s-operator-docs-preview" + +# Build configuration +[build] +command = "make docs-build" +cwd = "." +watch_dir = ["docs", "api", "crd-ref-docs.config.yaml"] + +[build.environment_variables] +GO_VERSION = "1.21" +PYTHON_VERSION = "3.11" +NODE_VERSION = "18" + +# Pages configuration +[pages] +build_output_dir = "docs/site" +build_caching = true + +# Custom headers for security and performance +[[pages.headers]] +for = "/*" +[pages.headers.values] +X-Frame-Options = "DENY" +X-Content-Type-Options = "nosniff" +X-XSS-Protection = "1; mode=block" +Referrer-Policy = "strict-origin-when-cross-origin" +Permissions-Policy = "camera=(), microphone=(), geolocation=()" + +[[pages.headers]] +for = "*.css" +[pages.headers.values] +Cache-Control = "public, max-age=31536000, immutable" + +[[pages.headers]] +for = "*.js" +[pages.headers.values] +Cache-Control = "public, max-age=31536000, immutable" + +[[pages.headers]] +for = "*.woff2" +[pages.headers.values] +Cache-Control = "public, max-age=31536000, immutable" + +[[pages.headers]] +for = "*.png" +[pages.headers.values] +Cache-Control = "public, max-age=31536000, immutable" + +[[pages.headers]] +for = "*.jpg" +[pages.headers.values] +Cache-Control = "public, max-age=31536000, immutable" + +# Redirects for better UX +[[pages.redirects]] +from = "/api" +to = "/reference/api/" +status = 301 + +[[pages.redirects]] +from = "/docs" +to = "/" +status = 301 + +[[pages.redirects]] +from = "/getting-started" +to = "/getting-started/installation/" +status = 301 + +# SPA fallback for client-side routing +[[pages.redirects]] +from = "/*" +to = "/404.html" +status = 404