diff --git a/.claude/project.json b/.claude/project.json deleted file mode 100644 index c906f02..0000000 --- a/.claude/project.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "nutrient-dws-client-python", - "description": "Python client for Nutrient DWS", - "workspace": "/Users/admin/Projects/nutrient-dws-client-python", - "mcpServers": { - "claude-code-mcp": { - "enabled": true, - "workspace": "/Users/admin/Projects/nutrient-dws-client-python" - }, - "claude-code-github": { - "enabled": true, - "repository": "nutrient-dws-client-python" - } - } -} \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f7e1872 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +# Nutrient DWS Processor API Configuration for Testing +NUTRIENT_API_KEY=your_api_key_here +NUTRIENT_BASE_URL=https://api.nutrient.io + +# Development Settings +DEBUG=true +PYTHON_ENV=development diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1eee5eb..d6e2225 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,81 +1,44 @@ name: CI -# Integration Test Strategy: -# - Fork PRs: Cannot access secrets, so integration tests are skipped with informative feedback -# - Same-repo PRs: Have access to secrets, integration tests run normally -# - Push to main/develop: Integration tests always run to catch any issues after merge -# - Manual trigger: Allows maintainers to run integration tests on demand -# -# This ensures security while still validating integration tests before release - on: push: - branches: [ main, develop ] + branches: [ main ] pull_request: - branches: [ main, develop ] - # Run integration tests after PR is merged - workflow_dispatch: # Allow manual trigger for integration tests + branches: [ main ] jobs: - test: + lint-and-type-check: runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip- + python-version: '3.12' + cache: 'pip' - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install ".[dev]" - - name: Run linting with ruff - if: matrix.python-version == '3.10' + - name: Run linting run: | - python -m ruff check . - python -m ruff format --check . - - - name: Run type checking with mypy - run: python -m mypy --python-version=${{ matrix.python-version }} src tests + python -m ruff check src/ - - name: Run unit tests with pytest - run: python -m pytest tests/unit/ -v --cov=nutrient_dws --cov-report=xml --cov-report=term + - name: Run type checking + run: python -m mypy src/ - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - files: ./coverage.xml - flags: unittests - name: codecov-umbrella - fail_ci_if_error: false + unit-tests: + runs-on: ${{ matrix.os }} + needs: lint-and-type-check - integration-test: - runs-on: ubuntu-latest - # Run on: pushes to main/develop, PRs from same repo, and manual triggers - if: | - github.event_name == 'push' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) strategy: matrix: python-version: ['3.10', '3.11', '3.12'] - + os: [ubuntu-latest, windows-latest, macos-latest] + steps: - uses: actions/checkout@v4 @@ -83,108 +46,26 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip- + cache: 'pip' - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install ".[dev]" - - name: Check for API key availability - run: | - if [ -z "${{ secrets.NUTRIENT_DWS_API_KEY }}" ]; then - echo "::warning::NUTRIENT_DWS_API_KEY secret not found, skipping integration tests" - echo "skip_tests=true" >> $GITHUB_ENV - - # Provide context about why this might be happening - if [ "${{ github.event_name }}" == "pull_request" ]; then - if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then - echo "::notice::This appears to be a PR from a fork. Secrets are not available for security reasons." - else - echo "::error::This is a PR from the same repository but the API key is missing. Please check repository secrets configuration." - fi - else - echo "::error::Running on ${{ github.event_name }} event but API key is missing. Please configure NUTRIENT_DWS_API_KEY secret." - fi - else - echo "::notice::API key found, integration tests will run" - echo "skip_tests=false" >> $GITHUB_ENV - fi - - - name: Create integration config with API key - if: env.skip_tests != 'true' - run: | - python -c " - import os - with open('tests/integration/integration_config.py', 'w') as f: - f.write(f'API_KEY = \"{os.environ[\"NUTRIENT_DWS_API_KEY\"]}\"\n') - " - env: - NUTRIENT_DWS_API_KEY: ${{ secrets.NUTRIENT_DWS_API_KEY }} - - - name: Run integration tests - if: env.skip_tests != 'true' - run: python -m pytest tests/integration/ -v - - - name: Cleanup integration config - if: always() - run: rm -f tests/integration/integration_config.py - - # Provide feedback for fork PRs where integration tests can't run - integration-test-fork-feedback: - runs-on: ubuntu-latest - if: | - github.event_name == 'pull_request' && - github.event.pull_request.head.repo.full_name != github.repository - steps: - - name: Comment on PR about integration tests - uses: actions/github-script@v7 + - name: Run unit tests with coverage + run: python -m pytest tests/unit/ -v --cov=nutrient_dws --cov-report=xml --cov-report=term + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12' with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const issue_number = context.issue.number; - const owner = context.repo.owner; - const repo = context.repo.repo; - - // Check if we've already commented - const comments = await github.rest.issues.listComments({ - owner, - repo, - issue_number, - }); - - const botComment = comments.data.find(comment => - comment.user.type === 'Bot' && - comment.body.includes('Integration tests are skipped for pull requests from forks') - ); - - if (!botComment) { - await github.rest.issues.createComment({ - owner, - repo, - issue_number, - body: `## Integration Tests Status\n\n` + - `Integration tests are skipped for pull requests from forks due to security restrictions. ` + - `These tests will run automatically after the PR is merged.\n\n` + - `**What this means:**\n` + - `- Unit tests, linting, and type checking have passed ✅\n` + - `- Integration tests require API credentials that aren't available to fork PRs\n` + - `- A maintainer will review your changes and merge if appropriate\n` + - `- Integration tests will run on the main branch after merge\n\n` + - `Thank you for your contribution! 🙏` - }); - } + files: ./coverage.xml + flags: unittests + name: codecov-umbrella build: runs-on: ubuntu-latest - needs: test + needs: [lint-and-type-check, unit-tests] steps: - uses: actions/checkout@v4 @@ -193,20 +74,14 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.12' + cache: 'pip' - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install ".[dev]" - name: Build package run: python -m build - - name: Check package with twine + - name: Verify build outputs run: twine check dist/* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - name: dist - path: dist/ diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 0000000..15fd475 --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,54 @@ +name: Integration Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + integration-tests: + runs-on: ubuntu-latest + needs: [] # Run in parallel with other workflows + + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + + - name: Check for API key + id: check-api-key + env: + NUTRIENT_API_KEY: ${{ secrets.NUTRIENT_API_KEY }} + run: | + if [ -n "$NUTRIENT_API_KEY" ] && [ "$NUTRIENT_API_KEY" != "fake_key" ] && [ ${#NUTRIENT_API_KEY} -gt 10 ]; then + echo "has_api_key=true" >> $GITHUB_OUTPUT + echo "✅ Valid API key detected" + else + echo "has_api_key=false" >> $GITHUB_OUTPUT + echo "⏭️ No valid API key - Integration tests will be skipped" + fi + + - name: Run integration tests + if: steps.check-api-key.outputs.has_api_key == 'true' + env: + NUTRIENT_API_KEY: ${{ secrets.NUTRIENT_API_KEY }} + run: python -m pytest tests/test_integration.py -v + + - name: Skip integration tests (no API key) + if: steps.check-api-key.outputs.has_api_key == 'false' + run: | + echo "✅ Integration tests skipped - no valid API key available" + echo "This is expected for forks and external PRs" diff --git a/.github/workflows/publish-existing-tag.yml b/.github/workflows/publish-existing-tag.yml deleted file mode 100644 index 44babb1..0000000 --- a/.github/workflows/publish-existing-tag.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Publish Existing Tag to PyPI - -on: - workflow_dispatch: - inputs: - tag: - description: 'Tag to publish (e.g., v1.0.2)' - required: true - default: 'v1.0.2' - -jobs: - build-and-publish: - name: Build and Publish to PyPI - runs-on: ubuntu-latest - - # IMPORTANT: This permission is required for trusted publishing - permissions: - id-token: write - - steps: - - name: Checkout specific tag - uses: actions/checkout@v4 - with: - ref: ${{ github.event.inputs.tag }} - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - python -m pip install build - - - name: Build distribution - run: python -m build - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - # No need for username/password with trusted publishing! \ No newline at end of file diff --git a/.github/workflows/publish-manual.yml b/.github/workflows/publish.yml similarity index 87% rename from .github/workflows/publish-manual.yml rename to .github/workflows/publish.yml index fc0a83d..db74c79 100644 --- a/.github/workflows/publish-manual.yml +++ b/.github/workflows/publish.yml @@ -7,27 +7,27 @@ jobs: publish: name: Publish to PyPI runs-on: ubuntu-latest - + permissions: id-token: write contents: read - + steps: # Use current branch/tag - uses: actions/checkout@v4 - + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.10' - + - name: Install build dependencies run: | python -m pip install --upgrade pip python -m pip install build - + - name: Build distribution run: python -m build - + - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 99cd5f6..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Release - -on: - release: - types: [published] # Changed from 'created' to 'published' for better control - # Allow manual trigger - workflow_dispatch: - -jobs: - deploy: - runs-on: ubuntu-latest - - # IMPORTANT: Required for trusted publishing - permissions: - id-token: write - contents: read - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - python -m pip install build - - - name: Build package - run: python -m build - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - # No API token needed with trusted publishing! \ No newline at end of file diff --git a/.github/workflows/scheduled-integration-tests.yml b/.github/workflows/scheduled-integration-tests.yml new file mode 100644 index 0000000..cd6e679 --- /dev/null +++ b/.github/workflows/scheduled-integration-tests.yml @@ -0,0 +1,154 @@ +name: Scheduled Integration Tests + +on: + schedule: + # Run every day at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +jobs: + scheduled-integration-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + + - name: Run all integration tests + if: secrets.NUTRIENT_API_KEY != '' + env: + NUTRIENT_API_KEY: ${{ secrets.NUTRIENT_API_KEY }} + run: | + echo "Running scheduled integration tests to detect API changes..." + python -m pytest tests/test_integration.py -v --tb=short + timeout-minutes: 20 + continue-on-error: true + id: test-run + + - name: Skip scheduled tests (no API key) + if: secrets.NUTRIENT_API_KEY == '' + run: | + echo "⏭️ Skipping scheduled integration tests - NUTRIENT_API_KEY not available" + echo "Configure NUTRIENT_API_KEY secret to enable scheduled API validation" + + - name: Generate detailed test report + if: always() + run: | + python -m pytest tests/test_integration.py -v --tb=short --junit-xml=scheduled-test-results.xml || true + + # Create summary + echo "## Integration Test Summary" > test-summary.md + echo "Date: $(date)" >> test-summary.md + echo "Status: ${{ steps.test-run.outcome }}" >> test-summary.md + + # Extract test counts if possible + if [ -f scheduled-test-results.xml ]; then + echo "### Test Results" >> test-summary.md + python -c " + import xml.etree.ElementTree as ET + import os + if os.path.exists('scheduled-test-results.xml'): + tree = ET.parse('scheduled-test-results.xml') + root = tree.getroot() + tests = root.get('tests', '0') + failures = root.get('failures', '0') + errors = root.get('errors', '0') + skipped = root.get('skipped', '0') + passed = str(int(tests) - int(failures) - int(errors) - int(skipped)) + print(f'- Total Tests: {tests}') + print(f'- Passed: {passed}') + print(f'- Failed: {failures}') + print(f'- Errors: {errors}') + print(f'- Skipped: {skipped}') + " >> test-summary.md + fi + + - name: Create issue if tests fail + if: failure() && steps.test-run.outcome == 'failure' + uses: actions/github-script@v7 + with: + script: | + const date = new Date().toISOString().split('T')[0]; + const title = `🚨 Integration Tests Failed - ${date}`; + + // Check if issue already exists + const issues = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['integration-failure', 'automated'], + state: 'open' + }); + + const existingIssue = issues.data.find(issue => issue.title.includes(date)); + + if (!existingIssue) { + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: title, + body: `## Scheduled Integration tests failed + + The scheduled integration test run has detected failures. This could indicate: + - API changes that need to be addressed + - Service degradation + - Test flakiness + + ### Action Required + 1. Check the [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for details + 2. Investigate any API changes + 3. Update tests if needed + 4. Close this issue once resolved + + ### Test Summary + See the workflow artifacts for detailed test results.`, + labels: ['integration-failure', 'automated', 'high-priority'] + }); + } + + - name: Upload test artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: scheduled-integration-results-${{ github.run_number }} + path: | + scheduled-test-results.xml + test-summary.md + retention-days: 30 + + - name: Notify on success after previous failure + if: success() && steps.test-run.outcome == 'success' + uses: actions/github-script@v7 + with: + script: | + // Close any open integration failure issues + const issues = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['integration-failure', 'automated'], + state: 'open' + }); + + for (const issue of issues.data) { + await github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issue.number, + state: 'closed' + }); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issue.number, + body: '✅ **Resolved**: Integration tests are now passing.' + }); + } diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..a47e70d --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,128 @@ +name: Security Checks + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 0 * * 0' # Weekly on Sunday + +jobs: + secret-scanning: + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run Gitleaks + uses: gitleaks/gitleaks-action@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITLEAKS_LICENSE: ${{ secrets.GITLEAKS_LICENSE }} + continue-on-error: true + + - name: Check for hardcoded secrets + run: | + echo "🔍 Scanning for hardcoded secrets..." + + # Check for potential API keys + if grep -r "pdf_live_" --include="*.py" --include="*.json" --exclude-dir=.venv --exclude-dir=__pycache__ --exclude-dir=.pytest_cache . 2>/dev/null; then + echo "❌ Found hardcoded API keys!" + exit 1 + fi + + # Check for base64 encoded secrets (common Nutrient patterns) + if grep -r "cGRmX2xpdmVf" --include="*.py" --include="*.json" --exclude-dir=.venv --exclude-dir=__pycache__ --exclude-dir=.pytest_cache . 2>/dev/null; then + echo "❌ Found base64 encoded API keys!" + exit 1 + fi + + # Check for other common secret patterns + if grep -rE "(sk_|pk_|nutr_sk_)" --include="*.py" --include="*.json" --exclude-dir=.venv --exclude-dir=__pycache__ --exclude-dir=.pytest_cache . 2>/dev/null; then + echo "❌ Found potential secret keys!" + exit 1 + fi + + echo "✅ No hardcoded secrets found" + + dependency-check: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + pip install safety bandit + + - name: Run Safety check + run: | + echo "🔍 Running Safety security scan..." + safety check --json --output safety-report.json || echo "⚠️ Safety found issues but continuing..." + + # Display summary if report exists + if [ -f safety-report.json ]; then + echo "Safety report generated - check artifacts for details" + fi + continue-on-error: true + + - name: Run Bandit security linter + run: | + echo "🔍 Running Bandit security linter..." + bandit -r src/ -f json -o bandit-report.json || echo "⚠️ Bandit found issues but continuing..." + + # Display summary + bandit -r src/ --severity-level medium || echo "⚠️ Medium+ severity issues found" + continue-on-error: true + + - name: Upload security scan results + uses: actions/upload-artifact@v4 + if: always() + with: + name: security-reports-${{ github.run_number }} + path: | + safety-report.json + bandit-report.json + retention-days: 30 + + - name: Run pip audit (if available) + run: | + echo "🔍 Running pip audit..." + pip install pip-audit || echo "pip-audit not available" + pip-audit --format=json --output=pip-audit-report.json || echo "⚠️ pip-audit found issues but continuing..." + continue-on-error: true + + code-quality: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + + - name: Run additional security checks with ruff + run: | + echo "🔍 Running security-focused linting..." + python -m ruff check . --select=S # Security rules + continue-on-error: true diff --git a/.gitignore b/.gitignore index d62bed8..0e553f9 100644 --- a/.gitignore +++ b/.gitignore @@ -154,5 +154,5 @@ openapi_spec.yml .pixi .claude/settings.local.json -# Integration test configuration -tests/integration/integration_config.py +# Example output +examples/output diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4464cc4..6e18f1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,17 +12,19 @@ repos: - id: debug-statements - id: mixed-line-ending + - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.11 + rev: v0.12.8 hooks: - id: ruff args: [--fix] + files: ^src - id: ruff-format + files: ^src - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 + rev: v1.17.1 hooks: - id: mypy - additional_dependencies: [types-requests] - args: [--strict, --no-implicit-reexport] - files: ^src/ \ No newline at end of file + additional_dependencies: [types-aiofiles, httpx] + files: ^src diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 216ead5..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,136 +0,0 @@ -# Changelog - -All notable changes to the nutrient-dws Python client library will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [1.0.2] - 2025-01-03 - -### Added - -#### Direct API Methods -- `create_redactions_preset()` - Create redactions using predefined patterns (SSN, email, phone, etc.) -- `create_redactions_regex()` - Create redactions using custom regex patterns -- `create_redactions_text()` - Create redactions for specific text strings -- `optimize_pdf()` - Optimize PDF file size and performance -- `password_protect_pdf()` - Add password protection to PDFs -- `set_pdf_metadata()` - Update PDF metadata (title, author, subject, keywords) -- `split_pdf()` - Split PDFs into multiple files based on page ranges -- `duplicate_pdf_pages()` - Duplicate specific pages within a PDF -- `delete_pdf_pages()` - Remove specific pages from a PDF -- `add_page()` - Insert blank pages at specific positions -- `apply_instant_json()` - Apply PSPDFKit Instant JSON annotations -- `apply_xfdf()` - Apply XFDF annotations to PDFs -- `set_page_label()` - Set custom page labels (Roman numerals, letters, etc.) - -#### Enhancements -- Image file support for `watermark_pdf()` method - now accepts PNG/JPEG images as watermarks -- Improved CI/CD integration test strategy with better error reporting -- Enhanced test coverage for all new Direct API methods - -### Fixed -- Critical API compatibility issues in Direct API integration -- Python 3.9 and 3.10 syntax compatibility across the codebase -- Comprehensive CI failure resolution based on multi-model analysis -- Integration test fixes to match actual API behavior patterns -- Ruff linting and formatting issues throughout the project -- MyPy type checking errors and improved type annotations -- Removed unsupported parameters (stroke_width, base_url) from API calls -- Corrected API parameter formats for various operations -- Fixed page range handling in split_pdf with proper defaults -- Resolved runtime errors with isinstance union syntax -- Updated test fixtures to use valid PNG images - -### Changed -- Minimum Python version maintained at 3.10+ as per project design -- Improved error messages for better debugging experience -- Standardized code formatting with ruff across entire codebase - -## [1.0.1] - 2025-06-20 - -### Fixed - -#### Critical Bug Fixes -- Fix README.md documentation to use `NutrientTimeoutError` instead of `TimeoutError` -- Resolve inconsistency where code exported `NutrientTimeoutError` but docs referenced `TimeoutError` - -#### Testing Improvements -- Added comprehensive unit tests (31 tests total) -- Added integration test framework for CI -- Improved test stability and coverage - -## [1.0.0] - 2024-06-17 - -### Added - -#### Core Features -- **NutrientClient**: Main client class with support for both Direct API and Builder API patterns -- **Direct API Methods**: Convenient methods for single operations: - - `convert_to_pdf()` - Convert Office documents to PDF (uses implicit conversion) - - `flatten_annotations()` - Flatten PDF annotations and form fields - - `rotate_pages()` - Rotate specific or all pages - - `ocr_pdf()` - Apply OCR to make PDFs searchable - - `watermark_pdf()` - Add text or image watermarks - - `apply_redactions()` - Apply existing redaction annotations - - `merge_pdfs()` - Merge multiple PDFs and Office documents - -- **Builder API**: Fluent interface for chaining multiple operations: - ```python - client.build(input_file="document.docx") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "english"}) \ - .execute(output_path="processed.pdf") - ``` - -#### Infrastructure -- **HTTP Client**: - - Connection pooling for performance - - Automatic retry logic with exponential backoff - - Bearer token authentication - - Comprehensive error handling - -- **File Handling**: - - Support for multiple input types (paths, Path objects, bytes, file-like objects) - - Automatic streaming for large files (>10MB) - - Memory-efficient processing - -- **Exception Hierarchy**: - - `NutrientError` - Base exception - - `AuthenticationError` - API key issues - - `APIError` - General API errors with status codes - - `ValidationError` - Request validation failures - - `TimeoutError` - Request timeouts - - `FileProcessingError` - File operation failures - -#### Development Tools -- **Testing**: 82 unit tests with 92.46% code coverage -- **Type Safety**: Full mypy type checking support -- **Linting**: Configured with ruff -- **Pre-commit Hooks**: Automated code quality checks -- **CI/CD**: GitHub Actions for testing, linting, and releases -- **Documentation**: Comprehensive README with examples - -### Changed -- Package name updated from `nutrient` to `nutrient-dws` for PyPI -- Source directory renamed from `src/nutrient` to `src/nutrient_dws` -- API endpoint updated to https://api.pspdfkit.com -- Authentication changed from X-Api-Key header to Bearer token - -### Discovered -- **Implicit Document Conversion**: The API automatically converts Office documents (DOCX, XLSX, PPTX) to PDF when processing, eliminating the need for explicit conversion steps - -### Fixed -- Watermark operation now correctly requires width/height parameters -- OCR language codes properly mapped (e.g., "en" → "english") -- All API operations updated to use the Build API endpoint -- Type annotations corrected throughout the codebase - -### Security -- API keys are never logged or exposed -- Support for environment variable configuration -- Secure handling of authentication tokens - -[1.0.2]: https://github.com/PSPDFKit/nutrient-dws-client-python/releases/tag/v1.0.2 -[1.0.1]: https://github.com/PSPDFKit/nutrient-dws-client-python/releases/tag/v1.0.1 -[1.0.0]: https://github.com/PSPDFKit/nutrient-dws-client-python/releases/tag/v1.0.0 \ No newline at end of file diff --git a/DEVELOPMENT_ROADMAP.md b/DEVELOPMENT_ROADMAP.md deleted file mode 100644 index aef5356..0000000 --- a/DEVELOPMENT_ROADMAP.md +++ /dev/null @@ -1,100 +0,0 @@ -# Development Roadmap - Nutrient DWS Python Client - -## 📊 Issue Review & Recommendations - -After reviewing all open issues and analyzing the codebase, here are my recommendations for what to tackle next: - -### 🥇 **Top Priority: Quick Wins (1-2 days each)** - -#### 1. **Issue #11: Image Watermark Support** ⭐⭐⭐⭐⭐ -- **Why**: 80% already implemented! Just needs file upload support -- **Current**: Supports `image_url` parameter -- **Add**: `image_file` parameter for local image files -- **Effort**: Very Low - mostly parameter handling -- **Value**: High - common user request - -#### 2. **Issue #10: Multi-Language OCR Support** ⭐⭐⭐⭐ -- **Why**: Small change with big impact -- **Current**: Single language string -- **Add**: Accept `List[str]` for multiple languages -- **Effort**: Low - update parameter handling and validation -- **Value**: High - enables multi-lingual document processing - -### 🥈 **Second Priority: Core Features (3-5 days each)** - -#### 3. **Issue #13: Create Redactions Method** ⭐⭐⭐⭐ -- **Why**: Complements existing `apply_redactions()` -- **Value**: Complete redaction workflow -- **Complexity**: Medium - new API patterns for search strategies -- **Use cases**: Compliance, privacy, legal docs - -#### 4. **Issue #12: Selective Annotation Flattening** ⭐⭐⭐ -- **Why**: Enhancement to existing `flatten_annotations()` -- **Add**: `annotation_ids` parameter -- **Effort**: Low-Medium -- **Value**: More control over flattening - -### 🥉 **Third Priority: High-Value Features (1 week each)** - -#### 5. **Issue #16: Convert to PDF/A** ⭐⭐⭐⭐ -- **Why**: Critical for archival/compliance -- **Value**: Legal requirement for many organizations -- **Complexity**: Medium - new output format handling - -#### 6. **Issue #17: Convert PDF to Images** ⭐⭐⭐⭐ -- **Why**: Very common use case -- **Value**: Thumbnails, previews, web display -- **Complexity**: Medium - handle multiple output files - -### 📋 **Issues to Defer** - -- **Issue #20: AI-Powered Redaction** - Requires AI endpoint investigation -- **Issue #21: Digital Signatures** - Complex, needs certificate handling -- **Issue #22: Batch Processing** - Client-side enhancement, do after core features -- **Issue #19: Office Formats** - Lower priority, complex format handling - -### 🎯 **Recommended Implementation Order** - -**Sprint 1 (Week 1):** -1. Image Watermark Support (1 day) -2. Multi-Language OCR (1 day) -3. Selective Annotation Flattening (2 days) - -**Sprint 2 (Week 2):** -4. Create Redactions Method (4 days) - -**Sprint 3 (Week 3):** -5. Convert to PDF/A (3 days) -6. Convert PDF to Images (3 days) - -### 💡 **Why This Order?** - -1. **Quick Wins First**: Build momentum with easy enhancements -2. **Complete Workflows**: Redaction creation completes the redaction workflow -3. **High User Value**: PDF/A and image conversion are frequently requested -4. **Incremental Complexity**: Start simple, build up to more complex features -5. **API Coverage**: These 6 features would increase API coverage significantly - -### 📈 **Expected Outcomes** - -After implementing these 6 features: -- **Methods**: 18 total (up from 12) -- **API Coverage**: ~50% (up from ~30%) -- **User Satisfaction**: Address most common feature requests -- **Time**: ~3 weeks of development - -## 🚀 Current Status - -As of the last update: -- **PR #7 (Direct API Methods)**: ✅ Merged - Added 5 new methods -- **PR #23 (OpenAPI Compliance)**: ✅ Merged - Added comprehensive documentation -- **Current Methods**: 12 Direct API methods -- **Test Coverage**: 94% -- **Python Support**: 3.8 - 3.12 - -## 📝 Notes - -- All features should maintain backward compatibility -- Each feature should include comprehensive tests -- Documentation should reference OpenAPI spec where applicable -- Integration tests should be added for each new method \ No newline at end of file diff --git a/GITHUB_ABOUT.md b/GITHUB_ABOUT.md deleted file mode 100644 index e54d040..0000000 --- a/GITHUB_ABOUT.md +++ /dev/null @@ -1,132 +0,0 @@ -# GitHub Repository Settings - -## About Section - -### Description -Official Python client library for Nutrient Document Web Services API - PDF processing, OCR, watermarking, and document manipulation with automatic Office format conversion - -### Website -https://www.nutrient.io/ - -### Topics (Tags) -Add these topics to make your repository more discoverable: - -**Core Technologies:** -- `python` -- `python3` -- `api-client` -- `sdk` -- `rest-api` - -**PDF & Document Processing:** -- `pdf` -- `pdf-processing` -- `pdf-manipulation` -- `pdf-generation` -- `document-processing` -- `document-conversion` -- `document-automation` - -**Features:** -- `ocr` -- `optical-character-recognition` -- `watermark` -- `pdf-merge` -- `pdf-rotation` -- `office-conversion` -- `docx-to-pdf` - -**Brand/Product:** -- `nutrient` -- `pspdfkit` -- `nutrient-api` -- `dws` - -**Development:** -- `type-hints` -- `async-ready` -- `well-tested` -- `developer-tools` - -## Recommended Repository Settings - -### ✅ Features to Enable: -- **Issues** - For bug reports and feature requests -- **Discussions** - For Q&A and community support -- **Wiki** - For additional documentation (optional) -- **Projects** - For tracking development roadmap - -### 🔧 Settings: -- **Allow forking** - Enable community contributions -- **Sponsorships** - If you want to accept sponsorships -- **Preserve this repository** - For long-term stability - -### 📌 Pinned Issues: -Consider pinning: -1. "Getting Started Guide" -2. "API Key Request" -3. "Roadmap & Feature Requests" - -### 🏷️ Issue Labels: -Add these custom labels: -- `api-question` - Questions about API usage -- `office-conversion` - Related to DOCX/XLSX/PPTX conversion -- `performance` - Performance-related issues -- `security` - Security-related issues - -### 📋 Issue Templates: -Consider adding templates for: -1. Bug Report -2. Feature Request -3. API Question -4. Documentation Issue - -## Social Preview - -Consider adding a social preview image that shows: -- Nutrient DWS logo -- "Python Client Library" -- Key features (PDF, OCR, Watermark, etc.) -- Code snippet example - -## Repository Insights to Highlight - -### In your README badges: -```markdown - - - - -``` - -### Quick Stats: -- **Language**: Python 100% -- **Test Coverage**: 92.46% -- **Dependencies**: Minimal (just `requests`) -- **Python Support**: 3.8, 3.9, 3.10, 3.11, 3.12 -- **API Operations**: 7 supported operations -- **Development Time**: Rapid implementation with comprehensive testing - -## Suggested Bio/Tagline Options - -1. "🚀 Transform documents at scale with Nutrient's Python SDK - PDF processing made simple" - -2. "📄 Enterprise-ready Python client for Nutrient DWS - Convert, OCR, watermark, and manipulate PDFs with ease" - -3. "🔧 The official Python SDK for Nutrient Document Web Services - Your toolkit for PDF automation" - -4. "⚡ Fast, reliable document processing in Python - Powered by Nutrient's cloud API" - -## SEO Keywords for Better Discovery - -Include these naturally in your README: -- Python PDF library -- Document automation API -- PDF OCR Python -- Office to PDF conversion -- PDF watermarking Python -- Document processing SDK -- Nutrient API Python -- Cloud PDF API -- PDF manipulation library -- Enterprise document processing \ No newline at end of file diff --git a/LLM_DOC.md b/LLM_DOC.md new file mode 100644 index 0000000..53f18cb --- /dev/null +++ b/LLM_DOC.md @@ -0,0 +1,1920 @@ +# Nutrient DWS Python Client Documentation + +> Nutrient DWS is a document processing service which provides document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. + +## Authentication + +### Direct API Key + +Provide your API key directly: + +```python +from nutrient_dws import NutrientClient + +client = NutrientClient(api_key='your_api_key') +``` + +### Token Provider + +Use an async token provider to fetch tokens from a secure source: + +```python +import httpx +from nutrient_dws import NutrientClient + +async def get_token(): + async with httpx.AsyncClient() as http_client: + response = await http_client.get('/api/get-nutrient-token') + data = response.json() + return data['token'] + +client = NutrientClient(api_key=get_token) +``` + +## NutrientClient + +The main client for interacting with the Nutrient DWS Processor API. + +### Constructor + +```python +NutrientClient(api_key: str | Callable[[], Awaitable[str] | str], base_url: str | None = None, timeout: int | None = None) +``` + +Parameters: +- `api_key` (required): Your API key string or async function returning a token +- `base_url` (optional): Custom API base URL (defaults to `https://api.nutrient.io`) +- `timeout` (optional): Request timeout in milliseconds + +## Direct Methods + +The client provides numerous async methods for document processing: + +### Account Methods + +#### get_account_info() +Gets account information for the current API key. + +**Returns**: `AccountInfo` - Account information dictionary + +```python +account_info = await client.get_account_info() + +# Access subscription information +print(account_info['subscriptionType']) +``` + +#### create_token(params) +Creates a new authentication token. + +**Parameters**: +- `params: CreateAuthTokenParameters` - Parameters for creating the token + +**Returns**: `CreateAuthTokenResponse` - The created token information + +```python +token = await client.create_token({ + 'expirationTime': 3600 +}) +print(token['id']) + +# Store the token for future use +token_id = token['id'] +token_value = token['accessToken'] +``` + +#### delete_token(id) +Deletes an authentication token. + +**Parameters**: +- `id: str` - ID of the token to delete + +**Returns**: `None` + +```python +await client.delete_token('token-id-123') + +# Example in a token management function +async def revoke_user_token(token_id: str) -> bool: + try: + await client.delete_token(token_id) + print(f'Token {token_id} successfully revoked') + return True + except Exception as error: + print(f'Failed to revoke token: {error}') + return False +``` + +### Document Processing Methods + +#### sign(file, data?, options?) +Signs a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to sign +- `data: CreateDigitalSignature | None` - Signature data (optional) +- `options: SignRequestOptions | None` - Additional options (image, graphicImage) (optional) + +**Returns**: `BufferOutput` - The signed PDF file output + +```python +result = await client.sign('document.pdf', { + 'signatureType': 'cms', + 'flatten': False, + 'cadesLevel': 'b-lt' +}) + +# Access the signed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('signed-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_ai(file, criteria, redaction_state?, pages?, options?) +Uses AI to redact sensitive information in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to redact +- `criteria: str` - AI redaction criteria +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional pages to redact +- `options: RedactOptions | None` - Optional redaction options + +**Returns**: `BufferOutput` - The redacted document + +```python +# Stage redactions +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails' +) + +# Apply redactions immediately +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'apply' +) + +# Redact only specific pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# Redact only the last 3 pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the redacted PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### ocr(file, language) +Performs OCR (Optical Character Recognition) on a document. + +**Parameters**: +- `file: FileInput` - The input file to perform OCR on +- `language: OcrLanguage | list[OcrLanguage]` - The language(s) to use for OCR + +**Returns**: `BufferOutput` - The OCR result + +```python +result = await client.ocr('scanned-document.pdf', 'english') + +# Access the OCR-processed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('ocr-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### watermark_text(file, text, options?) +Adds a text watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `text: str` - The watermark text +- `options: dict[str, Any] | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_text('document.pdf', 'CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 24 +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### watermark_image(file, image, options?) +Adds an image watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `image: FileInput` - The watermark image +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_image('document.pdf', 'watermark.jpg', { + 'opacity': 0.5, + 'width': {'value': 50, 'unit': "%"}, + 'height': {'value': 50, 'unit': "%"} +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('image-watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### convert(file, target_format) +Converts a document to a different format. + +**Parameters**: +- `file: FileInput` - The input file to convert +- `target_format: OutputFormat` - The target format to convert to + +**Returns**: `BufferOutput | ContentOutput | JsonContentOutput` - The specific output type based on the target format + +```python +# Convert DOCX to PDF +pdf_result = await client.convert('document.docx', 'pdf') +# Supports formats: pdf, pdfa, pdfua, docx, xlsx, pptx, png, jpeg, jpg, webp, html, markdown + +# Access the PDF buffer +pdf_buffer = pdf_result['buffer'] +print(pdf_result['mimeType']) # 'application/pdf' + +# Save the PDF +with open('converted-document.pdf', 'wb') as f: + f.write(pdf_buffer) + +# Convert PDF to image +image_result = await client.convert('document.pdf', 'png') + +# Access the PNG buffer +png_buffer = image_result['buffer'] +print(image_result['mimeType']) # 'image/png' + +# Save the image +with open('document-page.png', 'wb') as f: + f.write(png_buffer) +``` + +#### merge(files) +Merges multiple documents into one. + +**Parameters**: +- `files: list[FileInput]` - The files to merge + +**Returns**: `BufferOutput` - The merged document + +```python +result = await client.merge([ + 'doc1.pdf', + 'doc2.pdf', + 'doc3.pdf' +]) + +# Access the merged PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('merged-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### extract_text(file, pages?) +Extracts text content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract text from +- `pages: PageRange | None` - Optional page range to extract text from + +**Returns**: `JsonContentOutput` - The extracted text data + +```python +result = await client.extract_text('document.pdf') + +# Extract text from specific pages +result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract text from the last page +result = await client.extract_text('document.pdf', {'end': -1}) # Last page + +# Extract text from the second-to-last page to the end +result = await client.extract_text('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted text content +text_content = result['data']['pages'][0]['plainText'] + +# Process the extracted text +word_count = len(text_content.split()) +print(f'Document contains {word_count} words') + +# Search for specific content +if 'confidential' in text_content: + print('Document contains confidential information') +``` + +#### extract_table(file, pages?) +Extracts table content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract tables from +- `pages: PageRange | None` - Optional page range to extract tables from + +**Returns**: `JsonContentOutput` - The extracted table data + +```python +result = await client.extract_table('document.pdf') + +# Extract tables from specific pages +result = await client.extract_table('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract tables from the last page +result = await client.extract_table('document.pdf', {'end': -1}) # Last page + +# Extract tables from the second-to-last page to the end +result = await client.extract_table('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted tables +tables = result['data']['pages'][0]['tables'] + +# Process the first table if available +if tables and len(tables) > 0: + first_table = tables[0] + + # Get table dimensions + print(f"Table has {len(first_table['rows'])} rows and {len(first_table['columns'])} columns") + + # Access table cells + for i in range(len(first_table['rows'])): + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + cell_content = cell['text'] if cell else '' + print(f"Cell [{i}][{j}]: {cell_content}") + + # Convert table to CSV + csv_content = '' + for i in range(len(first_table['rows'])): + row_data = [] + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + row_data.append(cell['text'] if cell else '') + csv_content += ','.join(row_data) + '\n' + print(csv_content) +``` + +#### extract_key_value_pairs(file, pages?) +Extracts key value pair content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract KVPs from +- `pages: PageRange | None` - Optional page range to extract KVPs from + +**Returns**: `JsonContentOutput` - The extracted KVPs data + +```python +result = await client.extract_key_value_pairs('document.pdf') + +# Extract KVPs from specific pages +result = await client.extract_key_value_pairs('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract KVPs from the last page +result = await client.extract_key_value_pairs('document.pdf', {'end': -1}) # Last page + +# Extract KVPs from the second-to-last page to the end +result = await client.extract_key_value_pairs('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted key-value pairs +kvps = result['data']['pages'][0]['keyValuePairs'] + +# Process the key-value pairs +if kvps and len(kvps) > 0: + # Iterate through all key-value pairs + for index, kvp in enumerate(kvps): + print(f'KVP {index + 1}:') + print(f' Key: {kvp["key"]}') + print(f' Value: {kvp["value"]}') + print(f' Confidence: {kvp["confidence"]}') + + # Create a dictionary from the key-value pairs + dictionary = {} + for kvp in kvps: + dictionary[kvp['key']] = kvp['value'] + + # Look up specific values + print(f'Invoice Number: {dictionary.get("Invoice Number")}') + print(f'Date: {dictionary.get("Date")}') + print(f'Total Amount: {dictionary.get("Total")}') +``` + +#### flatten(file, annotation_ids?) +Flattens annotations in a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to flatten +- `annotation_ids: list[str | int] | None` - Optional specific annotation IDs to flatten + +**Returns**: `BufferOutput` - The flattened document + +```python +# Flatten all annotations +result = await client.flatten('annotated-document.pdf') + +# Flatten specific annotations by ID +result = await client.flatten('annotated-document.pdf', ['annotation1', 'annotation2']) +``` + +#### password_protect(file, user_password, owner_password, permissions?) +Password protects a PDF document. + +**Parameters**: +- `file: FileInput` - The file to protect +- `user_password: str` - Password required to open the document +- `owner_password: str` - Password required to modify the document +- `permissions: list[PDFUserPermission] | None` - Optional list of permissions granted when opened with user password + +**Returns**: `BufferOutput` - The password-protected document + +```python +result = await client.password_protect('document.pdf', 'user123', 'owner456') + +# Or with specific permissions: +result = await client.password_protect('document.pdf', 'user123', 'owner456', + ['printing', 'extract_accessibility']) + +# Access the password-protected PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('protected-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### set_metadata(file, metadata) +Sets metadata for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `metadata: Metadata` - The metadata to set (title and/or author) + +**Returns**: `BufferOutput` - The document with updated metadata + +```python +result = await client.set_metadata('document.pdf', { + 'title': 'My Document', + 'author': 'John Doe' +}) +``` + +#### set_page_labels(file, labels) +Sets page labels for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `labels: list[Label]` - List of label objects with pages and label properties + +**Returns**: `BufferOutput` - The document with updated page labels + +```python +result = await client.set_page_labels('document.pdf', [ + {'pages': [0, 1, 2], 'label': 'Cover'}, + {'pages': [3, 4, 5], 'label': 'Chapter 1'} +]) + +# Access the updated PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('labeled-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### apply_instant_json(file, instant_json_file) +Applies Instant JSON to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `instant_json_file: FileInput` - The Instant JSON file to apply + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_instant_json('document.pdf', 'annotations.json') + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('annotated-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### apply_xfdf(file, xfdf_file, options?) +Applies XFDF to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `xfdf_file: FileInput` - The XFDF file to apply +- `options: ApplyXfdfActionOptions | None` - Optional settings for applying XFDF + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') + +# Or with options: +result = await client.apply_xfdf( + 'document.pdf', 'annotations.xfdf', + {'ignorePageRotation': True, 'richTextEnabled': False} +) + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('xfdf-applied-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_preset(file, preset, redaction_state?, pages?, preset_options?, options?) +Creates redaction annotations based on a preset pattern. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `preset: SearchPreset` - The preset pattern to search for (e.g., 'email-address', 'social-security-number') +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `preset_options: CreateRedactionsStrategyOptionsPreset | None` - Optional settings for the preset strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_preset('document.pdf', 'email-address') + +# With specific pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the document with redactions +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_regex(file, regex, redaction_state?, pages?, regex_options?, options?) +Creates redaction annotations based on a regular expression. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `regex: str` - The regular expression to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `regex_options: CreateRedactionsStrategyOptionsRegex | None` - Optional settings for the regex strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_regex('document.pdf', r'Account:\s*\d{8,12}') + +# With specific pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\s*\d{8,12}', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\s*\d{8,12}', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the document with redactions +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('regex-redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_text(file, text, redaction_state?, pages?, text_options?, options?) +Creates redaction annotations based on text. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `text: str` - The text to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `text_options: CreateRedactionsStrategyOptionsText | None` - Optional settings for the text strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_text('document.pdf', 'email@example.com') + +# With specific pages and options +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': 0, 'end': 4}, # Pages 0, 1, 2, 3, 4 + {'caseSensitive': False, 'includeAnnotations': True} +) + +# Create redactions on the last 3 pages +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the document with redactions +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('text-redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### apply_redactions(file) +Applies redaction annotations in a document. + +**Parameters**: +- `file: FileInput` - The PDF file with redaction annotations to apply + +**Returns**: `BufferOutput` - The document with applied redactions + +```python +# Stage redactions from a createRedaction Method: +staged_result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage' +) + +result = await client.apply_redactions(staged_result['buffer']) + +# Access the final redacted document +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('final-redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### rotate(file, angle, pages?) +Rotates pages in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to rotate +- `angle: Literal[90, 180, 270]` - Rotation angle (90, 180, or 270 degrees) +- `pages: PageRange | None` - Optional page range to rotate + +**Returns**: `BufferOutput` - The entire document with specified pages rotated + +```python +result = await client.rotate('document.pdf', 90) + +# Rotate specific pages: +result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) # Pages 1, 2, 3 + +# Rotate the last page: +result = await client.rotate('document.pdf', 90, {'end': -1}) # Last page + +# Rotate from page 2 to the second-to-last page: +result = await client.rotate('document.pdf', 90, {'start': 2, 'end': -2}) + +# Access the rotated PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('rotated-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### add_page(file, count?, index?) +Adds blank pages to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to add pages to +- `count: int` - The number of blank pages to add (default: 1) +- `index: int | None` - Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + +**Returns**: `BufferOutput` - The document with added pages + +```python +# Add 2 blank pages at the end +result = await client.add_page('document.pdf', 2) + +# Add 1 blank page after the first page (at index 1) +result = await client.add_page('document.pdf', 1, 1) + +# Access the document with added pages +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('document-with-pages.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### optimize(file, options?) +Optimizes a PDF document for size reduction. + +**Parameters**: +- `file: FileInput` - The PDF file to optimize +- `options: OptimizePdf | None` - Optimization options + +**Returns**: `BufferOutput` - The optimized document + +```python +result = await client.optimize('large-document.pdf', { + 'grayscaleImages': True, + 'mrcCompression': True, + 'imageOptimizationQuality': 2 +}) + +# Access the optimized PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('optimized-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### split(file, page_ranges) +Splits a PDF document into multiple parts based on page ranges. + +**Parameters**: +- `file: FileInput` - The PDF file to split +- `page_ranges: list[PageRange]` - List of page ranges to extract + +**Returns**: `list[BufferOutput]` - A list of PDF documents, one for each page range + +```python +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # Pages 0, 1, 2 + {'start': 3, 'end': 5} # Pages 3, 4, 5 +]) + +# Split using negative indices +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # First three pages + {'start': 3, 'end': -3}, # Middle pages + {'start': -2, 'end': -1} # Last two pages +]) + +# Process each resulting PDF +for i, result in enumerate(results): + # Access the PDF buffer + pdf_buffer = result['buffer'] + + # Get the MIME type of the output + print(result['mimeType']) # 'application/pdf' + + # Save the buffer to a file + with open(f'split-part-{i}.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### duplicate_pages(file, page_indices) +Creates a new PDF containing only the specified pages in the order provided. + +**Parameters**: +- `file: FileInput` - The PDF file to extract pages from +- `page_indices: list[int]` - List of page indices to include in the new PDF (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - A new document with only the specified pages + +```python +# Create a new PDF with only the first and third pages +result = await client.duplicate_pages('document.pdf', [0, 2]) + +# Create a new PDF with pages in a different order +result = await client.duplicate_pages('document.pdf', [2, 0, 1]) + +# Create a new PDF with duplicated pages +result = await client.duplicate_pages('document.pdf', [0, 0, 1, 1, 0]) + +# Create a new PDF with the first and last pages +result = await client.duplicate_pages('document.pdf', [0, -1]) + +# Create a new PDF with the last three pages in reverse order +result = await client.duplicate_pages('document.pdf', [-1, -2, -3]) + +# Access the PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('duplicated-pages.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### delete_pages(file, page_indices) +Deletes pages from a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `page_indices: list[int]` - List of page indices to delete (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - The document with deleted pages + +```python +# Delete second and fourth pages +result = await client.delete_pages('document.pdf', [1, 3]) + +# Delete the last page +result = await client.delete_pages('document.pdf', [-1]) + +# Delete the first and last two pages +result = await client.delete_pages('document.pdf', [0, -1, -2]) + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('modified-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +### Error Handling + +The library provides a comprehensive error hierarchy: + +```python +from nutrient_dws import ( + NutrientError, + ValidationError, + APIError, + AuthenticationError, + NetworkError +) + +try: + result = await client.convert('file.docx', 'pdf') +except ValidationError as error: + # Invalid input parameters + print(f'Invalid input: {error.message} - Details: {error.details}') +except AuthenticationError as error: + # Authentication failed + print(f'Auth error: {error.message} - Status: {error.status_code}') +except APIError as error: + # API returned an error + print(f'API error: {error.message} - Status: {error.status_code} - Details: {error.details}') +except NetworkError as error: + # Network request failed + print(f'Network error: {error.message} - Details: {error.details}') +``` + +## Workflow Methods + +The Nutrient DWS Python Client uses a fluent builder pattern with staged interfaces to create document processing workflows. This architecture provides several benefits: + +1. **Type Safety**: The staged interface ensures that methods are only available at appropriate stages +2. **Readability**: Method chaining creates readable, declarative code +3. **Discoverability**: IDE auto-completion guides you through the workflow stages +4. **Flexibility**: Complex workflows can be built with simple, composable pieces + +### Stage 0: Create Workflow + +You have several ways of creating a workflow + +```python +# Creating Workflow from a client +workflow = client.workflow() + +# Override the client timeout +workflow = client.workflow(60000) + +# Create a workflow without a client +from nutrient_dws.builder.builder import StagedWorkflowBuilder +workflow = StagedWorkflowBuilder({ + 'apiKey': 'your-api-key' +}) +``` + +### Stage 1: Add Parts + +In this stage, you add document parts to the workflow: + +```python +workflow = (client.workflow() + .add_file_part('document.pdf') + .add_file_part('appendix.pdf')) +``` + +Available methods: + +#### `add_file_part(file, options?, actions?)` +Adds a file part to the workflow. + +**Parameters:** +- `file: FileInput` - The file to add to the workflow. Can be a local file path, bytes, or file-like object. +- `options: FilePartOptions | None` - Additional options for the file part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the file part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Add a PDF file from a local path +workflow.add_file_part('/path/to/document.pdf') + +# Add a file with options and actions +workflow.add_file_part( + '/path/to/document.pdf', + {'pages': {'start': 1, 'end': 3}}, + [BuildActions.watermark_text('CONFIDENTIAL')] +) +``` + +#### `add_html_part(html, assets?, options?, actions?)` +Adds an HTML part to the workflow. + +**Parameters:** +- `html: FileInput` - The HTML content to add. Can be a file path, bytes, or file-like object. +- `assets: list[FileInput] | None` - Optional list of assets (CSS, images, etc.) to include with the HTML. Only local files or bytes are supported (optional) +- `options: HTMLPartOptions | None` - Additional options for the HTML part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the HTML part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add HTML content from a file +workflow.add_html_part('/path/to/content.html') + +# Add HTML with assets and options +workflow.add_html_part( + '/path/to/content.html', + ['/path/to/style.css', '/path/to/image.png'], + {'layout': {'size': 'A4'}} +) +``` + +#### `add_new_page(options?, actions?)` +Adds a new blank page to the workflow. + +**Parameters:** +- `options: NewPagePartOptions | None` - Additional options for the new page, such as page size, orientation, etc. (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the new page (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a simple blank page +workflow.add_new_page() + +# Add a new page with specific options +workflow.add_new_page({ + 'layout': {'size': 'A4', 'orientation': 'portrait'} +}) +``` + +#### `add_document_part(document_id, options?, actions?)` +Adds a document part to the workflow by referencing an existing document by ID. + +**Parameters:** +- `document_id: str` - The ID of the document to add to the workflow. +- `options: DocumentPartOptions | None` - Additional options for the document part (optional) + - `options['layer']: str` - Optional layer name to select a specific layer from the document. +- `actions: list[BuildAction] | None` - Actions to apply to the document part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a document by ID +workflow.add_document_part('doc_12345abcde') + +# Add a document with a specific layer and options +workflow.add_document_part( + 'doc_12345abcde', + { + 'layer': 'content', + 'pages': {'start': 0, 'end': 3} + } +) +``` + +### Stage 2: Apply Actions (Optional) + +In this stage, you can apply actions to the document: + +```python +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) +``` + +Available methods: + +#### `apply_action(action)` +Applies a single action to the workflow. + +**Parameters:** +- `action: BuildAction` - The action to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply a watermark action +workflow.apply_action( + BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.3, + 'rotation': 45 + }) +) + +# Apply an OCR action +workflow.apply_action(BuildActions.ocr('english')) +``` + +#### `apply_actions(actions)` +Applies multiple actions to the workflow. + +**Parameters:** +- `actions: list[BuildAction]` - A list of actions to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply multiple actions to the workflow +workflow.apply_actions([ + BuildActions.watermark_text('DRAFT', {'opacity': 0.5}), + BuildActions.ocr('english'), + BuildActions.flatten() +]) +``` + +#### Action Types: + +#### Document Processing + +##### `BuildActions.ocr(language)` +Creates an OCR (Optical Character Recognition) action to extract text from images or scanned documents. + +**Parameters:** +- `language: str | list[str]` - Language(s) for OCR. Can be a single language or a list of languages. + +**Example:** +```python +# Basic OCR with English language +workflow.apply_action(BuildActions.ocr('english')) + +# OCR with multiple languages +workflow.apply_action(BuildActions.ocr(['english', 'french', 'german'])) + +# OCR with options (via dict syntax) +workflow.apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True +})) +``` + +##### `BuildActions.rotate(rotate_by)` +Creates an action to rotate pages in the document. + +**Parameters:** +- `rotate_by: Literal[90, 180, 270]` - Rotation angle in degrees (must be 90, 180, or 270). + +**Example:** +```python +# Rotate pages by 90 degrees +workflow.apply_action(BuildActions.rotate(90)) + +# Rotate pages by 180 degrees +workflow.apply_action(BuildActions.rotate(180)) +``` + +##### `BuildActions.flatten(annotation_ids?)` +Creates an action to flatten annotations into the document content, making them non-interactive but permanently visible. + +**Parameters:** +- `annotation_ids: list[str | int] | None` - Optional list of annotation IDs to flatten. If not specified, all annotations will be flattened (optional) + +**Example:** +```python +# Flatten all annotations +workflow.apply_action(BuildActions.flatten()) + +# Flatten specific annotations +workflow.apply_action(BuildActions.flatten(['annotation1', 'annotation2'])) +``` + +#### Watermarking + +##### `BuildActions.watermark_text(text, options?)` +Creates an action to add a text watermark to the document. + +**Parameters:** +- `text: str` - Watermark text content. +- `options: TextWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + - `fontFamily`: Font family for the text (e.g. 'Helvetica') + - `fontSize`: Size of the text in points + - `fontColor`: Foreground color of the text (e.g. '#ffffff') + - `fontStyle`: Text style list (['bold'], ['italic'], or ['bold', 'italic']) + +**Example:** + +```python +# Simple text watermark +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Customized text watermark +workflow.apply_action(BuildActions.watermark_text('DRAFT', { + 'opacity': 0.5, + 'rotation': 45, + 'fontSize': 36, + 'fontColor': '#FF0000', + 'fontStyle': ['bold', 'italic'] +})) +``` + +##### `BuildActions.watermark_image(image, options?)` +Creates an action to add an image watermark to the document. + +**Parameters:** +- `image: FileInput` - Watermark image (file path, bytes, or file-like object). +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + +**Example:** + +```python +# Simple image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png')) + +# Customized image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png', { + 'opacity': 0.3, + 'width': {'value': 50, 'unit': '%'}, + 'height': {'value': 50, 'unit': '%'}, + 'top': {'value': 10, 'unit': 'px'}, + 'left': {'value': 10, 'unit': 'px'}, + 'rotation': 0 +})) +``` + +#### Annotations + +##### `BuildActions.apply_instant_json(file)` +Creates an action to apply annotations from an Instant JSON file to the document. + +**Parameters:** +- `file: FileInput` - Instant JSON file input (file path, bytes, or file-like object). + +**Example:** + +```python +# Apply annotations from Instant JSON file +workflow.apply_action(BuildActions.apply_instant_json('/path/to/annotations.json')) +``` + +##### `BuildActions.apply_xfdf(file, options?)` +Creates an action to apply annotations from an XFDF file to the document. + +**Parameters:** +- `file: FileInput` - XFDF file input (file path, bytes, or file-like object). +- `options: ApplyXfdfActionOptions | None` - Apply XFDF options (optional): + - `ignorePageRotation: bool` - If True, ignores page rotation when applying XFDF data (default: False) + - `richTextEnabled: bool` - If True, plain text annotations will be converted to rich text annotations. If False, all text annotations will be plain text annotations (default: True) + +**Example:** + +```python +# Apply annotations from XFDF file with default options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf')) + +# Apply annotations with specific options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf', { + 'ignorePageRotation': True, + 'richTextEnabled': False +})) +``` + +#### Redactions + +##### `BuildActions.create_redactions_text(text, options?, strategy_options?)` +Creates an action to add redaction annotations based on text search. + +**Parameters:** +- `text: str` - Text to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsText | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided text (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: False) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for all occurrences of "Confidential" +workflow.apply_action(BuildActions.create_redactions_text('Confidential')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_text('Confidential', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'REDACTED', + 'textColor': '#FFFFFF' + } + }, + { + 'caseSensitive': True, + 'start': 2, + 'limit': 5 + } + )) +``` + +##### `BuildActions.create_redactions_regex(regex, options?, strategy_options?)` +Creates an action to add redaction annotations based on regex pattern matching. + +**Parameters:** +- `regex: str` - Regex pattern to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsRegex | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided regex (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', + { + 'content': { + 'backgroundColor': '#FF0000', + 'overlayText': 'EMAIL REDACTED' + } + }, + { + 'caseSensitive': False, + 'start': 0, + 'limit': 10 + } + )) +``` + +##### `BuildActions.create_redactions_preset(preset, options?, strategy_options?)` +Creates an action to add redaction annotations based on a preset pattern. + +**Parameters:** +- `preset: str` - Preset pattern to search and redact (e.g. 'email-address', 'credit-card-number', 'social-security-number', etc.) +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsPreset | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided preset (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses using preset +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Create redactions for credit card numbers with custom appearance +workflow.apply_action(BuildActions.create_redactions_preset('credit-card-number', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'FINANCIAL DATA' + } + }, + { + 'start': 0, + 'limit': 5 + } + )) +``` + +##### `BuildActions.apply_redactions()` +Creates an action to apply previously created redaction annotations, permanently removing the redacted content. + +**Example:** + +```python +# First create redactions +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Then apply them +workflow.apply_action(BuildActions.apply_redactions()) +``` + +### Stage 3: Set Output Format + +In this stage, you specify the desired output format: + +```python +workflow.output_pdf({ + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + } +}) +``` + +Available methods: + +#### `output_pdf(options?)` +Sets the output format to PDF. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF output, such as compression, encryption, etc. (optional) + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF with default options +workflow.output_pdf() + +# Set output format to PDF with specific options +workflow.output_pdf({ + 'user_password': 'secret', + 'user_permissions': ["printing"], + 'metadata': { + 'title': 'Important Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_pdfa(options?)` +Sets the output format to PDF/A (archival PDF). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/A output (optional): + - `options['conformance']: str` - The PDF/A conformance level to target. Options include 'pdfa-1b', 'pdfa-1a', 'pdfa-2b', 'pdfa-2a', 'pdfa-3b', 'pdfa-3a'. + Different levels have different requirements for long-term archiving. + - `options['vectorization']: bool` - When True, attempts to convert raster content to vector graphics where possible, improving quality and reducing file size. + - `options['rasterization']: bool` - When True, converts vector graphics to raster images, which can help with compatibility in some cases. + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/A with default options +workflow.output_pdfa() + +# Set output format to PDF/A with specific options +workflow.output_pdfa({ + 'conformance': 'pdfa-2b', + 'vectorization': True, + 'metadata': { + 'title': 'Archive Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True + } +}) +``` + +#### `output_pdfua(options?)` +Sets the output format to PDF/UA (Universal Accessibility). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/UA output (optional): + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/UA with default options +workflow.output_pdfua() + +# Set output format to PDF/UA with specific options +workflow.output_pdfua({ + 'metadata': { + 'title': 'Accessible Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_image(format, options?)` +Sets the output format to an image format (PNG, JPEG, WEBP). + +**Parameters:** +- `format: Literal['png', 'jpeg', 'jpg', 'webp']` - The image format to output. + - PNG: Lossless compression, supports transparency, best for graphics and screenshots + - JPEG/JPG: Lossy compression, smaller file size, best for photographs + - WEBP: Modern format with both lossy and lossless compression, good for web use +- `options: dict[str, Any] | None` - Additional options for image output, such as resolution, quality, etc. (optional) + **Note: At least one of options['width'], options['height'], or options['dpi'] must be specified.** + - `options['pages']: dict[str, int]` - Specifies which pages to convert to images. If omitted, all pages are converted. + - `options['pages']['start']: int` - The first page to convert (0-based index). + - `options['pages']['end']: int` - The last page to convert (0-based index). + - `options['width']: int` - The width of the output image in pixels. If specified without height, aspect ratio is maintained. + - `options['height']: int` - The height of the output image in pixels. If specified without width, aspect ratio is maintained. + - `options['dpi']: int` - The resolution in dots per inch. Higher values create larger, more detailed images. + Common values: 72 (web), 150 (standard), 300 (print quality), 600 (high quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PNG with dpi specified +workflow.output_image('png', {'dpi': 300}) + +# Set output format to JPEG with specific options +workflow.output_image('jpeg', { + 'dpi': 300, + 'pages': {'start': 1, 'end': 3} +}) + +# Set output format to WEBP with specific dimensions +workflow.output_image('webp', { + 'width': 1200, + 'height': 800, + 'dpi': 150 +}) +``` + +#### `output_office(format)` +Sets the output format to an Office document format (DOCX, XLSX, PPTX). + +**Parameters:** +- `format: Literal['docx', 'xlsx', 'pptx']` - The Office format to output ('docx' for Word, 'xlsx' for Excel, or 'pptx' for PowerPoint). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Word document (DOCX) +workflow.output_office('docx') + +# Set output format to Excel spreadsheet (XLSX) +workflow.output_office('xlsx') + +# Set output format to PowerPoint presentation (PPTX) +workflow.output_office('pptx') +``` + +#### `output_html(layout)` +Sets the output format to HTML. + +**Parameters:** +- `layout: Literal['page', 'reflow']` - The layout type to use for conversion to HTML: + - 'page' layout keeps the original structure of the document, segmented by page. + - 'reflow' layout converts the document into a continuous flow of text, without page breaks. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to HTML +workflow.output_html('page') +``` + +#### `output_markdown()` +Sets the output format to Markdown. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Markdown with default options +workflow.output_markdown() +``` + +#### `output_json(options?)` +Sets the output format to JSON content. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for JSON output (optional): + - `options['plainText']: bool` - When True, extracts plain text content from the document and includes it in the JSON output. + This provides the raw text without structural information. + - `options['structuredText']: bool` - When True, extracts text with structural information (paragraphs, headings, etc.) + and includes it in the JSON output. + - `options['keyValuePairs']: bool` - When True, attempts to identify and extract key-value pairs from the document + (like form fields, labeled data, etc.) and includes them in the JSON output. + - `options['tables']: bool` - When True, attempts to identify and extract tabular data from the document + and includes it in the JSON output as structured table objects. + - `options['language']: str | list[str]` - Specifies the language(s) of the document content for better text extraction. + Can be a single language code or a list of language codes for multi-language documents. + Examples: "english", "french", "german", or ["english", "spanish"]. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to JSON with default options +workflow.output_json() + +# Set output format to JSON with specific options +workflow.output_json({ + 'plainText': True, + 'structuredText': True, + 'keyValuePairs': True, + 'tables': True, + 'language': "english" +}) + +# Set output format to JSON with multiple languages +workflow.output_json({ + 'plainText': True, + 'tables': True, + 'language': ["english", "french", "german"] +}) +``` + +### Stage 4: Execute or Dry Run + +In this final stage, you execute the workflow or perform a dry run: + +```python +result = await workflow.execute() +``` + +Available methods: + +#### `execute(options?)` +Executes the workflow and returns the result. + +**Parameters:** +- `on_progress: Callable[[int, int], None] | None` - Callback for progress updates (optional). + +**Returns:** `TypedWorkflowResult` - The workflow result. + +**Example:** +```python +# Execute the workflow with default options +result = await workflow.execute() + +# Execute with progress tracking +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + +result = await workflow.execute(on_progress=progress_callback) +``` + +#### `dry_run(options?)` +Performs a dry run of the workflow without generating the final output. This is useful for validating the workflow configuration and estimating processing time. + +**Returns:** `WorkflowDryRunResult` - The dry run result, containing validation information and estimated processing time. + +**Example:** +```python +# Perform a dry run with default options +dry_run_result = await (workflow + .add_file_part('/path/to/document.pdf') + .output_pdf() + .dry_run()) +``` + +### Workflow Examples + +#### Basic Document Conversion + +```python +result = await (client + .workflow() + .add_file_part('document.docx') + .output_pdf() + .execute()) +``` + +#### Document Merging with Watermark + +```python +result = await (client + .workflow() + .add_file_part('document1.pdf') + .add_file_part('document2.pdf') + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) + .output_pdf() + .execute()) +``` + +#### OCR with Language Selection + +```python +result = await (client + .workflow() + .add_file_part('scanned-document.pdf') + .apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True + })) + .output_pdf() + .execute()) +``` + +#### HTML to PDF Conversion + +```python +result = await (client + .workflow() + .add_html_part('index.html', None, { + 'layout': { + 'size': 'A4', + 'margin': { + 'top': 50, + 'bottom': 50, + 'left': 50, + 'right': 50 + } + } + }) + .output_pdf() + .execute()) +``` + +#### Complex Multi-step Workflow + +```python +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + + +result = await (client + .workflow() + .add_file_part('document.pdf', {'pages': {'start': 0, 'end': 5}}) + .add_file_part('appendix.pdf') + .apply_actions([ + BuildActions.ocr({'language': 'english'}), + BuildActions.watermark_text('CONFIDENTIAL'), + BuildActions.create_redactions_preset('email-address', 'apply') +]) + .output_pdfa({ + 'level': 'pdfa-2b', + 'optimize': { + 'mrcCompression': True + } +}) + .execute(on_progress=progress_callback)) +``` + +### Staged Workflow Builder + +For more complex scenarios where you need to build workflows dynamically, you can use the staged workflow builder: + +```python +# Create a staged workflow +workflow = client.workflow() + +# Add parts +workflow.add_file_part('document.pdf') + +# Conditionally add more parts +if include_appendix: + workflow.add_file_part('appendix.pdf') + +# Conditionally apply actions +if needs_watermark: + workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Set output format based on user preference +if output_format == 'pdf': + workflow.output_pdf() +elif output_format == 'docx': + workflow.output_office('docx') +else: + workflow.output_image('png') + +# Execute the workflow +result = await workflow.execute() +``` + +### Error Handling in Workflows + +Workflows provide detailed error information: + +```python +try: + result = await (client + .workflow() + .add_file_part('document.pdf') + .output_pdf() + .execute()) + + if not result['success']: + # Handle workflow errors + for error in result.get('errors', []): + print(f"Step {error['step']}: {error['error']['message']}") +except Exception as error: + # Handle unexpected errors + print(f'Workflow execution failed: {error}') +``` + +### Workflow Result Structure + +The result of a workflow execution includes: + +```python +from typing import TypedDict, Any, List, Optional, Union + +class WorkflowError(TypedDict): + step: str + error: dict[str, Any] + +class BufferOutput(TypedDict): + mimeType: str + filename: str + buffer: bytes + +class ContentOutput(TypedDict): + mimeType: str + filename: str + content: str + +class JsonContentOutput(TypedDict): + mimeType: str + filename: str + data: Any + +class WorkflowResult(TypedDict): + # Overall success status + success: bool + + # Output data (if successful) + output: Optional[Union[BufferOutput, ContentOutput, JsonContentOutput]] + + # Error information (if failed) + errors: Optional[List[WorkflowError]] +``` + +### Performance Considerations + +For optimal performance with workflows: + +1. **Minimize the number of parts**: Combine related files when possible +2. **Use appropriate output formats**: Choose formats based on your needs +3. **Consider dry runs**: Use `dry_run()` to estimate resource usage +4. **Monitor progress**: Use the `on_progress` callback for long-running workflows +5. **Handle large files**: For very large files, consider splitting into smaller workflows diff --git a/README.md b/README.md index 3bf020a..b12a086 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,26 @@ # Nutrient DWS Python Client -[](https://www.python.org/downloads/) -[](https://github.com/jdrhyne/nutrient-dws-client-python/actions) +[](https://badge.fury.io/py/nutrient-dws) +[](https://github.com/PSPDFKit/nutrient-dws-client-python/actions/workflows/ci.yml) +[](https://github.com/PSPDFKit/nutrient-dws-client-python/actions/workflows/integration-tests.yml) [](https://opensource.org/licenses/MIT) -[](https://github.com/psf/black) -[](https://pypi.org/project/nutrient-dws/) -A Python client library for the [Nutrient Document Web Services (DWS) API](https://www.nutrient.io/). This library provides a Pythonic interface to interact with Nutrient's document processing services, supporting both Direct API calls and Builder API workflows. +A Python client library for [Nutrient Document Web Services (DWS) API](https://nutrient.io/). This library provides a fully async, type-safe, and ergonomic interface for document processing operations including conversion, merging, compression, watermarking, OCR, and text extraction. + +> **Note**: This package is published as `nutrient-dws` on PyPI. The package provides full type support and is designed for async Python environments (Python 3.10+). ## Features -- 🚀 **Two API styles**: Direct API for single operations, Builder API for complex workflows -- 📄 **Comprehensive document tools**: Convert, merge, rotate, OCR, watermark, and more -- 🔄 **Automatic retries**: Built-in retry logic for transient failures -- 📁 **Flexible file handling**: Support for file paths, bytes, and file-like objects -- 🔒 **Type-safe**: Full type hints for better IDE support -- ⚡ **Streaming support**: Memory-efficient processing of large files -- 🧪 **Well-tested**: Comprehensive test suite with high coverage +- 📄 **Powerful document processing**: Convert, OCR, edit, compress, watermark, redact, and digitally sign documents +- 🤖 **LLM friendly**: Built-in support for popular Coding Agents (Claude Code, GitHub Copilot, JetBrains Junie, Cursor, Windsurf) with auto-generated rules +- 🔄 **100% mapping with DWS Processor API**: Complete coverage of all Nutrient DWS Processor API capabilities +- 🛠️ **Convenient functions with sane defaults**: Simple interfaces for common operations with smart default settings +- ⛓️ **Chainable operations**: Build complex document workflows with intuitive method chaining +- 🚀 **Fully async**: Built from the ground up with async/await support for optimal performance +- 🔐 **Flexible authentication and security**: Support for API keys and async token providers with secure handling +- ✅ **Highly tested**: Comprehensive test suite ensuring reliability and stability +- 🔒 **Type-safe**: Full type annotations with comprehensive type definitions +- 🐍 **Pythonic**: Follows Python conventions and best practices ## Installation @@ -24,305 +28,219 @@ A Python client library for the [Nutrient Document Web Services (DWS) API](https pip install nutrient-dws ``` -## Quick Start - -```python -from nutrient_dws import NutrientClient - -# Initialize the client -client = NutrientClient(api_key="your-api-key") -# Direct API - Flatten PDF annotations -client.flatten_annotations( - input_file="document.pdf", - output_path="flattened.pdf" -) +## Integration with Coding Agents -# Builder API - Chain multiple operations -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "en"}) \ - .add_step("watermark-pdf", {"text": "CONFIDENTIAL"}) \ - .execute(output_path="processed.pdf") -``` +This package has built-in support with popular coding agents like Claude Code, GitHub Copilot, Cursor, and Windsurf by exposing scripts that will inject rules instructing the coding agents on how to use the package. This ensures that the coding agent doesn't hallucinate documentation, as well as making full use of all the features offered in Nutrient DWS Python Client. -## Authentication +```bash +# Adding code rule to Claude Code +dws-add-claude-code-rule -The client supports API key authentication through multiple methods: +# Adding code rule to GitHub Copilot +dws-add-github-copilot-rule -```python -# 1. Pass directly to client -client = NutrientClient(api_key="your-api-key") +# Adding code rule to Junie (Jetbrains) +dws-add-junie-rule -# 2. Set environment variable -# export NUTRIENT_API_KEY=your-api-key -client = NutrientClient() # Will use env variable +# Adding code rule to Cursor +dws-add-cursor-rule -# 3. Use context manager for automatic cleanup -with NutrientClient(api_key="your-api-key") as client: - client.convert_to_pdf("document.docx") +# Adding code rule to Windsurf +dws-add-windsurf-rule ``` -## Direct API Examples +The documentation for Nutrient DWS Python Client is also available on [Context7](https://context7.com/pspdfkit/nutrient-dws-client-python) -### Flatten Annotations +## Quick Start ```python -# Flatten all annotations and form fields -client.flatten_annotations( - input_file="form.pdf", - output_path="flattened.pdf" -) -``` - -### Merge PDFs +from nutrient_dws import NutrientClient -```python -# Merge multiple PDFs -client.merge_pdfs( - input_files=["doc1.pdf", "doc2.pdf", "doc3.pdf"], - output_path="merged.pdf" -) +client = NutrientClient(api_key='your_api_key') ``` -### OCR PDF - -```python -# Add OCR layer to scanned PDF -client.ocr_pdf( - input_file="scanned.pdf", - output_path="searchable.pdf", - language="en" -) -``` +## Direct Methods -### Rotate Pages +The client provides numerous async methods for document processing: ```python -# Rotate all pages -client.rotate_pages( - input_file="document.pdf", - output_path="rotated.pdf", - degrees=180 -) - -# Rotate specific pages -client.rotate_pages( - input_file="document.pdf", - output_path="rotated.pdf", - degrees=90, - page_indexes=[0, 2, 4] # Pages 1, 3, and 5 -) -``` - -### Watermark PDF +import asyncio +from nutrient_dws import NutrientClient -```python -# Add text watermark (width/height required) -client.watermark_pdf( - input_file="document.pdf", - output_path="watermarked.pdf", - text="DRAFT", - width=200, - height=100, - opacity=0.5, - position="center" -) +async def main(): + client = NutrientClient(api_key='your_api_key') -# Add image watermark from URL -client.watermark_pdf( - input_file="document.pdf", - output_path="watermarked.pdf", - image_url="https://example.com/logo.png", - width=150, - height=75, - opacity=0.8, - position="bottom-right" -) + # Convert a document + pdf_result = await client.convert('document.docx', 'pdf') -# Add image watermark from local file (NEW!) -client.watermark_pdf( - input_file="document.pdf", - output_path="watermarked.pdf", - image_file="logo.png", # Can be path, bytes, or file-like object - width=150, - height=75, - opacity=0.8, - position="bottom-right" -) -``` + # Extract text + text_result = await client.extract_text('document.pdf') -## Builder API Examples + # Add a watermark + watermarked_doc = await client.watermark_text('document.pdf', 'CONFIDENTIAL') -The Builder API allows you to chain multiple operations in a single workflow: + # Merge multiple documents + merged_pdf = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']) -```python -# Complex document processing pipeline -result = client.build(input_file="raw-scan.pdf") \ - .add_step("ocr-pdf", {"language": "en"}) \ - .add_step("rotate-pages", {"degrees": -90, "page_indexes": [0]}) \ - .add_step("watermark-pdf", { - "text": "PROCESSED", - "opacity": 0.3, - "position": "top-right" - }) \ - .add_step("flatten-annotations") \ - .set_output_options( - metadata={"title": "Processed Document", "author": "DWS Client"}, - optimize=True - ) \ - .execute(output_path="final.pdf") - -# Using image file in builder API -result = client.build(input_file="document.pdf") \ - .add_step("watermark-pdf", { - "image_file": "company-logo.png", # Local file - "width": 100, - "height": 50, - "opacity": 0.5, - "position": "bottom-left" - }) \ - .execute() +asyncio.run(main()) ``` -## File Input Options - -The library supports multiple ways to provide input files: - -```python -# File path (string or Path object) -client.convert_to_pdf("document.docx") -client.convert_to_pdf(Path("document.docx")) +For a complete list of available methods with examples, see the [Methods Documentation](docs/METHODS.md). -# Bytes -with open("document.docx", "rb") as f: - file_bytes = f.read() -client.convert_to_pdf(file_bytes) +## Workflow System -# File-like object -with open("document.docx", "rb") as f: - client.convert_to_pdf(f) +The client also provides a fluent builder pattern with staged interfaces to create document processing workflows: -# URL (for supported operations) -client.import_from_url("https://example.com/document.pdf") -``` +```python +from nutrient_dws.builder.constant import BuildActions + +async def main(): + client = NutrientClient(api_key='your_api_key') + + result = await (client + .workflow() + .add_file_part('document.pdf') + .add_file_part('appendix.pdf') + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 + })) + .output_pdf({ + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + } + }) + .execute()) + +asyncio.run(main()) +``` + +The workflow system follows a staged approach: +1. Add document parts (files, HTML, pages) +2. Apply actions (optional) +3. Set output format +4. Execute or perform a dry run + +For detailed information about the workflow system, including examples and best practices, see the [Workflow Documentation](docs/WORKFLOW.md). ## Error Handling -The library provides specific exceptions for different error scenarios: +The library provides a comprehensive error hierarchy: ```python from nutrient_dws import ( + NutrientClient, NutrientError, - AuthenticationError, - APIError, ValidationError, - TimeoutError, - FileProcessingError + APIError, + AuthenticationError, + NetworkError ) -try: - client.convert_to_pdf("document.docx") -except AuthenticationError: - print("Invalid API key") -except ValidationError as e: - print(f"Invalid parameters: {e.errors}") -except APIError as e: - print(f"API error: {e.status_code} - {e.message}") -except TimeoutError: - print("Request timed out") -except FileProcessingError as e: - print(f"File processing failed: {e}") -``` - -## Advanced Configuration +async def main(): + client = NutrientClient(api_key='your_api_key') -### Custom Timeout + try: + result = await client.convert('file.docx', 'pdf') + except ValidationError as error: + # Invalid input parameters + print(f'Invalid input: {error.message} - Details: {error.details}') + except AuthenticationError as error: + # Authentication failed + print(f'Auth error: {error.message} - Status: {error.status_code}') + except APIError as error: + # API returned an error + print(f'API error: {error.message} - Status: {error.status_code} - Details: {error.details}') + except NetworkError as error: + # Network request failed + print(f'Network error: {error.message} - Details: {error.details}') -```python -# Set timeout to 10 minutes for large files -client = NutrientClient(api_key="your-api-key", timeout=600) +asyncio.run(main()) ``` -### Streaming Large Files - -Files larger than 10MB are automatically streamed to avoid memory issues: - -```python -# This will stream the file instead of loading it into memory -client.flatten_annotations("large-document.pdf") -``` +## Testing -## Available Operations +The library includes comprehensive unit and integration tests: -### PDF Manipulation -- `merge_pdfs` - Merge multiple PDFs into one -- `rotate_pages` - Rotate PDF pages (all or specific pages) -- `flatten_annotations` - Flatten form fields and annotations +```bash +# Run all tests +python -m pytest -### PDF Enhancement -- `ocr_pdf` - Add searchable text layer (English and German) -- `watermark_pdf` - Add text or image watermarks +# Run with coverage report +python -m pytest --cov=nutrient_dws --cov-report=html -### PDF Security -- `apply_redactions` - Apply existing redaction annotations +# Run only unit tests +python -m pytest tests/unit/ -### Builder API -The Builder API allows chaining multiple operations: -```python -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "english"}) \ - .add_step("watermark-pdf", {"text": "DRAFT", "width": 200, "height": 100}) \ - .execute(output_path="processed.pdf") +# Run integration tests (requires API key) +NUTRIENT_API_KEY=your_key python -m pytest tests/test_integration.py ``` -Note: See [SUPPORTED_OPERATIONS.md](SUPPORTED_OPERATIONS.md) for detailed documentation of all supported operations and their parameters. +The library maintains high test coverage across all API methods, including: +- Unit tests for all public methods +- Integration tests for real API interactions +- Type checking with mypy ## Development -### Setup +For development, install the package in development mode: ```bash # Clone the repository -git clone https://github.com/jdrhyne/nutrient-dws-client-python.git +git clone https://github.com/PSPDFKit/nutrient-dws-client-python.git cd nutrient-dws-client-python # Install in development mode pip install -e ".[dev]" -# Run tests -pytest +# Run type checking +mypy src/ # Run linting -ruff check . +ruff check src/ -# Run type checking -mypy src tests +# Run formatting +ruff format src/ ``` -### Running Tests +## Contributing -```bash -# Run all tests -pytest +We welcome contributions to improve the library! Please follow our development standards to ensure code quality and maintainability. -# Run with coverage -pytest --cov=nutrient --cov-report=html +Quick start for contributors: -# Run specific test file -pytest tests/unit/test_client.py -``` +1. Clone and setup the repository +2. Make changes following atomic commit practices +3. Use conventional commits for clear change history +4. Include appropriate tests for new features +5. Ensure type checking passes with mypy +6. Follow Python code style with ruff -## Contributing +For detailed contribution guidelines, see the [Contributing Guide](docs/CONTRIBUTING.md). + +## Project Structure -Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change. +``` +src/ +├── nutrient_dws/ +│ ├── builder/ # Builder classes and constants +│ ├── generated/ # Generated type definitions +│ ├── types/ # Type definitions +│ ├── client.py # Main NutrientClient class +│ ├── errors.py # Error classes +│ ├── http.py # HTTP layer +│ ├── inputs.py # Input handling +│ ├── workflow.py # Workflow factory +│ └── __init__.py # Public exports +├── scripts/ # CLI scripts for coding agents +└── tests/ # Test files +``` -1. Fork the repository -2. Create your feature branch (`git checkout -b feature/amazing-feature`) -3. Commit your changes (`git commit -m 'Add some amazing feature'`) -4. Push to the branch (`git push origin feature/amazing-feature`) -5. Open a Pull Request +## Python Version Support + +This library supports Python 3.10 and higher. The async-first design requires modern Python features for optimal performance and type safety. ## License @@ -330,6 +248,6 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## Support -- 📧 Email: support@nutrient.io -- 📚 Documentation: https://www.nutrient.io/docs/ -- 🐛 Issues: https://github.com/jdrhyne/nutrient-dws-client-python/issues \ No newline at end of file +For issues and feature requests, please use the [GitHub issue tracker](https://github.com/PSPDFKit/nutrient-dws-client-python/issues). + +For questions about the Nutrient DWS Processor API, refer to the [official documentation](https://nutrient.io/docs/). diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md deleted file mode 100644 index b916875..0000000 --- a/RELEASE_NOTES.md +++ /dev/null @@ -1,118 +0,0 @@ -# Release Notes - v1.0.0 - -**Release Date**: June 17, 2024 - -We are excited to announce the first release of the official Python client library for Nutrient Document Web Services (DWS) API! This library provides a comprehensive, Pythonic interface for document processing operations including PDF manipulation, OCR, watermarking, and more. - -## 🎉 Highlights - -### Dual API Design -The library offers two complementary ways to interact with the Nutrient API: - -1. **Direct API** - Simple method calls for single operations -2. **Builder API** - Fluent interface for complex, multi-step workflows - -### Automatic Office Document Conversion -A major discovery during development: the Nutrient API automatically converts Office documents (DOCX, XLSX, PPTX) to PDF when processing them. This means you can: -- Apply any PDF operation directly to Office documents -- Mix PDFs and Office documents in merge operations -- Skip explicit conversion steps in your workflows - -### Enterprise-Ready Features -- **Robust Error Handling**: Comprehensive exception hierarchy for different error scenarios -- **Automatic Retries**: Built-in retry logic for transient failures -- **Connection Pooling**: Optimized performance for multiple requests -- **Large File Support**: Automatic streaming for files over 10MB -- **Type Safety**: Full type hints for better IDE support - -## 📦 Installation - -```bash -pip install nutrient-dws -``` - -## 🚀 Quick Start - -```python -from nutrient_dws import NutrientClient - -# Initialize client -client = NutrientClient(api_key="your-api-key") - -# Direct API - Single operation -client.rotate_pages("document.pdf", output_path="rotated.pdf", degrees=90) - -# Convert Office document to PDF (automatic!) -client.convert_to_pdf("report.docx", output_path="report.pdf") - -# Builder API - Complex workflow -client.build(input_file="scan.pdf") \ - .add_step("ocr-pdf", {"language": "english"}) \ - .add_step("watermark-pdf", {"text": "CONFIDENTIAL"}) \ - .add_step("flatten-annotations") \ - .execute(output_path="processed.pdf") - -# Merge PDFs and Office documents together -client.merge_pdfs([ - "chapter1.pdf", - "chapter2.docx", - "appendix.xlsx" -], output_path="complete_document.pdf") -``` - -## 🔧 Supported Operations - -- **convert_to_pdf** - Convert Office documents to PDF -- **flatten_annotations** - Flatten form fields and annotations -- **rotate_pages** - Rotate specific or all pages -- **ocr_pdf** - Make scanned PDFs searchable (English & German) -- **watermark_pdf** - Add text or image watermarks -- **apply_redactions** - Apply redaction annotations -- **merge_pdfs** - Combine multiple documents - -## 🛡️ Error Handling - -The library provides specific exceptions for different error scenarios: - -```python -from nutrient_dws import NutrientClient, AuthenticationError, ValidationError - -try: - client = NutrientClient(api_key="your-api-key") - result = client.ocr_pdf("scan.pdf") -except AuthenticationError: - print("Invalid API key") -except ValidationError as e: - print(f"Invalid parameters: {e.errors}") -``` - -## 📚 Documentation - -- [README](https://github.com/jdrhyne/nutrient-dws-client-python/blob/main/README.md) - Getting started guide -- [SUPPORTED_OPERATIONS](https://github.com/jdrhyne/nutrient-dws-client-python/blob/main/SUPPORTED_OPERATIONS.md) - Detailed operation documentation -- [API Reference](https://nutrient-dws-client-python.readthedocs.io) - Coming soon! - -## 🧪 Quality Assurance - -- **Test Coverage**: 92.46% with 82 unit tests -- **Type Checking**: Full mypy compliance -- **Code Quality**: Enforced with ruff and pre-commit hooks -- **CI/CD**: Automated testing on Python 3.8-3.12 - -## 🤝 Contributing - -We welcome contributions! Please see our [Contributing Guidelines](https://github.com/jdrhyne/nutrient-dws-client-python/blob/main/CONTRIBUTING.md) for details. - -## 📝 License - -This project is licensed under the MIT License. - -## 🙏 Acknowledgments - -Special thanks to the Nutrient team for their excellent API and documentation. - ---- - -**Note**: This is the initial release. We're actively working on additional features including more language support for OCR, additional file format support, and performance optimizations. Stay tuned! - -For questions or support, please [open an issue](https://github.com/jdrhyne/nutrient-dws-client-python/issues). \ No newline at end of file diff --git a/RELEASE_PROCESS.md b/RELEASE_PROCESS.md deleted file mode 100644 index a977ce5..0000000 --- a/RELEASE_PROCESS.md +++ /dev/null @@ -1,66 +0,0 @@ -# Release Process - -This document describes how to release a new version of nutrient-dws to PyPI using GitHub's trusted publishing. - -## Prerequisites - -1. PyPI account with maintainer access to nutrient-dws -2. GitHub repository configured as a trusted publisher on PyPI -3. Write access to the GitHub repository - -## Automatic Release Process (Recommended) - -### For New Releases - -1. Update version in `pyproject.toml` -2. Update `CHANGELOG.md` with release notes -3. Commit changes: `git commit -m "chore: prepare release v1.0.x"` -4. Create and push tag: `git tag v1.0.x && git push origin v1.0.x` -5. Create GitHub release: - - Go to https://github.com/PSPDFKit/nutrient-dws-client-python/releases/new - - Select the tag you just created - - Add release notes - - Click "Publish release" -6. The `Release` workflow will automatically trigger and upload to PyPI - -### For Existing Tags (like v1.0.2) - -1. Go to Actions tab in GitHub -2. Select "Publish Existing Tag to PyPI" workflow -3. Click "Run workflow" -4. Enter the tag name (e.g., `v1.0.2`) -5. Click "Run workflow" -6. Monitor the workflow progress - -## Manual Trigger - -You can also manually trigger the release workflow: -1. Go to Actions tab -2. Select "Release" workflow -3. Click "Run workflow" -4. Select branch/tag and run - -## Verification - -After publishing: -1. Check PyPI: https://pypi.org/project/nutrient-dws/ -2. Test installation: `pip install nutrient-dws==1.0.x` -3. Verify the GitHub release page shows the release - -## Troubleshooting - -### Trusted Publisher Issues -- Ensure the GitHub repository is configured as a trusted publisher on PyPI -- Check that the workflow has `id-token: write` permission -- Verify the PyPI project name matches exactly - -### Build Issues -- Ensure `pyproject.toml` is valid -- Check that all required files are present -- Verify Python version compatibility - -## Security Notes - -- No API tokens or passwords are needed with trusted publishing -- GitHub Actions uses OIDC to authenticate with PyPI -- This is more secure than storing PyPI tokens as secrets \ No newline at end of file diff --git a/SPECIFICATION.md b/SPECIFICATION.md deleted file mode 100644 index b67cb13..0000000 --- a/SPECIFICATION.md +++ /dev/null @@ -1,170 +0,0 @@ -# Software Design Specification: Nutrient DWS Python Client -Version: 1.2 -Date: December 19, 2024 - -## 1. Introduction -### 1.1. Project Overview -This document outlines the software design specification for a new Python client library for the Nutrient Document Web Services (DWS) API. The goal of this project is to create a high-quality, lightweight, and intuitive Python package that simplifies interaction with the Nutrient DWS API for developers. - -The library will provide two primary modes of interaction: -- A **Direct API** for executing single, discrete document processing tasks (e.g., converting a single file, rotating a page). -- A **Builder API** that offers a fluent, chainable interface for composing and executing complex, multi-step document processing workflows, abstracting the `POST /build` endpoint of the Nutrient API. - -The final product will be a distributable package suitable for publishing on PyPI, with comprehensive documentation. The design prioritizes ease of use, adherence to Python best practices, and clear documentation consumable by both humans and LLMs. - -### 1.2. Scope -This specification covers the implemented Python client library: -- Client authentication and configuration -- Direct API methods for common document operations -- Builder API for multi-step workflows -- Comprehensive error handling with custom exceptions -- Optimized file input/output handling -- Standard Python package structure - -Out of scope: -- Command-line interface (CLI) -- Asynchronous operations (all calls are synchronous) -- Non-Python implementations - -### 1.3. References -- **Nutrient DWS OpenAPI Specification**: https://dashboard.nutrient.io/assets/specs/public@1.9.0-dfc6ec1c1d008be3dcb81a72be6346b5.yml -- **Nutrient DWS API Documentation**: https://www.nutrient.io/api/reference/public/ -- **Nutrient DWS List of Tools**: https://www.nutrient.io/api/tools-overview/ -- **Target API Endpoint**: https://api.pspdfkit.com - -## 2. Goals and Objectives -- **Simplicity**: Clean, Pythonic interface abstracting HTTP requests, authentication, and file handling -- **Flexibility**: Direct API for single operations and Builder API for complex workflows -- **Lightweight**: Single external dependency on `requests` library -- **Performance**: Optimized file handling with streaming for large files (>10MB) -- **Distribution-Ready**: Standard Python package structure with `pyproject.toml` - -## 3. High-Level Architecture -The library is architected around a central `NutrientClient` class, which is the main entry point for all interactions. - -### 3.1. Core Components -**NutrientClient (The Main Client):** -- Handles initialization and configuration, including a configurable timeout for API calls. -- Manages the API key for authentication. All outgoing requests will include the `X-Api-Key` header. -- Contains static methods for the Direct API (e.g., `client.rotate_pages(...)`), which are derived from the OpenAPI specification. -- Acts as a factory for the Builder API via the `client.build()` method. - -**Direct API (Static Methods):** -- A collection of methods attached directly to the `NutrientClient` object. -- Each method corresponds to a specific tool available in the OpenAPI specification (e.g., `ocr_pdf`, `rotate_pages`). -- These methods abstract the `POST /process/{tool}` endpoint. They handle file preparation, making the request, and returning the processed file. - -**BuildAPIWrapper (Builder API):** -- A separate class, instantiated via `client.build()`. -- Implements the Builder design pattern with a fluent, chainable interface. -- The `execute()` method compiles the workflow into a `multipart/form-data` request for the `POST /build` endpoint, including a JSON part for actions and the necessary file parts. - -### 3.2. Data Flow -**Direct API Call:** -1. User calls method like `client.rotate_pages(input_file='path/to/doc.pdf', degrees=90)` -2. Method internally uses Builder API with single step -3. File is processed via `/build` endpoint -4. Returns processed file bytes or saves to `output_path` - -**Builder API Call:** -1. User chains operations: `client.build(input_file='doc.docx').add_step(tool='rotate-pages', options={'degrees': 90})` -2. `execute()` sends `multipart/form-data` request to `/build` endpoint -3. Returns processed file bytes or saves to `output_path` - -## 4. API Design -### 4.1. Client Initialization -```python -from nutrient_dws import NutrientClient, AuthenticationError - -# API key from parameter (takes precedence) or NUTRIENT_API_KEY env var -client = NutrientClient(api_key="YOUR_DWS_API_KEY", timeout=300) - -# Context manager support -with NutrientClient() as client: - result = client.convert_to_pdf("document.docx") -``` - -- **API Key**: Parameter takes precedence over `NUTRIENT_API_KEY` environment variable -- **Timeout**: Default 300 seconds, configurable per client -- **Error Handling**: `AuthenticationError` raised on first API call if key invalid - -### 4.2. File Handling -**Input Types**: -- `str` or `Path` for local file paths -- `bytes` objects -- File-like objects (`io.IOBase`) - -**Output Behavior**: -- Returns `bytes` by default -- Saves to `output_path` and returns `None` when path provided -- Large files (>10MB) use streaming to optimize memory usage - -### 4.3. Direct API Design -Method names are snake_case versions of operations. Tool-specific parameters are keyword-only arguments. - -**Example Usage:** -```python -# User Story: Convert a DOCX to PDF and rotate it. - -# Step 1: Convert DOCX to PDF -pdf_bytes = client.convert_to_pdf( - input_file="path/to/document.docx" -) - -# Step 2: Rotate the newly created PDF from memory -client.rotate_pages( - input_file=pdf_bytes, - output_path="path/to/rotated_document.pdf", - degrees=90 # keyword-only argument -) - -print("File saved to path/to/rotated_document.pdf") -``` - -### 4.4. Builder API Design -Fluent interface for multi-step workflows with single API call: - -- `client.build(input_file)`: Starts workflow -- `.add_step(tool, options=None)`: Adds processing step -- `.execute(output_path=None)`: Executes workflow -- `.set_output_options(**options)`: Sets output metadata/optimization - -**Example Usage:** -```python -from nutrient_dws import APIError - -# User Story: Convert a DOCX to PDF and rotate it (Builder version) -try: - client.build(input_file="path/to/document.docx") \ - .add_step(tool="rotate-pages", options={"degrees": 90}) \ - .execute(output_path="path/to/final_document.pdf") - - print("Workflow complete. File saved to path/to/final_document.pdf") - -except APIError as e: - print(f"An API error occurred: Status {e.status_code}, Response: {e.response_body}") -``` - -### 4.5. Error Handling -The library provides a comprehensive set of custom exceptions for clear error feedback: - -- `NutrientError(Exception)`: The base exception for all library-specific errors. -- `AuthenticationError(NutrientError)`: Raised on 401/403 HTTP errors, indicating an invalid or missing API key. -- `APIError(NutrientError)`: Raised for general API errors (e.g., 400, 422, 5xx status codes). Contains `status_code`, `response_body`, and optional `request_id` attributes. -- `ValidationError(NutrientError)`: Raised when request validation fails, with optional `errors` dictionary. -- `NutrientTimeoutError(NutrientError)`: Raised when requests timeout. -- `FileProcessingError(NutrientError)`: Raised when file processing operations fail. -- `FileNotFoundError` (Built-in): Standard Python exception for missing file paths. - -## 5. Implementation Details - -### 5.1. Package Structure -- **Layout**: Standard `src` layout with `nutrient_dws` package -- **Configuration**: `pyproject.toml` for project metadata and dependencies -- **Dependencies**: `requests` as sole runtime dependency -- **Versioning**: Semantic versioning starting at `1.0.0` - -### 5.2. File Handling Optimizations -- **Large Files**: Files >10MB are streamed rather than loaded into memory -- **Input Types**: Support for `str` paths, `bytes`, `Path` objects, and file-like objects -- **Output**: Returns `bytes` by default, or saves to `output_path` when provided diff --git a/SUPPORTED_OPERATIONS.md b/SUPPORTED_OPERATIONS.md deleted file mode 100644 index a86395c..0000000 --- a/SUPPORTED_OPERATIONS.md +++ /dev/null @@ -1,386 +0,0 @@ -# Supported Operations - -This document lists all operations currently supported by the Nutrient DWS API through this Python client. - -## 🎯 Important Discovery: Implicit Document Conversion - -The Nutrient DWS API automatically converts Office documents (DOCX, XLSX, PPTX) to PDF when processing them. This means: - -- **No explicit conversion needed** - Just pass your Office documents to any method -- **All methods accept Office documents** - `rotate_pages()`, `ocr_pdf()`, etc. work with DOCX files -- **Seamless operation chaining** - Convert and process in one API call - -### Example: -```python -# This automatically converts DOCX to PDF and rotates it! -client.rotate_pages("document.docx", degrees=90) - -# Merge PDFs and Office documents together -client.merge_pdfs(["file1.pdf", "file2.docx", "spreadsheet.xlsx"]) -``` - -## Direct API Methods - -The following methods are available on the `NutrientClient` instance: - -### 1. `convert_to_pdf(input_file, output_path=None)` -Converts Office documents to PDF format using implicit conversion. - -**Parameters:** -- `input_file`: Office document (DOCX, XLSX, PPTX) -- `output_path`: Optional path to save output - -**Example:** -```python -# Convert DOCX to PDF -client.convert_to_pdf("document.docx", "document.pdf") - -# Convert and get bytes -pdf_bytes = client.convert_to_pdf("spreadsheet.xlsx") -``` - -**Note:** HTML files are not currently supported. - -### 2. `flatten_annotations(input_file, output_path=None)` -Flattens all annotations and form fields in a PDF, converting them to static page content. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional path to save output - -**Example:** -```python -client.flatten_annotations("document.pdf", "flattened.pdf") -# Works with Office docs too! -client.flatten_annotations("form.docx", "flattened.pdf") -``` - -### 3. `rotate_pages(input_file, output_path=None, degrees=0, page_indexes=None)` -Rotates pages in a PDF or converts Office document to PDF and rotates. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional output path -- `degrees`: Rotation angle (90, 180, 270, or -90) -- `page_indexes`: Optional list of page indexes to rotate (0-based) - -**Example:** -```python -# Rotate all pages 90 degrees -client.rotate_pages("document.pdf", "rotated.pdf", degrees=90) - -# Works with Office documents too! -client.rotate_pages("presentation.pptx", "rotated.pdf", degrees=180) - -# Rotate specific pages -client.rotate_pages("document.pdf", "rotated.pdf", degrees=180, page_indexes=[0, 2]) -``` - -### 4. `ocr_pdf(input_file, output_path=None, language="english")` -Applies OCR to make a PDF searchable. Converts Office documents to PDF first if needed. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional output path -- `language`: OCR language - supported values: - - `"english"` or `"eng"` - English - - `"deu"` or `"german"` - German - -**Example:** -```python -client.ocr_pdf("scanned.pdf", "searchable.pdf", language="english") -# Convert DOCX to searchable PDF -client.ocr_pdf("document.docx", "searchable.pdf", language="eng") -``` - -### 5. `watermark_pdf(input_file, output_path=None, text=None, image_url=None, width=200, height=100, opacity=1.0, position="center")` -Adds a watermark to all pages of a PDF. Converts Office documents to PDF first if needed. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional output path -- `text`: Text for watermark (either text or image_url required) -- `image_url`: URL of image for watermark -- `width`: Width in points (required) -- `height`: Height in points (required) -- `opacity`: Opacity from 0.0 to 1.0 -- `position`: One of: "top-left", "top-center", "top-right", "center", "bottom-left", "bottom-center", "bottom-right" - -**Example:** -```python -# Text watermark -client.watermark_pdf( - "document.pdf", - "watermarked.pdf", - text="CONFIDENTIAL", - width=300, - height=150, - opacity=0.5, - position="center" -) -``` - -### 6. `apply_redactions(input_file, output_path=None)` -Applies redaction annotations to permanently remove content. Converts Office documents to PDF first if needed. - -**Parameters:** -- `input_file`: PDF or Office document with redaction annotations -- `output_path`: Optional output path - -**Example:** -```python -client.apply_redactions("document_with_redactions.pdf", "redacted.pdf") -``` - -### 7. `merge_pdfs(input_files, output_path=None)` -Merges multiple files into one PDF. Automatically converts Office documents to PDF before merging. - -**Parameters:** -- `input_files`: List of files to merge (PDFs and/or Office documents) -- `output_path`: Optional output path - -**Example:** -```python -# Merge PDFs only -client.merge_pdfs( - ["document1.pdf", "document2.pdf", "document3.pdf"], - "merged.pdf" -) - -# Mix PDFs and Office documents - they'll be converted automatically! -client.merge_pdfs( - ["report.pdf", "spreadsheet.xlsx", "presentation.pptx"], - "combined.pdf" -) -``` - -### 8. `split_pdf(input_file, page_ranges=None, output_paths=None)` -Splits a PDF into multiple documents by page ranges. - -**Parameters:** -- `input_file`: PDF file to split -- `page_ranges`: List of page range dictionaries with `start`/`end` keys (0-based indexing) -- `output_paths`: Optional list of paths to save output files - -**Returns:** -- List of PDF bytes for each split, or empty list if `output_paths` provided - -**Example:** -```python -# Split into custom ranges -parts = client.split_pdf( - "document.pdf", - page_ranges=[ - {"start": 0, "end": 4}, # Pages 1-5 - {"start": 5, "end": 9}, # Pages 6-10 - {"start": 10} # Pages 11 to end - ] -) - -# Save to specific files -client.split_pdf( - "document.pdf", - page_ranges=[{"start": 0, "end": 1}, {"start": 2}], - output_paths=["part1.pdf", "part2.pdf"] -) - -# Default behavior (extracts first page) -pages = client.split_pdf("document.pdf") -``` - -### 9. `duplicate_pdf_pages(input_file, page_indexes, output_path=None)` -Duplicates specific pages within a PDF document. - -**Parameters:** -- `input_file`: PDF file to process -- `page_indexes`: List of page indexes to include (0-based). Pages can be repeated for duplication. Negative indexes supported (-1 for last page) -- `output_path`: Optional path to save the output file - -**Returns:** -- Processed PDF as bytes, or None if `output_path` provided - -**Example:** -```python -# Duplicate first page twice, then include second page -result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 0, 1] # Page 1, Page 1, Page 2 -) - -# Include last page at beginning and end -result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[-1, 0, 1, 2, -1] # Last, First, Second, Third, Last -) - -# Save to specific file -client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 2, 1], # Reorder: Page 1, Page 3, Page 2 - output_path="reordered.pdf" -) -``` - -### 10. `delete_pdf_pages(input_file, page_indexes, output_path=None)` -Deletes specific pages from a PDF document. - -**Parameters:** -- `input_file`: PDF file to process -- `page_indexes`: List of page indexes to delete (0-based). Duplicates are automatically removed. -- `output_path`: Optional path to save the output file - -**Returns:** -- Processed PDF as bytes, or None if `output_path` provided - -**Note:** Negative page indexes are not currently supported. - -**Example:** -```python -# Delete first and third pages -result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[0, 2] # Delete pages 1 and 3 (0-based indexing) -) - -# Delete specific pages with duplicates (duplicates ignored) -result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[1, 3, 1, 5] # Effectively deletes pages 2, 4, and 6 -) - -# Save to specific file -client.delete_pdf_pages( - "document.pdf", - page_indexes=[0, 1], # Delete first two pages - output_path="trimmed_document.pdf" -) -``` - -### 11. `set_page_label(input_file, labels, output_path=None)` -Sets custom labels/numbering for specific page ranges in a PDF. - -**Parameters:** -- `input_file`: PDF file to process -- `labels`: List of label configurations. Each dict must contain: - - `pages`: Page range dict with `start` (required) and optionally `end` - - `label`: String label to apply to those pages - - Page ranges use 0-based indexing where `end` is inclusive. -- `output_path`: Optional path to save the output file - -**Returns:** -- Processed PDF as bytes, or None if `output_path` provided - -**Example:** -```python -# Set labels for different page ranges -client.set_page_label( - "document.pdf", - labels=[ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - {"pages": {"start": 10}, "label": "Appendix"} - ], - output_path="labeled_document.pdf" -) - -# Set label for single page -client.set_page_label( - "document.pdf", - labels=[{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] -) -``` - -## Builder API - -The Builder API allows chaining multiple operations. Like the Direct API, it automatically converts Office documents to PDF when needed: - -```python -# Works with PDFs -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "english"}) \ - .add_step("watermark-pdf", { - "text": "DRAFT", - "width": 200, - "height": 100, - "opacity": 0.3 - }) \ - .add_step("flatten-annotations") \ - .execute(output_path="processed.pdf") - -# Also works with Office documents! -client.build(input_file="report.docx") \ - .add_step("watermark-pdf", {"text": "CONFIDENTIAL", "width": 300, "height": 150}) \ - .add_step("flatten-annotations") \ - .execute(output_path="watermarked_report.pdf") - -# Setting page labels with Builder API -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .set_page_labels([ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3}, "label": "Content"} - ]) \ - .execute(output_path="labeled_document.pdf") -``` - -### Supported Builder Actions - -1. **flatten-annotations** - No parameters required -2. **rotate-pages** - Parameters: `degrees`, `page_indexes` (optional) -3. **ocr-pdf** - Parameters: `language` -4. **watermark-pdf** - Parameters: `text` or `image_url`, `width`, `height`, `opacity`, `position` -5. **apply-redactions** - No parameters required - -### Builder Output Options - -The Builder API also supports setting output options: - -- **set_output_options()** - General output configuration (metadata, optimization, etc.) -- **set_page_labels()** - Set page labels for specific page ranges - -Example: -```python -client.build("document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .set_output_options(metadata={"title": "My Document"}) \ - .set_page_labels([{"pages": {"start": 0}, "label": "Chapter 1"}]) \ - .execute("output.pdf") -``` - -## API Limitations - -The following operations are **NOT** currently supported by the API: - -- HTML to PDF conversion (only Office documents are supported) -- PDF to image export -- Form filling -- Digital signatures -- Compression/optimization -- Linearization -- Creating redactions (only applying existing ones) -- Instant JSON annotations -- XFDF annotations - -## Language Support - -OCR currently supports: -- English (`"english"` or `"eng"`) -- German (`"deu"` or `"german"`) - -## File Input Types - -All methods accept files as: -- String paths: `"document.pdf"` -- Path objects: `Path("document.pdf")` -- Bytes: `b"...pdf content..."` -- File-like objects: `open("document.pdf", "rb")` - -## Error Handling - -Common exceptions: -- `AuthenticationError` - Invalid or missing API key -- `APIError` - General API errors with status code -- `ValidationError` - Invalid parameters -- `FileNotFoundError` - File not found -- `ValueError` - Invalid input values diff --git a/conftest.py b/conftest.py deleted file mode 100644 index 3fbb1b6..0000000 --- a/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Root conftest.py to ensure proper test configuration.""" - -import sys -from pathlib import Path - -# Add src to Python path for test discovery -src_path = Path(__file__).parent / "src" -if str(src_path) not in sys.path: - sys.path.insert(0, str(src_path)) diff --git a/context7.json b/context7.json new file mode 100644 index 0000000..20e8d2e --- /dev/null +++ b/context7.json @@ -0,0 +1,7 @@ +{ + "$schema": "https://context7.com/schema/context7.json", + "projectTitle": "Nutrient DWS Python Client", + "description": "Python client library for Nutrient Document Web Services (DWS) API.\n", + "excludeFolders": ["src", "docs", "example", ".github"], + "excludeFiles": ["README.md"] +} diff --git a/CONTRIBUTING.md b/docs/CONTRIBUTING.md similarity index 97% rename from CONTRIBUTING.md rename to docs/CONTRIBUTING.md index 4d98b11..2784f47 100644 --- a/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -70,7 +70,7 @@ By participating in this project, you agree to abide by our Code of Conduct: be 5. Run linting: ```bash - ruff check src/ tests/ + ruff check src/ ``` 6. Commit your changes: @@ -120,7 +120,7 @@ Example test: ```python def test_new_feature(): """Test description.""" - client = NutrientClient(api_key="test-key") + client = NutrientClient(api_key='your_api_key') result = client.new_feature() assert result == expected_value ``` @@ -146,4 +146,4 @@ def test_new_feature(): - Check existing documentation - Review closed issues and PRs -Thank you for contributing! \ No newline at end of file +Thank you for contributing! diff --git a/docs/METHODS.md b/docs/METHODS.md new file mode 100644 index 0000000..5873a9b --- /dev/null +++ b/docs/METHODS.md @@ -0,0 +1,882 @@ +# Nutrient DWS Python Client Methods + +This document provides detailed information about all the methods available in the Nutrient DWS Python Client. + +## Client Methods + +### NutrientClient + +The main client for interacting with the Nutrient DWS Processor API. + +#### Constructor + +```python +NutrientClient(api_key: str | Callable[[], Awaitable[str] | str], base_url: str | None = None, timeout: int | None = None) +``` + +Parameters: +- `api_key` (required): Your API key string or async function returning a token +- `base_url` (optional): Custom API base URL (defaults to `https://api.nutrient.io`) +- `timeout` (optional): Request timeout in milliseconds + +#### Authentication + +Provide your API key directly: + +```python +from nutrient_dws import NutrientClient + +client = NutrientClient(api_key='your_api_key') +``` + +Or use an async token provider to fetch tokens from a secure source: + +```python +import httpx +from nutrient_dws import NutrientClient + +async def get_token(): + async with httpx.AsyncClient() as http_client: + response = await http_client.get('/api/get-nutrient-token') + data = response.json() + return data['token'] + +client = NutrientClient(api_key=get_token) +``` + + +#### Account Methods + +##### get_account_info() +Gets account information for the current API key. + +**Returns**: `AccountInfo` - Account information dictionary + +```python +account_info = await client.get_account_info() + +# Access subscription information +print(account_info['subscriptionType']) +``` + +##### create_token(params) +Creates a new authentication token. + +**Parameters**: +- `params: CreateAuthTokenParameters` - Parameters for creating the token + +**Returns**: `CreateAuthTokenResponse` - The created token information + +```python +token = await client.create_token({ + 'expirationTime': 3600 +}) +print(token['id']) + +# Store the token for future use +token_id = token['id'] +token_value = token['accessToken'] +``` + +##### delete_token(id) +Deletes an authentication token. + +**Parameters**: +- `id: str` - ID of the token to delete + +**Returns**: `None` + +```python +await client.delete_token('token-id-123') + +# Example in a token management function +async def revoke_user_token(token_id: str) -> bool: + try: + await client.delete_token(token_id) + print(f'Token {token_id} successfully revoked') + return True + except Exception as error: + print(f'Failed to revoke token: {error}') + return False +``` + +#### Document Processing Methods + +##### sign(file, data?, options?) +Signs a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to sign +- `data: CreateDigitalSignature | None` - Signature data (optional) +- `options: SignRequestOptions | None` - Additional options (image, graphicImage) (optional) + +**Returns**: `BufferOutput` - The signed PDF file output + +```python +result = await client.sign('document.pdf', { + 'signatureType': 'cms', + 'flatten': False, + 'cadesLevel': 'b-lt' +}) + +# Access the signed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('signed-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### create_redactions_ai(file, criteria, redaction_state?, pages?, options?) +Uses AI to redact sensitive information in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to redact +- `criteria: str` - AI redaction criteria +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional pages to redact +- `options: RedactOptions | None` - Optional redaction options + +**Returns**: `BufferOutput` - The redacted document + +```python +# Stage redactions +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails' +) + +# Apply redactions immediately +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'apply' +) + +# Redact only specific pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# Redact only the last 3 pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the redacted PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### ocr(file, language) +Performs OCR (Optical Character Recognition) on a document. + +**Parameters**: +- `file: FileInput` - The input file to perform OCR on +- `language: OcrLanguage | list[OcrLanguage]` - The language(s) to use for OCR + +**Returns**: `BufferOutput` - The OCR result + +```python +result = await client.ocr('scanned-document.pdf', 'english') + +# Access the OCR-processed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('ocr-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### watermark_text(file, text, options?) +Adds a text watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `text: str` - The watermark text +- `options: dict[str, Any] | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_text('document.pdf', 'CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 24 +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### watermark_image(file, image, options?) +Adds an image watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `image: FileInput` - The watermark image +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_image('document.pdf', 'watermark.jpg', { + 'opacity': 0.5, + 'width': {'value': 50, 'unit': "%"}, + 'height': {'value': 50, 'unit': "%"} +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('image-watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### convert(file, target_format) +Converts a document to a different format. + +**Parameters**: +- `file: FileInput` - The input file to convert +- `target_format: OutputFormat` - The target format to convert to + +**Returns**: `BufferOutput | ContentOutput | JsonContentOutput` - The specific output type based on the target format + +```python +# Convert DOCX to PDF +pdf_result = await client.convert('document.docx', 'pdf') +# Supports formats: pdf, pdfa, pdfua, docx, xlsx, pptx, png, jpeg, jpg, webp, html, markdown + +# Access the PDF buffer +pdf_buffer = pdf_result['buffer'] +print(pdf_result['mimeType']) # 'application/pdf' + +# Save the PDF +with open('converted-document.pdf', 'wb') as f: + f.write(pdf_buffer) + +# Convert PDF to image +image_result = await client.convert('document.pdf', 'png') + +# Access the PNG buffer +png_buffer = image_result['buffer'] +print(image_result['mimeType']) # 'image/png' + +# Save the image +with open('document-page.png', 'wb') as f: + f.write(png_buffer) +``` + +##### merge(files) +Merges multiple documents into one. + +**Parameters**: +- `files: list[FileInput]` - The files to merge + +**Returns**: `BufferOutput` - The merged document + +```python +result = await client.merge([ + 'doc1.pdf', + 'doc2.pdf', + 'doc3.pdf' +]) + +# Access the merged PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('merged-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### extract_text(file, pages?) +Extracts text content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract text from +- `pages: PageRange | None` - Optional page range to extract text from + +**Returns**: `JsonContentOutput` - The extracted text data + +```python +result = await client.extract_text('document.pdf') + +# Extract text from specific pages +result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract text from the last page +result = await client.extract_text('document.pdf', {'end': -1}) # Last page + +# Extract text from the second-to-last page to the end +result = await client.extract_text('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted text content +text_content = result['data']['pages'][0]['plainText'] + +# Process the extracted text +word_count = len(text_content.split()) +print(f'Document contains {word_count} words') + +# Search for specific content +if 'confidential' in text_content: + print('Document contains confidential information') +``` + +##### extract_table(file, pages?) +Extracts table content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract tables from +- `pages: PageRange | None` - Optional page range to extract tables from + +**Returns**: `JsonContentOutput` - The extracted table data + +```python +result = await client.extract_table('document.pdf') + +# Extract tables from specific pages +result = await client.extract_table('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract tables from the last page +result = await client.extract_table('document.pdf', {'end': -1}) # Last page + +# Extract tables from the second-to-last page to the end +result = await client.extract_table('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted tables +tables = result['data']['pages'][0]['tables'] + +# Process the first table if available +if tables and len(tables) > 0: + first_table = tables[0] + + # Get table dimensions + print(f"Table has {len(first_table['rows'])} rows and {len(first_table['columns'])} columns") + + # Access table cells + for i in range(len(first_table['rows'])): + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + cell_content = cell['text'] if cell else '' + print(f"Cell [{i}][{j}]: {cell_content}") + + # Convert table to CSV + csv_content = '' + for i in range(len(first_table['rows'])): + row_data = [] + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + row_data.append(cell['text'] if cell else '') + csv_content += ','.join(row_data) + '\n' + print(csv_content) +``` + +##### extract_key_value_pairs(file, pages?) +Extracts key value pair content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract KVPs from +- `pages: PageRange | None` - Optional page range to extract KVPs from + +**Returns**: `JsonContentOutput` - The extracted KVPs data + +```python +result = await client.extract_key_value_pairs('document.pdf') + +# Extract KVPs from specific pages +result = await client.extract_key_value_pairs('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract KVPs from the last page +result = await client.extract_key_value_pairs('document.pdf', {'end': -1}) # Last page + +# Extract KVPs from the second-to-last page to the end +result = await client.extract_key_value_pairs('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted key-value pairs +kvps = result['data']['pages'][0]['keyValuePairs'] + +# Process the key-value pairs +if kvps and len(kvps) > 0: + # Iterate through all key-value pairs + for index, kvp in enumerate(kvps): + print(f'KVP {index + 1}:') + print(f' Key: {kvp["key"]}') + print(f' Value: {kvp["value"]}') + print(f' Confidence: {kvp["confidence"]}') + + # Create a dictionary from the key-value pairs + dictionary = {} + for kvp in kvps: + dictionary[kvp['key']] = kvp['value'] + + # Look up specific values + print(f'Invoice Number: {dictionary.get("Invoice Number")}') + print(f'Date: {dictionary.get("Date")}') + print(f'Total Amount: {dictionary.get("Total")}') +``` + +##### flatten(file, annotation_ids?) +Flattens annotations in a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to flatten +- `annotation_ids: list[str | int] | None` - Optional specific annotation IDs to flatten + +**Returns**: `BufferOutput` - The flattened document + +```python +# Flatten all annotations +result = await client.flatten('annotated-document.pdf') + +# Flatten specific annotations by ID +result = await client.flatten('annotated-document.pdf', ['annotation1', 'annotation2']) +``` + +##### password_protect(file, user_password, owner_password, permissions?) +Password protects a PDF document. + +**Parameters**: +- `file: FileInput` - The file to protect +- `user_password: str` - Password required to open the document +- `owner_password: str` - Password required to modify the document +- `permissions: list[PDFUserPermission] | None` - Optional array of permissions granted when opened with user password + +**Returns**: `BufferOutput` - The password-protected document + +```python +result = await client.password_protect('document.pdf', 'user123', 'owner456') + +# Or with specific permissions: +result = await client.password_protect('document.pdf', 'user123', 'owner456', + ['printing', 'extract_accessibility']) + +# Access the password-protected PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('protected-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### set_metadata(file, metadata) +Sets metadata for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `metadata: Metadata` - The metadata to set (title and/or author) + +**Returns**: `BufferOutput` - The document with updated metadata + +```python +result = await client.set_metadata('document.pdf', { + 'title': 'My Document', + 'author': 'John Doe' +}) +``` + +##### set_page_labels(file, labels) +Sets page labels for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `labels: list[Label]` - Array of label objects with pages and label properties + +**Returns**: `BufferOutput` - The document with updated page labels + +```python +result = await client.set_page_labels('document.pdf', [ + {'pages': [0, 1, 2], 'label': 'Cover'}, + {'pages': [3, 4, 5], 'label': 'Chapter 1'} +]) +``` + +##### apply_instant_json(file, instant_json_file) +Applies Instant JSON to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `instant_json_file: FileInput` - The Instant JSON file to apply + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_instant_json('document.pdf', 'annotations.json') +``` + +##### apply_xfdf(file, xfdf_file, options?) +Applies XFDF to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `xfdf_file: FileInput` - The XFDF file to apply +- `options: ApplyXfdfActionOptions | None` - Optional settings for applying XFDF + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') +# Or with options: +result = await client.apply_xfdf( + 'document.pdf', 'annotations.xfdf', + {'ignorePageRotation': True, 'richTextEnabled': False} +) +``` + +##### create_redactions_preset(file, preset, redaction_state?, pages?, preset_options?, options?) +Creates redaction annotations based on a preset pattern. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `preset: SearchPreset` - The preset pattern to search for (e.g., 'email-address', 'social-security-number') +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `preset_options: CreateRedactionsStrategyOptionsPreset | None` - Optional settings for the preset strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_preset('document.pdf', 'email-address') + +# With specific pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) +``` + +##### create_redactions_regex(file, regex, redaction_state?, pages?, regex_options?, options?) +Creates redaction annotations based on a regular expression. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `regex: str` - The regular expression to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `regex_options: CreateRedactionsStrategyOptionsRegex | None` - Optional settings for the regex strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_regex('document.pdf', r'Account:\\s*\\d{8,12}') + +# With specific pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\\s*\\d{8,12}', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\\s*\\d{8,12}', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) +``` + +##### create_redactions_text(file, text, redaction_state?, pages?, text_options?, options?) +Creates redaction annotations based on text. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `text: str` - The text to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `text_options: CreateRedactionsStrategyOptionsText | None` - Optional settings for the text strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_text('document.pdf', 'email@example.com') + +# With specific pages and options +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': 0, 'end': 4}, # Pages 0, 1, 2, 3, 4 + {'caseSensitive': False, 'includeAnnotations': True} +) + +# Create redactions on the last 3 pages +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) +``` + +##### apply_redactions(file) +Applies redaction annotations in a document. + +**Parameters**: +- `file: FileInput` - The PDF file with redaction annotations to apply + +**Returns**: `BufferOutput` - The document with applied redactions + +```python +# Stage redactions from a createRedaction Method: +staged_result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage' +) + +result = await client.apply_redactions(staged_result['buffer']) +``` + +##### rotate(file, angle, pages?) +Rotates pages in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to rotate +- `angle: Literal[90, 180, 270]` - Rotation angle (90, 180, or 270 degrees) +- `pages: PageRange | None` - Optional page range to rotate + +**Returns**: `BufferOutput` - The entire document with specified pages rotated + +```python +result = await client.rotate('document.pdf', 90) + +# Rotate specific pages: +result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) # Pages 1, 2, 3 + +# Rotate the last page: +result = await client.rotate('document.pdf', 90, {'end': -1}) # Last page + +# Rotate from page 2 to the second-to-last page: +result = await client.rotate('document.pdf', 90, {'start': 2, 'end': -2}) +``` + +##### add_page(file, count?, index?) +Adds blank pages to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to add pages to +- `count: int` - The number of blank pages to add (default: 1) +- `index: int | None` - Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + +**Returns**: `BufferOutput` - The document with added pages + +```python +# Add 2 blank pages at the end +result = await client.add_page('document.pdf', 2) + +# Add 1 blank page after the first page (at index 1) +result = await client.add_page('document.pdf', 1, 1) +``` + +##### optimize(file, options?) +Optimizes a PDF document for size reduction. + +**Parameters**: +- `file: FileInput` - The PDF file to optimize +- `options: OptimizePdf | None` - Optimization options + +**Returns**: `BufferOutput` - The optimized document + +```python +result = await client.optimize('large-document.pdf', { + 'grayscaleImages': True, + 'mrcCompression': True, + 'imageOptimizationQuality': 2 +}) +``` + +##### split(file, page_ranges) +Splits a PDF document into multiple parts based on page ranges. + +**Parameters**: +- `file: FileInput` - The PDF file to split +- `page_ranges: list[PageRange]` - Array of page ranges to extract + +**Returns**: `list[BufferOutput]` - An array of PDF documents, one for each page range + +```python +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # Pages 0, 1, 2 + {'start': 3, 'end': 5} # Pages 3, 4, 5 +]) + +# Split using negative indices +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # First three pages + {'start': 3, 'end': -3}, # Middle pages + {'start': -2, 'end': -1} # Last two pages +]) + +# Process each resulting PDF +for i, result in enumerate(results): + # Access the PDF buffer + pdf_buffer = result['buffer'] + + # Get the MIME type of the output + print(result['mimeType']) # 'application/pdf' + + # Save the buffer to a file + with open(f'split-part-{i}.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### duplicate_pages(file, page_indices) +Creates a new PDF containing only the specified pages in the order provided. + +**Parameters**: +- `file: FileInput` - The PDF file to extract pages from +- `page_indices: list[int]` - Array of page indices to include in the new PDF (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - A new document with only the specified pages + +```python +# Create a new PDF with only the first and third pages +result = await client.duplicate_pages('document.pdf', [0, 2]) + +# Create a new PDF with pages in a different order +result = await client.duplicate_pages('document.pdf', [2, 0, 1]) + +# Create a new PDF with duplicated pages +result = await client.duplicate_pages('document.pdf', [0, 0, 1, 1, 0]) + +# Create a new PDF with the first and last pages +result = await client.duplicate_pages('document.pdf', [0, -1]) + +# Create a new PDF with the last three pages in reverse order +result = await client.duplicate_pages('document.pdf', [-1, -2, -3]) + +# Access the PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('duplicated-pages.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### delete_pages(file, page_indices) +Deletes pages from a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `page_indices: list[int]` - Array of page indices to delete (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - The document with deleted pages + +```python +# Delete second and fourth pages +result = await client.delete_pages('document.pdf', [1, 3]) + +# Delete the last page +result = await client.delete_pages('document.pdf', [-1]) + +# Delete the first and last two pages +result = await client.delete_pages('document.pdf', [0, -1, -2]) + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('modified-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +## Workflow Builder Methods + +The workflow builder provides a fluent interface for chaining multiple operations. See [WORKFLOW.md](WORKFLOW.md) for detailed information about workflow methods including: + +- `workflow()` - Create a new workflow builder +- `add_file_part()` - Add file parts to the workflow +- `add_html_part()` - Add HTML content +- `apply_action()` - Apply processing actions +- `output_pdf()`, `output_image()`, `output_json()` - Set output formats +- `execute()` - Execute the workflow + +## Error Handling + +All methods can raise the following exceptions: + +- `ValidationError` - Invalid input parameters +- `AuthenticationError` - Authentication failed +- `APIError` - API returned an error +- `NetworkError` - Network request failed +- `NutrientError` - Base error class + +```python +from nutrient_dws import ( + NutrientError, + ValidationError, + APIError, + AuthenticationError, + NetworkError +) + +try: + result = await client.convert('file.docx', 'pdf') +except ValidationError as error: + print(f'Invalid input: {error.message} - Details: {error.details}') +except AuthenticationError as error: + print(f'Auth error: {error.message} - Status: {error.status_code}') +except APIError as error: + print(f'API error: {error.message} - Status: {error.status_code} - Details: {error.details}') +except NetworkError as error: + print(f'Network error: {error.message} - Details: {error.details}') +``` diff --git a/docs/WORKFLOW.md b/docs/WORKFLOW.md new file mode 100644 index 0000000..303cc51 --- /dev/null +++ b/docs/WORKFLOW.md @@ -0,0 +1,952 @@ +# Nutrient DWS Python Client Workflow + +This document provides detailed information about the workflow system in the Nutrient DWS Python Client. + +## Workflow Architecture + +The Nutrient DWS Python Client uses a fluent builder pattern with staged interfaces to create document processing workflows. This architecture provides several benefits: + +1. **Type Safety**: The staged interface ensures that methods are only available at appropriate stages +2. **Readability**: Method chaining creates readable, declarative code +3. **Discoverability**: IDE auto-completion guides you through the workflow stages +4. **Flexibility**: Complex workflows can be built with simple, composable pieces + +## Workflow Stages + +The workflow builder follows a staged approach: + +### Stage 0: Create Workflow + +You have several ways of creating a workflow + +```python +# Creating Workflow from a client +workflow = client.workflow() + +# Override the client timeout +workflow = client.workflow(60000) + +# Create a workflow without a client +from nutrient_dws.builder.builder import StagedWorkflowBuilder +workflow = StagedWorkflowBuilder({ + 'apiKey': 'your-api-key' +}) +``` + +### Stage 1: Add Parts + +In this stage, you add document parts to the workflow: + +```python +workflow = (client.workflow() + .add_file_part('document.pdf') + .add_file_part('appendix.pdf')) +``` + +Available methods: + +#### `add_file_part(file, options?, actions?)` +Adds a file part to the workflow. + +**Parameters:** +- `file: FileInput` - The file to add to the workflow. Can be a local file path, bytes, or file-like object. +- `options: FilePartOptions | None` - Additional options for the file part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the file part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Add a PDF file from a local path +workflow.add_file_part('/path/to/document.pdf') + +# Add a file with options and actions +workflow.add_file_part( + '/path/to/document.pdf', + {'pages': {'start': 1, 'end': 3}}, + [BuildActions.watermark_text('CONFIDENTIAL')] +) +``` + +#### `add_html_part(html, assets?, options?, actions?)` +Adds an HTML part to the workflow. + +**Parameters:** +- `html: FileInput` - The HTML content to add. Can be a file path, bytes, or file-like object. +- `assets: list[FileInput] | None` - Optional list of assets (CSS, images, etc.) to include with the HTML. Only local files or bytes are supported (optional) +- `options: HTMLPartOptions | None` - Additional options for the HTML part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the HTML part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add HTML content from a file +workflow.add_html_part('/path/to/content.html') + +# Add HTML with assets and options +workflow.add_html_part( + '/path/to/content.html', + ['/path/to/style.css', '/path/to/image.png'], + {'layout': {'size': 'A4'}} +) +``` + +#### `add_new_page(options?, actions?)` +Adds a new blank page to the workflow. + +**Parameters:** +- `options: NewPagePartOptions | None` - Additional options for the new page, such as page size, orientation, etc. (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the new page (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a simple blank page +workflow.add_new_page() + +# Add a new page with specific options +workflow.add_new_page({ + 'layout': {'size': 'A4', 'orientation': 'portrait'} +}) +``` + +#### `add_document_part(document_id, options?, actions?)` +Adds a document part to the workflow by referencing an existing document by ID. + +**Parameters:** +- `document_id: str` - The ID of the document to add to the workflow. +- `options: DocumentPartOptions | None` - Additional options for the document part (optional) + - `options['layer']: str` - Optional layer name to select a specific layer from the document. +- `actions: list[BuildAction] | None` - Actions to apply to the document part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a document by ID +workflow.add_document_part('doc_12345abcde') + +# Add a document with a specific layer and options +workflow.add_document_part( + 'doc_12345abcde', + { + 'layer': 'content', + 'pages': {'start': 0, 'end': 3} + } +) +``` + +### Stage 2: Apply Actions (Optional) + +In this stage, you can apply actions to the document: + +```python +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) +``` + +Available methods: + +#### `apply_action(action)` +Applies a single action to the workflow. + +**Parameters:** +- `action: BuildAction` - The action to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply a watermark action +workflow.apply_action( + BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.3, + 'rotation': 45 + }) +) + +# Apply an OCR action +workflow.apply_action(BuildActions.ocr('english')) +``` + +#### `apply_actions(actions)` +Applies multiple actions to the workflow. + +**Parameters:** +- `actions: list[BuildAction]` - A list of actions to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply multiple actions to the workflow +workflow.apply_actions([ + BuildActions.watermark_text('DRAFT', {'opacity': 0.5}), + BuildActions.ocr('english'), + BuildActions.flatten() +]) +``` + +#### Action Types: + +#### Document Processing + +##### `BuildActions.ocr(language)` +Creates an OCR (Optical Character Recognition) action to extract text from images or scanned documents. + +**Parameters:** +- `language: str | list[str]` - Language(s) for OCR. Can be a single language or a list of languages. + +**Example:** +```python +# Basic OCR with English language +workflow.apply_action(BuildActions.ocr('english')) + +# OCR with multiple languages +workflow.apply_action(BuildActions.ocr(['english', 'french', 'german'])) + +# OCR with options (via dict syntax) +workflow.apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True +})) +``` + +##### `BuildActions.rotate(rotate_by)` +Creates an action to rotate pages in the document. + +**Parameters:** +- `rotate_by: Literal[90, 180, 270]` - Rotation angle in degrees (must be 90, 180, or 270). + +**Example:** +```python +# Rotate pages by 90 degrees +workflow.apply_action(BuildActions.rotate(90)) + +# Rotate pages by 180 degrees +workflow.apply_action(BuildActions.rotate(180)) +``` + +##### `BuildActions.flatten(annotation_ids?)` +Creates an action to flatten annotations into the document content, making them non-interactive but permanently visible. + +**Parameters:** +- `annotation_ids: list[str | int] | None` - Optional list of annotation IDs to flatten. If not specified, all annotations will be flattened (optional) + +**Example:** +```python +# Flatten all annotations +workflow.apply_action(BuildActions.flatten()) + +# Flatten specific annotations +workflow.apply_action(BuildActions.flatten(['annotation1', 'annotation2'])) +``` + +#### Watermarking + +##### `BuildActions.watermark_text(text, options?)` +Creates an action to add a text watermark to the document. + +**Parameters:** +- `text: str` - Watermark text content. +- `options: TextWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + - `fontFamily`: Font family for the text (e.g. 'Helvetica') + - `fontSize`: Size of the text in points + - `fontColor`: Foreground color of the text (e.g. '#ffffff') + - `fontStyle`: Text style list (['bold'], ['italic'], or ['bold', 'italic']) + +**Example:** + +```python +# Simple text watermark +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Customized text watermark +workflow.apply_action(BuildActions.watermark_text('DRAFT', { + 'opacity': 0.5, + 'rotation': 45, + 'fontSize': 36, + 'fontColor': '#FF0000', + 'fontStyle': ['bold', 'italic'] +})) +``` + +##### `BuildActions.watermark_image(image, options?)` +Creates an action to add an image watermark to the document. + +**Parameters:** +- `image: FileInput` - Watermark image (file path, bytes, or file-like object). +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + +**Example:** + +```python +# Simple image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png')) + +# Customized image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png', { + 'opacity': 0.3, + 'width': {'value': 50, 'unit': '%'}, + 'height': {'value': 50, 'unit': '%'}, + 'top': {'value': 10, 'unit': 'px'}, + 'left': {'value': 10, 'unit': 'px'}, + 'rotation': 0 +})) +``` + +#### Annotations + +##### `BuildActions.apply_instant_json(file)` +Creates an action to apply annotations from an Instant JSON file to the document. + +**Parameters:** +- `file: FileInput` - Instant JSON file input (file path, bytes, or file-like object). + +**Example:** + +```python +# Apply annotations from Instant JSON file +workflow.apply_action(BuildActions.apply_instant_json('/path/to/annotations.json')) +``` + +##### `BuildActions.apply_xfdf(file, options?)` +Creates an action to apply annotations from an XFDF file to the document. + +**Parameters:** +- `file: FileInput` - XFDF file input (file path, bytes, or file-like object). +- `options: ApplyXfdfActionOptions | None` - Apply XFDF options (optional): + - `ignorePageRotation: bool` - If True, ignores page rotation when applying XFDF data (default: False) + - `richTextEnabled: bool` - If True, plain text annotations will be converted to rich text annotations. If False, all text annotations will be plain text annotations (default: True) + +**Example:** + +```python +# Apply annotations from XFDF file with default options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf')) + +# Apply annotations with specific options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf', { + 'ignorePageRotation': True, + 'richTextEnabled': False +})) +``` + +#### Redactions + +##### `BuildActions.create_redactions_text(text, options?, strategy_options?)` +Creates an action to add redaction annotations based on text search. + +**Parameters:** +- `text: str` - Text to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsText | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided text (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: False) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for all occurrences of "Confidential" +workflow.apply_action(BuildActions.create_redactions_text('Confidential')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_text('Confidential', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'REDACTED', + 'textColor': '#FFFFFF' + } + }, + { + 'caseSensitive': True, + 'start': 2, + 'limit': 5 + } + )) +``` + +##### `BuildActions.create_redactions_regex(regex, options?, strategy_options?)` +Creates an action to add redaction annotations based on regex pattern matching. + +**Parameters:** +- `regex: str` - Regex pattern to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsRegex | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided regex (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', + { + 'content': { + 'backgroundColor': '#FF0000', + 'overlayText': 'EMAIL REDACTED' + } + }, + { + 'caseSensitive': False, + 'start': 0, + 'limit': 10 + } + )) +``` + +##### `BuildActions.create_redactions_preset(preset, options?, strategy_options?)` +Creates an action to add redaction annotations based on a preset pattern. + +**Parameters:** +- `preset: str` - Preset pattern to search and redact (e.g. 'email-address', 'credit-card-number', 'social-security-number', etc.) +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsPreset | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided preset (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses using preset +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Create redactions for credit card numbers with custom appearance +workflow.apply_action(BuildActions.create_redactions_preset('credit-card-number', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'FINANCIAL DATA' + } + }, + { + 'start': 0, + 'limit': 5 + } + )) +``` + +##### `BuildActions.apply_redactions()` +Creates an action to apply previously created redaction annotations, permanently removing the redacted content. + +**Example:** + +```python +# First create redactions +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Then apply them +workflow.apply_action(BuildActions.apply_redactions()) +``` + +### Stage 3: Set Output Format + +In this stage, you specify the desired output format: + +```python +workflow.output_pdf({ + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + } +}) +``` + +Available methods: + +#### `output_pdf(options?)` +Sets the output format to PDF. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF output, such as compression, encryption, etc. (optional) + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF with default options +workflow.output_pdf() + +# Set output format to PDF with specific options +workflow.output_pdf({ + 'user_password': 'secret', + 'user_permissions': ["printing"], + 'metadata': { + 'title': 'Important Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_pdfa(options?)` +Sets the output format to PDF/A (archival PDF). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/A output (optional): + - `options['conformance']: str` - The PDF/A conformance level to target. Options include 'pdfa-1b', 'pdfa-1a', 'pdfa-2b', 'pdfa-2a', 'pdfa-3b', 'pdfa-3a'. + Different levels have different requirements for long-term archiving. + - `options['vectorization']: bool` - When True, attempts to convert raster content to vector graphics where possible, improving quality and reducing file size. + - `options['rasterization']: bool` - When True, converts vector graphics to raster images, which can help with compatibility in some cases. + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/A with default options +workflow.output_pdfa() + +# Set output format to PDF/A with specific options +workflow.output_pdfa({ + 'conformance': 'pdfa-2b', + 'vectorization': True, + 'metadata': { + 'title': 'Archive Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True + } +}) +``` + +#### `output_pdfua(options?)` +Sets the output format to PDF/UA (Universal Accessibility). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/UA output (optional): + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/UA with default options +workflow.output_pdfua() + +# Set output format to PDF/UA with specific options +workflow.output_pdfua({ + 'metadata': { + 'title': 'Accessible Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_image(format, options?)` +Sets the output format to an image format (PNG, JPEG, WEBP). + +**Parameters:** +- `format: Literal['png', 'jpeg', 'jpg', 'webp']` - The image format to output. + - PNG: Lossless compression, supports transparency, best for graphics and screenshots + - JPEG/JPG: Lossy compression, smaller file size, best for photographs + - WEBP: Modern format with both lossy and lossless compression, good for web use +- `options: dict[str, Any] | None` - Additional options for image output, such as resolution, quality, etc. (optional) + **Note: At least one of options['width'], options['height'], or options['dpi'] must be specified.** + - `options['pages']: dict[str, int]` - Specifies which pages to convert to images. If omitted, all pages are converted. + - `options['pages']['start']: int` - The first page to convert (0-based index). + - `options['pages']['end']: int` - The last page to convert (0-based index). + - `options['width']: int` - The width of the output image in pixels. If specified without height, aspect ratio is maintained. + - `options['height']: int` - The height of the output image in pixels. If specified without width, aspect ratio is maintained. + - `options['dpi']: int` - The resolution in dots per inch. Higher values create larger, more detailed images. + Common values: 72 (web), 150 (standard), 300 (print quality), 600 (high quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PNG with dpi specified +workflow.output_image('png', {'dpi': 300}) + +# Set output format to JPEG with specific options +workflow.output_image('jpeg', { + 'dpi': 300, + 'pages': {'start': 1, 'end': 3} +}) + +# Set output format to WEBP with specific dimensions +workflow.output_image('webp', { + 'width': 1200, + 'height': 800, + 'dpi': 150 +}) +``` + +#### `output_office(format)` +Sets the output format to an Office document format (DOCX, XLSX, PPTX). + +**Parameters:** +- `format: Literal['docx', 'xlsx', 'pptx']` - The Office format to output ('docx' for Word, 'xlsx' for Excel, or 'pptx' for PowerPoint). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Word document (DOCX) +workflow.output_office('docx') + +# Set output format to Excel spreadsheet (XLSX) +workflow.output_office('xlsx') + +# Set output format to PowerPoint presentation (PPTX) +workflow.output_office('pptx') +``` + +#### `output_html(layout)` +Sets the output format to HTML. + +**Parameters:** +- `layout: Literal['page', 'reflow']` - The layout type to use for conversion to HTML: + - 'page' layout keeps the original structure of the document, segmented by page. + - 'reflow' layout converts the document into a continuous flow of text, without page breaks. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to HTML +workflow.output_html('page') +``` + +#### `output_markdown()` +Sets the output format to Markdown. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Markdown with default options +workflow.output_markdown() +``` + +#### `output_json(options?)` +Sets the output format to JSON content. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for JSON output (optional): + - `options['plainText']: bool` - When True, extracts plain text content from the document and includes it in the JSON output. + This provides the raw text without structural information. + - `options['structuredText']: bool` - When True, extracts text with structural information (paragraphs, headings, etc.) + and includes it in the JSON output. + - `options['keyValuePairs']: bool` - When True, attempts to identify and extract key-value pairs from the document + (like form fields, labeled data, etc.) and includes them in the JSON output. + - `options['tables']: bool` - When True, attempts to identify and extract tabular data from the document + and includes it in the JSON output as structured table objects. + - `options['language']: str | list[str]` - Specifies the language(s) of the document content for better text extraction. + Can be a single language code or a list of language codes for multi-language documents. + Examples: "english", "french", "german", or ["english", "spanish"]. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to JSON with default options +workflow.output_json() + +# Set output format to JSON with specific options +workflow.output_json({ + 'plainText': True, + 'structuredText': True, + 'keyValuePairs': True, + 'tables': True, + 'language': "english" +}) + +# Set output format to JSON with multiple languages +workflow.output_json({ + 'plainText': True, + 'tables': True, + 'language': ["english", "french", "german"] +}) +``` + +### Stage 4: Execute or Dry Run + +In this final stage, you execute the workflow or perform a dry run: + +```python +result = await workflow.execute() +``` + +Available methods: + +#### `execute(options?)` +Executes the workflow and returns the result. + +**Parameters:** +- `on_progress: Callable[[int, int], None] | None` - Callback for progress updates (optional). + +**Returns:** `TypedWorkflowResult` - The workflow result. + +**Example:** +```python +# Execute the workflow with default options +result = await workflow.execute() + +# Execute with progress tracking +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + +result = await workflow.execute(on_progress=progress_callback) +``` + +#### `dry_run(options?)` +Performs a dry run of the workflow without generating the final output. This is useful for validating the workflow configuration and estimating processing time. + +**Returns:** `WorkflowDryRunResult` - The dry run result, containing validation information and estimated processing time. + +**Example:** +```python +# Perform a dry run with default options +dry_run_result = await (workflow + .add_file_part('/path/to/document.pdf') + .output_pdf() + .dry_run()) +``` + +## Workflow Examples + +### Basic Document Conversion + +```python +result = await (client + .workflow() + .add_file_part('document.docx') + .output_pdf() + .execute()) +``` + +### Document Merging with Watermark + +```python +result = await (client + .workflow() + .add_file_part('document1.pdf') + .add_file_part('document2.pdf') + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) + .output_pdf() + .execute()) +``` + +### OCR with Language Selection + +```python +result = await (client + .workflow() + .add_file_part('scanned-document.pdf') + .apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True + })) + .output_pdf() + .execute()) +``` + +### HTML to PDF Conversion + +```python +result = await (client + .workflow() + .add_html_part('index.html', None, { + 'layout': { + 'size': 'A4', + 'margin': { + 'top': 50, + 'bottom': 50, + 'left': 50, + 'right': 50 + } + } + }) + .output_pdf() + .execute()) +``` + +### Complex Multi-step Workflow + +```python +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + + +result = await (client + .workflow() + .add_file_part('document.pdf', {'pages': {'start': 0, 'end': 5}}) + .add_file_part('appendix.pdf') + .apply_actions([ + BuildActions.ocr({'language': 'english'}), + BuildActions.watermark_text('CONFIDENTIAL'), + BuildActions.create_redactions_preset('email-address', 'apply') +]) + .output_pdfa({ + 'level': 'pdfa-2b', + 'optimize': { + 'mrcCompression': True + } +}) + .execute(on_progress=progress_callback)) +``` + +## Staged Workflow Builder + +For more complex scenarios where you need to build workflows dynamically, you can use the staged workflow builder: + +```python +# Create a staged workflow +workflow = client.workflow() + +# Add parts +workflow.add_file_part('document.pdf') + +# Conditionally add more parts +if include_appendix: + workflow.add_file_part('appendix.pdf') + +# Conditionally apply actions +if needs_watermark: + workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Set output format based on user preference +if output_format == 'pdf': + workflow.output_pdf() +elif output_format == 'docx': + workflow.output_office('docx') +else: + workflow.output_image('png') + +# Execute the workflow +result = await workflow.execute() +``` + +## Error Handling in Workflows + +Workflows provide detailed error information: + +```python +try: + result = await (client + .workflow() + .add_file_part('document.pdf') + .output_pdf() + .execute()) + + if not result['success']: + # Handle workflow errors + for error in result.get('errors', []): + print(f"Step {error['step']}: {error['error']['message']}") +except Exception as error: + # Handle unexpected errors + print(f'Workflow execution failed: {error}') +``` + +## Workflow Result Structure + +The result of a workflow execution includes: + +```python +from typing import TypedDict, Any, List, Optional, Union + +class WorkflowError(TypedDict): + step: str + error: dict[str, Any] + +class BufferOutput(TypedDict): + mimeType: str + filename: str + buffer: bytes + +class ContentOutput(TypedDict): + mimeType: str + filename: str + content: str + +class JsonContentOutput(TypedDict): + mimeType: str + filename: str + data: Any + +class WorkflowResult(TypedDict): + # Overall success status + success: bool + + # Output data (if successful) + output: Optional[Union[BufferOutput, ContentOutput, JsonContentOutput]] + + # Error information (if failed) + errors: Optional[List[WorkflowError]] +``` + +## Performance Considerations + +For optimal performance with workflows: + +1. **Minimize the number of parts**: Combine related files when possible +2. **Use appropriate output formats**: Choose formats based on your needs +3. **Consider dry runs**: Use `dry_run()` to estimate resource usage +4. **Monitor progress**: Use the `on_progress` callback for long-running workflows +5. **Handle large files**: For very large files, consider splitting into smaller workflows diff --git a/examples/.env.example b/examples/.env.example new file mode 100644 index 0000000..4a0ee74 --- /dev/null +++ b/examples/.env.example @@ -0,0 +1,2 @@ +NUTRIENT_API_KEY=your_api_key_here +PYTHON_ENV=development diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..31f96f8 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,135 @@ +# Nutrient DWS Python Client Examples + +This example project demonstrates how to use the Nutrient DWS Python Client for document processing operations. + +## Project Structure + +- `assets/` - Contains sample files for processing (PDF, DOCX, PNG) +- `src/` - Contains Python source files + - `direct_method.py` - Examples using direct method calls + - `workflow.py` - Examples using the workflow builder pattern +- `output/` - Directory where processed files will be saved +- `.env.example` - Example environment variables file + +## Prerequisites + +- Python 3.10 or higher +- pip + +## Setup + +### Option 1: Virtual Environment Setup + +1. Clone the repository: + ```bash + git clone https://github.com/pspdfkit-labs/nutrient-dws-client-python.git + cd nutrient-dws-client-python + ``` + +2. Build the package from source: + ```bash + python -m build + ``` + +3. Navigate to the examples directory: + ```bash + cd examples + ``` + +4. Set up and activate the virtual environment: + ```bash + # Set up the virtual environment and install dependencies + python setup_venv.py + + # Activate the virtual environment + # On macOS/Linux: + source example_venv/bin/activate + + # On Windows: + example_venv\Scripts\activate + ``` + +5. Create a `.env` file from the example: + ```bash + cp .env.example .env + ``` + +6. Edit the `.env` file and add your Nutrient DWS Processor API key. You can sign up for a free API key by visiting [Nutrient](https://www.nutrient.io/api/): + ``` + NUTRIENT_API_KEY=your_api_key_here + ``` + +### Option 2: Development Mode Setup + +1. Clone the repository: + ```bash + git clone https://github.com/pspdfkit-labs/nutrient-dws-client-python.git + cd nutrient-dws-client-python + ``` + +2. Install the main package in development mode: + ```bash + pip install -e . + ``` + +3. Navigate to the examples directory: + ```bash + cd examples + ``` + +4. Install dependencies for the example project: + ```bash + pip install -r requirements.txt + ``` + +5. Create a `.env` file from the example: + ```bash + cp .env.example .env + ``` + +6. Edit the `.env` file and add your Nutrient DWS Processor API key. You can sign up for a free API key by visiting [Nutrient](https://www.nutrient.io/api/): + ``` + NUTRIENT_API_KEY=your_api_key_here + ``` + +## Running the Examples + +### Direct Method Examples + +To run the direct method examples: + +```bash +python src/direct_method.py +``` + +This will: +1. Convert a DOCX file to PDF +2. Extract text from the PDF +3. Add a watermark to the PDF +4. Merge multiple documents + +### Workflow Examples + +To run the workflow examples: + +```bash +python src/workflow.py +``` + +This will: +1. Perform a basic document conversion workflow +2. Create a document merging with watermark workflow +3. Extract text with JSON output +4. Execute a complex multi-step workflow + +## Output + +All processed files will be saved to the `output/` directory. You can examine these files to see the results of the document processing operations. + +## Documentation + +For more information about the Nutrient DWS Python Client, refer to: + +- [README.md](../README.md) - Main documentation +- [METHODS.md](../docs/METHODS.md) - Direct methods documentation +- [WORKFLOW.md](../docs/WORKFLOW.md) - Workflow system documentation diff --git a/examples/assets/sample.docx b/examples/assets/sample.docx new file mode 100644 index 0000000..22e2341 Binary files /dev/null and b/examples/assets/sample.docx differ diff --git a/examples/assets/sample.pdf b/examples/assets/sample.pdf new file mode 100644 index 0000000..ada2bdd Binary files /dev/null and b/examples/assets/sample.pdf differ diff --git a/examples/assets/sample.png b/examples/assets/sample.png new file mode 100644 index 0000000..32ee307 Binary files /dev/null and b/examples/assets/sample.png differ diff --git a/examples/requirements.txt b/examples/requirements.txt new file mode 100644 index 0000000..1e2f2f3 --- /dev/null +++ b/examples/requirements.txt @@ -0,0 +1,2 @@ +python-dotenv>=1.0.0 +pathlib diff --git a/examples/setup_venv.py b/examples/setup_venv.py new file mode 100644 index 0000000..63ba273 --- /dev/null +++ b/examples/setup_venv.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Setup script for creating and configuring the examples virtual environment. +This script creates a virtual environment and installs the nutrient-dws package +from the built distribution files. +""" + +import os +import subprocess +import sys +from pathlib import Path + +def run_command(cmd, cwd=None, check=True): + """Run a command and return the result.""" + print(f"Running: {' '.join(cmd) if isinstance(cmd, list) else cmd}") + try: + result = subprocess.run( + cmd, + shell=isinstance(cmd, str), + cwd=cwd, + check=check, + capture_output=True, + text=True + ) + if result.stdout: + print(result.stdout) + return result + except subprocess.CalledProcessError as e: + print(f"Error: {e}") + if e.stderr: + print(f"Error output: {e.stderr}") + raise + +def main(): + # Get the current directory (examples folder) + examples_dir = Path(__file__).parent + project_root = examples_dir.parent + dist_dir = project_root / "dist" + + print(f"Setting up virtual environment in: {examples_dir}") + + # Create virtual environment + venv_path = examples_dir / "example_venv" + if venv_path.exists(): + print("Virtual environment already exists. Removing...") + import shutil + shutil.rmtree(venv_path) + + print("Creating virtual environment...") + run_command([sys.executable, "-m", "venv", "example_venv"], cwd=examples_dir) + + # Determine the python executable in the venv + if sys.platform == "win32": + python_exe = venv_path / "Scripts" / "python.exe" + pip_exe = venv_path / "Scripts" / "pip.exe" + else: + python_exe = venv_path / "bin" / "python" + pip_exe = venv_path / "bin" / "pip" + + # Upgrade pip + print("Upgrading pip...") + run_command([str(pip_exe), "install", "--upgrade", "pip"]) + + # Install the wheel and tar.gz files + wheel_file = dist_dir / "nutrient_dws-2.0.0-py3-none-any.whl" + tar_file = dist_dir / "nutrient_dws-2.0.0.tar.gz" + + if wheel_file.exists(): + print("Installing nutrient-dws from wheel...") + run_command([str(pip_exe), "install", str(wheel_file)]) + elif tar_file.exists(): + print("Installing nutrient-dws from tar.gz...") + run_command([str(pip_exe), "install", str(tar_file)]) + else: + print("Error: Neither wheel nor tar.gz file found in dist directory") + print("Please build the package first using: python -m build") + sys.exit(1) + + # Install example requirements + requirements_file = examples_dir / "requirements.txt" + if requirements_file.exists(): + print("Installing example requirements...") + run_command([str(pip_exe), "install", "-r", str(requirements_file)]) + + print("\n" + "="*50) + print("Virtual environment setup complete!") + print(f"Virtual environment location: {venv_path}") + print("\nTo activate the virtual environment:") + if sys.platform == "win32": + print(f" {venv_path / 'Scripts' / 'activate.bat'}") + else: + print(f" source {venv_path / 'bin' / 'activate'}") + + print("\nTo run examples:") + print(" python src/direct_method.py") + print(" python src/workflow.py") + +if __name__ == "__main__": + main() diff --git a/examples/src/direct_method.py b/examples/src/direct_method.py new file mode 100644 index 0000000..ae5e781 --- /dev/null +++ b/examples/src/direct_method.py @@ -0,0 +1,149 @@ +""" +Direct Method Example + +This example demonstrates how to use the Nutrient DWS Python Client +with direct method calls for document processing operations. +""" + +import asyncio +import os +import json +from pathlib import Path +from dotenv import load_dotenv + +from nutrient_dws import NutrientClient + +# Load environment variables from .env file +load_dotenv() + +# Check if API key is provided +if not os.getenv('NUTRIENT_API_KEY'): + print('Error: NUTRIENT_API_KEY is not set in .env file') + exit(1) + +# Initialize the client with API key +client = NutrientClient(api_key=os.getenv('NUTRIENT_API_KEY')) + +# Define paths +assets_dir = Path(__file__).parent.parent / 'assets' +output_dir = Path(__file__).parent.parent / 'output' + +# Ensure output directory exists +output_dir.mkdir(parents=True, exist_ok=True) + + +# Example 1: Convert a document +async def convert_document(): + print('Example 1: Converting DOCX to PDF') + + try: + docx_path = assets_dir / 'sample.docx' + result = await client.convert(docx_path, 'pdf') + + # Save the result to the output directory + output_path = output_dir / 'converted-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['buffer']) + + print(f'Conversion successful. Output saved to: {output_path}') + print(f'MIME type: {result["mimeType"]}') + return output_path + except Exception as error: + print(f'Conversion failed: {error}') + raise error + + +# Example 2: Extract text from a document +async def extract_text(file_path: Path): + print('\nExample 2: Extracting text from PDF') + + try: + result = await client.extract_text(file_path) + + # Save the extracted text to the output directory + output_path = output_dir / 'extracted-text.json' + with open(output_path, 'w') as f: + json.dump(result['data'], f, indent=2, default=str) + + # Display a sample of the extracted text + text_sample = result['data']['pages'][0]['plainText'][:100] + '...' + print(f'Text extraction successful. Output saved to: {output_path}') + print(f'Text sample: {text_sample}') + return output_path + except Exception as error: + print(f'Text extraction failed: {error}') + raise error + + +# Example 3: Add a watermark to a document +async def add_watermark(file_path: Path): + print('\nExample 3: Adding watermark to PDF') + + try: + result = await client.watermark_text(file_path, 'CONFIDENTIAL', { + 'opacity': 0.5, + 'font_color': '#FF0000', + 'rotation': 45, + 'width': {'value': 50, 'unit': '%'} + }) + + # Save the watermarked document to the output directory + output_path = output_dir / 'watermarked-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['buffer']) + + print(f'Watermarking successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Watermarking failed: {error}') + raise error + + +# Example 4: Merge multiple documents +async def merge_documents(): + print('\nExample 4: Merging documents') + + try: + # Create a second PDF + pdf_path = assets_dir / 'sample.pdf' + + # Get the converted PDF from Example 1 + converted_pdf_path = output_dir / 'converted-document.pdf' + + # Merge the documents + result = await client.merge([converted_pdf_path, pdf_path]) + + # Save the merged document to the output directory + output_path = output_dir / 'merged-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['buffer']) + + print(f'Merging successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Merging failed: {error}') + raise error + + + + + +# Run all examples +async def run_examples(): + try: + print('Starting direct method examples...\n') + + # Run the examples in sequence + converted_pdf_path = await convert_document() + await extract_text(converted_pdf_path) + await add_watermark(converted_pdf_path) + await merge_documents() + + print('\nAll examples completed successfully!') + except Exception as error: + print(f'\nExamples failed: {error}') + + +# Execute the examples +if __name__ == '__main__': + asyncio.run(run_examples()) diff --git a/examples/src/workflow.py b/examples/src/workflow.py new file mode 100644 index 0000000..c6106d3 --- /dev/null +++ b/examples/src/workflow.py @@ -0,0 +1,176 @@ +""" +Workflow Example + +This example demonstrates how to use the Nutrient DWS Python Client +with the workflow builder pattern for document processing operations. +""" + +import asyncio +import os +import json +from pathlib import Path +from dotenv import load_dotenv + +from nutrient_dws import NutrientClient +from nutrient_dws.builder.constant import BuildActions + +# Load environment variables from .env file +load_dotenv() + +# Check if API key is provided +if not os.getenv('NUTRIENT_API_KEY'): + print('Error: NUTRIENT_API_KEY is not set in .env file') + exit(1) + +# Initialize the client with API key +client = NutrientClient(api_key=os.getenv('NUTRIENT_API_KEY')) + +# Define paths +assets_dir = Path(__file__).parent.parent / 'assets' +output_dir = Path(__file__).parent.parent / 'output' + +# Ensure output directory exists +output_dir.mkdir(parents=True, exist_ok=True) + + +# Example 1: Basic document conversion workflow +async def basic_conversion_workflow(): + print('Example 1: Basic document conversion workflow') + + try: + docx_path = assets_dir / 'sample.docx' + + result = await client.workflow() \ + .add_file_part(docx_path) \ + .output_pdf() \ + .execute() + + # Save the result to the output directory + output_path = output_dir / 'workflow-converted-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['output']['buffer']) + + print(f'Conversion workflow successful. Output saved to: {output_path}') + print(f'MIME type: {result["output"]["mimeType"]}') + return output_path + except Exception as error: + print(f'Conversion workflow failed: {error}') + raise error + + +# Example 2: Document merging with watermark +async def merge_with_watermark_workflow(): + print('\nExample 2: Document merging with watermark workflow') + + try: + pdf_path = output_dir / 'workflow-converted-document.pdf' + png_path = assets_dir / 'sample.png' + + result = await client.workflow() \ + .add_file_part(pdf_path) \ + .add_file_part(png_path) \ + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48, + 'fontColor': '#FF0000' + })) \ + .output_pdf() \ + .execute() + + # Save the result to the output directory + output_path = output_dir / 'workflow-merged-watermarked.pdf' + with open(output_path, 'wb') as f: + f.write(result['output']['buffer']) + + print(f'Merge with watermark workflow successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Merge with watermark workflow failed: {error}') + raise error + + +# Example 3: Extract text with JSON output +async def extract_text_workflow(file_path: Path): + print('\nExample 3: Extract text workflow with JSON output') + + try: + result = await client.workflow() \ + .add_file_part(file_path) \ + .output_json({ + 'plainText': True, + 'structuredText': True, + 'keyValuePairs': True, + 'tables': True + }) \ + .execute() + + # Save the result to the output directory + output_path = output_dir / 'workflow-extracted-text.json' + with open(output_path, 'w') as f: + json.dump(result['output']['data'], f, indent=2, default=str) + + print(f'Text extraction workflow successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Text extraction workflow failed: {error}') + raise error + + +# Example 4: Complex multi-step workflow +async def complex_workflow(): + print('\nExample 4: Complex multi-step workflow') + + try: + pdf_path = output_dir / 'workflow-converted-document.pdf' + png_path = assets_dir / 'sample.png' + + result = await client.workflow() \ + .add_file_part(pdf_path) \ + .add_file_part(png_path) \ + .apply_actions([ + BuildActions.watermark_text('DRAFT', { + 'opacity': 0.3, + 'fontSize': 36, + 'fontColor': '#0000FF' + }), + BuildActions.rotate(90) + ]) \ + .output_pdfua({ + 'metadata': { + 'title': 'Complex Workflow Example', + 'author': 'Nutrient DWS Python Client' + } + }) \ + .execute(on_progress= lambda current, total: print(f'Processing step {current} of {total}')) + + # Save the result to the output directory + output_path = output_dir / 'workflow-complex-result.pdf' + with open(output_path, 'wb') as f: + f.write(result['output']['buffer']) + + print(f'Complex workflow successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Complex workflow failed: {error}') + raise error + + +# Run all examples +async def run_examples(): + try: + print('Starting workflow examples...\n') + + # Run the examples in sequence + converted_pdf_path = await basic_conversion_workflow() + await merge_with_watermark_workflow() + await extract_text_workflow(converted_pdf_path) + await complex_workflow() + + print('\nAll workflow examples completed successfully!') + except Exception as error: + print(f'\nWorkflow examples failed: {error}') + + +# Execute the examples +if __name__ == '__main__': + asyncio.run(run_examples()) diff --git a/pixi.toml b/pixi.toml deleted file mode 100644 index 7f04cf1..0000000 --- a/pixi.toml +++ /dev/null @@ -1,30 +0,0 @@ -[project] -name = "nutrient-dws-client-python" -channels = ["conda-forge"] -platforms = ["osx-arm64", "osx-64", "linux-64", "win-64"] - -[dependencies] -python = ">=3.10,<3.13" -requests = ">=2.25.0,<3.0.0" - -[feature.dev.dependencies] -pytest = ">=7.0.0" -pytest-cov = ">=4.0.0" -mypy = ">=1.0.0" -ruff = ">=0.1.0" -types-requests = ">=2.25.0" - -[environments] -default = {features = ["dev"], solve-group = "default"} -dev = {features = ["dev"], solve-group = "default"} - -[tasks] -test = "pytest" -lint = "ruff check ." -format = "ruff format ." -typecheck = "mypy src/" -dev = "python -m pip install -e ." - -[pypi-dependencies] -build = ">=1.2.2.post1, <2" -twine = ">=6.1.0, <7" diff --git a/pyproject.toml b/pyproject.toml index 49f452b..fe461e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,9 +8,17 @@ package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +nutrient_dws = [ + "py.typed", + "../../docs/WORKFLOW.md", + "../../docs/METHODS.md", + "../../LLM_DOC.md", +] + [project] name = "nutrient-dws" -version = "1.0.2" +version = "2.0.0" description = "Python client library for Nutrient Document Web Services API" readme = "README.md" requires-python = ">=3.10" @@ -32,37 +40,40 @@ classifiers = [ "Topic :: Multimedia :: Graphics :: Graphics Conversion", ] dependencies = [ - "requests>=2.25.0,<3.0.0", + "httpx>=0.24.0,<1.0.0", + "aiofiles>=23.0.0,<25.0.0", + "typing_extensions>=4.9.0" ] [project.optional-dependencies] dev = [ "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", "pytest-cov>=4.0.0", "mypy>=1.0.0", "ruff>=0.1.0", - "types-requests>=2.25.0", - "build>=1.0.0", - "twine>=4.0.0", -] -docs = [ - "sphinx>=5.0.0", - "sphinx-rtd-theme>=1.2.0", - "sphinx-autodoc-typehints>=1.22.0", + "types-aiofiles>=24.1.0", + "build>=1.2.2.post1,<2", + "twine>=6.1.0,<7", + "python-dotenv>=1.1.1" ] [project.urls] Homepage = "https://github.com/PSPDFKit/nutrient-dws-client-python" -Documentation = "https://nutrient-dws-client-python.readthedocs.io" +Documentation = "https://github.com/PSPDFKit/nutrient-dws-client-python/blob/main/README.md" Repository = "https://github.com/PSPDFKit/nutrient-dws-client-python" "Bug Tracker" = "https://github.com/PSPDFKit/nutrient-dws-client-python/issues" -[tool.setuptools.package-data] -nutrient_dws = ["py.typed"] +[project.scripts] +dws-add-claude-code-rule = "scripts.add_claude_code_rule:main" +dws-add-cursor-rule = "scripts.add_cursor_rule:main" +dws-add-github-copilot-rule = "scripts.add_github_copilot_rule:main" +dws-add-junie-rule = "scripts.add_junie_rule:main" +dws-add-windsurf-rule = "scripts.add_windsurf_rule:main" [tool.ruff] target-version = "py310" -line-length = 100 + [tool.ruff.lint] select = [ @@ -80,17 +91,28 @@ select = [ "RUF", # Ruff-specific rules ] ignore = [ + "E501", # Line too long + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling "D100", # Missing docstring in public module + "D101", # Missing docstring in public class + "D103", # Missing docstring in public function "D104", # Missing docstring in public package "D107", # Missing docstring in __init__ + "D205", # 1 blank line required between summary line and description + "UP007", # Use `X | Y` for type annotations "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)` - not supported in Python 3.10 runtime + "UP045", # Use `X | None` for type annotations + "N802", # Function name should be lowercase + "N803", # Argument name should be lowercase + "N815", # Variable in class scope should not be mixedCase + "N811", # Constant imported as non-constant ] [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.lint.per-file-ignores] -"tests/*" = ["D", "S101"] # Don't require docstrings in tests, allow asserts +"tests/*" = [] # Don't require docstrings in tests, allow asserts [tool.mypy] python_version = "3.10" @@ -130,4 +152,4 @@ exclude_lines = [ ] [tool.coverage.html] -directory = "htmlcov" \ No newline at end of file +directory = "htmlcov" diff --git a/pytest.ini b/pytest.ini index 0b63f27..9a9cab7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,4 +3,5 @@ testpaths = tests python_files = test_*.py python_classes = Test* python_functions = test_* -addopts = -ra \ No newline at end of file +addopts = -ra --tb=short +asyncio_mode = auto diff --git a/scripts/build_package.py b/scripts/build_package.py deleted file mode 100755 index c2e7a23..0000000 --- a/scripts/build_package.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -"""Build the package for distribution.""" - -import subprocess -import sys -from pathlib import Path - - -def main(): - """Build the package.""" - root_dir = Path(__file__).parent.parent - - # Clean previous builds - print("Cleaning previous builds...") - for dir_name in ["dist", "build", "*.egg-info"]: - subprocess.run(["rm", "-rf", str(root_dir / dir_name)]) - - # Build the package - print("Building package...") - result = subprocess.run( - [sys.executable, "-m", "build"], cwd=root_dir, capture_output=True, text=True - ) - - if result.returncode != 0: - print(f"Build failed:\n{result.stderr}") - return 1 - - print("Build successful!") - print("\nBuilt files:") - dist_dir = root_dir / "dist" - for file in dist_dir.iterdir(): - print(f" - {file.name}") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/generate_api_methods.py b/scripts/generate_api_methods.py deleted file mode 100755 index 68401b3..0000000 --- a/scripts/generate_api_methods.py +++ /dev/null @@ -1,376 +0,0 @@ -#!/usr/bin/env python3 -"""Generate Direct API methods from OpenAPI specification.""" - -import re -from pathlib import Path -from typing import Any - - -def to_snake_case(name: str) -> str: - """Convert string to snake_case.""" - # Handle common patterns - name = name.replace("-", "_") - # Insert underscore before uppercase letters - name = re.sub(r"(? str: - """Convert OpenAPI schema type to Python type hint.""" - if not schema: - return "Any" - - type_mapping = { - "string": "str", - "integer": "int", - "number": "float", - "boolean": "bool", - "array": "List[Any]", - "object": "Dict[str, Any]", - } - - schema_type = schema.get("type", "string") - return type_mapping.get(schema_type, "Any") - - -def create_manual_tools() -> list[dict[str, Any]]: - """Create tool definitions based on the specification documentation. - - Since the Nutrient API uses a build endpoint with actions rather than - individual tool endpoints, we'll create convenience methods that wrap - the build API. - """ - tools = [ - { - "tool_name": "convert-to-pdf", - "method_name": "convert_to_pdf", - "summary": "Convert a document to PDF", - "description": "Convert various document formats (DOCX, XLSX, PPTX, etc.) to PDF.", - "parameters": {}, - }, - { - "tool_name": "convert-to-pdfa", - "method_name": "convert_to_pdfa", - "summary": "Convert a document to PDF/A", - "description": "Convert documents to PDF/A format for long-term archiving.", - "parameters": { - "conformance_level": { - "type": "str", - "required": False, - "description": "PDF/A conformance level (e.g., '2b', '3b')", - "default": "2b", - }, - }, - }, - { - "tool_name": "ocr-pdf", - "method_name": "ocr_pdf", - "summary": "Perform OCR on a PDF", - "description": "Apply optical character recognition to make scanned PDFs searchable.", - "parameters": { - "language": { - "type": "str", - "required": False, - "description": "OCR language code (e.g., 'en', 'de', 'fr')", - "default": "en", - }, - }, - }, - { - "tool_name": "rotate-pages", - "method_name": "rotate_pages", - "summary": "Rotate PDF pages", - "description": "Rotate pages in a PDF document.", - "parameters": { - "degrees": { - "type": "int", - "required": True, - "description": "Rotation angle in degrees (90, 180, 270)", - }, - "page_indexes": { - "type": "List[int]", - "required": False, - "description": ( - "List of page indexes to rotate (0-based). " - "If not specified, all pages are rotated." - ), - }, - }, - }, - { - "tool_name": "flatten-annotations", - "method_name": "flatten_annotations", - "summary": "Flatten PDF annotations", - "description": "Flatten annotations and form fields in a PDF.", - "parameters": {}, - }, - { - "tool_name": "watermark-pdf", - "method_name": "watermark_pdf", - "summary": "Add watermark to PDF", - "description": "Add text or image watermark to PDF pages.", - "parameters": { - "text": { - "type": "str", - "required": False, - "description": "Watermark text", - }, - "image_url": { - "type": "str", - "required": False, - "description": "URL of watermark image", - }, - "opacity": { - "type": "float", - "required": False, - "description": "Watermark opacity (0.0 to 1.0)", - "default": 0.5, - }, - "position": { - "type": "str", - "required": False, - "description": "Watermark position", - "default": "center", - }, - }, - }, - { - "tool_name": "sign-pdf", - "method_name": "sign_pdf", - "summary": "Digitally sign a PDF", - "description": "Add a digital signature to a PDF document.", - "parameters": { - "certificate_file": { - "type": "FileInput", - "required": True, - "description": "Digital certificate file (P12/PFX format)", - }, - "certificate_password": { - "type": "str", - "required": True, - "description": "Certificate password", - }, - "reason": { - "type": "str", - "required": False, - "description": "Reason for signing", - }, - "location": { - "type": "str", - "required": False, - "description": "Location of signing", - }, - }, - }, - { - "tool_name": "redact-pdf", - "method_name": "redact_pdf", - "summary": "Redact sensitive information from PDF", - "description": "Use AI to automatically redact sensitive information from a PDF.", - "parameters": { - "types": { - "type": "List[str]", - "required": False, - "description": "Types of information to redact (e.g., 'email', 'phone', 'ssn')", - }, - }, - }, - { - "tool_name": "export-pdf-to-office", - "method_name": "export_pdf_to_office", - "summary": "Export PDF to Office format", - "description": "Convert PDF to Microsoft Office formats (DOCX, XLSX, PPTX).", - "parameters": { - "format": { - "type": "str", - "required": True, - "description": "Output format ('docx', 'xlsx', 'pptx')", - }, - }, - }, - { - "tool_name": "export-pdf-to-images", - "method_name": "export_pdf_to_images", - "summary": "Export PDF pages as images", - "description": "Convert PDF pages to image files.", - "parameters": { - "format": { - "type": "str", - "required": False, - "description": "Image format ('png', 'jpeg', 'webp')", - "default": "png", - }, - "dpi": { - "type": "int", - "required": False, - "description": "Image resolution in DPI", - "default": 150, - }, - "page_indexes": { - "type": "List[int]", - "required": False, - "description": "List of page indexes to export (0-based)", - }, - }, - }, - ] - - return tools - - -def generate_method_code(tool_info: dict[str, Any]) -> str: - """Generate Python method code for a tool.""" - method_name = tool_info["method_name"] - tool_name = tool_info["tool_name"] - summary = tool_info["summary"] - description = tool_info["description"] - parameters = tool_info["parameters"] - - # Build parameter list - param_list = ["self", "input_file: FileInput"] - param_docs = [] - - # Add required parameters first - for param_name, param_info in parameters.items(): - if param_info["required"]: - param_type = param_info["type"] - # Handle imports for complex types - if param_type == "FileInput": - param_type = "'FileInput'" # Forward reference - param_list.append(f"{param_name}: {param_type}") - param_docs.append(f" {param_name}: {param_info['description']}") - - # Always add output_path - param_list.append("output_path: Optional[str] = None") - - # Add optional parameters - for param_name, param_info in parameters.items(): - if not param_info["required"]: - param_type = param_info["type"] - # Handle List types - base_type = param_type - - default = param_info.get("default") - if default is None: - param_list.append(f"{param_name}: Optional[{base_type}] = None") - else: - if isinstance(default, str): - param_list.append(f'{param_name}: {base_type} = "{default}"') - else: - param_list.append(f"{param_name}: {base_type} = {default}") - param_docs.append(f" {param_name}: {param_info['description']}") - - # Build method signature - if len(param_list) > 3: # Multiple parameters - params_str = ",\n ".join(param_list) - method_signature = ( - f" def {method_name}(\n {params_str},\n ) -> Optional[bytes]:" - ) - else: - params_str = ", ".join(param_list) - method_signature = f" def {method_name}({params_str}) -> Optional[bytes]:" - - # Build docstring - docstring_lines = [f' """{summary}'] - if description and description != summary: - docstring_lines.append("") - docstring_lines.append(f" {description}") - - docstring_lines.extend( - [ - "", - " Args:", - " input_file: Input file (path, bytes, or file-like object).", - ] - ) - - if param_docs: - docstring_lines.extend(param_docs) - - docstring_lines.extend( - [ - " output_path: Optional path to save the output file.", - "", - " Returns:", - " Processed file as bytes, or None if output_path is provided.", - "", - " Raises:", - " AuthenticationError: If API key is missing or invalid.", - " APIError: For other API errors.", - ' """', - ] - ) - - # Build method body - method_body = [] - - # Collect kwargs - kwargs_params = [f"{name}={name}" for name in parameters] - - if kwargs_params: - kwargs_str = ", ".join(kwargs_params) - method_body.append( - f' return self._process_file("{tool_name}", input_file, ' - f"output_path, {kwargs_str})" - ) - else: - method_body.append( - f' return self._process_file("{tool_name}", input_file, output_path)' - ) - - # Combine all parts - return "\n".join( - [ - method_signature, - "\n".join(docstring_lines), - "\n".join(method_body), - ] - ) - - -def generate_api_methods(spec_path: Path, output_path: Path) -> None: - """Generate API methods from OpenAPI specification.""" - # For Nutrient API, we'll use manually defined tools since they use - # a build endpoint with actions rather than individual endpoints - tools = create_manual_tools() - - # Sort tools by method name - tools.sort(key=lambda t: t["method_name"]) - - # Generate code - code_lines = [ - '"""Direct API methods for individual document processing tools.', - "", - "This file provides convenient methods that wrap the Nutrient Build API", - "for common document processing operations.", - '"""', - "", - "from typing import List, Optional", - "", - "from nutrient_dws.file_handler import FileInput", - "", - "", - "class DirectAPIMixin:", - ' """Mixin class containing Direct API methods.', - " ", - " These methods provide a simplified interface to common document", - " processing operations. They internally use the Build API.", - ' """', - "", - ] - - # Add methods - for tool in tools: - code_lines.append(generate_method_code(tool)) - code_lines.append("") # Empty line between methods - - # Write to file - output_path.write_text("\n".join(code_lines)) - print(f"Generated {len(tools)} API methods in {output_path}") - - -if __name__ == "__main__": - spec_path = Path("openapi_spec.yml") - output_path = Path("src/nutrient/api/direct.py") - - generate_api_methods(spec_path, output_path) diff --git a/src/nutrient_dws/__init__.py b/src/nutrient_dws/__init__.py index 2e260e0..0cab185 100644 --- a/src/nutrient_dws/__init__.py +++ b/src/nutrient_dws/__init__.py @@ -4,22 +4,32 @@ """ from nutrient_dws.client import NutrientClient -from nutrient_dws.exceptions import ( +from nutrient_dws.errors import ( APIError, AuthenticationError, - FileProcessingError, + NetworkError, NutrientError, - NutrientTimeoutError, ValidationError, ) +from nutrient_dws.inputs import ( + is_remote_file_input, + process_file_input, + process_remote_file_input, + validate_file_input, +) +from nutrient_dws.utils import get_library_version, get_user_agent -__version__ = "1.0.2" __all__ = [ "APIError", "AuthenticationError", - "FileProcessingError", + "NetworkError", "NutrientClient", "NutrientError", - "NutrientTimeoutError", "ValidationError", + "get_library_version", + "get_user_agent", + "is_remote_file_input", + "process_file_input", + "process_remote_file_input", + "validate_file_input", ] diff --git a/src/nutrient_dws/api/__init__.py b/src/nutrient_dws/api/__init__.py deleted file mode 100644 index 72b9fda..0000000 --- a/src/nutrient_dws/api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""API module for Nutrient DWS client.""" diff --git a/src/nutrient_dws/api/direct.py b/src/nutrient_dws/api/direct.py deleted file mode 100644 index 690289c..0000000 --- a/src/nutrient_dws/api/direct.py +++ /dev/null @@ -1,1507 +0,0 @@ -"""Direct API methods for supported document processing tools. - -This file provides convenient methods that wrap the Nutrient Build API -for supported document processing operations. -""" - -from typing import TYPE_CHECKING, Any, Protocol - -from nutrient_dws.file_handler import FileInput - -if TYPE_CHECKING: - from nutrient_dws.builder import BuildAPIWrapper - from nutrient_dws.http_client import HTTPClient - - -class HasBuildMethod(Protocol): - """Protocol for objects that have a build method.""" - - def build(self, input_file: FileInput) -> "BuildAPIWrapper": - """Build method signature.""" - ... - - @property - def _http_client(self) -> "HTTPClient": - """HTTP client property.""" - ... - - -class DirectAPIMixin: - """Mixin class containing Direct API methods. - - These methods provide a simplified interface to common document - processing operations. They internally use the Build API. - - Note: The API automatically converts supported document formats - (DOCX, XLSX, PPTX) to PDF when processing. - """ - - def _process_file( - self, - tool: str, - input_file: FileInput, - output_path: str | None = None, - **options: Any, - ) -> bytes | None: - """Process file method that will be provided by NutrientClient.""" - raise NotImplementedError("This method is provided by NutrientClient") - - def convert_to_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - ) -> bytes | None: - """Convert a document to PDF. - - Converts Office documents (DOCX, XLSX, PPTX) to PDF format. - This uses the API's implicit conversion - simply uploading a - non-PDF document returns it as a PDF. - - Args: - input_file: Input document (DOCX, XLSX, PPTX, etc). - output_path: Optional path to save the output PDF. - - Returns: - Converted PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors (e.g., unsupported format). - - Note: - HTML files are not currently supported by the API. - """ - # Use builder with no actions - implicit conversion happens - # Type checking: at runtime, self is NutrientClient which has these methods - return self.build(input_file).execute(output_path) # type: ignore[attr-defined,no-any-return] - - def flatten_annotations( - self, input_file: FileInput, output_path: str | None = None - ) -> bytes | None: - """Flatten annotations and form fields in a PDF. - - Converts all annotations and form fields into static page content. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - return self._process_file("flatten-annotations", input_file, output_path) - - def rotate_pages( - self, - input_file: FileInput, - output_path: str | None = None, - degrees: int = 0, - page_indexes: list[int] | None = None, - ) -> bytes | None: - """Rotate pages in a PDF. - - Rotate all pages or specific pages by the specified degrees. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - degrees: Rotation angle (90, 180, 270, or -90). - page_indexes: Optional list of page indexes to rotate (0-based). - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - options = {"degrees": degrees} - if page_indexes is not None: - options["page_indexes"] = page_indexes # type: ignore - return self._process_file("rotate-pages", input_file, output_path, **options) - - def ocr_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - language: str = "english", - ) -> bytes | None: - """Apply OCR to a PDF to make it searchable. - - Performs optical character recognition on the PDF to extract text - and make it searchable. If input is an Office document, it will - be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - language: OCR language. Supported: "english", "eng", "deu", "german". - Default is "english". - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - return self._process_file("ocr-pdf", input_file, output_path, language=language) - - def watermark_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - text: str | None = None, - image_url: str | None = None, - image_file: FileInput | None = None, - width: int = 200, - height: int = 100, - opacity: float = 1.0, - position: str = "center", - ) -> bytes | None: - """Add a watermark to a PDF. - - Adds a text or image watermark to all pages of the PDF. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - text: Text to use as watermark. One of text, image_url, or image_file required. - image_url: URL of image to use as watermark. - image_file: Local image file to use as watermark (path, bytes, or file-like object). - Supported formats: PNG, JPEG, TIFF. - width: Width of the watermark in points (required). - height: Height of the watermark in points (required). - opacity: Opacity of the watermark (0.0 to 1.0). - position: Position of watermark. One of: "top-left", "top-center", - "top-right", "center", "bottom-left", "bottom-center", - "bottom-right". - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If none of text, image_url, or image_file is provided. - """ - if not text and not image_url and not image_file: - raise ValueError("Either text, image_url, or image_file must be provided") - - # For image file uploads, we need to use the builder directly - if image_file: - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Prepare files for upload - files = {} - - # Main PDF file - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - # Watermark image file - image_field, image_data = prepare_file_for_upload(image_file, "watermark") - files[image_field] = image_data - - # Build instructions with watermark action - action = { - "type": "watermark", - "width": width, - "height": height, - "opacity": opacity, - "position": position, - "image": "watermark", # Reference to the uploaded image file - } - - instructions = {"parts": [{"file": "file"}], "actions": [action]} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - # For text and URL watermarks, use the existing _process_file approach - options = { - "width": width, - "height": height, - "opacity": opacity, - "position": position, - } - - if text: - options["text"] = text - else: - options["image_url"] = image_url - - return self._process_file("watermark-pdf", input_file, output_path, **options) - - def apply_redactions( - self, - input_file: FileInput, - output_path: str | None = None, - ) -> bytes | None: - """Apply redaction annotations to permanently remove content. - - Applies any redaction annotations in the PDF to permanently remove - the underlying content. If input is an Office document, it will - be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - return self._process_file("apply-redactions", input_file, output_path) - - def create_redactions_preset( - self, - input_file: FileInput, - preset: str, - output_path: str | None = None, - include_annotations: bool = False, - appearance_fill_color: str | None = None, - appearance_stroke_color: str | None = None, - ) -> bytes | None: - """Create redaction annotations using a preset pattern. - - Creates redaction annotations for common sensitive data patterns - like social security numbers, credit card numbers, etc. - - Args: - input_file: Input PDF file. - preset: Preset pattern to use. Valid options: - - "social-security-number": US Social Security Number - - "credit-card-number": Credit card numbers - - "international-phone-number": International phone numbers - - "north-american-phone-number": North America phone numbers - - "date": Date patterns - - "time": Time patterns - - "us-zip-code": US Zip Code patterns - - "email-address": Email addresses - output_path: Optional path to save the output file. - include_annotations: Include text in annotations (default: False). - appearance_fill_color: Fill color for redaction boxes (hex format). - appearance_stroke_color: Stroke color for redaction boxes (hex format). - - Returns: - PDF with redaction annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Note: - This creates redaction annotations but does not apply them. - Use apply_redactions() to permanently remove the content. - """ - options = { - "strategy": "preset", - "strategy_options": { - "preset": preset, - "includeAnnotations": include_annotations, - }, - } - - # Add appearance options if provided - content = {} - if appearance_fill_color: - content["fillColor"] = appearance_fill_color - if appearance_stroke_color: - content["outlineColor"] = appearance_stroke_color - - if content: - options["content"] = content - - return self._process_file("create-redactions", input_file, output_path, **options) - - def create_redactions_regex( - self, - input_file: FileInput, - pattern: str, - output_path: str | None = None, - case_sensitive: bool = False, - include_annotations: bool = False, - appearance_fill_color: str | None = None, - appearance_stroke_color: str | None = None, - ) -> bytes | None: - """Create redaction annotations using a regex pattern. - - Creates redaction annotations for text matching a regular expression. - - Args: - input_file: Input PDF file. - pattern: Regular expression pattern to match. - output_path: Optional path to save the output file. - case_sensitive: Whether pattern matching is case-sensitive (default: False). - include_annotations: Include text in annotations (default: False). - include_text: Include regular text content (default: True). - appearance_fill_color: Fill color for redaction boxes (hex format). - appearance_stroke_color: Stroke color for redaction boxes (hex format). - - Returns: - PDF with redaction annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Note: - This creates redaction annotations but does not apply them. - Use apply_redactions() to permanently remove the content. - """ - options = { - "strategy": "regex", - "strategy_options": { - "regex": pattern, - "caseSensitive": case_sensitive, - "includeAnnotations": include_annotations, - }, - } - - # Add appearance options if provided - content = {} - if appearance_fill_color: - content["fillColor"] = appearance_fill_color - if appearance_stroke_color: - content["outlineColor"] = appearance_stroke_color - - if content: - options["content"] = content - - return self._process_file("create-redactions", input_file, output_path, **options) - - def create_redactions_text( - self, - input_file: FileInput, - text: str, - output_path: str | None = None, - case_sensitive: bool = True, - include_annotations: bool = False, - appearance_fill_color: str | None = None, - appearance_stroke_color: str | None = None, - ) -> bytes | None: - """Create redaction annotations for exact text matches. - - Creates redaction annotations for all occurrences of specific text. - - Args: - input_file: Input PDF file. - text: Exact text to redact. - output_path: Optional path to save the output file. - case_sensitive: Whether text matching is case-sensitive (default: True). - include_annotations: Include text in annotations (default: False). - appearance_fill_color: Fill color for redaction boxes (hex format). - appearance_stroke_color: Stroke color for redaction boxes (hex format). - - Returns: - PDF with redaction annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Note: - This creates redaction annotations but does not apply them. - Use apply_redactions() to permanently remove the content. - """ - options = { - "strategy": "text", - "strategy_options": { - "text": text, - "caseSensitive": case_sensitive, - "includeAnnotations": include_annotations, - }, - } - - # Add appearance options if provided - content = {} - if appearance_fill_color: - content["fillColor"] = appearance_fill_color - if appearance_stroke_color: - content["outlineColor"] = appearance_stroke_color - - if content: - options["content"] = content - - return self._process_file("create-redactions", input_file, output_path, **options) - - def optimize_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - grayscale_text: bool = False, - grayscale_graphics: bool = False, - grayscale_images: bool = False, - grayscale_form_fields: bool = False, - grayscale_annotations: bool = False, - disable_images: bool = False, - mrc_compression: bool = False, - image_optimization_quality: int | None = 2, - linearize: bool = False, - ) -> bytes | None: - """Optimize a PDF to reduce file size. - - Applies various optimization techniques to reduce the file size of a PDF - while maintaining readability. If input is an Office document, it will - be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - grayscale_text: Convert text to grayscale (default: False). - grayscale_graphics: Convert graphics to grayscale (default: False). - grayscale_images: Convert images to grayscale (default: False). - grayscale_form_fields: Convert form_fields to grayscale (default: False). - grayscale_annotations: Convert annotations to grayscale (default: False). - disable_images: Remove all images from the PDF (default: False). - mrc_compression: MCR compression (default: False). - image_optimization_quality: Image optimization quality from 1 (least optimized) - to 4 (most optimized) (default: 2). - linearize: Linearize (optimize for web viewing) the PDF (default: False). - - Returns: - Optimized PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If image_optimization_quality is not between 1-4 - or no optimization is enabled - - Example: - # Aggressive optimization for minimum file size - client.optimize_pdf( - "large_document.pdf", - grayscale_images=True, - image_optimization_quality=4, - output_path="optimized.pdf" - ) - """ - options: dict[str, Any] = {} - - # Add grayscale options - if grayscale_text: - options["grayscale_text"] = True - if grayscale_graphics: - options["grayscale_graphics"] = True - if grayscale_images: - options["grayscale_images"] = True - if grayscale_form_fields: - options["grayscale_form_fields"] = True - if grayscale_annotations: - options["grayscale_annotations"] = True - - # Add MCR compression - if mrc_compression: - options["mrc_compression"] = True - - # Add image options - if disable_images: - options["disable_images"] = True - if image_optimization_quality is not None: - if not 1 <= image_optimization_quality <= 4: - raise ValueError("image_optimization_quality must be between 1 and 4") - options["image_optimization_quality"] = image_optimization_quality - - # Add linearization - if linearize: - options["linearize"] = True - - # Build using the Builder API with output options - builder = self.build(input_file) # type: ignore[attr-defined] - - # Apply optimization via output options - if options: - # If there are specific options, set optimize to the options dict - builder.set_output_options(optimize=options) - else: - # If no options, raise error - raise ValueError("No optimization is enabled") - return builder.execute(output_path) # type: ignore[no-any-return] - - def password_protect_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - user_password: str | None = None, - owner_password: str | None = None, - permissions: list[str] | None = None, - ) -> bytes | None: - """Add password protection and permissions to a PDF. - - Secures a PDF with password protection and optional permission restrictions. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - user_password: Password required to open the document. - owner_password: Password required to change permissions/security settings. - If not provided, uses user_password. - permissions: Array of permission strings. Available permissions: - - "printing": Allow printing - - "modification": Allow document modification - - "extract": Allow content extraction - - "annotations_and_forms": Allow adding annotations - - "fill_forms": Allow filling forms - - "extract_accessibility": Allow accessibility features - - "assemble": Allow document assembly - - "print_high_quality": Allow high-quality printing - - Returns: - Protected PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If neither user_password nor owner_password is provided. - - Example: - # Protect with view-only permissions (only allowing extract_accessibility) - client.password_protect_pdf( - "sensitive.pdf", - user_password="view123", - owner_password="admin456", - permissions=["extract_accessibility"], - output_path="protected.pdf" - ) - """ - if not user_password and not owner_password: - raise ValueError("At least one of user_password or owner_password must be provided") - - # Build using the Builder API with output options - builder = self.build(input_file) # type: ignore[attr-defined] - - # Set up password options with camelCase for API - password_options: dict[str, Any] = {} - if user_password: - password_options["userPassword"] = user_password - if owner_password: - password_options["ownerPassword"] = owner_password - else: - # If no owner password provided, use user password - password_options["ownerPassword"] = user_password - - # Set up permissions if provided - if permissions: - password_options["permissions"] = permissions - - # Apply password protection via output options - builder.set_output_options(**password_options) - return builder.execute(output_path) # type: ignore[no-any-return] - - def set_pdf_metadata( - self, - input_file: FileInput, - output_path: str | None = None, - title: str | None = None, - author: str | None = None, - ) -> bytes | None: - """Set metadata properties of a PDF. - - Updates the metadata/document properties of a PDF file. - If input is an Office document, it will be converted to PDF first. - Only title and author metadata fields are supported. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - title: Document title. - author: Document author. - - Returns: - PDF with updated metadata as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If no metadata fields are provided. - - Example: - client.set_pdf_metadata( - "document.pdf", - title="Annual Report 2024", - author="John Doe", - output_path="document_with_metadata.pdf" - ) - """ - metadata = {} - if title is not None: - metadata["title"] = title - if author is not None: - metadata["author"] = author - - if not metadata: - raise ValueError("At least one metadata field must be provided") - - # Build using the Builder API with output options - builder = self.build(input_file) # type: ignore[attr-defined] - builder.set_output_options(metadata=metadata) - return builder.execute(output_path) # type: ignore[no-any-return] - - def split_pdf( - self, - input_file: FileInput, - page_ranges: list[dict[str, int]] | None = None, - output_paths: list[str] | None = None, - ) -> list[bytes]: - """Split a PDF into multiple documents by page ranges. - - Splits a PDF into multiple files based on specified page ranges. - Each range creates a separate output file. - - Args: - input_file: Input PDF file. - page_ranges: List of page range dictionaries. Each dict can contain: - - 'start': Starting page index (0-based, inclusive) - - 'end': Ending page index (0-based, inclusive) - - If not provided, splits into individual pages - output_paths: Optional list of paths to save output files. - Must match length of page_ranges if provided. - - Returns: - List of PDF bytes for each split, or empty list if output_paths provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_ranges and output_paths length mismatch. - - Examples: - # Split into individual pages - pages = client.split_pdf("document.pdf") - - # Split by custom ranges - parts = client.split_pdf( - "document.pdf", - page_ranges=[ - {"start": 0, "end": 4}, # Pages 1-5 - {"start": 5, "end": 9}, # Pages 6-10 - {"start": 10} # Pages 11 to end - ] - ) - - # Save to specific files - client.split_pdf( - "document.pdf", - page_ranges=[{"start": 0, "end": 1}, {"start": 2}], - output_paths=["part1.pdf", "part2.pdf"] - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not page_ranges: - # Default behavior: extract first page only - page_ranges = [{"start": 0, "end": 0}] - - if len(page_ranges) > 50: - raise ValueError("Maximum 50 page ranges allowed") - - if output_paths and len(output_paths) != len(page_ranges): - raise ValueError("output_paths length must match page_ranges length") - - # Get total number of pages to validate ranges - num_of_pages = get_pdf_page_count(input_file) - - # Validate and adjust page ranges - for i, page_range in enumerate(page_ranges): - start = page_range.get("start", 0) - - # Validate start is within document bounds - if start < 0 or start >= num_of_pages: - raise ValueError( - f"Page range {i}: start index {start} is out of bounds (0-{num_of_pages - 1})" - ) - - # If end is specified, validate it's within document bounds - if "end" in page_range: - end = page_range["end"] - if end < 0 or end >= num_of_pages: - raise ValueError( - f"Page range {i}: end index {end} is out of bounds (0-{num_of_pages - 1})" - ) - if end < start: - raise ValueError( - f"Page range {i}: end index {end} cannot be less than start index {start}" - ) - - results = [] - - # Process each page range as a separate API call - for i, page_range in enumerate(page_ranges): - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Build instructions for page extraction - instructions = {"parts": [{"file": "file", "pages": page_range}], "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_paths and i < len(output_paths): - save_file_output(result, output_paths[i]) - else: - results.append(result) # type: ignore[arg-type] - - return results if not output_paths else [] - - def duplicate_pdf_pages( - self, - input_file: FileInput, - page_indexes: list[int], - output_path: str | None = None, - ) -> bytes | None: - """Duplicate specific pages within a PDF document. - - Creates a new PDF containing the specified pages in the order provided. - Pages can be duplicated multiple times by including their index multiple times. - - Args: - input_file: Input PDF file. - page_indexes: List of page indexes to include (0-based). - Pages can be repeated to create duplicates. - Negative indexes are supported (-1 for last page). - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_indexes is empty. - - Examples: - # Duplicate first page twice, then include second page - result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 0, 1] # Page 1, Page 1, Page 2 - ) - - # Include last page at beginning and end - result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[-1, 0, 1, 2, -1] # Last, First, Second, Third, Last - ) - - # Save to specific file - client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 2, 1], # Reorder: Page 1, Page 3, Page 2 - output_path="reordered.pdf" - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not page_indexes: - raise ValueError("page_indexes cannot be empty") - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Get total number of pages to validate indexes - num_of_pages = get_pdf_page_count(input_file) - - # Build parts for each page index - parts = [] - for page_index in page_indexes: - if page_index < 0: - # For negative indexes, use the index directly (API supports negative indexes) - # No validation for negative indexes as they're handled by the API - parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}}) - else: - # Validate positive indexes are within bounds - if page_index >= num_of_pages: - raise ValueError( - f"Page index {page_index} is out of bounds (0-{num_of_pages - 1})" - ) - # For positive indexes, create single-page range - parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}}) - - # Build instructions for duplication - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def delete_pdf_pages( - self, - input_file: FileInput, - page_indexes: list[int], - output_path: str | None = None, - ) -> bytes | None: - """Delete specific pages from a PDF document. - - Creates a new PDF with the specified pages removed. The API approach - works by selecting all pages except those to be deleted. - - Args: - input_file: Input PDF file. - page_indexes: List of page indexes to delete (0-based). 0 = first page. - Must be unique, sorted in ascending order. - Negative indexes are NOT supported. - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_indexes is empty or contains negative indexes. - - Examples: - # Delete first and last pages (Note: negative indexes not supported) - result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[0, 2] # Delete pages 1 and 3 - ) - - # Delete specific pages (2nd and 4th pages) - result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[1, 3] # 0-based indexing - ) - - # Save to specific file - client.delete_pdf_pages( - "document.pdf", - page_indexes=[2, 4, 5], - output_path="pages_deleted.pdf" - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not page_indexes: - raise ValueError("page_indexes cannot be empty") - - # Check for negative indexes - if any(idx < 0 for idx in page_indexes): - negative_indexes = [idx for idx in page_indexes if idx < 0] - raise ValueError( - f"Negative page indexes not yet supported for deletion: {negative_indexes}" - ) - - # Get total number of pages to validate indexes - num_of_pages = get_pdf_page_count(input_file) - - # Validate page indexes are within bounds - for idx in page_indexes: - if idx >= num_of_pages: - raise ValueError(f"Page index {idx} is out of bounds (0-{num_of_pages - 1})") - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Sort page indexes to handle ranges efficiently - sorted_indexes = sorted(set(page_indexes)) # Remove duplicates and sort - - # Build parts for pages to keep (excluding the ones to delete) - # We need to create ranges that exclude the deleted pages - parts = [] - - # Start from page 0 - current_page = 0 - - for delete_index in sorted_indexes: - # Add range from current_page to delete_index-1 (inclusive) - if current_page < delete_index: - parts.append( - {"file": "file", "pages": {"start": current_page, "end": delete_index - 1}} - ) - - # Skip the deleted page - current_page = delete_index + 1 - - # Add remaining pages after the last deleted page - num_of_pages = get_pdf_page_count(input_file) - if ( - current_page > 0 or (current_page == 0 and len(sorted_indexes) == 0) - ) and current_page < num_of_pages: - # Add all remaining pages from current_page onwards - parts.append({"file": "file", "pages": {"start": current_page}}) - - # If no parts, it means we're trying to delete all pages - if not parts: - raise ValueError("Cannot delete all pages from document") - - # Build instructions for deletion (keeping non-deleted pages) - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def merge_pdfs( - self, - input_files: list[FileInput], - output_path: str | None = None, - ) -> bytes | None: - """Merge multiple PDF files into one. - - Combines multiple files into a single PDF in the order provided. - Office documents (DOCX, XLSX, PPTX) will be automatically converted - to PDF before merging. - - Args: - input_files: List of input files (PDFs or Office documents). - output_path: Optional path to save the output file. - - Returns: - Merged PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If less than 2 files provided. - - Example: - # Merge PDFs and Office documents - client.merge_pdfs([ - "document1.pdf", - "document2.docx", - "spreadsheet.xlsx" - ], "merged.pdf") - """ - if len(input_files) < 2: - raise ValueError("At least 2 files required for merge") - - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Prepare files for upload - files = {} - parts = [] - - for i, file in enumerate(input_files): - field_name = f"file{i}" - file_field, file_data = prepare_file_for_upload(file, field_name) - files[file_field] = file_data - parts.append({"file": field_name}) - - # Build instructions for merge (no actions needed) - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def add_page( - self, - input_file: FileInput, - insert_index: int, - page_count: int = 1, - page_size: str = "A4", - orientation: str = "portrait", - output_path: str | None = None, - ) -> bytes | None: - """Add blank pages to a PDF document. - - Inserts blank pages at the specified insertion index in the document. - - Args: - input_file: Input PDF file. - insert_index: Position to insert pages (0-based insertion index). - 0 = insert before first page (at beginning) - 1 = insert before second page (after first page) - -1 = insert after last page (at end) - page_count: Number of blank pages to add (default: 1). - page_size: Page size for new pages. Common values: "A4", "Letter", - "Legal", "A3", "A5" (default: "A4"). - orientation: Page orientation. Either "portrait" or "landscape" - (default: "portrait"). - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_count is less than 1 or if insert_index is - a negative number other than -1. - - Examples: - # Add a single blank page at the beginning - result = client.add_page("document.pdf", insert_index=0) - - # Add multiple pages at the end - result = client.add_page( - "document.pdf", - insert_index=-1, # Insert at end - page_count=3, - page_size="Letter", - orientation="landscape" - ) - - # Add pages before third page and save to file - client.add_page( - "document.pdf", - insert_index=2, # Insert before third page - page_count=2, - output_path="with_blank_pages.pdf" - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if page_count < 1: - raise ValueError("page_count must be at least 1") - if page_count > 100: - raise ValueError("page_count cannot exceed 100 pages") - if insert_index < -1: - raise ValueError("insert_index must be -1 (for end) or a non-negative insertion index") - - # Get total number of pages to validate insert_index - if insert_index >= 0: # Skip validation for -1 (end) - num_of_pages = get_pdf_page_count(input_file) - if insert_index > num_of_pages: - raise ValueError(f"insert_index {insert_index} is out of bounds (0-{num_of_pages})") - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Build parts array - parts: list[dict[str, Any]] = [] - - # Create new page part - new_page_part = { - "page": "new", - "pageCount": page_count, - "layout": { - "size": page_size, - "orientation": orientation, - }, - } - - if insert_index == -1: - # Insert at end: add all original pages first, then new pages - parts.append({"file": "file"}) - parts.append(new_page_part) - elif insert_index == 0: - # Insert at beginning: add new pages first, then all original pages - parts.append(new_page_part) - parts.append({"file": "file"}) - else: - # Insert at specific position: split original document - # Add pages from start up to insertion point (0 to insert_index-1) - parts.append({"file": "file", "pages": {"start": 0, "end": insert_index - 1}}) - - # Add new blank pages - parts.append(new_page_part) - - # Add remaining pages from insertion point to end - parts.append({"file": "file", "pages": {"start": insert_index}}) - - # Build instructions for adding pages - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def apply_instant_json( - self, - input_file: FileInput, - instant_json: FileInput | str, - output_path: str | None = None, - ) -> bytes | None: - """Apply Nutrient Instant JSON annotations to a PDF. - - Applies annotations from a Nutrient Instant JSON file or URL to a PDF. - This allows importing annotations exported from Nutrient SDK or other - compatible sources. - - Args: - input_file: Input PDF file. - instant_json: Instant JSON data as file path, bytes, file object, or URL. - output_path: Optional path to save the output file. - - Returns: - PDF with applied annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Example: - # Apply annotations from file - client.apply_instant_json( - "document.pdf", - "annotations.json", - output_path="annotated.pdf" - ) - - # Apply annotations from URL - client.apply_instant_json( - "document.pdf", - "https://example.com/annotations.json", - output_path="annotated.pdf" - ) - """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Check if instant_json is a URL - if isinstance(instant_json, str) and ( - instant_json.startswith("http://") or instant_json.startswith("https://") - ): - # Use URL approach - action = { - "type": "applyInstantJson", - "file": {"url": instant_json}, - } - - # Prepare the PDF file - files = {} - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - else: - # It's a file input - need to upload both files - files = {} - - # Main PDF file - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - # Instant JSON file - json_field, json_data = prepare_file_for_upload(instant_json, "instant_json") - files[json_field] = json_data - - # Build instructions with applyInstantJson action - action = { - "type": "applyInstantJson", - "file": json_field, # Reference to the uploaded file - } - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def apply_xfdf( - self, - input_file: FileInput, - xfdf: FileInput | str, - output_path: str | None = None, - ) -> bytes | None: - """Apply XFDF annotations to a PDF. - - Applies annotations from an XFDF (XML Forms Data Format) file or URL - to a PDF. XFDF is a standard format for exchanging PDF annotations. - - Args: - input_file: Input PDF file. - xfdf: XFDF data as file path, bytes, file object, or URL. - output_path: Optional path to save the output file. - - Returns: - PDF with applied annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Example: - # Apply annotations from file - client.apply_xfdf( - "document.pdf", - "annotations.xfdf", - output_path="annotated.pdf" - ) - - # Apply annotations from URL - client.apply_xfdf( - "document.pdf", - "https://example.com/annotations.xfdf", - output_path="annotated.pdf" - ) - """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Check if xfdf is a URL - if isinstance(xfdf, str) and (xfdf.startswith("http://") or xfdf.startswith("https://")): - # Use URL approach - action = { - "type": "applyXfdf", - "file": {"url": xfdf}, - } - - # Prepare the PDF file - files = {} - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - else: - # It's a file input - need to upload both files - files = {} - - # Main PDF file - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - # XFDF file - xfdf_field, xfdf_data = prepare_file_for_upload(xfdf, "xfdf") - files[xfdf_field] = xfdf_data - - # Build instructions with applyXfdf action - action = { - "type": "applyXfdf", - "file": xfdf_field, # Reference to the uploaded file - } - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def set_page_label( - self, - input_file: FileInput, - labels: list[dict[str, Any]], - output_path: str | None = None, - ) -> bytes | None: - """Set labels for specific pages in a PDF. - - Assigns custom labels/numbering to specific page ranges in a PDF document. - Each label configuration specifies a page range and the label text to apply. - - Args: - input_file: Input PDF file. - labels: List of label configurations. Each dict must contain: - - 'pages': Page range dict with 'start' (required) and optionally 'end' - - 'label': String label to apply to those pages - Page ranges use 0-based indexing where 'end' is inclusive. - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If labels list is empty or contains invalid configurations. - - Examples: - # Set labels for different page ranges - client.set_page_label( - "document.pdf", - labels=[ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - {"pages": {"start": 10}, "label": "Appendix"} - ], - output_path="labeled_document.pdf" - ) - - # Set label for single page - client.set_page_label( - "document.pdf", - labels=[{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not labels: - raise ValueError("labels list cannot be empty") - - # Get total number of pages to validate ranges - num_of_pages = get_pdf_page_count(input_file) - - # Normalize labels to ensure proper format - normalized_labels = [] - for i, label_config in enumerate(labels): - if not isinstance(label_config, dict): - raise ValueError(f"Label configuration {i} must be a dictionary") - - if "pages" not in label_config: - raise ValueError(f"Label configuration {i} missing required 'pages' key") - - if "label" not in label_config: - raise ValueError(f"Label configuration {i} missing required 'label' key") - - pages = label_config["pages"] - if not isinstance(pages, dict) or "start" not in pages: - raise ValueError(f"Label configuration {i} 'pages' must be a dict with 'start' key") - - # Validate start is within document bounds - start = pages["start"] - if start < 0 or start >= num_of_pages: - raise ValueError( - f"Label configuration {i}: start index {start}" - f" is out of bounds (0-{num_of_pages - 1})" - ) - - # Normalize pages - only include 'end' if explicitly provided - normalized_pages = {"start": start} - if "end" in pages: - end = pages["end"] - # Validate end is within document bounds - if end < 0 or end >= num_of_pages: - raise ValueError( - f"Label configuration {i}: end index {end}" - f" is out of bounds (0-{num_of_pages - 1})" - ) - # Validate end is not less than start - if end < start: - raise ValueError( - f"Label configuration {i}: end index {end}" - f" cannot be less than start index {start}" - ) - normalized_pages["end"] = end - # If no end is specified, leave it out (meaning "to end of document") - - normalized_labels.append({"pages": normalized_pages, "label": label_config["label"]}) - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Build instructions with page labels in output configuration - instructions = { - "parts": [{"file": "file"}], - "actions": [], - "output": {"labels": normalized_labels}, - } - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] diff --git a/src/nutrient_dws/builder.py b/src/nutrient_dws/builder.py deleted file mode 100644 index bdada1f..0000000 --- a/src/nutrient_dws/builder.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Builder API implementation for multi-step workflows.""" - -from typing import Any - -from nutrient_dws.file_handler import FileInput, prepare_file_for_upload, save_file_output - - -class BuildAPIWrapper: - r"""Builder pattern implementation for chaining document operations. - - This class provides a fluent interface for building complex document - processing workflows using the Nutrient Build API. - - Example: - >>> client.build(input_file="document.pdf") \\ - ... .add_step(tool="rotate-pages", options={"degrees": 90}) \\ - ... .add_step(tool="ocr-pdf", options={"language": "en"}) \\ - ... .add_step(tool="watermark-pdf", options={"text": "CONFIDENTIAL"}) \\ - ... .execute(output_path="processed.pdf") - """ - - def __init__(self, client: Any, input_file: FileInput) -> None: - """Initialize builder with client and input file. - - Args: - client: NutrientClient instance. - input_file: Input file to process. - """ - self._client = client - self._input_file = input_file - self._parts: list[dict[str, Any]] = [{"file": "file"}] # Main file - self._files: dict[str, FileInput] = {"file": input_file} # Track files - self._actions: list[dict[str, Any]] = [] - self._output_options: dict[str, Any] = {} - - def _add_file_part(self, file: FileInput, name: str) -> None: - """Add an additional file part for operations like merge. - - Args: - file: File to add. - name: Name for the file part. - """ - self._parts.append({"file": name}) - self._files[name] = file - - def add_step(self, tool: str, options: dict[str, Any] | None = None) -> "BuildAPIWrapper": - """Add a processing step to the workflow. - - Args: - tool: Tool identifier (e.g., 'rotate-pages', 'ocr-pdf'). - options: Optional parameters for the tool. - - Returns: - Self for method chaining. - - Example: - >>> builder.add_step(tool="rotate-pages", options={"degrees": 180}) - """ - action = self._map_tool_to_action(tool, options or {}) - self._actions.append(action) - return self - - def set_output_options(self, **options: Any) -> "BuildAPIWrapper": - """Set output options for the final document. - - Args: - **options: Output options (e.g., metadata, optimization). - - Returns: - Self for method chaining. - - Example: - >>> builder.set_output_options( - ... metadata={"title": "My Document", "author": "John Doe"}, - ... optimize=True - ... ) - """ - self._output_options.update(options) - return self - - def set_page_labels(self, labels: list[dict[str, Any]]) -> "BuildAPIWrapper": - """Set page labels for the final document. - - Assigns custom labels/numbering to specific page ranges in the output PDF. - - Args: - labels: List of label configurations. Each dict must contain: - - 'pages': Page range dict with 'start' (required) and optionally 'end' - - 'label': String label to apply to those pages - Page ranges use 0-based indexing where 'end' is inclusive. - - Returns: - Self for method chaining. - - Example: - >>> builder.set_page_labels([ - ... {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - ... {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - ... {"pages": {"start": 10}, "label": "Appendix"} - ... ]) - """ - self._output_options["labels"] = labels - return self - - def execute(self, output_path: str | None = None) -> bytes | None: - """Execute the workflow. - - Args: - output_path: Optional path to save the output file. - - Returns: - Processed file bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - # Prepare the build instructions - instructions = self._build_instructions() - - # Prepare files for upload - files = {} - for name, file in self._files.items(): - file_field, file_data = prepare_file_for_upload(file, name) - files[file_field] = file_data - - # Make API request - result = self._client._http_client.post( - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def _build_instructions(self) -> dict[str, Any]: - """Build the instructions payload for the API. - - Returns: - Instructions dictionary for the Build API. - """ - instructions = { - "parts": self._parts, - "actions": self._actions, - } - - # Add output options if specified - if self._output_options: - instructions["output"] = self._output_options # type: ignore - - return instructions - - def _map_tool_to_action(self, tool: str, options: dict[str, Any]) -> dict[str, Any]: - """Map tool name and options to Build API action format. - - Args: - tool: Tool identifier. - options: Tool options. - - Returns: - Action dictionary for the Build API. - """ - # Map tool names to action types - tool_mapping = { - "rotate-pages": "rotate", - "ocr-pdf": "ocr", - "watermark-pdf": "watermark", - "flatten-annotations": "flatten", - "apply-instant-json": "applyInstantJson", - "apply-xfdf": "applyXfdf", - "create-redactions": "createRedactions", - "apply-redactions": "applyRedactions", - } - - action_type = tool_mapping.get(tool, tool) - - # Build action dictionary - action = {"type": action_type} - - # Handle special cases for different action types using pattern matching - match action_type: - case "rotate": - action["rotateBy"] = options.get("degrees", 0) - if "page_indexes" in options: - action["pageIndexes"] = options["page_indexes"] - - case "ocr": - if "language" in options: - # Map common language codes to API format - lang_map = { - "en": "english", - "de": "deu", - "eng": "eng", - "deu": "deu", - "german": "deu", - } - lang = options["language"] - action["language"] = lang_map.get(lang, lang) - - case "watermark": - # Watermark requires width/height - action["width"] = options.get("width", 200) # Default width - action["height"] = options.get("height", 100) # Default height - - if "text" in options: - action["text"] = options["text"] - elif "image_url" in options: - action["image"] = {"url": options["image_url"]} # type: ignore - elif "image_file" in options: - # Handle image file upload - image_file = options["image_file"] - # Add the image as a file part - watermark_name = f"watermark_{len(self._files)}" - self._files[watermark_name] = image_file - # Reference the uploaded file - action["image"] = watermark_name # type: ignore - else: - # Default to text watermark if neither specified - action["text"] = "WATERMARK" - - if "opacity" in options: - action["opacity"] = options["opacity"] - if "position" in options: - action["position"] = options["position"] - - case "createRedactions": - # Handle create redactions - pass through directly - # The direct.py already formats everything correctly - if "strategy" in options: - action["strategy"] = options["strategy"] - if "strategy_options" in options: - action["strategyOptions"] = options["strategy_options"] - if "content" in options: - action["content"] = options["content"] - - case "optimize": - # Handle optimize action with camelCase conversion - for key, value in options.items(): - # Convert snake_case to camelCase for API - camel_key = "".join( - word.capitalize() if i else word for i, word in enumerate(key.split("_")) - ) - action[camel_key] = value - - case _: - # For other actions, pass options directly - action.update(options) - - return action - - def __str__(self) -> str: - """String representation of the build workflow.""" - steps = [f"{action['type']}" for action in self._actions] - return f"BuildAPIWrapper(steps={steps})" - - def __repr__(self) -> str: - """Detailed representation of the build workflow.""" - return ( - f"BuildAPIWrapper(" - f"input_file={self._input_file!r}, " - f"actions={self._actions!r}, " - f"output_options={self._output_options!r})" - ) diff --git a/tests/integration/__init__.py b/src/nutrient_dws/builder/__init__.py similarity index 100% rename from tests/integration/__init__.py rename to src/nutrient_dws/builder/__init__.py diff --git a/src/nutrient_dws/builder/base_builder.py b/src/nutrient_dws/builder/base_builder.py new file mode 100644 index 0000000..f09ff96 --- /dev/null +++ b/src/nutrient_dws/builder/base_builder.py @@ -0,0 +1,64 @@ +"""Base builder class that all builders extend from.""" + +from abc import ABC, abstractmethod +from typing import Literal, Union, overload + +from nutrient_dws.builder.staged_builders import ( + TypedWorkflowResult, +) +from nutrient_dws.errors import ValidationError +from nutrient_dws.http import ( + AnalyzeBuildRequestData, + BuildRequestData, + NutrientClientOptions, + RequestConfig, + is_post_analyse_build_request_config, + is_post_build_request_config, + send_request, +) +from nutrient_dws.types.analyze_response import AnalyzeBuildResponse +from nutrient_dws.types.build_response_json import BuildResponseJsonContents + + +class BaseBuilder(ABC): + """Base builder class that all builders extend from. + Provides common functionality for API interaction. + """ + + def __init__(self, client_options: NutrientClientOptions) -> None: + self.client_options = client_options + + @overload + async def _send_request( + self, path: Literal["/build"], options: BuildRequestData + ) -> Union[BuildResponseJsonContents, bytes, str]: ... + + @overload + async def _send_request( + self, path: Literal["/analyze_build"], options: AnalyzeBuildRequestData + ) -> AnalyzeBuildResponse: ... + + async def _send_request( + self, + path: Literal["/build", "/analyze_build"], + options: BuildRequestData | AnalyzeBuildRequestData, + ) -> Union[BuildResponseJsonContents, bytes, str, AnalyzeBuildResponse]: + """Sends a request to the API.""" + config = RequestConfig(endpoint=path, method="POST", data=options, headers=None) + + if is_post_build_request_config(config): + response = await send_request(config, self.client_options) + return response["data"] + + if is_post_analyse_build_request_config(config): + analyze_response = await send_request(config, self.client_options) + return analyze_response["data"] + + raise ValidationError( + "Invalid _send_request args", {"path": path, "options": options} + ) + + @abstractmethod + async def execute(self) -> TypedWorkflowResult: + """Abstract method that child classes must implement for execution.""" + pass diff --git a/src/nutrient_dws/builder/builder.py b/src/nutrient_dws/builder/builder.py new file mode 100644 index 0000000..5497e88 --- /dev/null +++ b/src/nutrient_dws/builder/builder.py @@ -0,0 +1,616 @@ +"""Staged workflow builder that provides compile-time safety through Python's type system.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypeGuard, cast + +from nutrient_dws.builder.base_builder import BaseBuilder +from nutrient_dws.builder.constant import ActionWithFileInput, BuildOutputs +from nutrient_dws.builder.staged_builders import ( + ApplicableAction, + BufferOutput, + ContentOutput, + JsonContentOutput, + TypedWorkflowResult, + WorkflowDryRunResult, + WorkflowError, + WorkflowExecuteCallback, + WorkflowWithActionsStage, + WorkflowWithOutputStage, + WorkflowWithPartsStage, +) +from nutrient_dws.errors import ValidationError +from nutrient_dws.http import ( + AnalyzeBuildRequestData, + BuildRequestData, + NutrientClientOptions, +) +from nutrient_dws.inputs import ( + FileInput, + NormalizedFileData, + is_remote_file_input, + process_file_input, + validate_file_input, +) +from nutrient_dws.types.file_handle import FileHandle, RemoteFileHandle + +if TYPE_CHECKING: + from nutrient_dws.types.build_actions import BuildAction + from nutrient_dws.types.build_instruction import BuildInstructions + from nutrient_dws.types.build_output import ( + BuildOutput, + ImageOutputOptions, + JSONContentOutputOptions, + PDFAOutputOptions, + PDFOutput, + PDFOutputOptions, + PDFUAOutputOptions, + ) + from nutrient_dws.types.build_response_json import BuildResponseJsonContents + from nutrient_dws.types.input_parts import ( + DocumentPart, + DocumentPartOptions, + FilePart, + FilePartOptions, + HTMLPart, + HTMLPartOptions, + NewPagePart, + NewPagePartOptions, + ) + + +class StagedWorkflowBuilder( + BaseBuilder, + WorkflowWithPartsStage, + WorkflowWithOutputStage, +): + """Staged workflow builder that provides compile-time safety through Python's type system. + This builder ensures methods are only available at appropriate stages of the workflow. + """ + + def __init__(self, client_options: NutrientClientOptions) -> None: + """Initialize the staged workflow builder. + + Args: + client_options: Client configuration options + """ + super().__init__(client_options) + self.build_instructions: BuildInstructions = {"parts": []} + self.assets: dict[str, FileInput] = {} + self.asset_index = 0 + self.current_step = 0 + self.is_executed = False + + def _register_asset(self, asset: FileInput) -> str: + """Register an asset in the workflow and return its key for use in actions. + + Args: + asset: The asset to register + + Returns: + The asset key that can be used in BuildActions + """ + if not validate_file_input(asset): + raise ValidationError( + "Invalid file input provided to workflow", {"asset": asset} + ) + + if is_remote_file_input(asset): + raise ValidationError( + "Remote file input doesn't need to be registered", {"asset": asset} + ) + + asset_key = f"asset_{self.asset_index}" + self.asset_index += 1 + self.assets[asset_key] = asset + return asset_key + + def _ensure_not_executed(self) -> None: + """Ensure the workflow hasn't been executed yet.""" + if self.is_executed: + raise ValidationError( + "This workflow has already been executed. Create a new workflow builder for additional operations." + ) + + def _validate(self) -> None: + """Validate the workflow before execution.""" + if not self.build_instructions["parts"]: + raise ValidationError("Workflow has no parts to execute") + + if "output" not in self.build_instructions: + self.build_instructions["output"] = cast("PDFOutput", {"type": "pdf"}) + + def _process_action(self, action: ApplicableAction) -> BuildAction: + """Process an action, registering files if needed. + + Args: + action: The action to process + + Returns: + The processed action + """ + if self._is_action_with_file_input(action): + # Register the file and create the actual action + if is_remote_file_input(action.fileInput): + file_handle: FileHandle = RemoteFileHandle(url=action.fileInput) + else: + file_handle = self._register_asset(action.fileInput) + return action.createAction(file_handle) + else: + return cast("BuildAction", action) + + def _is_action_with_file_input( + self, action: ApplicableAction + ) -> TypeGuard[ActionWithFileInput]: + """Type guard to check if action needs file registration. + + Args: + action: The action to check + + Returns: + True if action needs file registration + """ + return hasattr(action, "createAction") + + async def _prepare_files(self) -> dict[str, NormalizedFileData]: + """Prepare files for the request concurrently. + + Returns: + Dictionary mapping asset keys to normalized file data + """ + import asyncio + + # Process all files concurrently + tasks = [] + keys = [] + for key, file_input in self.assets.items(): + tasks.append(process_file_input(file_input)) + keys.append(key) + + # Wait for all file processing to complete + normalized_files = await asyncio.gather(*tasks) + + # Build the result dictionary + request_files = {} + for key, normalized_data in zip(keys, normalized_files, strict=False): + request_files[key] = normalized_data + + return request_files + + def _cleanup(self) -> None: + """Clean up resources after execution.""" + self.assets.clear() + self.asset_index = 0 + self.current_step = 0 + self.is_executed = True + + # Part methods (WorkflowInitialStage) + + def add_file_part( + self, + file: FileInput, + options: FilePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a file part to the workflow. + + Args: + file: The file to add to the workflow. Can be a local file path, bytes, or URL. + options: Additional options for the file part. + actions: Actions to apply to the file part. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Handle file field + file_field: FileHandle + if is_remote_file_input(file): + file_field = RemoteFileHandle(url=file) + else: + file_field = self._register_asset(file) + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + file_part: FilePart = { + "file": file_field, + **(options or {}), + } + + if processed_actions: + file_part["actions"] = processed_actions + + self.build_instructions["parts"].append(file_part) + return self + + def add_html_part( + self, + html: FileInput, + assets: list[FileInput] | None = None, + options: HTMLPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add an HTML part to the workflow. + + Args: + html: The HTML content to add. Can be a file path, bytes, or URL. + assets: Optional array of assets (CSS, images, etc.) to include with the HTML. + options: Additional options for the HTML part. + actions: Actions to apply to the HTML part. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Handle HTML field + html_field: FileHandle + if is_remote_file_input(html): + html_field = RemoteFileHandle(url=html) + else: + html_field = self._register_asset(html) + + # Handle assets + assets_field = None + if assets: + assets_field = [] + for asset in assets: + if is_remote_file_input(asset): + raise ValidationError( + "Assets file input cannot be a URL", {"input": asset} + ) + asset_key = self._register_asset(asset) + assets_field.append(asset_key) + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + html_part: HTMLPart = { + "html": html_field, + } + + if options is not None and "layout" in options: + html_part["layout"] = options["layout"] + + if assets_field: + html_part["assets"] = assets_field + + if processed_actions: + html_part["actions"] = processed_actions + + self.build_instructions["parts"].append(html_part) + return self + + def add_new_page( + self, + options: NewPagePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a new blank page to the workflow. + + Args: + options: Additional options for the new page, such as page size, orientation, etc. + actions: Actions to apply to the new page. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + new_page_part: NewPagePart = { + "page": "new", + } + + if options is not None: + if "pageCount" in options: + new_page_part["pageCount"] = options["pageCount"] + + if "layout" in options: + new_page_part["layout"] = options["layout"] + + if processed_actions: + new_page_part["actions"] = processed_actions + + self.build_instructions["parts"].append(new_page_part) + return self + + def add_document_part( + self, + document_id: str, + options: DocumentPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a document part to the workflow by referencing an existing document by ID. + + Args: + document_id: The ID of the document to add to the workflow. + options: Additional options for the document part. + actions: Actions to apply to the document part. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Extract layer from options + layer = None + document_options = options or {} + if "layer" in document_options: + layer = document_options.pop("layer") + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + document_part: DocumentPart = { + "document": {"id": document_id}, + } + + if "password" in document_options: + document_part["password"] = document_options["password"] + + if "pages" in document_options: + document_part["pages"] = document_options["pages"] + + if layer: + document_part["document"]["layer"] = layer + + if processed_actions: + document_part["actions"] = processed_actions + + self.build_instructions["parts"].append(document_part) + return self + + # Action methods (WorkflowWithPartsStage) + + def apply_actions( + self, actions: list[ApplicableAction] + ) -> WorkflowWithActionsStage: + """Apply multiple actions to the workflow. + + Args: + actions: An array of actions to apply to the workflow. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + if "actions" not in self.build_instructions: + self.build_instructions["actions"] = [] + + processed_actions = [self._process_action(action) for action in actions] + self.build_instructions["actions"].extend(processed_actions) + return cast("WorkflowWithActionsStage", self) + + def apply_action(self, action: ApplicableAction) -> WorkflowWithActionsStage: + """Apply a single action to the workflow. + + Args: + action: The action to apply to the workflow. + + Returns: + The workflow builder instance for method chaining. + """ + return self.apply_actions([action]) + + # Output methods (WorkflowWithPartsStage) + + def _output(self, output: BuildOutput) -> StagedWorkflowBuilder: + """Set the output configuration.""" + self._ensure_not_executed() + self.build_instructions["output"] = output + return self + + def output_pdf( + self, + options: PDFOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to PDF.""" + self._output(BuildOutputs.pdf(options)) + return cast("WorkflowWithOutputStage", self) + + def output_pdfa( + self, + options: PDFAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to PDF/A.""" + self._output(BuildOutputs.pdfa(options)) + return cast("WorkflowWithOutputStage", self) + + def output_pdfua( + self, + options: PDFUAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to PDF/UA.""" + self._output(BuildOutputs.pdfua(options)) + return cast("WorkflowWithOutputStage", self) + + def output_image( + self, + format: Literal["png", "jpeg", "jpg", "webp"], + options: ImageOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to an image format.""" + if not options or not any(k in options for k in ["dpi", "width", "height"]): + raise ValidationError( + "Image output requires at least one of the following options: dpi, height, width" + ) + self._output(BuildOutputs.image(format, options)) + return cast("WorkflowWithOutputStage", self) + + def output_office( + self, + format: Literal["docx", "xlsx", "pptx"], + ) -> WorkflowWithOutputStage: + """Set the output format to an Office document format.""" + self._output(BuildOutputs.office(format)) + return cast("WorkflowWithOutputStage", self) + + def output_html( + self, layout: Literal["page", "reflow"] | None = None + ) -> WorkflowWithOutputStage: + """Set the output format to HTML.""" + casted_layout: Literal["page", "reflow"] = "page" + if layout is not None: + casted_layout = layout + self._output(BuildOutputs.html(casted_layout)) + return cast("WorkflowWithOutputStage", self) + + def output_markdown( + self, + ) -> WorkflowWithOutputStage: + """Set the output format to Markdown.""" + self._output(BuildOutputs.markdown()) + return cast("WorkflowWithOutputStage", self) + + def output_json( + self, + options: JSONContentOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to JSON content.""" + self._output(BuildOutputs.jsonContent(options)) + return cast("WorkflowWithOutputStage", self) + + # Execution methods (WorkflowWithOutputStage) + + async def execute( + self, + on_progress: WorkflowExecuteCallback | None = None, + ) -> TypedWorkflowResult: + """Execute the workflow and return the result. + + Args: + on_progress: Optional progress callback. + + Returns: + The workflow execution result. + """ + self._ensure_not_executed() + self.current_step = 0 + + result: TypedWorkflowResult = { + "success": False, + "errors": [], + "output": None, + } + + try: + # Step 1: Validate + self.current_step = 1 + if on_progress: + on_progress(self.current_step, 3) + self._validate() + + # Step 2: Prepare files + self.current_step = 2 + if on_progress: + on_progress(self.current_step, 3) + + output_config = self.build_instructions.get("output") + if not output_config: + raise ValueError("Output configuration is required") + + files = await self._prepare_files() + + # Make the request + response = await self._send_request( + "/build", + BuildRequestData(instructions=self.build_instructions, files=files), + ) + + # Step 3: Process response + self.current_step = 3 + if on_progress: + on_progress(self.current_step, 3) + + if output_config["type"] == "json-content": + result["success"] = True + result["output"] = JsonContentOutput( + data=cast("BuildResponseJsonContents", response) + ) + elif output_config["type"] in ["html", "markdown"]: + mime_info = BuildOutputs.getMimeTypeForOutput(output_config) + result["success"] = True + result["output"] = ContentOutput( + content=cast("bytes", response).decode("utf-8"), + mimeType=mime_info["mimeType"], + filename=mime_info.get("filename"), + ) + else: + mime_info = BuildOutputs.getMimeTypeForOutput(output_config) + result["success"] = True + result["output"] = BufferOutput( + buffer=cast("bytes", response), + mimeType=mime_info["mimeType"], + filename=mime_info.get("filename"), + ) + + except Exception as error: + if result["errors"] is None: + result["errors"] = [] + + workflow_error: WorkflowError = { + "step": self.current_step, + "error": error + if isinstance(error, Exception) + else Exception(str(error)), + } + cast("list[WorkflowError]", result["errors"]).append(workflow_error) + + finally: + self._cleanup() + + return result + + async def dry_run(self) -> WorkflowDryRunResult: + """Perform a dry run of the workflow without generating the final output. + This is useful for validating the workflow configuration and estimating processing time. + + Returns: + A dry run result containing validation information and estimated processing time. + """ + self._ensure_not_executed() + + result: WorkflowDryRunResult = { + "success": False, + "errors": [], + "analysis": None, + } + + try: + self._validate() + + response = await self._send_request( + "/analyze_build", + AnalyzeBuildRequestData(instructions=self.build_instructions), + ) + + result["success"] = True + result["analysis"] = response + + except Exception as error: + if result["errors"] is None: + result["errors"] = [] + + workflow_error: WorkflowError = { + "step": 0, + "error": error + if isinstance(error, Exception) + else Exception(str(error)), + } + cast("list[WorkflowError]", result["errors"]).append(workflow_error) + + return result diff --git a/src/nutrient_dws/builder/constant.py b/src/nutrient_dws/builder/constant.py new file mode 100644 index 0000000..b61ea5c --- /dev/null +++ b/src/nutrient_dws/builder/constant.py @@ -0,0 +1,640 @@ +from collections.abc import Callable +from typing import Any, Literal, Protocol, TypeVar, cast + +from nutrient_dws.inputs import FileInput +from nutrient_dws.types.build_actions import ( + ApplyInstantJsonAction, + ApplyRedactionsAction, + ApplyXfdfAction, + ApplyXfdfActionOptions, + BaseCreateRedactionsOptions, + BuildAction, + CreateRedactionsActionPreset, + CreateRedactionsActionRegex, + CreateRedactionsActionText, + CreateRedactionsStrategyOptionsPreset, + CreateRedactionsStrategyOptionsRegex, + CreateRedactionsStrategyOptionsText, + FlattenAction, + ImageWatermarkAction, + ImageWatermarkActionOptions, + OcrAction, + RotateAction, + SearchPreset, + TextWatermarkAction, + TextWatermarkActionOptions, +) +from nutrient_dws.types.build_output import ( + HTMLOutput, + ImageOutput, + ImageOutputOptions, + JSONContentOutput, + JSONContentOutputOptions, + MarkdownOutput, + OfficeOutput, + PDFAOutput, + PDFAOutputOptions, + PDFOutput, + PDFOutputOptions, + PDFUAOutput, + PDFUAOutputOptions, +) +from nutrient_dws.types.file_handle import FileHandle +from nutrient_dws.types.misc import OcrLanguage, WatermarkDimension + +# Default dimension for watermarks +DEFAULT_DIMENSION: WatermarkDimension = {"value": 100, "unit": "%"} + + +T = TypeVar("T") + + +class ActionWithFileInput(Protocol): + """Internal action type that holds FileInput for deferred registration.""" + + __needsFileRegistration: bool + fileInput: FileInput + createAction: Callable[[FileHandle], BuildAction] + + +class BuildActions: + """Factory functions for creating common build actions.""" + + @staticmethod + def ocr(language: OcrLanguage | list[OcrLanguage]) -> OcrAction: + """Create an OCR action. + + Args: + language: Language(s) for OCR + + Returns: + OcrAction object + """ + return { + "type": "ocr", + "language": language, + } + + @staticmethod + def rotate(rotateBy: Literal[90, 180, 270]) -> RotateAction: + """Create a rotation action. + + Args: + rotateBy: Rotation angle (90, 180, or 270) + + Returns: + RotateAction object + """ + return { + "type": "rotate", + "rotateBy": rotateBy, + } + + @staticmethod + def watermark_text( + text: str, options: TextWatermarkActionOptions | None = None + ) -> TextWatermarkAction: + """Create a text watermark action. + + Args: + text: Watermark text + options: Watermark options + width: Width dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + height: Height dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + top: Top position of the watermark (value and unit) + right: Right position of the watermark (value and unit) + bottom: Bottom position of the watermark (value and unit) + left: Left position of the watermark (value and unit) + rotation: Rotation of the watermark in counterclockwise degrees (default: 0) + opacity: Watermark opacity (0 is fully transparent, 1 is fully opaque) + fontFamily: Font family for the text (e.g. 'Helvetica') + fontSize: Size of the text in points + fontColor: Foreground color of the text (e.g. '#ffffff') + fontStyle: Text style array ('bold', 'italic', or both) + + Returns: + TextWatermarkAction object + """ + if options is None: + options = { + "width": DEFAULT_DIMENSION, + "height": DEFAULT_DIMENSION, + "rotation": 0, + } + + return { + "type": "watermark", + "text": text, + **options, + "rotation": options.get("rotation", 0), + "width": options.get("width", DEFAULT_DIMENSION), + "height": options.get("height", DEFAULT_DIMENSION), + } + + @staticmethod + def watermark_image( + image: FileInput, options: ImageWatermarkActionOptions | None = None + ) -> ActionWithFileInput: + """Create an image watermark action. + + Args: + image: Watermark image + options: Watermark options + width: Width dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + height: Height dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + top: Top position of the watermark (value and unit) + right: Right position of the watermark (value and unit) + bottom: Bottom position of the watermark (value and unit) + left: Left position of the watermark (value and unit) + rotation: Rotation of the watermark in counterclockwise degrees (default: 0) + opacity: Watermark opacity (0 is fully transparent, 1 is fully opaque) + + Returns: + ActionWithFileInput object + """ + if options is None: + options = { + "width": DEFAULT_DIMENSION, + "height": DEFAULT_DIMENSION, + "rotation": 0, + } + + class ImageWatermarkActionWithFileInput(ActionWithFileInput): + __needsFileRegistration = True + + def __init__( + self, file_input: FileInput, opts: ImageWatermarkActionOptions + ): + self.fileInput = file_input + self.options = opts + + def createAction(self, fileHandle: FileHandle) -> ImageWatermarkAction: + return { + "type": "watermark", + "image": fileHandle, + **self.options, + "rotation": self.options.get("rotation", 0), + "width": self.options.get("width", DEFAULT_DIMENSION), + "height": self.options.get("height", DEFAULT_DIMENSION), + } + + return ImageWatermarkActionWithFileInput(image, options) + + @staticmethod + def flatten(annotation_ids: list[str | int] | None = None) -> FlattenAction: + """Create a flatten action. + + Args: + annotation_ids: Optional annotation IDs to flatten (all if not specified) + + Returns: + FlattenAction object + """ + result: FlattenAction = {"type": "flatten"} + if annotation_ids is not None: + result["annotationIds"] = annotation_ids + return result + + @staticmethod + def apply_instant_json(file: FileInput) -> ActionWithFileInput: + """Create an apply Instant JSON action. + + Args: + file: Instant JSON file input + + Returns: + ActionWithFileInput object + """ + + class ApplyInstantJsonActionWithFileInput(ActionWithFileInput): + __needsFileRegistration = True + + def __init__(self, file_input: FileInput): + self.fileInput = file_input + + def createAction(self, fileHandle: FileHandle) -> ApplyInstantJsonAction: + return { + "type": "applyInstantJson", + "file": fileHandle, + } + + return ApplyInstantJsonActionWithFileInput(file) + + @staticmethod + def apply_xfdf( + file: FileInput, options: ApplyXfdfActionOptions | None = None + ) -> ActionWithFileInput: + """Create an apply XFDF action. + + Args: + file: XFDF file input + options: Apply Xfdf options + ignorePageRotation: If true, ignores page rotation when applying XFDF data (default: false) + richTextEnabled: If true, plain text annotations will be converted to rich text annotations. If false, all text annotations will be plain text annotations (default: true) + + Returns: + ActionWithFileInput object + """ + + class ApplyXfdfActionWithFileInput(ActionWithFileInput): + __needsFileRegistration = True + + def __init__( + self, file_input: FileInput, opts: ApplyXfdfActionOptions | None + ): + self.fileInput = file_input + self.options = opts or {} + + def createAction(self, fileHandle: FileHandle) -> ApplyXfdfAction: + return { + "type": "applyXfdf", + "file": fileHandle, + **self.options, + } + + return ApplyXfdfActionWithFileInput(file, options) + + @staticmethod + def create_redactions_text( + text: str, + options: BaseCreateRedactionsOptions | None = None, + strategy_options: CreateRedactionsStrategyOptionsText | None = None, + ) -> CreateRedactionsActionText: + """Create redactions with text search. + + Args: + text: Text to search and redact + options: Redaction options + content: Visual aspects of the redaction annotation (background color, overlay text, etc.) + strategy_options: Redaction strategy options + includeAnnotations: If true, redaction annotations are created on top of annotations whose content match the provided text (default: true) + caseSensitive: If true, the search will be case sensitive (default: false) + start: The index of the page from where to start the search (default: 0) + limit: Starting from start, the number of pages to search (default: to the end of the document) + + Returns: + CreateRedactionsAction object + """ + result: dict[str, Any] = { + "type": "createRedactions", + "strategy": "text", + "strategyOptions": { + "text": text, + **(strategy_options or {}), + }, + **(options or {}), + } + return cast("CreateRedactionsActionText", result) + + @staticmethod + def create_redactions_regex( + regex: str, + options: BaseCreateRedactionsOptions | None = None, + strategy_options: CreateRedactionsStrategyOptionsRegex | None = None, + ) -> CreateRedactionsActionRegex: + """Create redactions with regex pattern. + + Args: + regex: Regex pattern to search and redact + options: Redaction options + content: Visual aspects of the redaction annotation (background color, overlay text, etc.) + strategy_options: Redaction strategy options + includeAnnotations: If true, redaction annotations are created on top of annotations whose content match the provided regex (default: true) + caseSensitive: If true, the search will be case sensitive (default: true) + start: The index of the page from where to start the search (default: 0) + limit: Starting from start, the number of pages to search (default: to the end of the document) + + Returns: + CreateRedactionsAction object + """ + result: dict[str, Any] = { + "type": "createRedactions", + "strategy": "regex", + "strategyOptions": { + "regex": regex, + **(strategy_options or {}), + }, + **(options or {}), + } + return cast("CreateRedactionsActionRegex", result) + + @staticmethod + def create_redactions_preset( + preset: SearchPreset, + options: BaseCreateRedactionsOptions | None = None, + strategy_options: CreateRedactionsStrategyOptionsPreset | None = None, + ) -> CreateRedactionsActionPreset: + """Create redactions with preset pattern. + + Args: + preset: Preset pattern to search and redact (e.g. 'email-address', 'credit-card-number', 'social-security-number', etc.) + options: Redaction options + content: Visual aspects of the redaction annotation (background color, overlay text, etc.) + strategy_options: Redaction strategy options + includeAnnotations: If true, redaction annotations are created on top of annotations whose content match the provided preset (default: true) + start: The index of the page from where to start the search (default: 0) + limit: Starting from start, the number of pages to search (default: to the end of the document) + + Returns: + CreateRedactionsAction object + """ + result: dict[str, Any] = { + "type": "createRedactions", + "strategy": "preset", + "strategyOptions": { + "preset": preset, + **(strategy_options or {}), + }, + **(options or {}), + } + return cast("CreateRedactionsActionPreset", result) + + @staticmethod + def apply_redactions() -> ApplyRedactionsAction: + """Apply previously created redactions. + + Returns: + ApplyRedactionsAction object + """ + return { + "type": "applyRedactions", + } + + +class BuildOutputs: + """Factory functions for creating output configurations.""" + + @staticmethod + def pdf(options: PDFOutputOptions | None = None) -> PDFOutput: + """PDF output configuration. + + Args: + options: PDF output options + metadata: Document metadata + labels: Page labels + user_password: User password for the PDF + owner_password: Owner password for the PDF + user_permissions: User permissions + optimize: PDF optimization options + + Returns: + PDFOutput object + """ + result: dict[str, Any] = {"type": "pdf"} + + if options: + if "metadata" in options: + result["metadata"] = options["metadata"] + if "labels" in options: + result["labels"] = options["labels"] + if "user_password" in options: + result["user_password"] = options["user_password"] + if "owner_password" in options: + result["owner_password"] = options["owner_password"] + if "user_permissions" in options: + result["user_permissions"] = options["user_permissions"] + if "optimize" in options: + result["optimize"] = options["optimize"] + + return cast("PDFOutput", result) + + @staticmethod + def pdfa(options: PDFAOutputOptions | None = None) -> PDFAOutput: + """PDF/A output configuration. + + Args: + options: PDF/A output options + conformance: PDF/A conformance level + vectorization: Enable vectorization + rasterization: Enable rasterization + metadata: Document metadata + labels: Page labels + user_password: User password for the PDF + owner_password: Owner password for the PDF + user_permissions: User permissions + optimize: PDF optimization options + + Returns: + PDFAOutput object + """ + result: dict[str, Any] = {"type": "pdfa"} + + if options: + if "conformance" in options: + result["conformance"] = options["conformance"] + if "vectorization" in options: + result["vectorization"] = options["vectorization"] + if "rasterization" in options: + result["rasterization"] = options["rasterization"] + if "metadata" in options: + result["metadata"] = options["metadata"] + if "labels" in options: + result["labels"] = options["labels"] + if "user_password" in options: + result["user_password"] = options["user_password"] + if "owner_password" in options: + result["owner_password"] = options["owner_password"] + if "user_permissions" in options: + result["user_permissions"] = options["user_permissions"] + if "optimize" in options: + result["optimize"] = options["optimize"] + + return cast("PDFAOutput", result) + + @staticmethod + def pdfua(options: PDFUAOutputOptions | None = None) -> PDFUAOutput: + """PDF/UA output configuration. + + Args: + options: PDF/UA output options + metadata: Document metadata + labels: Page labels + user_password: User password for the PDF + owner_password: Owner password for the PDF + user_permissions: User permissions + optimize: PDF optimization options + + Returns: + PDFUAOutput object + """ + result: dict[str, Any] = {"type": "pdfua"} + + if options: + if "metadata" in options: + result["metadata"] = options["metadata"] + if "labels" in options: + result["labels"] = options["labels"] + if "user_password" in options: + result["user_password"] = options["user_password"] + if "owner_password" in options: + result["owner_password"] = options["owner_password"] + if "user_permissions" in options: + result["user_permissions"] = options["user_permissions"] + if "optimize" in options: + result["optimize"] = options["optimize"] + + return cast("PDFUAOutput", result) + + @staticmethod + def image( + format: Literal["png", "jpeg", "jpg", "webp"], + options: ImageOutputOptions | None = None, + ) -> ImageOutput: + """Image output configuration. + + Args: + format: Image format type + options: Image output options + pages: Page range to convert + width: Width of the output image + height: Height of the output image + dpi: DPI of the output image + + Returns: + ImageOutput object + """ + result: dict[str, Any] = { + "type": "image", + "format": format, + } + + if options: + if "pages" in options: + result["pages"] = options["pages"] + if "width" in options: + result["width"] = options["width"] + if "height" in options: + result["height"] = options["height"] + if "dpi" in options: + result["dpi"] = options["dpi"] + + return cast("ImageOutput", result) + + @staticmethod + def jsonContent( + options: JSONContentOutputOptions | None = None, + ) -> JSONContentOutput: + """JSON content output configuration. + + Args: + options: JSON content extraction options + plainText: Extract plain text + structuredText: Extract structured text + keyValuePairs: Extract key-value pairs + tables: Extract tables + language: Language(s) for OCR + + Returns: + JSONContentOutput object + """ + result: dict[str, Any] = {"type": "json-content"} + + if options: + if "plainText" in options: + result["plainText"] = options["plainText"] + if "structuredText" in options: + result["structuredText"] = options["structuredText"] + if "keyValuePairs" in options: + result["keyValuePairs"] = options["keyValuePairs"] + if "tables" in options: + result["tables"] = options["tables"] + if "language" in options: + result["language"] = options["language"] + + return cast("JSONContentOutput", result) + + @staticmethod + def office(type: Literal["docx", "xlsx", "pptx"]) -> OfficeOutput: + """Office document output configuration. + + Args: + type: Office document type + + Returns: + OfficeOutput object + """ + return { + "type": type, + } + + @staticmethod + def html(layout: Literal["page", "reflow"]) -> HTMLOutput: + """HTML output configuration. + + Args: + layout: The layout type to use for conversion to HTML + + Returns: + HTMLOutput object + """ + return { + "type": "html", + "layout": layout, + } + + @staticmethod + def markdown() -> MarkdownOutput: + """Markdown output configuration. + + Returns: + MarkdownOutput object + """ + return { + "type": "markdown", + } + + @staticmethod + def getMimeTypeForOutput( + output: PDFOutput + | PDFAOutput + | PDFUAOutput + | ImageOutput + | OfficeOutput + | HTMLOutput + | MarkdownOutput, + ) -> dict[str, str]: + """Get MIME type and filename for a given output configuration. + + Args: + output: The output configuration + + Returns: + Dictionary with mimeType and optional filename + """ + output_type = output.get("type", "pdf") + + if output_type in ["pdf", "pdfa", "pdfua"]: + return {"mimeType": "application/pdf", "filename": "output.pdf"} + elif output_type == "image": + format = output.get("format", "png") + if format == "jpg": + return {"mimeType": "image/jpeg", "filename": "output.jpg"} + else: + return {"mimeType": f"image/{format}", "filename": f"output.{format}"} + elif output_type == "docx": + return { + "mimeType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "filename": "output.docx", + } + elif output_type == "xlsx": + return { + "mimeType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "filename": "output.xlsx", + } + elif output_type == "pptx": + return { + "mimeType": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "filename": "output.pptx", + } + elif output_type == "html": + return { + "mimeType": "text/html", + "filename": "output.html", + } + elif output_type == "markdown": + return { + "mimeType": "text/markdown", + "filename": "output.md", + } + else: + return {"mimeType": "application/octet-stream", "filename": "output"} diff --git a/src/nutrient_dws/builder/staged_builders.py b/src/nutrient_dws/builder/staged_builders.py new file mode 100644 index 0000000..bfd0743 --- /dev/null +++ b/src/nutrient_dws/builder/staged_builders.py @@ -0,0 +1,253 @@ +"""Staged builder interfaces for workflow pattern implementation.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections.abc import Callable +from typing import TYPE_CHECKING, Literal, TypedDict + +from nutrient_dws.builder.constant import ActionWithFileInput +from nutrient_dws.types.build_actions import BuildAction + +if TYPE_CHECKING: + from nutrient_dws.inputs import FileInput + from nutrient_dws.types.analyze_response import AnalyzeBuildResponse + from nutrient_dws.types.build_output import ( + ImageOutputOptions, + JSONContentOutputOptions, + PDFAOutputOptions, + PDFOutputOptions, + PDFUAOutputOptions, + ) + from nutrient_dws.types.build_response_json import BuildResponseJsonContents + from nutrient_dws.types.input_parts import ( + DocumentPartOptions, + FilePartOptions, + HTMLPartOptions, + NewPagePartOptions, + ) + +# Type aliases for output types +OutputFormat = Literal[ + "pdf", + "pdfa", + "pdfua", + "png", + "jpeg", + "jpg", + "webp", + "docx", + "xlsx", + "pptx", + "html", + "markdown", + "json-content", +] + + +# Output type mappings +class BufferOutput(TypedDict): + buffer: bytes + mimeType: str + filename: str | None + + +class ContentOutput(TypedDict): + content: str + mimeType: str + filename: str | None + + +class JsonContentOutput(TypedDict): + data: BuildResponseJsonContents + + +# Applicable actions type - actions that can be applied to workflows +ApplicableAction = BuildAction | ActionWithFileInput + + +class WorkflowError(TypedDict): + """Workflow execution error details.""" + + step: int + error: Exception + + +class WorkflowOutput(TypedDict): + """Represents an output file with its content and metadata.""" + + buffer: bytes + mimeType: str + filename: str | None + + +class WorkflowResult(TypedDict): + """Result of a workflow execution.""" + + success: bool + output: WorkflowOutput | None + errors: list[WorkflowError] | None + + +class TypedWorkflowResult(TypedDict): + """Typed result of a workflow execution based on output configuration.""" + + success: bool + output: BufferOutput | ContentOutput | JsonContentOutput | None + errors: list[WorkflowError] | None + + +class WorkflowDryRunResult(TypedDict): + """Result of a workflow dry run.""" + + success: bool + analysis: AnalyzeBuildResponse | None + errors: list[WorkflowError] | None + + +WorkflowExecuteCallback = Callable[[int, int], None] + + +class WorkflowInitialStage(ABC): + """Stage 1: Initial workflow - only part methods available.""" + + @abstractmethod + def add_file_part( + self, + file: FileInput, + options: FilePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a file part to the workflow.""" + pass + + @abstractmethod + def add_html_part( + self, + html: FileInput, + assets: list[FileInput] | None = None, + options: HTMLPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add an HTML part to the workflow.""" + pass + + @abstractmethod + def add_new_page( + self, + options: NewPagePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a new page part to the workflow.""" + pass + + @abstractmethod + def add_document_part( + self, + document_id: str, + options: DocumentPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a document part to the workflow.""" + pass + + +class WorkflowWithPartsStage(WorkflowInitialStage): + """Stage 2: After parts added - parts, actions, and output methods available.""" + + # Action methods + @abstractmethod + def apply_actions(self, actions: list[ApplicableAction]) -> WorkflowWithPartsStage: + """Apply multiple actions to the workflow.""" + pass + + @abstractmethod + def apply_action(self, action: ApplicableAction) -> WorkflowWithPartsStage: + """Apply a single action to the workflow.""" + pass + + # Output methods + @abstractmethod + def output_pdf( + self, + options: PDFOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set PDF output for the workflow.""" + pass + + @abstractmethod + def output_pdfa( + self, + options: PDFAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set PDF/A output for the workflow.""" + pass + + @abstractmethod + def output_pdfua( + self, + options: PDFUAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set PDF/UA output for the workflow.""" + pass + + @abstractmethod + def output_image( + self, + format: Literal["png", "jpeg", "jpg", "webp"], + options: ImageOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set image output for the workflow.""" + pass + + @abstractmethod + def output_office( + self, + format: Literal["docx", "xlsx", "pptx"], + ) -> WorkflowWithOutputStage: + """Set Office format output for the workflow.""" + pass + + @abstractmethod + def output_html( + self, + layout: Literal["page", "reflow"] | None = None, + ) -> WorkflowWithOutputStage: + """Set HTML output for the workflow.""" + pass + + @abstractmethod + def output_markdown( + self, + ) -> WorkflowWithOutputStage: + """Set Markdown output for the workflow.""" + pass + + @abstractmethod + def output_json( + self, + options: JSONContentOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set JSON content output for the workflow.""" + pass + + +# Stage 3: After actions added - type alias since functionality is the same +WorkflowWithActionsStage = WorkflowWithPartsStage + + +class WorkflowWithOutputStage(ABC): + """Stage 4: After output set - only execute and dryRun available.""" + + @abstractmethod + async def execute( + self, + on_progress: WorkflowExecuteCallback | None = None, + ) -> TypedWorkflowResult: + """Execute the workflow and return the result.""" + pass + + @abstractmethod + async def dry_run(self) -> WorkflowDryRunResult: + """Perform a dry run of the workflow without executing.""" + pass diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index 02b5894..4c6cd85 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -1,110 +1,1751 @@ -"""Main client module for Nutrient DWS API.""" +"""Main client for interacting with the Nutrient Document Web Services API.""" -import os -from typing import Any +from collections.abc import Awaitable, Callable +from typing import TYPE_CHECKING, Any, Literal, cast -from nutrient_dws.api.direct import DirectAPIMixin -from nutrient_dws.builder import BuildAPIWrapper -from nutrient_dws.file_handler import FileInput -from nutrient_dws.http_client import HTTPClient +from nutrient_dws.builder.builder import StagedWorkflowBuilder +from nutrient_dws.builder.constant import BuildActions +from nutrient_dws.builder.staged_builders import ( + ApplicableAction, + BufferOutput, + ContentOutput, + JsonContentOutput, + OutputFormat, + TypedWorkflowResult, + WorkflowInitialStage, + WorkflowWithPartsStage, +) +from nutrient_dws.errors import NutrientError, ValidationError +from nutrient_dws.http import ( + NutrientClientOptions, + RedactRequestData, + RequestConfig, + SignRequestData, + SignRequestOptions, + send_request, +) +from nutrient_dws.inputs import ( + FileInput, + get_pdf_page_count, + is_remote_file_input, + is_valid_pdf, + process_file_input, + process_remote_file_input, +) +from nutrient_dws.types.account_info import AccountInfo +from nutrient_dws.types.build_actions import ( + ApplyXfdfActionOptions, + BaseCreateRedactionsOptions, + CreateRedactionsStrategyOptionsPreset, + CreateRedactionsStrategyOptionsRegex, + CreateRedactionsStrategyOptionsText, + ImageWatermarkActionOptions, + SearchPreset, + TextWatermarkActionOptions, +) +from nutrient_dws.types.build_output import ( + JSONContentOutputOptions, + Label, + Metadata, + OptimizePdf, + PDFOutputOptions, + PDFUserPermission, +) +from nutrient_dws.types.create_auth_token import ( + CreateAuthTokenParameters, + CreateAuthTokenResponse, +) +from nutrient_dws.types.misc import OcrLanguage, PageRange, Pages +from nutrient_dws.types.redact_data import RedactOptions +from nutrient_dws.types.sign_request import CreateDigitalSignature +if TYPE_CHECKING: + from nutrient_dws.types.input_parts import FilePartOptions -class NutrientClient(DirectAPIMixin): - r"""Main client for interacting with Nutrient DWS API. - This client provides two ways to interact with the API: +def normalize_page_params( + pages: PageRange | None = None, + page_count: int | None = None, +) -> Pages: + """Normalize page parameters according to the requirements: + - start and end are inclusive + - start defaults to 0 (first page) + - end defaults to -1 (last page) + - negative end values loop from the end of the document. - 1. Direct API: Individual method calls for single operations - Example: client.convert_to_pdf(input_file="document.docx") + Args: + pages: The page parameters to normalize + page_count: The total number of pages in the document (required for negative indices) + + Returns: + Normalized page parameters + """ + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 - 2. Builder API: Fluent interface for chaining multiple operations - Example: client.build(input_file="doc.docx").add_step("convert-to-pdf").execute() + # Handle negative end values if page_count is provided + if page_count is not None and start < 0: + start = page_count + start - Args: - api_key: API key for authentication. If not provided, will look for - NUTRIENT_API_KEY environment variable. - timeout: Request timeout in seconds. Defaults to 300. + if page_count is not None and end < 0: + end = page_count + end - Raises: - AuthenticationError: When making API calls without a valid API key. + return {"start": start, "end": end} + + +class NutrientClient: + """Main client for interacting with the Nutrient Document Web Services API. Example: - >>> from nutrient_dws import NutrientClient - >>> client = NutrientClient(api_key="your-api-key") - >>> # Direct API - >>> pdf = client.convert_to_pdf(input_file="document.docx") - >>> # Builder API - >>> client.build(input_file="document.docx") \\ - ... .add_step(tool="convert-to-pdf") \\ - ... .add_step(tool="ocr-pdf") \\ - ... .execute(output_path="output.pdf") + Server-side usage with an API key: + + ```python + client = NutrientClient(api_key='your_api_key') + ``` + + Client-side usage with token provider: + + ```python + async def get_token(): + # Your token retrieval logic here + return 'your-token' + + client = NutrientClient(api_key=get_token) + ``` """ - def __init__(self, api_key: str | None = None, timeout: int = 300) -> None: - """Initialize the Nutrient client.""" - # Get API key from parameter or environment - self._api_key = api_key or os.environ.get("NUTRIENT_API_KEY") - self._timeout = timeout + def __init__( + self, + api_key: str | Callable[[], str | Awaitable[str]], + base_url: str | None = None, + timeout: int | None = None, + ) -> None: + """Create a new NutrientClient instance. - # Initialize HTTP client - self._http_client = HTTPClient(api_key=self._api_key, timeout=timeout) + Args: + api_key: API key or API key getter + base_url: DWS Base url + timeout: DWS request timeout - # Direct API methods will be added dynamically + Raises: + ValidationError: If options are invalid + """ + options = NutrientClientOptions( + apiKey=api_key, baseUrl=base_url, timeout=timeout + ) + self._validate_options(options) + self.options = options - def build(self, input_file: FileInput) -> BuildAPIWrapper: - """Start a Builder API workflow. + def _validate_options(self, options: NutrientClientOptions) -> None: + """Validate client options. Args: - input_file: Input file (path, bytes, or file-like object). + options: Configuration options to validate + + Raises: + ValidationError: If options are invalid + """ + if not options: + raise ValidationError("Client options are required") + + if not options.get("apiKey"): + raise ValidationError("API key is required") + + api_key = options["apiKey"] + if not (isinstance(api_key, str) or callable(api_key)): + raise ValidationError( + "API key must be a string or a function that returns a string" + ) + + base_url = options.get("baseUrl") + if base_url is not None and not isinstance(base_url, str): + raise ValidationError("Base URL must be a string") + + async def get_account_info(self) -> AccountInfo: + """Get account information for the current API key. Returns: - BuildAPIWrapper instance for chaining operations. + Account information Example: - >>> builder = client.build(input_file="document.pdf") - >>> builder.add_step(tool="rotate-pages", options={"degrees": 90}) - >>> result = builder.execute() + ```python + account_info = await client.get_account_info() + print(account_info['subscriptionType']) + ``` """ - return BuildAPIWrapper(client=self, input_file=input_file) + response: Any = await send_request( + { + "method": "GET", + "endpoint": "/account/info", + "data": None, + "headers": None, + }, + self.options, + ) - def _process_file( - self, - tool: str, - input_file: FileInput, - output_path: str | None = None, - **options: Any, - ) -> bytes | None: - """Process a file using the Direct API. + return cast("AccountInfo", response["data"]) + + async def create_token( + self, params: CreateAuthTokenParameters + ) -> CreateAuthTokenResponse: + """Create a new authentication token. + + Args: + params: Parameters for creating the token + + Returns: + The created token information + + Example: + ```python + token = await client.create_token({ + 'allowedOperations': ['annotations_api'], + 'expirationTime': 3600 # 1 hour + }) + print(token['id']) + ``` + """ + response: Any = await send_request( + { + "method": "POST", + "endpoint": "/tokens", + "data": params, + "headers": None, + }, + self.options, + ) + + return cast("CreateAuthTokenResponse", response["data"]) + + async def delete_token(self, token_id: str) -> None: + """Delete an authentication token. + + Args: + token_id: ID of the token to delete + + Example: + ```python + await client.delete_token('token-id-123') + ``` + """ + await send_request( + { + "method": "DELETE", + "endpoint": "/tokens", + "data": cast("Any", {"id": token_id}), + "headers": None, + }, + self.options, + ) + + def workflow(self, override_timeout: int | None = None) -> WorkflowInitialStage: + r"""Create a new WorkflowBuilder for chaining multiple operations. + + Args: + override_timeout: Set a custom timeout for the workflow (in milliseconds) + + Returns: + A new WorkflowBuilder instance - This is the internal method used by all Direct API methods. - It internally uses the Build API with a single action. + Example: + ```python + result = await client.workflow() \\ + .add_file_part('document.docx') \\ + .apply_action(BuildActions.ocr('english')) \\ + .output_pdf() \\ + .execute() + ``` + """ + options = self.options.copy() + if override_timeout is not None: + options["timeout"] = override_timeout + + return StagedWorkflowBuilder(options) + + def _process_typed_workflow_result( + self, result: TypedWorkflowResult + ) -> BufferOutput | ContentOutput | JsonContentOutput: + """Helper function that takes a TypedWorkflowResult, throws any errors, and returns the specific output type. Args: - tool: The tool identifier from the API. - input_file: Input file to process. - output_path: Optional path to save the output. - **options: Tool-specific options. + result: The TypedWorkflowResult to process Returns: - Processed file as bytes, or None if output_path is provided. + The specific output type from the result Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. + NutrientError: If the workflow was not successful or if output is missing + """ + if not result["success"]: + # If there are errors, throw the first one + errors = result.get("errors") + if errors and len(errors) > 0: + raise errors[0]["error"] + # If no specific errors but operation failed + raise NutrientError( + "Workflow operation failed without specific error details", + "WORKFLOW_ERROR", + ) + + # Check if output exists + output = result.get("output") + if not output: + raise NutrientError( + "Workflow completed successfully but no output was returned", + "MISSING_OUTPUT", + ) + + return output + + async def sign( + self, + pdf: FileInput, + data: CreateDigitalSignature | None = None, + options: SignRequestOptions | None = None, + ) -> BufferOutput: + """Sign a PDF document. + + Args: + pdf: The PDF file to sign + data: Signature data + options: Additional options (image, graphicImage) + + Returns: + The signed PDF file output + + Example: + ```python + result = await client.sign('document.pdf', { + 'signatureType': 'cms', + 'flatten': False, + 'cadesLevel': 'b-lt' + }) + + # Access the signed PDF buffer + pdf_buffer = result['buffer'] + + # Get the MIME type of the output + print(result['mimeType']) # 'application/pdf' + + # Save the buffer to a file + with open('signed-document.pdf', 'wb') as f: + f.write(pdf_buffer) + ``` + """ + # Normalize the file input + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Prepare optional files + normalized_image = None + normalized_graphic_image = None + + if options: + if "image" in options: + image = options["image"] + if is_remote_file_input(image): + normalized_image = await process_remote_file_input(str(image)) + else: + normalized_image = await process_file_input(image) + + if "graphicImage" in options: + graphic_image = options["graphicImage"] + if is_remote_file_input(graphic_image): + normalized_graphic_image = await process_remote_file_input( + str(graphic_image) + ) + else: + normalized_graphic_image = await process_file_input(graphic_image) + + request_data = { + "file": normalized_file, + "data": data, + } + + if normalized_image: + request_data["image"] = normalized_image + if normalized_graphic_image: + request_data["graphicImage"] = normalized_graphic_image + + response: Any = await send_request( + { + "method": "POST", + "endpoint": "/sign", + "data": cast("SignRequestData", request_data), + "headers": None, + }, + self.options, + ) + + buffer = response["data"] + + return { + "mimeType": "application/pdf", + "filename": "output.pdf", + "buffer": buffer, + } + + async def watermark_text( + self, + file: FileInput, + text: str, + options: TextWatermarkActionOptions | None = None, + ) -> BufferOutput: + """Add a text watermark to a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to watermark + text: The watermark text + options: Watermark options + + Returns: + The watermarked document + + Example: + ```python + result = await client.watermark_text('document.pdf', 'CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 24 + }) + + # Access the watermarked PDF buffer + pdf_buffer = result['buffer'] + + # Save the buffer to a file + with open('watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) + ``` + """ + watermark_action = BuildActions.watermark_text(text, options) + + builder = self.workflow().add_file_part(file, None, [watermark_action]) + + result = await builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def watermark_image( + self, + file: FileInput, + image: FileInput, + options: ImageWatermarkActionOptions | None = None, + ) -> BufferOutput: + """Add an image watermark to a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to watermark + image: The watermark image + options: Watermark options + + Returns: + The watermarked document + + Example: + ```python + result = await client.watermark_image('document.pdf', 'watermark.jpg', { + 'opacity': 0.5 + }) + + # Access the watermarked PDF buffer + pdf_buffer = result['buffer'] + ``` + """ + watermark_action = BuildActions.watermark_image(image, options) + + builder = self.workflow().add_file_part(file, None, [watermark_action]) + + result = await builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def convert( + self, + file: FileInput, + target_format: OutputFormat, + ) -> BufferOutput | ContentOutput | JsonContentOutput: + """Convert a document to a different format. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to convert + target_format: The target format to convert to + + Returns: + The specific output type based on the target format + + Example: + ```python + # Convert DOCX to PDF + pdf_result = await client.convert('document.docx', 'pdf') + pdf_buffer = pdf_result['buffer'] + + # Convert PDF to image + image_result = await client.convert('document.pdf', 'png') + png_buffer = image_result['buffer'] + + # Convert to HTML + html_result = await client.convert('document.pdf', 'html') + html_content = html_result['content'] + ``` + """ + builder = self.workflow().add_file_part(file) + + if target_format == "pdf": + result = await builder.output_pdf().execute() + elif target_format == "pdfa": + result = await builder.output_pdfa().execute() + elif target_format == "pdfua": + result = await builder.output_pdfua().execute() + elif target_format == "docx": + result = await builder.output_office("docx").execute() + elif target_format == "xlsx": + result = await builder.output_office("xlsx").execute() + elif target_format == "pptx": + result = await builder.output_office("pptx").execute() + elif target_format == "html": + result = await builder.output_html("page").execute() + elif target_format == "markdown": + result = await builder.output_markdown().execute() + elif target_format in ["png", "jpeg", "jpg", "webp"]: + result = await builder.output_image( + cast("Literal['png', 'jpeg', 'jpg', 'webp']", target_format), + {"dpi": 300}, + ).execute() + else: + raise ValidationError(f"Unsupported target format: {target_format}") + + return self._process_typed_workflow_result(result) + + async def ocr( + self, + file: FileInput, + language: OcrLanguage | list[OcrLanguage], + ) -> BufferOutput: + """Perform OCR (Optical Character Recognition) on a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to perform OCR on + language: The language(s) to use for OCR + + Returns: + The OCR result + + Example: + ```python + result = await client.ocr('scanned-document.pdf', 'english') + + # Access the OCR-processed PDF buffer + pdf_buffer = result['buffer'] + ``` + """ + ocr_action = BuildActions.ocr(language) + + builder = self.workflow().add_file_part(file, None, [ocr_action]) + + result = await builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def extract_text( + self, + file: FileInput, + pages: PageRange | None = None, + ) -> JsonContentOutput: + """Extract text content from a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to extract text from + pages: Optional page range to extract text from + + Returns: + The extracted text data + + Example: + ```python + result = await client.extract_text('document.pdf') + print(result['data']) + + # Extract text from specific pages + result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) + + # Access the extracted text content + text_content = result['data']['pages'][0]['plainText'] + ``` + """ + normalized_pages = normalize_page_params(pages) if pages else None + + part_options = ( + cast("FilePartOptions", {"pages": normalized_pages}) + if normalized_pages + else None + ) + + result = ( + await self.workflow() + .add_file_part(file, part_options) + .output_json( + cast("JSONContentOutputOptions", {"plainText": True, "tables": False}) + ) + .execute() + ) + + return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + + async def extract_table( + self, + file: FileInput, + pages: PageRange | None = None, + ) -> JsonContentOutput: + """Extract table content from a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to extract table from + pages: Optional page range to extract tables from + + Returns: + The extracted table data + + Example: + ```python + result = await client.extract_table('document.pdf') + + # Access the extracted tables + tables = result['data']['pages'][0]['tables'] + + # Process the first table if available + if tables and len(tables) > 0: + first_table = tables[0] + print(f"Table has {len(first_table['rows'])} rows") + ``` + """ + normalized_pages = normalize_page_params(pages) if pages else None + + part_options = ( + cast("FilePartOptions", {"pages": normalized_pages}) + if normalized_pages + else None + ) + + result = ( + await self.workflow() + .add_file_part(file, part_options) + .output_json( + cast("JSONContentOutputOptions", {"plainText": False, "tables": True}) + ) + .execute() + ) + + return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + + async def extract_key_value_pairs( + self, + file: FileInput, + pages: PageRange | None = None, + ) -> JsonContentOutput: + """Extract key value pair content from a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to extract KVPs from + pages: Optional page range to extract KVPs from + + Returns: + The extracted KVPs data + + Example: + ```python + result = await client.extract_key_value_pairs('document.pdf') + + # Access the extracted key-value pairs + kvps = result['data']['pages'][0]['keyValuePairs'] + + # Process the key-value pairs + if kvps and len(kvps) > 0: + for kvp in kvps: + print(f"Key: {kvp['key']}, Value: {kvp['value']}") + ``` + """ + normalized_pages = normalize_page_params(pages) if pages else None + + part_options = ( + cast("FilePartOptions", {"pages": normalized_pages}) + if normalized_pages + else None + ) + + result = ( + await self.workflow() + .add_file_part(file, part_options) + .output_json( + cast( + "JSONContentOutputOptions", + {"plainText": False, "tables": False, "keyValuePairs": True}, + ) + ) + .execute() + ) + + return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + + async def set_page_labels( + self, + pdf: FileInput, + labels: list[Label], + ) -> BufferOutput: + """Set page labels for a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + labels: Array of label objects with pages and label properties + + Returns: + The document with updated page labels + + Example: + ```python + result = await client.set_page_labels('document.pdf', [ + {'pages': [0, 1, 2], 'label': 'Cover'}, + {'pages': [3, 4, 5], 'label': 'Chapter 1'} + ]) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + result = ( + await self.workflow() + .add_file_part(pdf) + .output_pdf(cast("PDFOutputOptions", {"labels": labels})) + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def password_protect( + self, + file: FileInput, + user_password: str, + owner_password: str, + permissions: list[PDFUserPermission] | None = None, + ) -> BufferOutput: + """Password protect a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to protect + user_password: Password required to open the document + owner_password: Password required to modify the document + permissions: Optional array of permissions granted when opened with user password + + Returns: + The password-protected document + + Example: + ```python + result = await client.password_protect('document.pdf', 'user123', 'owner456') + + # Or with specific permissions: + result = await client.password_protect( + 'document.pdf', + 'user123', + 'owner456', + ['printing', 'extract_accessibility'] + ) + ``` + """ + pdf_options: PDFOutputOptions = { + "user_password": user_password, + "owner_password": owner_password, + } + + if permissions: + pdf_options["user_permissions"] = permissions + + result = ( + await self.workflow().add_file_part(file).output_pdf(pdf_options).execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def set_metadata( + self, + pdf: FileInput, + metadata: Metadata, + ) -> BufferOutput: + """Set metadata for a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + metadata: The metadata to set (title and/or author) + + Returns: + The document with updated metadata + + Example: + ```python + result = await client.set_metadata('document.pdf', { + 'title': 'My Document', + 'author': 'John Doe' + }) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + result = ( + await self.workflow() + .add_file_part(pdf) + .output_pdf(cast("PDFOutputOptions", {"metadata": metadata})) + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def apply_instant_json( + self, + pdf: FileInput, + instant_json_file: FileInput, + ) -> BufferOutput: + """Apply Instant JSON to a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + instant_json_file: The Instant JSON file to apply + + Returns: + The modified document + + Example: + ```python + result = await client.apply_instant_json('document.pdf', 'annotations.json') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + apply_json_action = BuildActions.apply_instant_json(instant_json_file) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [apply_json_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def apply_xfdf( + self, + pdf: FileInput, + xfdf_file: FileInput, + options: ApplyXfdfActionOptions | None = None, + ) -> BufferOutput: + """Apply XFDF to a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + xfdf_file: The XFDF file to apply + options: Optional settings for applying XFDF + + Returns: + The modified document + + Example: + ```python + result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') + # Or with options: + result = await client.apply_xfdf( + 'document.pdf', 'annotations.xfdf', + {'ignorePageRotation': True, 'richTextEnabled': False} + ) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + apply_xfdf_action = BuildActions.apply_xfdf(xfdf_file, options) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [apply_xfdf_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def merge(self, files: list[FileInput]) -> BufferOutput: + """Merge multiple documents into a single document. + This is a convenience method that uses the workflow builder. + + Args: + files: The files to merge + + Returns: + The merged document + + Example: + ```python + result = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']) + + # Access the merged PDF buffer + pdf_buffer = result['buffer'] + ``` + """ + if not files or len(files) < 2: + raise ValidationError("At least 2 files are required for merge operation") + + builder = self.workflow() + + # Add first file + workflow_builder = builder.add_file_part(files[0]) + + # Add remaining files + for file in files[1:]: + workflow_builder = workflow_builder.add_file_part(file) + + result = await workflow_builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def flatten( + self, + pdf: FileInput, + annotation_ids: list[str | int] | None = None, + ) -> BufferOutput: + """Flatten annotations in a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to flatten + annotation_ids: Optional specific annotation IDs to flatten + + Returns: + The flattened document + + Example: + ```python + # Flatten all annotations + result = await client.flatten('annotated-document.pdf') + + # Flatten specific annotations by ID + result = await client.flatten('annotated-document.pdf', ['annotation1', 'annotation2']) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + flatten_action = BuildActions.flatten(annotation_ids) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [flatten_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def create_redactions_ai( + self, + pdf: FileInput, + criteria: str, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + options: RedactOptions | None = None, + ) -> BufferOutput: + """Use AI to redact sensitive information in a document. + + Args: + pdf: The PDF file to redact + criteria: AI redaction criteria + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional pages to redact + options: Optional redaction options + + Returns: + The redacted document + + Example: + ```python + # Stage redactions + result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails' + ) + + # Apply redactions immediately + result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'apply' + ) + ``` """ - # Use the builder API with a single step - builder = self.build(input_file) - builder.add_step(tool, options) - return builder.execute(output_path) + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + document_data: dict[str, Any] = { + "file": "file", + } + + if normalized_pages: + document_data["pages"] = normalized_pages + + documents = [document_data] + + request_data = { + "data": { + "documents": documents, + "criteria": criteria, + "redaction_state": redaction_state, + }, + "file": normalized_file, + "fileKey": "file", + } + + if options: + request_data["data"]["options"] = options # type: ignore + + config = RequestConfig( + method="POST", + data=cast("RedactRequestData", request_data), + endpoint="/ai/redact", + headers=None, + ) + + response: Any = await send_request( + config, + self.options, + ) + + buffer = response["data"] + + return { + "mimeType": "application/pdf", + "filename": "output.pdf", + "buffer": buffer, + } + + async def create_redactions_preset( + self, + pdf: FileInput, + preset: SearchPreset, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + preset_options: CreateRedactionsStrategyOptionsPreset | None = None, + options: BaseCreateRedactionsOptions | None = None, + ) -> BufferOutput: + """Create redaction annotations based on a preset pattern. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to create redactions in + preset: The preset pattern to search for (e.g., 'email-address', 'social-security-number') + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional page range to create redactions in + preset_options: Optional settings for the preset strategy + options: Optional settings for creating redactions + + Returns: + The document with redaction annotations + + Example: + ```python + result = await client.create_redactions_preset('document.pdf', 'email-address') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get page count for handling negative indices + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + # Prepare strategy options with pages + strategy_options = preset_options.copy() if preset_options else {} + if normalized_pages: + strategy_options["start"] = normalized_pages["start"] + if normalized_pages["end"] >= 0: + strategy_options["limit"] = ( + normalized_pages["end"] - normalized_pages["start"] + 1 + ) + + create_redactions_action = BuildActions.create_redactions_preset( + preset, options, strategy_options + ) + actions: list[ApplicableAction] = [create_redactions_action] + + if redaction_state == "apply": + actions.append(BuildActions.apply_redactions()) + + result = ( + await self.workflow() + .add_file_part(pdf, None, actions) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def create_redactions_regex( + self, + pdf: FileInput, + regex: str, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + regex_options: CreateRedactionsStrategyOptionsRegex | None = None, + options: BaseCreateRedactionsOptions | None = None, + ) -> BufferOutput: + r"""Create redaction annotations based on a regular expression. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to create redactions in + regex: The regular expression to search for + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional page range to create redactions in + regex_options: Optional settings for the regex strategy + options: Optional settings for creating redactions + + Returns: + The document with redaction annotations + + Example: + ```python + result = await client.create_redactions_regex('document.pdf', r'Account:\s*\d{8,12}') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get page count for handling negative indices + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + # Prepare strategy options with pages + strategy_options = regex_options.copy() if regex_options else {} + if normalized_pages: + strategy_options["start"] = normalized_pages["start"] + if normalized_pages["end"] >= 0: + strategy_options["limit"] = ( + normalized_pages["end"] - normalized_pages["start"] + 1 + ) + + create_redactions_action = BuildActions.create_redactions_regex( + regex, options, strategy_options + ) + actions: list[ApplicableAction] = [create_redactions_action] + + if redaction_state == "apply": + actions.append(BuildActions.apply_redactions()) + + result = ( + await self.workflow() + .add_file_part(pdf, None, actions) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def create_redactions_text( + self, + pdf: FileInput, + text: str, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + text_options: CreateRedactionsStrategyOptionsText | None = None, + options: BaseCreateRedactionsOptions | None = None, + ) -> BufferOutput: + """Create redaction annotations based on text. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to create redactions in + text: The text to search for + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional page range to create redactions in + text_options: Optional settings for the text strategy + options: Optional settings for creating redactions + + Returns: + The document with redaction annotations + + Example: + ```python + result = await client.create_redactions_text('document.pdf', 'email@example.com') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get page count for handling negative indices + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + # Prepare strategy options with pages + strategy_options = text_options.copy() if text_options else {} + if normalized_pages: + strategy_options["start"] = normalized_pages["start"] + if normalized_pages["end"] >= 0: + strategy_options["limit"] = ( + normalized_pages["end"] - normalized_pages["start"] + 1 + ) + + create_redactions_action = BuildActions.create_redactions_text( + text, options, strategy_options + ) + actions: list[ApplicableAction] = [create_redactions_action] + + if redaction_state == "apply": + actions.append(BuildActions.apply_redactions()) + + result = ( + await self.workflow() + .add_file_part(pdf, None, actions) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def apply_redactions(self, pdf: FileInput) -> BufferOutput: + """Apply staged redaction into the PDF. + + Args: + pdf: The PDF file with redaction annotations to apply + + Returns: + The document with applied redactions + + Example: + ```python + # Stage redactions from a createRedaction Method: + staged_result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage' + ) + + result = await client.apply_redactions(staged_result['buffer']) + ``` + """ + apply_redactions_action = BuildActions.apply_redactions() + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [apply_redactions_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def rotate( + self, + pdf: FileInput, + angle: Literal[90, 180, 270], + pages: PageRange | None = None, + ) -> BufferOutput: + """Rotate pages in a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to rotate + angle: Rotation angle (90, 180, or 270 degrees) + pages: Optional page range to rotate + + Returns: + The entire document with specified pages rotated + + Example: + ```python + result = await client.rotate('document.pdf', 90) + + # Rotate specific pages: + result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) + ``` + """ + rotate_action = BuildActions.rotate(angle) + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + workflow = self.workflow() + + if pages: + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) + + # Add pages before the range to rotate + if normalized_pages["start"] > 0: + part_options = cast( + "FilePartOptions", + {"pages": {"start": 0, "end": normalized_pages["start"] - 1}}, + ) + workflow = workflow.add_file_part(pdf, part_options) + + # Add the specific pages with rotation action + part_options = cast("FilePartOptions", {"pages": normalized_pages}) + workflow = workflow.add_file_part(pdf, part_options, [rotate_action]) + + # Add pages after the range to rotate + if normalized_pages["end"] < page_count - 1: + part_options = cast( + "FilePartOptions", + { + "pages": { + "start": normalized_pages["end"] + 1, + "end": page_count - 1, + } + }, + ) + workflow = workflow.add_file_part(pdf, part_options) + else: + # If no pages specified, rotate the entire document + workflow = workflow.add_file_part(pdf, None, [rotate_action]) + + result = await workflow.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def add_page( + self, pdf: FileInput, count: int = 1, index: int | None = None + ) -> BufferOutput: + """Add blank pages to a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to add pages to + count: The number of blank pages to add + index: Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + + Returns: + The document with added pages + + Example: + ```python + # Add 2 blank pages at the end + result = await client.add_page('document.pdf', 2) + + # Add 1 blank page after the first page (at index 1) + result = await client.add_page('document.pdf', 1, 1) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # If no index is provided or it's the end of the document, simply add pages at the end + if index is None: + builder = self.workflow() + + builder.add_file_part(pdf) + + # Add the specified number of blank pages + builder = builder.add_new_page({"pageCount": count}) + + result = await builder.output_pdf().execute() + else: + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Validate that the index is within range + if index < 0 or index > page_count: + raise ValidationError( + f"Index {index} is out of range (document has {page_count} pages)" + ) + + builder = self.workflow() + + # Add pages before the specified index + if index > 0: + before_pages = normalize_page_params( + {"start": 0, "end": index - 1}, page_count + ) + part_options = cast("FilePartOptions", {"pages": before_pages}) + builder = builder.add_file_part(pdf, part_options) + + # Add the blank pages + builder = builder.add_new_page({"pageCount": count}) + + # Add pages after the specified index + if index < page_count: + after_pages = normalize_page_params( + {"start": index, "end": page_count - 1}, page_count + ) + part_options = cast("FilePartOptions", {"pages": after_pages}) + builder = builder.add_file_part(pdf, part_options) + + result = await builder.output_pdf().execute() + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def split( + self, pdf: FileInput, page_ranges: list[PageRange] + ) -> list[BufferOutput]: + """Split a PDF document into multiple parts based on page ranges. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to split + page_ranges: Array of page ranges to extract + + Returns: + An array of PDF documents, one for each page range + + Example: + ```python + results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # Pages 0, 1, 2 + {'start': 3, 'end': 5} # Pages 3, 4, 5 + ]) + ``` + """ + if not page_ranges or len(page_ranges) == 0: + raise ValidationError("At least one page range is required for splitting") + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Normalize and validate all page ranges + normalized_ranges = [ + normalize_page_params(page_range, page_count) for page_range in page_ranges + ] + + # Validate that all page ranges are within bounds + for page_range in normalized_ranges: + if page_range["start"] > page_range["end"]: + raise ValidationError( + f"Page range {page_range} is invalid (start > end)" + ) + + # Create a separate workflow for each page range + import asyncio + from typing import cast as typing_cast + + async def create_split_pdf(page_range: Pages) -> BufferOutput: + builder = self.workflow() + part_options = cast("FilePartOptions", {"pages": page_range}) + builder = builder.add_file_part(pdf, part_options) + result = await builder.output_pdf().execute() + return typing_cast( + "BufferOutput", self._process_typed_workflow_result(result) + ) + + # Execute all workflows in parallel and process the results + tasks = [create_split_pdf(page_range) for page_range in normalized_ranges] + results = await asyncio.gather(*tasks) + + return results + + async def duplicate_pages( + self, pdf: FileInput, page_indices: list[int] + ) -> BufferOutput: + """Create a new PDF containing only the specified pages in the order provided. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to extract pages from + page_indices: Array of page indices to include in the new PDF (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + + Returns: + A new document with only the specified pages + + Example: + ```python + # Create a new PDF with only the first and third pages + result = await client.duplicate_pages('document.pdf', [0, 2]) + + # Create a new PDF with pages in a different order + result = await client.duplicate_pages('document.pdf', [2, 0, 1]) + + # Create a new PDF with duplicated pages + result = await client.duplicate_pages('document.pdf', [0, 0, 1, 1, 0]) + + # Create a new PDF with the first and last pages + result = await client.duplicate_pages('document.pdf', [0, -1]) + ``` + """ + if not page_indices or len(page_indices) == 0: + raise ValidationError("At least one page index is required for duplication") + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Normalize negative indices + normalized_indices = [] + for index in page_indices: + if index < 0: + # Handle negative indices (e.g., -1 is the last page) + normalized_indices.append(page_count + index) + else: + normalized_indices.append(index) + + # Validate that all page indices are within range + for i, original_index in enumerate(page_indices): + normalized_index = normalized_indices[i] + if normalized_index < 0 or normalized_index >= page_count: + raise ValidationError( + f"Page index {original_index} is out of range (document has {page_count} pages)" + ) + + builder = self.workflow() + + # Add each page in the order specified + for page_index in normalized_indices: + # Use normalize_page_params to ensure consistent handling + page_range = normalize_page_params({"start": page_index, "end": page_index}) + part_options = cast("FilePartOptions", {"pages": page_range}) + builder = builder.add_file_part(pdf, part_options) + + result = await cast("WorkflowWithPartsStage", builder).output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def delete_pages( + self, pdf: FileInput, page_indices: list[int] + ) -> BufferOutput: + """Delete pages from a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + page_indices: Array of page indices to delete (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + + Returns: + The document with deleted pages + + Example: + ```python + # Delete second and fourth pages + result = await client.delete_pages('document.pdf', [1, 3]) + + # Delete the last page + result = await client.delete_pages('document.pdf', [-1]) + + # Delete the first and last two pages + result = await client.delete_pages('document.pdf', [0, -1, -2]) + ``` + """ + if not page_indices or len(page_indices) == 0: + raise ValidationError("At least one page index is required for deletion") + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Normalize negative indices + normalized_indices = [] + for index in page_indices: + if index < 0: + # Handle negative indices (e.g., -1 is the last page) + normalized_indices.append(page_count + index) + else: + normalized_indices.append(index) + + # Remove duplicates and sort the deleteIndices + delete_indices = sorted(set(normalized_indices)) + + # Validate that all page indices are within range + for original_index in page_indices: + if original_index >= 0: + normalized_index = original_index + else: + normalized_index = page_count + original_index + + if normalized_index < 0 or normalized_index >= page_count: + raise ValidationError( + f"Page index {original_index} is out of range (document has {page_count} pages)" + ) + + builder = self.workflow() + + # Group consecutive pages that should be kept into ranges + current_page = 0 + page_ranges = [] + + for delete_index in delete_indices: + if current_page < delete_index: + page_ranges.append( + normalize_page_params( + {"start": current_page, "end": delete_index - 1} + ) + ) + current_page = delete_index + 1 + + if ( + current_page > 0 or (current_page == 0 and len(delete_indices) == 0) + ) and current_page < page_count: + page_ranges.append( + normalize_page_params({"start": current_page, "end": page_count - 1}) + ) + + if len(page_ranges) == 0: + raise ValidationError("You cannot delete all pages from a document") + + for page_range in page_ranges: + part_options = cast("FilePartOptions", {"pages": page_range}) + builder = builder.add_file_part(pdf, part_options) + + result = await cast("WorkflowWithPartsStage", builder).output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def optimize( + self, + pdf: FileInput, + options: OptimizePdf | None = None, + ) -> BufferOutput: + """Optimize a PDF document for size reduction. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to optimize + options: Optimization options + + Returns: + The optimized document + + Example: + ```python + result = await client.optimize('large-document.pdf', { + 'grayscaleImages': True, + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + }) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) - def close(self) -> None: - """Close the HTTP client session.""" - self._http_client.close() + if options is None: + options = {"imageOptimizationQuality": 2} - def __enter__(self) -> "NutrientClient": - """Context manager entry.""" - return self + result = ( + await self.workflow() + .add_file_part(pdf) + .output_pdf(cast("PDFOutputOptions", {"optimize": options})) + .execute() + ) - def __exit__(self, *args: Any) -> None: - """Context manager exit.""" - self.close() + return cast("BufferOutput", self._process_typed_workflow_result(result)) diff --git a/src/nutrient_dws/errors.py b/src/nutrient_dws/errors.py new file mode 100644 index 0000000..2886487 --- /dev/null +++ b/src/nutrient_dws/errors.py @@ -0,0 +1,175 @@ +"""Error classes for Nutrient DWS client. +Provides consistent error handling across the library. +""" + +from typing import Any + + +class NutrientError(Exception): + """Base error class for all Nutrient DWS client errors. + Provides consistent error handling across the library. + """ + + def __init__( + self, + message: str, + code: str = "NUTRIENT_ERROR", + details: dict[str, Any] | None = None, + status_code: int | None = None, + ) -> None: + """Initialize a NutrientError. + + Args: + message: Error message + code: Error code for programmatic error handling + details: Additional error details + status_code: HTTP status code if the error originated from an HTTP response + """ + super().__init__(message) + self.name = "NutrientError" + self.message = message + self.code = code + self.details = details + self.status_code = status_code + + # Python doesn't have direct equivalent to Error.captureStackTrace, + # but the stack trace is automatically captured + + def to_json(self) -> dict[str, Any]: + """Returns a JSON representation of the error. + + Returns: + Dict containing error details + """ + return { + "name": self.name, + "message": str(self), + "code": self.code, + "details": self.details, + "status_code": self.status_code, + "stack": self.__traceback__, + } + + def __str__(self) -> str: + """Returns a string representation of the error. + + Returns: + Formatted error string + """ + result = f"{self.name}: {super().__str__()}" + if self.code != "NUTRIENT_ERROR": + result += f" ({self.code})" + if self.status_code: + result += f" [HTTP {self.status_code}]" + return result + + @classmethod + def wrap(cls, error: Any, message: str | None = None) -> "NutrientError": + """Wraps an unknown error into a NutrientError. + + Args: + error: The error to wrap + message: Optional message to prepend + + Returns: + A NutrientError instance + """ + if isinstance(error, NutrientError): + return error + + if isinstance(error, Exception): + wrapped_message = f"{message}: {error!s}" if message else str(error) + return NutrientError( + wrapped_message, + "WRAPPED_ERROR", + { + "originalError": error.__class__.__name__, + "originalMessage": str(error), + "stack": error.__traceback__, + }, + ) + + error_message = message or "An unknown error occurred" + return NutrientError( + error_message, "UNKNOWN_ERROR", {"originalError": str(error)} + ) + + +class ValidationError(NutrientError): + """Error thrown when input validation fails.""" + + def __init__( + self, + message: str, + details: dict[str, Any] | None = None, + status_code: int | None = None, + ) -> None: + """Initialize a ValidationError. + + Args: + message: Error message + details: Additional error details + status_code: HTTP status code if applicable + """ + super().__init__(message, "VALIDATION_ERROR", details, status_code) + self.name = "ValidationError" + + +class APIError(NutrientError): + """Error thrown when API requests fail.""" + + def __init__( + self, + message: str, + status_code: int, + details: dict[str, Any] | None = None, + ) -> None: + """Initialize an APIError. + + Args: + message: Error message + status_code: HTTP status code + details: Additional error details + """ + super().__init__(message, "API_ERROR", details, status_code) + self.name = "APIError" + + +class AuthenticationError(NutrientError): + """Error thrown when authentication fails.""" + + def __init__( + self, + message: str, + details: dict[str, Any] | None = None, + status_code: int = 401, + ) -> None: + """Initialize an AuthenticationError. + + Args: + message: Error message + details: Additional error details + status_code: HTTP status code, defaults to 401 + """ + super().__init__(message, "AUTHENTICATION_ERROR", details, status_code) + self.name = "AuthenticationError" + + +class NetworkError(NutrientError): + """Error thrown when network requests fail.""" + + def __init__( + self, + message: str, + details: dict[str, Any] | None = None, + status_code: int | None = None, + ) -> None: + """Initialize a NetworkError. + + Args: + message: Error message + details: Additional error details + status_code: HTTP status code if applicable + """ + super().__init__(message, "NETWORK_ERROR", details, status_code) + self.name = "NetworkError" diff --git a/src/nutrient_dws/exceptions.py b/src/nutrient_dws/exceptions.py deleted file mode 100644 index 413e2e9..0000000 --- a/src/nutrient_dws/exceptions.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Custom exceptions for Nutrient DWS client.""" - -from typing import Any - - -class NutrientError(Exception): - """Base exception for all Nutrient client errors.""" - - pass - - -class AuthenticationError(NutrientError): - """Raised when authentication fails (401/403 errors). - - This typically indicates: - - Missing API key - - Invalid API key - - Expired API key - - Insufficient permissions - """ - - def __init__(self, message: str = "Authentication failed") -> None: - """Initialize AuthenticationError.""" - super().__init__(message) - - -class APIError(NutrientError): - """Raised for general API errors. - - Attributes: - status_code: HTTP status code from the API. - response_body: Raw response body from the API for debugging. - request_id: Request ID for tracking (if available). - """ - - def __init__( - self, - message: str, - status_code: int | None = None, - response_body: str | None = None, - request_id: str | None = None, - ) -> None: - """Initialize APIError with status code and response body.""" - super().__init__(message) - self.status_code = status_code - self.response_body = response_body - self.request_id = request_id - - def __str__(self) -> str: - """String representation with all available error details.""" - parts = [str(self.args[0]) if self.args else "API Error"] - - if self.status_code: - parts.append(f"Status: {self.status_code}") - - if self.request_id: - parts.append(f"Request ID: {self.request_id}") - - if self.response_body: - parts.append(f"Response: {self.response_body}") - - return " | ".join(parts) - - -class ValidationError(NutrientError): - """Raised when request validation fails.""" - - def __init__(self, message: str, errors: dict[str, Any] | None = None) -> None: - """Initialize ValidationError with validation details.""" - super().__init__(message) - self.errors = errors or {} - - -class NutrientTimeoutError(NutrientError): - """Raised when a request times out.""" - - pass - - -class FileProcessingError(NutrientError): - """Raised when file processing fails.""" - - pass diff --git a/src/nutrient_dws/file_handler.py b/src/nutrient_dws/file_handler.py deleted file mode 100644 index f79cfde..0000000 --- a/src/nutrient_dws/file_handler.py +++ /dev/null @@ -1,263 +0,0 @@ -"""File handling utilities for input/output operations.""" - -import contextlib -import io -import os -import re -from collections.abc import Generator -from pathlib import Path -from typing import BinaryIO - -FileInput = str | Path | bytes | BinaryIO - -# Default chunk size for streaming operations (1MB) -DEFAULT_CHUNK_SIZE = 1024 * 1024 - - -def prepare_file_input(file_input: FileInput) -> tuple[bytes, str]: - """Convert various file input types to bytes. - - Args: - file_input: File path, bytes, or file-like object. - - Returns: - tuple of (file_bytes, filename). - - Raises: - FileNotFoundError: If file path doesn't exist. - ValueError: If input type is not supported. - """ - # Handle different file input types using pattern matching - match file_input: - case Path() if not file_input.exists(): - raise FileNotFoundError(f"File not found: {file_input}") - case Path(): - return file_input.read_bytes(), file_input.name - case str(): - path = Path(file_input) - if not path.exists(): - raise FileNotFoundError(f"File not found: {file_input}") - return path.read_bytes(), path.name - case bytes(): - return file_input, "document" - case _ if hasattr(file_input, "read"): - # Handle file-like objects - # Save current position if seekable - current_pos = None - if hasattr(file_input, "seek") and hasattr(file_input, "tell"): - try: - current_pos = file_input.tell() - file_input.seek(0) # Read from beginning - except (OSError, io.UnsupportedOperation): - pass - - content = file_input.read() - if isinstance(content, str): - content = content.encode() - - # Restore position if we saved it - if current_pos is not None: - with contextlib.suppress(OSError, io.UnsupportedOperation): - file_input.seek(current_pos) - - filename = getattr(file_input, "name", "document") - if hasattr(filename, "__fspath__"): - filename = os.path.basename(os.fspath(filename)) - elif isinstance(filename, bytes): - filename = os.path.basename(filename.decode()) - elif isinstance(filename, str): - filename = os.path.basename(filename) - return content, str(filename) - case _: - raise ValueError(f"Unsupported file input type: {type(file_input)}") - - -def prepare_file_for_upload( - file_input: FileInput, - field_name: str = "file", -) -> tuple[str, tuple[str, bytes | BinaryIO, str]]: - """Prepare file for multipart upload. - - Args: - file_input: File path, bytes, or file-like object. - field_name: Form field name for the file. - - Returns: - tuple of (field_name, (filename, file_content_or_stream, content_type)). - - Raises: - FileNotFoundError: If file path doesn't exist. - ValueError: If input type is not supported. - """ - content_type = "application/octet-stream" - - # Handle different file input types using pattern matching - path: Path | None - match file_input: - case Path(): - path = file_input - case str(): - path = Path(file_input) - case _: - path = None - - # Handle path-based inputs - if path is not None: - if not path.exists(): - raise FileNotFoundError(f"File not found: {path}") - - # For large files, return file handle instead of reading into memory - file_size = path.stat().st_size - if file_size > 10 * 1024 * 1024: # 10MB threshold - # Note: File handle is intentionally not using context manager - # as it needs to remain open for streaming upload by HTTP client - file_handle = open(path, "rb") # noqa: SIM115 - return field_name, (path.name, file_handle, content_type) - else: - return field_name, (path.name, path.read_bytes(), content_type) - - # Handle non-path inputs - match file_input: - case bytes(): - return field_name, ("document", file_input, content_type) - case _ if hasattr(file_input, "read"): - filename = getattr(file_input, "name", "document") - if hasattr(filename, "__fspath__"): - filename = os.path.basename(os.fspath(filename)) - elif isinstance(filename, bytes): - filename = os.path.basename(filename.decode()) - elif isinstance(filename, str): - filename = os.path.basename(filename) - return field_name, (str(filename), file_input, content_type) # type: ignore[return-value] - case _: - raise ValueError(f"Unsupported file input type: {type(file_input)}") - - -def save_file_output(content: bytes, output_path: str) -> None: - """Save file content to disk. - - Args: - content: File bytes to save. - output_path: Path where to save the file. - - Raises: - OSError: If file cannot be written. - """ - path = Path(output_path) - # Create parent directories if they don't exist - path.parent.mkdir(parents=True, exist_ok=True) - path.write_bytes(content) - - -def stream_file_content( - file_path: str, - chunk_size: int = DEFAULT_CHUNK_SIZE, -) -> Generator[bytes, None, None]: - """Stream file content in chunks. - - Args: - file_path: Path to the file to stream. - chunk_size: Size of each chunk in bytes. - - Yields: - Chunks of file content. - - Raises: - FileNotFoundError: If file doesn't exist. - """ - path = Path(file_path) - if not path.exists(): - raise FileNotFoundError(f"File not found: {file_path}") - - with open(path, "rb") as f: - while chunk := f.read(chunk_size): - yield chunk - - -def get_file_size(file_input: FileInput) -> int | None: - """Get size of file input if available. - - Args: - file_input: File path, bytes, or file-like object. - - Returns: - File size in bytes, or None if size cannot be determined. - """ - if isinstance(file_input, Path): - if file_input.exists(): - return file_input.stat().st_size - elif isinstance(file_input, str): - path = Path(file_input) - if path.exists(): - return path.stat().st_size - elif isinstance(file_input, bytes): - return len(file_input) - elif hasattr(file_input, "seek") and hasattr(file_input, "tell"): - # For seekable file-like objects - try: - current_pos = file_input.tell() - file_input.seek(0, 2) # Seek to end - size = file_input.tell() - file_input.seek(current_pos) # Restore position - return size - except (OSError, io.UnsupportedOperation): - pass - - return None - - -def get_pdf_page_count(pdf_input: FileInput) -> int: - """Zero dependency way to get the number of pages in a PDF. - - Args: - pdf_input: File path, bytes, or file-like object. Has to be of a PDF file - - Returns: - Number of pages in a PDF. - """ - if isinstance(pdf_input, (str, Path)): - with open(pdf_input, "rb") as f: - pdf_bytes = f.read() - elif isinstance(pdf_input, bytes): - pdf_bytes = pdf_input - elif hasattr(pdf_input, "read") and hasattr(pdf_input, "seek") and hasattr(pdf_input, "tell"): - pos = pdf_input.tell() - pdf_input.seek(0) - pdf_bytes = pdf_input.read() - pdf_input.seek(pos) - else: - raise TypeError("Unsupported input type. Expected str, Path, bytes, or seekable BinaryIO.") - - # Find all PDF objects - objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) - - # Get the Catalog Object - catalog_obj = None - for _obj_num, _gen_num, obj_data in objects: - if b"/Type" in obj_data and b"/Catalog" in obj_data: - catalog_obj = obj_data - break - - if not catalog_obj: - raise ValueError("Could not find /Catalog object in PDF.") - - # Extract /Pages reference (e.g. 3 0 R) - pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) - if not pages_ref_match: - raise ValueError("Could not find /Pages reference in /Catalog.") - pages_obj_num = pages_ref_match.group(1).decode() - pages_obj_gen = pages_ref_match.group(2).decode() - - # Step 3: Find the referenced /Pages object - pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() - pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) - if not pages_obj_match: - raise ValueError("Could not find root /Pages object.") - pages_obj_data = pages_obj_match.group(1) - - # Step 4: Extract /Count - count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) - if not count_match: - raise ValueError("Could not find /Count in root /Pages object.") - - return int(count_match.group(1)) diff --git a/src/nutrient_dws/http.py b/src/nutrient_dws/http.py new file mode 100644 index 0000000..6aae822 --- /dev/null +++ b/src/nutrient_dws/http.py @@ -0,0 +1,607 @@ +"""HTTP request and response type definitions for API communication.""" + +import json +from collections.abc import Awaitable, Callable +from typing import Any, Generic, Literal, TypeGuard, TypeVar, Union, overload + +import httpx +from typing_extensions import NotRequired, TypedDict + +from nutrient_dws.errors import ( + APIError, + AuthenticationError, + NetworkError, + NutrientError, + ValidationError, +) +from nutrient_dws.inputs import FileInput, NormalizedFileData +from nutrient_dws.types.account_info import AccountInfo +from nutrient_dws.types.analyze_response import AnalyzeBuildResponse +from nutrient_dws.types.build_instruction import BuildInstructions +from nutrient_dws.types.build_response_json import BuildResponseJsonContents +from nutrient_dws.types.create_auth_token import ( + CreateAuthTokenParameters, + CreateAuthTokenResponse, +) +from nutrient_dws.types.redact_data import RedactData +from nutrient_dws.types.sign_request import CreateDigitalSignature +from nutrient_dws.utils import get_user_agent + + +class BuildRequestData(TypedDict): + instructions: BuildInstructions + files: NotRequired[dict[str, NormalizedFileData]] + + +class AnalyzeBuildRequestData(TypedDict): + instructions: BuildInstructions + + +class SignRequestOptions(TypedDict): + image: NotRequired[FileInput] + graphicImage: NotRequired[FileInput] + + +class SignRequestData(TypedDict): + file: NormalizedFileData + data: NotRequired[CreateDigitalSignature] + image: NotRequired[NormalizedFileData] + graphicImage: NotRequired[NormalizedFileData] + + +class RedactRequestData(TypedDict): + data: RedactData + fileKey: NotRequired[str] + file: NotRequired[NormalizedFileData] + + +class DeleteTokenRequestData(TypedDict): + id: str + + +# Methods and Endpoints types +Method = TypeVar("Method", bound=Literal["GET", "POST", "DELETE"]) +Endpoint = TypeVar( + "Endpoint", + bound=Literal[ + "/account/info", "/build", "/analyze_build", "/sign", "/ai/redact", "/tokens" + ], +) + +# Type variables for generic types +Input = TypeVar( + "Input", + bound=CreateAuthTokenParameters + | BuildRequestData + | AnalyzeBuildRequestData + | SignRequestData + | RedactRequestData + | DeleteTokenRequestData + | None, +) +Output = TypeVar( + "Output", + bound=CreateAuthTokenResponse + | str + | bytes + | BuildResponseJsonContents + | AnalyzeBuildResponse + | AccountInfo + | None, +) + + +# Request configuration +class RequestConfig(TypedDict, Generic[Method, Endpoint, Input]): + """HTTP request configuration for API calls.""" + + method: Method + endpoint: Endpoint + data: Input # The actual type depends on the method and endpoint + headers: dict[str, str] | None + + +def is_get_account_info_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[RequestConfig[Literal["GET"], Literal["/account/info"], None]]: + return request["method"] == "GET" and request["endpoint"] == "/account/info" + + +def is_post_build_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[RequestConfig[Literal["POST"], Literal["/build"], BuildRequestData]]: + return request["method"] == "POST" and request["endpoint"] == "/build" + + +def is_post_analyse_build_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/analyze_build"], AnalyzeBuildRequestData] +]: + return request["method"] == "POST" and request["endpoint"] == "/analyze_build" + + +def is_post_sign_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[RequestConfig[Literal["POST"], Literal["/sign"], SignRequestData]]: + return request["method"] == "POST" and request["endpoint"] == "/sign" + + +def is_post_ai_redact_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/ai/redact"], RedactRequestData] +]: + return request["method"] == "POST" and request["endpoint"] == "/ai/redact" + + +def is_post_tokens_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/tokens"], CreateAuthTokenParameters] +]: + return request["method"] == "POST" and request["endpoint"] == "/tokens" + + +def is_delete_tokens_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["DELETE"], Literal["/tokens"], DeleteTokenRequestData] +]: + return request["method"] == "DELETE" and request["endpoint"] == "/tokens" + + +# API response +class ApiResponse(TypedDict, Generic[Output]): + """Response from API call.""" + + data: Output # The actual type depends on the method and endpoint + status: int + statusText: str + headers: dict[str, Any] + + +# Client options +class NutrientClientOptions(TypedDict): + """Client options for Nutrient DWS API.""" + + apiKey: str | Callable[[], str | Awaitable[str]] + baseUrl: str | None + timeout: int | None + + +async def resolve_api_key(api_key: str | Callable[[], str | Awaitable[str]]) -> str: + """Resolves API key from string or function. + + Args: + api_key: API key as string or function that returns a string + + Returns: + Resolved API key as string + + Raises: + AuthenticationError: If API key function returns invalid value + """ + if isinstance(api_key, str): + return api_key + + try: + resolved_key = api_key() + if isinstance(resolved_key, Awaitable): + return await resolved_key + if not isinstance(resolved_key, str) or len(resolved_key) == 0: + raise AuthenticationError( + "API key function must return a non-empty string", + {"resolvedType": type(resolved_key).__name__}, + ) + return resolved_key + except Exception as error: + if isinstance(error, AuthenticationError): + raise error + raise AuthenticationError( + "Failed to resolve API key from function", {"error": str(error)} + ) + + +def append_file_to_form_data( + form_data: dict[str, Any], key: str, file: NormalizedFileData +) -> None: + """Appends file to form data with proper format. + + Args: + form_data: Form data dictionary + key: Key for the file + file: File data + + Raises: + ValidationError: If file data is not in expected format + """ + file_content, filename = file + + if not isinstance(file_content, bytes): + raise ValidationError( + "Expected bytes for file data", {"dataType": type(file_content).__name__} + ) + + form_data[key] = (filename, file_content) + + +def prepare_request_body( + request_config: dict[str, Any], config: RequestConfig[Method, Endpoint, Input] +) -> dict[str, Any]: + """Prepares request body with files and data. + + Args: + request_config: Request configuration dictionary + config: Request configuration + + Returns: + Updated request configuration + """ + if is_post_build_request_config(config): + # Use multipart/form-data for file uploads + files: dict[str, Any] = {} + for key, value in config["data"]["files"].items(): + append_file_to_form_data(files, key, value) + + request_config["files"] = files + request_config["data"] = { + "instructions": json.dumps(config["data"]["instructions"]) + } + + return request_config + + if is_post_analyse_build_request_config(config): + # JSON only request + request_config["json"] = config["data"]["instructions"] + + return request_config + + if is_post_sign_request_config(config): + files = {} + append_file_to_form_data(files, "file", config["data"]["file"]) + + if "image" in config["data"]: + append_file_to_form_data(files, "image", config["data"]["image"]) + + if "graphicImage" in config["data"]: + append_file_to_form_data( + files, "graphicImage", config["data"]["graphicImage"] + ) + + request_config["files"] = files + + data = {} + if "data" in config["data"] and config["data"]["data"] is not None: + data["data"] = json.dumps(config["data"]["data"]) + else: + data["data"] = json.dumps( + { + "signatureType": "cades", + "cadesLevel": "b-lt", + } + ) + + request_config["data"] = data + + return request_config + + if is_post_ai_redact_request_config(config): + if "file" in config["data"] and "fileKey" in config["data"]: + files = {} + append_file_to_form_data( + files, config["data"]["fileKey"], config["data"]["file"] + ) + + request_config["files"] = files + request_config["data"] = {"data": json.dumps(config["data"]["data"])} + else: + # JSON only request + request_config["json"] = config["data"]["data"] + + return request_config + + # Fallback, passing data as JSON + if "data" in config: + request_config["json"] = config["data"] + + return request_config + + +def extract_error_message(data: Any) -> str | None: + """Extracts error message from response data with comprehensive DWS error handling. + + Args: + data: Response data + + Returns: + Extracted error message or None if not found + """ + if isinstance(data, dict): + error_data = data + + # DWS-specific error fields (prioritized) + if "error_description" in error_data and isinstance( + error_data["error_description"], str + ): + return error_data["error_description"] + + if "error_message" in error_data and isinstance( + error_data["error_message"], str + ): + return error_data["error_message"] + + # Common error message fields + if "message" in error_data and isinstance(error_data["message"], str): + return error_data["message"] + + if "error" in error_data and isinstance(error_data["error"], str): + return error_data["error"] + + if "detail" in error_data and isinstance(error_data["detail"], str): + return error_data["detail"] + + if "details" in error_data and isinstance(error_data["details"], str): + return error_data["details"] + + # Handle nested error objects + if "error" in error_data and isinstance(error_data["error"], dict): + nested_error = error_data["error"] + + if "message" in nested_error and isinstance(nested_error["message"], str): + return nested_error["message"] + + if "description" in nested_error and isinstance( + nested_error["description"], str + ): + return nested_error["description"] + + # Handle errors array (common in validation responses) + if ( + "errors" in error_data + and isinstance(error_data["errors"], list) + and error_data["errors"] + ): + first_error = error_data["errors"][0] + + if isinstance(first_error, str): + return first_error + + if isinstance(first_error, dict): + error_obj = first_error + + if "message" in error_obj and isinstance(error_obj["message"], str): + return error_obj["message"] + + return None + + +def create_http_error(status: int, status_text: str, data: Any) -> NutrientError: + """Creates appropriate error for HTTP status codes. + + Args: + status: HTTP status code + status_text: HTTP status text + data: Response data + + Returns: + Appropriate NutrientError subclass + """ + message = extract_error_message(data) or f"HTTP {status}: {status_text}" + details = data if isinstance(data, dict) else {"response": data} + + if status in (401, 403): + return AuthenticationError(message, details, status) + + if 400 <= status < 500: + return ValidationError(message, details, status) + + return APIError(message, status, details) + + +def handle_response(response: httpx.Response) -> ApiResponse[Output]: + """Handles HTTP response and converts to standardized format. + + Args: + response: Response from the API + + Returns: + Standardized API response + + Raises: + NutrientError: For error responses + """ + status = response.status_code + status_text = response.reason_phrase + headers: dict[str, Any] = dict(response.headers) + + try: + data = response.json() + except (ValueError, json.JSONDecodeError): + data = response.content + + # Check for error status codes + if status >= 400: + raise create_http_error(status, status_text, data) + + return { + "data": data, + "status": status, + "statusText": status_text, + "headers": headers, + } + + +def convert_error( + error: Any, config: RequestConfig[Method, Endpoint, Input] +) -> NutrientError: + """Converts various error types to NutrientError. + + Args: + error: The error to convert + config: Request configuration + + Returns: + Converted NutrientError + """ + if isinstance(error, NutrientError): + return error + + if isinstance(error, (httpx.RequestError, httpx.HTTPStatusError)): + response = getattr(error, "response", None) + request = getattr(error, "request", None) + message = str(error) + + if response is not None: + # HTTP error response + try: + response_data = response.json() + except (ValueError, json.JSONDecodeError): + response_data = response.text + return create_http_error( + response.status_code, response.reason_phrase, response_data + ) + + if request is not None: + # Network error (request made but no response) + sanitized_headers = (config.get("headers") or {}).copy() + if "Authorization" in sanitized_headers: + del sanitized_headers["Authorization"] + + return NetworkError( + "Network request failed", + { + "message": message, + "endpoint": config["endpoint"], + "method": config["method"], + "headers": sanitized_headers, + }, + ) + + # Request setup error + return ValidationError( + "Request configuration error", + { + "message": message, + "endpoint": config["endpoint"], + "method": config["method"], + "data": config.get("data"), + }, + ) + + # Unknown error + return NutrientError( + "Unexpected error occurred", + "UNKNOWN_ERROR", + { + "error": str(error), + "endpoint": config["endpoint"], + "method": config["method"], + "data": config.get("data"), + }, + ) + + +@overload +async def send_request( + config: RequestConfig[Literal["GET"], Literal["/account/info"], None], + client_options: NutrientClientOptions, +) -> ApiResponse[AccountInfo]: ... + + +@overload +async def send_request( + config: RequestConfig[ + Literal["POST"], Literal["/tokens"], CreateAuthTokenParameters + ], + client_options: NutrientClientOptions, +) -> ApiResponse[CreateAuthTokenResponse]: ... + + +@overload +async def send_request( + config: RequestConfig[Literal["POST"], Literal["/build"], BuildRequestData], + client_options: NutrientClientOptions, +) -> ApiResponse[Union[BuildResponseJsonContents, bytes, str]]: ... + + +@overload +async def send_request( + config: RequestConfig[ + Literal["POST"], Literal["/analyze_build"], AnalyzeBuildRequestData + ], + client_options: NutrientClientOptions, +) -> ApiResponse[AnalyzeBuildResponse]: ... + + +@overload +async def send_request( + config: RequestConfig[Literal["POST"], Literal["/sign"], SignRequestData], + client_options: NutrientClientOptions, +) -> ApiResponse[bytes]: ... + + +@overload +async def send_request( + config: RequestConfig[Literal["POST"], Literal["/ai/redact"], RedactRequestData], + client_options: NutrientClientOptions, +) -> ApiResponse[bytes]: ... + + +@overload +async def send_request( + config: RequestConfig[ + Literal["DELETE"], Literal["/tokens"], DeleteTokenRequestData + ], + client_options: NutrientClientOptions, +) -> ApiResponse[None]: ... + + +async def send_request( + config: RequestConfig[Method, Endpoint, Input], + client_options: NutrientClientOptions, +) -> ApiResponse[Output]: + """Sends HTTP request to Nutrient DWS Processor API. + Handles authentication, file uploads, and error conversion. + + Args: + config: Request configuration + client_options: Client options + + Returns: + API response + + Raises: + NutrientError: For various error conditions + """ + try: + # Resolve API key (string or function) + api_key = await resolve_api_key(client_options["apiKey"]) + + # Build full URL + base_url: str = client_options.get("baseUrl") or "https://api.nutrient.io" + url = f"{base_url.rstrip('/')}{config['endpoint']}" + + headers = config.get("headers") or {} + headers["Authorization"] = f"Bearer {api_key}" + headers["User-Agent"] = get_user_agent() + + # Prepare request configuration + request_config: dict[str, Any] = { + "method": config["method"], + "url": url, + "headers": headers, + "timeout": client_options.get("timeout", None), + } + + # Prepare request body + request_config = prepare_request_body(request_config, config) + + # Make request using httpx async client + async with httpx.AsyncClient() as client: + response = await client.request(**request_config) + + # Handle response + return handle_response(response) + except Exception as error: + raise convert_error(error, config) diff --git a/src/nutrient_dws/http_client.py b/src/nutrient_dws/http_client.py deleted file mode 100644 index 8483428..0000000 --- a/src/nutrient_dws/http_client.py +++ /dev/null @@ -1,192 +0,0 @@ -"""HTTP client abstraction for API communication.""" - -import json -import logging -from typing import Any - -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -from nutrient_dws.exceptions import ( - APIError, - AuthenticationError, - NutrientTimeoutError, - ValidationError, -) - -logger = logging.getLogger(__name__) - - -class HTTPClient: - """HTTP client with connection pooling and retry logic.""" - - def __init__(self, api_key: str | None, timeout: int = 300) -> None: - """Initialize HTTP client with authentication. - - Args: - api_key: API key for authentication. - timeout: Request timeout in seconds. - """ - self._api_key = api_key - self._timeout = timeout - self._session = self._create_session() - self._base_url = "https://api.pspdfkit.com" - - def _create_session(self) -> requests.Session: - """Create requests session with retry logic.""" - session = requests.Session() - - # Configure retries with exponential backoff - retry_strategy = Retry( - total=3, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], - allowed_methods=["GET", "POST"], - raise_on_status=False, # We'll handle status codes ourselves - ) - adapter = HTTPAdapter( - max_retries=retry_strategy, - pool_connections=10, - pool_maxsize=10, - ) - session.mount("http://", adapter) - session.mount("https://", adapter) - - # Set default headers - headers = { - "User-Agent": "nutrient-dws-python-client/0.1.0", - } - if self._api_key: - headers["Authorization"] = f"Bearer {self._api_key}" - - session.headers.update(headers) - - return session - - def _handle_response(self, response: requests.Response) -> bytes: - """Handle API response and raise appropriate exceptions. - - Args: - response: Response from the API. - - Returns: - Response content as bytes. - - Raises: - AuthenticationError: For 401/403 responses. - ValidationError: For 422 responses. - APIError: For other error responses. - """ - # Extract request ID if available - request_id = response.headers.get("X-Request-Id") - - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - # Try to parse error message from response - error_message = f"HTTP {response.status_code}" - error_details = None - - try: - error_data = response.json() - error_message = error_data.get("message", error_message) - error_details = error_data.get("errors", error_data.get("details")) - except (json.JSONDecodeError, requests.exceptions.JSONDecodeError): - # If response is not JSON, use text content - if response.text: - error_message = f"{error_message}: {response.text[:200]}" - - # Handle specific status codes - if response.status_code in (401, 403): - raise AuthenticationError( - error_message or "Authentication failed. Check your API key." - ) from None - elif response.status_code == 422: - raise ValidationError( - error_message or "Request validation failed", - errors=error_details, - ) from None - else: - raise APIError( - error_message, - status_code=response.status_code, - response_body=response.text, - request_id=request_id, - ) from None - - return response.content - - def post( - self, - endpoint: str, - files: dict[str, Any] | None = None, - data: dict[str, Any] | None = None, - json_data: dict[str, Any] | None = None, - ) -> bytes: - """Make POST request to API. - - Args: - endpoint: API endpoint path. - files: Files to upload. - data: Form data. - json_data: JSON data (for multipart requests). - - Returns: - Response content as bytes. - - Raises: - AuthenticationError: If API key is missing or invalid. - TimeoutError: If request times out. - APIError: For other API errors. - """ - if not self._api_key: - raise AuthenticationError("API key is required but not provided") - - url = f"{self._base_url}{endpoint}" - logger.debug(f"POST {url}") - - # Prepare multipart data if json_data is provided - prepared_data = data or {} - if json_data is not None: - prepared_data["instructions"] = json.dumps(json_data) - - try: - response = self._session.post( - url, - files=files, - data=prepared_data, - timeout=self._timeout, - ) - except requests.exceptions.Timeout as e: - raise NutrientTimeoutError(f"Request timed out after {self._timeout} seconds") from e - except requests.exceptions.ConnectionError as e: - raise APIError(f"Connection error: {e!s}") from e - except requests.exceptions.RequestException as e: - raise APIError(f"Request failed: {e!s}") from e - - logger.debug(f"Response: {response.status_code}") - - # Clean up file handles after request - if files: - for _, file_data in files.items(): - if hasattr(file_data, "close"): - file_data.close() - elif isinstance(file_data, tuple) and len(file_data) > 1: - file_obj = file_data[1] - if hasattr(file_obj, "close"): - file_obj.close() - - return self._handle_response(response) - - def close(self) -> None: - """Close the session.""" - self._session.close() - - def __enter__(self) -> "HTTPClient": - """Context manager entry.""" - return self - - def __exit__(self, *args: Any) -> None: - """Context manager exit.""" - self.close() diff --git a/src/nutrient_dws/inputs.py b/src/nutrient_dws/inputs.py new file mode 100644 index 0000000..5acb5cf --- /dev/null +++ b/src/nutrient_dws/inputs.py @@ -0,0 +1,223 @@ +import contextlib +import io +import os +import re +from pathlib import Path +from typing import BinaryIO, TypeGuard +from urllib.parse import urlparse + +import aiofiles +import httpx + +FileInput = str | Path | bytes | BinaryIO + +NormalizedFileData = tuple[bytes, str] + + +def is_url(string: str) -> bool: + """Checks if a given string is a valid URL. + + Args: + string: The string to validate. + + Returns: + True if the string is a valid URL, False otherwise. + """ + try: + result = urlparse(string) + # A valid URL must have a scheme (e.g., 'http') and a network location (e.g., 'www.google.com') + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +def is_valid_pdf(file_bytes: bytes) -> bool: + """Check if a file is a valid PDF.""" + return file_bytes.startswith(b"%PDF-") + + +def is_remote_file_input(file_input: FileInput) -> TypeGuard[str]: + """Check if the file input is a remote URL. + + Args: + file_input: The file input to check + + Returns: + True if the file input is a remote URL + """ + return isinstance(file_input, str) and is_url(file_input) + + +async def process_file_input(file_input: FileInput) -> NormalizedFileData: + """Convert various file input types to bytes. + + Args: + file_input: File path, bytes, or file-like object. + + Returns: + tuple of (file_bytes, filename). + + Raises: + FileNotFoundError: If file path doesn't exist. + ValueError: If input type is not supported. + """ + # Handle different file input types using pattern matching + match file_input: + case Path() if not file_input.exists(): + raise FileNotFoundError(f"File not found: {file_input}") + case Path(): + async with aiofiles.open(file_input, "rb") as f: + content = await f.read() + return content, file_input.name + case str(): + path = Path(file_input) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_input}") + async with aiofiles.open(path, "rb") as f: + content = await f.read() + return content, path.name + case bytes(): + return file_input, "document" + case _ if hasattr(file_input, "read"): + # Handle file-like objects (both sync and async) + if hasattr(file_input, "aread"): + # Async file-like object + current_pos = None + if hasattr(file_input, "seek") and hasattr(file_input, "tell"): + try: + current_pos = ( + await file_input.atell() + if hasattr(file_input, "atell") + else file_input.tell() + ) + if hasattr(file_input, "aseek"): + await file_input.aseek(0) + else: + file_input.seek(0) + except (OSError, io.UnsupportedOperation): + pass + + content = await file_input.aread() + if isinstance(content, str): + content = content.encode() + + # Restore position if we saved it + if current_pos is not None: + with contextlib.suppress(OSError, io.UnsupportedOperation): + if hasattr(file_input, "aseek"): + await file_input.aseek(current_pos) + else: + file_input.seek(current_pos) + else: + # Synchronous file-like object + # Save current position if seekable + current_pos = None + if hasattr(file_input, "seek") and hasattr(file_input, "tell"): + try: + current_pos = file_input.tell() + file_input.seek(0) # Read from beginning + except (OSError, io.UnsupportedOperation): + pass + + content = file_input.read() + if isinstance(content, str): + content = content.encode() + + # Restore position if we saved it + if current_pos is not None: + with contextlib.suppress(OSError, io.UnsupportedOperation): + file_input.seek(current_pos) + + filename = getattr(file_input, "name", "document") + if hasattr(filename, "__fspath__"): + filename = os.path.basename(os.fspath(filename)) + elif isinstance(filename, bytes): + filename = os.path.basename(filename.decode()) + elif isinstance(filename, str): + filename = os.path.basename(filename) + return content, str(filename) + case _: + raise ValueError(f"Unsupported file input type: {type(file_input)}") + + +async def process_remote_file_input(url: str) -> NormalizedFileData: + """Convert various file input types to bytes.""" + async with httpx.AsyncClient() as client: + response = await client.get(url) + # This will raise an exception for bad responses (4xx or 5xx status codes) + response.raise_for_status() + # The .content attribute holds the raw bytes of the response + file_bytes = response.content + + filename = "downloaded_file" + # Try to get filename from 'Content-Disposition' header first + header = response.headers.get("content-disposition") + if header: + # Use regex to find a filename in the header + match = re.search(r'filename="?([^"]+)"?', header) + if match: + filename = match.group(1) + + return file_bytes, filename + + +def validate_file_input(file_input: FileInput) -> bool: + """Validate that the file input is in a supported format. + + Args: + file_input: The file input to validate + + Returns: + True if the file input is valid + """ + if isinstance(file_input, (bytes, str)): + return True + elif isinstance(file_input, Path): + return file_input.exists() and file_input.is_file() + elif hasattr(file_input, "read"): + return True + return False + + +def get_pdf_page_count(pdf_bytes: bytes) -> int: + """Zero dependency way to get the number of pages in a PDF. + + Args: + pdf_bytes: PDF file bytes + + Returns: + Number of pages in a PDF. + """ + # Find all PDF objects + objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) + + # Get the Catalog Object + catalog_obj = None + for _obj_num, _gen_num, obj_data in objects: + if b"/Type" in obj_data and b"/Catalog" in obj_data: + catalog_obj = obj_data + break + + if not catalog_obj: + raise ValueError("Could not find /Catalog object in PDF.") + + # Extract /Pages reference (e.g. 3 0 R) + pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) + if not pages_ref_match: + raise ValueError("Could not find /Pages reference in /Catalog.") + pages_obj_num = pages_ref_match.group(1).decode() + pages_obj_gen = pages_ref_match.group(2).decode() + + # Step 3: Find the referenced /Pages object + pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() + pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) + if not pages_obj_match: + raise ValueError("Could not find root /Pages object.") + pages_obj_data = pages_obj_match.group(1) + + # Step 4: Extract /Count + count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) + if not count_match: + raise ValueError("Could not find /Count in root /Pages object.") + + return int(count_match.group(1)) diff --git a/tests/integration/py.typed b/src/nutrient_dws/types/__init__.py similarity index 100% rename from tests/integration/py.typed rename to src/nutrient_dws/types/__init__.py diff --git a/src/nutrient_dws/types/account_info.py b/src/nutrient_dws/types/account_info.py new file mode 100644 index 0000000..52a8e64 --- /dev/null +++ b/src/nutrient_dws/types/account_info.py @@ -0,0 +1,22 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class APIKeys(TypedDict): + live: NotRequired[str] + + +SubscriptionType = Literal["free", "paid", "enterprise"] + + +class Usage(TypedDict): + totalCredits: NotRequired[int] + usedCredits: NotRequired[int] + + +class AccountInfo(TypedDict): + apiKeys: NotRequired[APIKeys] + signedIn: NotRequired[bool] + subscriptionType: NotRequired[SubscriptionType] + usage: NotRequired[Usage] diff --git a/src/nutrient_dws/types/analyze_response.py b/src/nutrient_dws/types/analyze_response.py new file mode 100644 index 0000000..45ef137 --- /dev/null +++ b/src/nutrient_dws/types/analyze_response.py @@ -0,0 +1,16 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class RequiredFeatures(TypedDict): + unit_cost: NotRequired[float] + unit_type: NotRequired[Literal["per_use", "per_output_page"]] + units: NotRequired[int] + cost: NotRequired[float] + usage: NotRequired[list[str]] + + +class AnalyzeBuildResponse(TypedDict): + cost: NotRequired[float] + required_features: NotRequired[dict[str, RequiredFeatures]] diff --git a/src/nutrient_dws/types/annotation/__init__.py b/src/nutrient_dws/types/annotation/__init__.py new file mode 100644 index 0000000..5bd7176 --- /dev/null +++ b/src/nutrient_dws/types/annotation/__init__.py @@ -0,0 +1,33 @@ +from typing import Union + +from nutrient_dws.types.annotation.ellipse import EllipseAnnotation +from nutrient_dws.types.annotation.image import ImageAnnotation +from nutrient_dws.types.annotation.ink import InkAnnotation +from nutrient_dws.types.annotation.line import LineAnnotation +from nutrient_dws.types.annotation.link import LinkAnnotation +from nutrient_dws.types.annotation.markup import MarkupAnnotation +from nutrient_dws.types.annotation.note import NoteAnnotation +from nutrient_dws.types.annotation.polygon import PolygonAnnotation +from nutrient_dws.types.annotation.polyline import PolylineAnnotation +from nutrient_dws.types.annotation.rectangle import RectangleAnnotation +from nutrient_dws.types.annotation.redaction import RedactionAnnotation +from nutrient_dws.types.annotation.stamp import StampAnnotation +from nutrient_dws.types.annotation.text import TextAnnotation +from nutrient_dws.types.annotation.widget import WidgetAnnotation + +Annotation = Union[ + MarkupAnnotation, + RedactionAnnotation, + TextAnnotation, + InkAnnotation, + LinkAnnotation, + NoteAnnotation, + EllipseAnnotation, + RectangleAnnotation, + LineAnnotation, + PolylineAnnotation, + PolygonAnnotation, + ImageAnnotation, + StampAnnotation, + WidgetAnnotation, +] diff --git a/src/nutrient_dws/types/annotation/base.py b/src/nutrient_dws/types/annotation/base.py new file mode 100644 index 0000000..0e80b69 --- /dev/null +++ b/src/nutrient_dws/types/annotation/base.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +if TYPE_CHECKING: + from nutrient_dws.types.instant_json.actions import Action + from nutrient_dws.types.misc import ( + AnnotationBbox, + AnnotationCustomData, + AnnotationNote, + AnnotationOpacity, + MeasurementPrecision, + MeasurementScale, + PageIndex, + PdfObjectId, + ) + + +class V1(TypedDict): + v: Literal[1] + pageIndex: int + bbox: AnnotationBbox + action: NotRequired[Action] + opacity: NotRequired[float] + pdfObjectId: NotRequired[int] + id: NotRequired[str] + flags: NotRequired[ + list[ + Literal[ + "noPrint", + "noZoom", + "noRotate", + "noView", + "hidden", + "invisible", + "readOnly", + "locked", + "toggleNoView", + "lockedContents", + ] + ] + ] + createdAt: NotRequired[str] + updatedAt: NotRequired[str] + name: NotRequired[str] + creatorName: NotRequired[str] + customData: NotRequired[AnnotationCustomData | None] + + +class V2(TypedDict): + v: Literal[2] + pageIndex: PageIndex + bbox: AnnotationBbox + action: NotRequired[Action] + opacity: NotRequired[AnnotationOpacity] + pdfObjectId: NotRequired[PdfObjectId] + id: NotRequired[str] + flags: NotRequired[ + list[ + Literal[ + "noPrint", + "noZoom", + "noRotate", + "noView", + "hidden", + "invisible", + "readOnly", + "locked", + "toggleNoView", + "lockedContents", + ] + ] + ] + createdAt: NotRequired[str] + updatedAt: NotRequired[str] + name: NotRequired[str] + creatorName: NotRequired[str] + customData: NotRequired[AnnotationCustomData | None] + + +class BaseShapeAnnotation(TypedDict): + strokeDashArray: NotRequired[list[float]] + strokeWidth: NotRequired[float] + strokeColor: NotRequired[str] + note: NotRequired[AnnotationNote] + measurementScale: NotRequired[MeasurementScale] + measurementPrecision: NotRequired[MeasurementPrecision] + + +BaseAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/ellipse.py b/src/nutrient_dws/types/annotation/ellipse.py new file mode 100644 index 0000000..516ddea --- /dev/null +++ b/src/nutrient_dws/types/annotation/ellipse.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + CloudyBorderInset, + CloudyBorderIntensity, + FillColor, + ) + + +class EllipseBase(TypedDict): + type: Literal["pspdfkit/shape/ellipse"] + fillColor: NotRequired[FillColor] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, BaseShapeAnnotation, EllipseBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, EllipseBase): ... + + +EllipseAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/image.py b/src/nutrient_dws/types/annotation/image.py new file mode 100644 index 0000000..f6ca973 --- /dev/null +++ b/src/nutrient_dws/types/annotation/image.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, AnnotationRotation + + +class ImageBase(TypedDict): + type: Literal["pspdfkit/image"] + description: NotRequired[str] + fileName: NotRequired[str] + contentType: NotRequired[Literal["image/jpeg", "image/png", "application/pdf"]] + imageAttachmentId: NotRequired[str] + rotation: NotRequired[AnnotationRotation] + isSignature: NotRequired[bool] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, ImageBase): + pass + + +class V2(BaseV2, ImageBase): + pass + + +ImageAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/ink.py b/src/nutrient_dws/types/annotation/ink.py new file mode 100644 index 0000000..c1811b9 --- /dev/null +++ b/src/nutrient_dws/types/annotation/ink.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationNote, + BackgroundColor, + BlendMode, + Lines, + ) + + +class InkBase(TypedDict): + type: Literal["pspdfkit/ink"] + lines: Lines + lineWidth: int + isDrawnNaturally: NotRequired[bool] + isSignature: NotRequired[bool] + strokeColor: NotRequired[str] + backgroundColor: NotRequired[BackgroundColor] + blendMode: NotRequired[BlendMode] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, InkBase): + pass + + +class V2(BaseV2, InkBase): + pass + + +InkAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/line.py b/src/nutrient_dws/types/annotation/line.py new file mode 100644 index 0000000..5afabf2 --- /dev/null +++ b/src/nutrient_dws/types/annotation/line.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import FillColor, LineCaps, Point + + +class LineBase(TypedDict): + type: Literal["pspdfkit/shape/line"] + startPoint: Point + endPoint: Point + fillColor: NotRequired[FillColor] + lineCaps: NotRequired[LineCaps] + + +class V1(BaseV1, BaseShapeAnnotation, LineBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, LineBase): ... + + +LineAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/link.py b/src/nutrient_dws/types/annotation/link.py new file mode 100644 index 0000000..60411bb --- /dev/null +++ b/src/nutrient_dws/types/annotation/link.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, BorderStyle + + +class LinkBase(TypedDict): + type: Literal["pspdfkit/link"] + borderColor: NotRequired[str] + borderStyle: NotRequired[BorderStyle] + borderWidth: NotRequired[int] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, LinkBase): + pass + + +class V2(BaseV2, LinkBase): + pass + + +LinkAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/markup.py b/src/nutrient_dws/types/annotation/markup.py new file mode 100644 index 0000000..aca6f4c --- /dev/null +++ b/src/nutrient_dws/types/annotation/markup.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationNote, + BlendMode, + IsCommentThreadRoot, + Rect, + ) + + +class MarkupBase(TypedDict): + type: Literal[ + "pspdfkit/markup/highlight", + "pspdfkit/markup/squiggly", + "pspdfkit/markup/strikeout", + "pspdfkit/markup/underline", + ] + rects: list[Rect] + blendMode: NotRequired[BlendMode] + color: str + note: NotRequired[AnnotationNote] + isCommentThreadRoot: NotRequired[IsCommentThreadRoot] + + +class V1(BaseV1, MarkupBase): ... + + +class V2(BaseV2, MarkupBase): ... + + +MarkupAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/note.py b/src/nutrient_dws/types/annotation/note.py new file mode 100644 index 0000000..bd0b9ae --- /dev/null +++ b/src/nutrient_dws/types/annotation/note.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationPlainText, IsCommentThreadRoot + +NoteIcon = Literal[ + "comment", + "rightPointer", + "rightArrow", + "check", + "circle", + "cross", + "insert", + "newParagraph", + "note", + "paragraph", + "help", + "star", + "key", +] + + +class NoteBase(TypedDict): + text: AnnotationPlainText + icon: NoteIcon + color: NotRequired[str] + isCommentThreadRoot: NotRequired[IsCommentThreadRoot] + + +class V1(BaseV1, NoteBase): + pass + + +class V2(BaseV2, NoteBase): + pass + + +NoteAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/polygon.py b/src/nutrient_dws/types/annotation/polygon.py new file mode 100644 index 0000000..7dade5d --- /dev/null +++ b/src/nutrient_dws/types/annotation/polygon.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import CloudyBorderIntensity, FillColor, Point + + +class PolygonBase(TypedDict): + type: Literal["pspdfkit/shape/polygon"] + fillColor: NotRequired[FillColor] + points: list[Point] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + + +class V1(BaseV1, BaseShapeAnnotation, PolygonBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, PolygonBase): ... + + +PolygonAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/polyline.py b/src/nutrient_dws/types/annotation/polyline.py new file mode 100644 index 0000000..53d9259 --- /dev/null +++ b/src/nutrient_dws/types/annotation/polyline.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + CloudyBorderInset, + CloudyBorderIntensity, + FillColor, + LineCaps, + Point, + ) + + +class PolylineBase(TypedDict): + type: Literal["pspdfkit/shape/polyline"] + fillColor: NotRequired[FillColor] + points: list[Point] + lineCaps: NotRequired[LineCaps] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, BaseShapeAnnotation, PolylineBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, PolylineBase): ... + + +PolylineAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/rectangle.py b/src/nutrient_dws/types/annotation/rectangle.py new file mode 100644 index 0000000..f37897b --- /dev/null +++ b/src/nutrient_dws/types/annotation/rectangle.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + CloudyBorderInset, + CloudyBorderIntensity, + FillColor, + ) + + +class RectangleBase(TypedDict): + type: Literal["pspdfkit/shape/rectangle"] + fillColor: NotRequired[FillColor] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, BaseShapeAnnotation, RectangleBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, RectangleBase): ... + + +RectangleAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/redaction.py b/src/nutrient_dws/types/annotation/redaction.py new file mode 100644 index 0000000..1195af7 --- /dev/null +++ b/src/nutrient_dws/types/annotation/redaction.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, AnnotationRotation, Rect + + +class RedactionBase(TypedDict): + type: Literal["pspdfkit/markup/redaction"] + rects: NotRequired[list[Rect]] + outlineColor: NotRequired[str] + fillColor: NotRequired[str] + overlayText: NotRequired[str] + repeatOverlayText: NotRequired[bool] + color: NotRequired[str] + rotation: NotRequired[AnnotationRotation] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, RedactionBase): ... + + +class V2(BaseV2, RedactionBase): ... + + +RedactionAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/stamp.py b/src/nutrient_dws/types/annotation/stamp.py new file mode 100644 index 0000000..f190b8b --- /dev/null +++ b/src/nutrient_dws/types/annotation/stamp.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, AnnotationRotation + + +class StampBase(TypedDict): + type: Literal["pspdfkit/stamp"] + stampType: Literal[ + "Accepted", + "Approved", + "AsIs", + "Completed", + "Confidential", + "Departmental", + "Draft", + "Experimental", + "Expired", + "Final", + "ForComment", + "ForPublicRelease", + "InformationOnly", + "InitialHere", + "NotApproved", + "NotForPublicRelease", + "PreliminaryResults", + "Rejected", + "Revised", + "SignHere", + "Sold", + "TopSecret", + "Void", + "Witness", + "Custom", + ] + title: NotRequired[str] + subtitle: NotRequired[str] + color: NotRequired[str] + rotation: NotRequired[AnnotationRotation] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, StampBase): + pass + + +class V2(BaseV2, StampBase): + pass + + +StampAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/text.py b/src/nutrient_dws/types/annotation/text.py new file mode 100644 index 0000000..73fc21b --- /dev/null +++ b/src/nutrient_dws/types/annotation/text.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationPlainText, + AnnotationRotation, + BorderStyle, + CloudyBorderInset, + CloudyBorderIntensity, + Font, + FontColor, + FontSizeInt, + HorizontalAlign, + LineCap, + Point, + VerticalAlign, + ) + + +class Callout(TypedDict): + start: Point + end: Point + innerRectInset: list[float] + cap: NotRequired[LineCap] + knee: NotRequired[Point] + + +class TextBase(TypedDict): + type: Literal["pspdfkit/text"] + text: AnnotationPlainText + fontSize: FontSizeInt + fontStyle: NotRequired[list[Literal["bold", "italic"]]] + fontColor: NotRequired[FontColor] + font: NotRequired[Font] + backgroundColor: NotRequired[str] + horizontalAlign: NotRequired[HorizontalAlign] + verticalAlign: NotRequired[VerticalAlign] + rotation: NotRequired[AnnotationRotation] + isFitting: NotRequired[bool] + callout: NotRequired[Callout] + borderStyle: NotRequired[BorderStyle] + borderWidth: NotRequired[int] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, TextBase): + pass + + +class V2(BaseV2, TextBase): + pass + + +TextAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/widget.py b/src/nutrient_dws/types/annotation/widget.py new file mode 100644 index 0000000..366a449 --- /dev/null +++ b/src/nutrient_dws/types/annotation/widget.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationRotation, + BackgroundColor, + BorderStyle, + Font, + FontColor, + FontSizeAuto, + FontSizeInt, + HorizontalAlign, + VerticalAlign, + ) + + +class WidgetBase(TypedDict): + type: Literal["pspdfkit/widget"] + formFieldName: NotRequired[str] + borderColor: NotRequired[str] + borderStyle: NotRequired[BorderStyle] + borderWidth: NotRequired[int] + font: NotRequired[Font] + fontSize: NotRequired[FontSizeInt | FontSizeAuto] + fontColor: NotRequired[FontColor] + horizontalAlign: NotRequired[HorizontalAlign] + verticalAlign: NotRequired[VerticalAlign] + rotation: NotRequired[AnnotationRotation] + backgroundColor: NotRequired[BackgroundColor] + + +class V1(BaseV1, WidgetBase): + pass + + +class V2(BaseV2, WidgetBase): + pass + + +WidgetAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/build_actions.py b/src/nutrient_dws/types/build_actions.py new file mode 100644 index 0000000..4d18112 --- /dev/null +++ b/src/nutrient_dws/types/build_actions.py @@ -0,0 +1,161 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.redaction import RedactionAnnotation +from nutrient_dws.types.file_handle import FileHandle +from nutrient_dws.types.misc import OcrLanguage, WatermarkDimension + + +class ApplyInstantJsonAction(TypedDict): + type: Literal["applyInstantJson"] + file: FileHandle + + +class ApplyXfdfActionOptions(TypedDict, total=False): + ignorePageRotation: NotRequired[bool] + richTextEnabled: NotRequired[bool] + + +class ApplyXfdfAction(TypedDict): + type: Literal["applyXfdf"] + file: FileHandle + ignorePageRotation: NotRequired[bool] + richTextEnabled: NotRequired[bool] + + +class FlattenAction(TypedDict): + type: Literal["flatten"] + annotationIds: NotRequired[list[str | int]] + + +class OcrAction(TypedDict): + type: Literal["ocr"] + language: OcrLanguage | list[OcrLanguage] + + +class RotateAction(TypedDict): + type: Literal["rotate"] + rotateBy: Literal[90, 180, 270] + + +class BaseWatermarkActionOptions(TypedDict): + width: WatermarkDimension + height: WatermarkDimension + top: NotRequired[WatermarkDimension] + right: NotRequired[WatermarkDimension] + bottom: NotRequired[WatermarkDimension] + left: NotRequired[WatermarkDimension] + rotation: NotRequired[float] + opacity: NotRequired[float] + + +class BaseWatermarkAction(BaseWatermarkActionOptions): + type: Literal["watermark"] + + +class TextWatermarkActionOptions(BaseWatermarkActionOptions, total=False): + fontFamily: NotRequired[str] + fontSize: NotRequired[int] + fontColor: NotRequired[str] + fontStyle: NotRequired[list[Literal["bold", "italic"]]] + + +class TextWatermarkAction(BaseWatermarkAction): + text: str + fontFamily: NotRequired[str] + fontSize: NotRequired[int] + fontColor: NotRequired[str] + fontStyle: NotRequired[list[Literal["bold", "italic"]]] + + +class ImageWatermarkActionOptions(BaseWatermarkActionOptions, total=False): ... + + +class ImageWatermarkAction(BaseWatermarkAction): + image: FileHandle + + +WatermarkAction = Union[TextWatermarkAction, ImageWatermarkAction] + +SearchPreset = Literal[ + "credit-card-number", + "date", + "email-address", + "international-phone-number", + "ipv4", + "ipv6", + "mac-address", + "north-american-phone-number", + "social-security-number", + "time", + "url", + "us-zip-code", + "vin", +] + + +class CreateRedactionsStrategyOptionsPreset(TypedDict): + includeAnnotations: NotRequired[bool] + start: NotRequired[int] + limit: NotRequired[int] + + +class CreateRedactionsStrategyOptionsRegex(TypedDict): + includeAnnotations: NotRequired[bool] + caseSensitive: NotRequired[bool] + start: NotRequired[int] + limit: NotRequired[int] + + +class CreateRedactionsStrategyOptionsText(TypedDict): + includeAnnotations: NotRequired[bool] + caseSensitive: NotRequired[bool] + start: NotRequired[int] + limit: NotRequired[int] + + +class BaseCreateRedactionsOptions(TypedDict): + content: NotRequired[RedactionAnnotation] + + +class BaseCreateRedactionsAction(BaseCreateRedactionsOptions): + type: Literal["createRedactions"] + + +class CreateRedactionsActionPreset(TypedDict, BaseCreateRedactionsAction): + strategy: Literal["preset"] + strategyOptions: CreateRedactionsStrategyOptionsPreset + + +class CreateRedactionsActionRegex(TypedDict, BaseCreateRedactionsAction): + strategy: Literal["regex"] + strategyOptions: CreateRedactionsStrategyOptionsRegex + + +class CreateRedactionsActionText(TypedDict, BaseCreateRedactionsAction): + strategy: Literal["text"] + strategyOptions: CreateRedactionsStrategyOptionsText + + +CreateRedactionsAction = Union[ + CreateRedactionsActionPreset, + CreateRedactionsActionRegex, + CreateRedactionsActionText, +] + + +class ApplyRedactionsAction(TypedDict): + type: Literal["applyRedactions"] + + +BuildAction = Union[ + ApplyInstantJsonAction, + ApplyXfdfAction, + FlattenAction, + OcrAction, + RotateAction, + WatermarkAction, + CreateRedactionsAction, + ApplyRedactionsAction, +] diff --git a/src/nutrient_dws/types/build_instruction.py b/src/nutrient_dws/types/build_instruction.py new file mode 100644 index 0000000..8d8ce9d --- /dev/null +++ b/src/nutrient_dws/types/build_instruction.py @@ -0,0 +1,13 @@ +from typing import TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.build_actions import BuildAction +from nutrient_dws.types.build_output import BuildOutput +from nutrient_dws.types.input_parts import Part + + +class BuildInstructions(TypedDict): + parts: list[Part] + actions: NotRequired[list[BuildAction]] + output: NotRequired[BuildOutput] diff --git a/src/nutrient_dws/types/build_output.py b/src/nutrient_dws/types/build_output.py new file mode 100644 index 0000000..e314b6f --- /dev/null +++ b/src/nutrient_dws/types/build_output.py @@ -0,0 +1,125 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.misc import OcrLanguage, PageRange + + +class Metadata(TypedDict): + title: NotRequired[str] + author: NotRequired[str] + + +class Label(TypedDict): + pages: list[int] + label: str + + +PDFUserPermission = Literal[ + "printing", + "modification", + "extract", + "annotations_and_forms", + "fill_forms", + "extract_accessibility", + "assemble", + "print_high_quality", +] + + +class OptimizePdf(TypedDict): + grayscaleText: NotRequired[bool] + grayscaleGraphics: NotRequired[bool] + grayscaleImages: NotRequired[bool] + grayscaleFormFields: NotRequired[bool] + grayscaleAnnotations: NotRequired[bool] + disableImages: NotRequired[bool] + mrcCompression: NotRequired[bool] + imageOptimizationQuality: NotRequired[int] + linearize: NotRequired[bool] + + +class BasePDFOutput(TypedDict): + metadata: NotRequired[Metadata] + labels: NotRequired[list[Label]] + user_password: NotRequired[str] + owner_password: NotRequired[str] + user_permissions: NotRequired[list[PDFUserPermission]] + optimize: NotRequired[OptimizePdf] + + +PDFOutputOptions = BasePDFOutput + + +class PDFOutput(BasePDFOutput): + type: NotRequired[Literal["pdf"]] + + +class PDFAOutputOptions(PDFOutputOptions): + conformance: NotRequired[ + Literal[ + "pdfa-1a", "pdfa-1b", "pdfa-2a", "pdfa-2u", "pdfa-2b", "pdfa-3a", "pdfa-3u" + ] + ] + vectorization: NotRequired[bool] + rasterization: NotRequired[bool] + + +class PDFAOutput(PDFAOutputOptions): + type: Literal["pdfa"] + + +PDFUAOutputOptions = BasePDFOutput + + +class PDFUAOutput(PDFUAOutputOptions): + type: Literal["pdfua"] + + +class ImageOutputOptions(TypedDict): + format: NotRequired[Literal["png", "jpeg", "jpg", "webp"]] + pages: NotRequired[PageRange] + width: NotRequired[float] + height: NotRequired[float] + dpi: NotRequired[float] + + +class ImageOutput(ImageOutputOptions): + type: Literal["image"] + + +class JSONContentOutputOptions(TypedDict): + plainText: NotRequired[bool] + structuredText: NotRequired[bool] + keyValuePairs: NotRequired[bool] + tables: NotRequired[bool] + language: NotRequired[OcrLanguage | list[OcrLanguage]] + + +class JSONContentOutput(JSONContentOutputOptions): + type: Literal["json-content"] + + +class OfficeOutput(TypedDict): + type: Literal["docx", "xlsx", "pptx"] + + +class HTMLOutput(TypedDict): + type: Literal["html"] + layout: NotRequired[Literal["page", "reflow"]] + + +class MarkdownOutput(TypedDict): + type: Literal["markdown"] + + +BuildOutput = Union[ + PDFOutput, + PDFAOutput, + PDFUAOutput, + ImageOutput, + JSONContentOutput, + OfficeOutput, + HTMLOutput, + MarkdownOutput, +] diff --git a/src/nutrient_dws/types/build_response_json.py b/src/nutrient_dws/types/build_response_json.py new file mode 100644 index 0000000..06fd943 --- /dev/null +++ b/src/nutrient_dws/types/build_response_json.py @@ -0,0 +1,129 @@ +from typing import TypedDict + +from typing_extensions import NotRequired + +PlainText = str + + +class JsonContentsBbox(TypedDict): + """Represents a rectangular region on the page.""" + + left: float + top: float + width: float + height: float + + +class Character(TypedDict): + """Character in structured text.""" + + bbox: JsonContentsBbox + char: str + + +class Line(TypedDict): + """Line in structured text.""" + + bbox: JsonContentsBbox + text: str + + +class Word(TypedDict): + """Word in structured text.""" + + bbox: JsonContentsBbox + text: str + + +class Paragraph(TypedDict): + """Paragraph in structured text.""" + + bbox: JsonContentsBbox + text: str + + +class StructuredText(TypedDict): + """Structured text content.""" + + characters: NotRequired[list[Character]] + lines: NotRequired[list[Line]] + paragraphs: NotRequired[list[Paragraph]] + words: NotRequired[list[Word]] + + +class KVPKey(TypedDict): + """Key-value pair key.""" + + bbox: JsonContentsBbox + confidence: float + text: str + + +class KVPValue(TypedDict): + """Key-value pair value.""" + + bbox: JsonContentsBbox + confidence: float + text: str + + +class KeyValuePair(TypedDict): + """Detected key-value pair.""" + + confidence: float + key: KVPKey + value: KVPValue + + +class TableCell(TypedDict): + """Table cell.""" + + bbox: JsonContentsBbox + rowIndex: int + colIndex: int + text: str + + +class TableColumn(TypedDict): + """Table column.""" + + bbox: JsonContentsBbox + + +class TableLine(TypedDict): + """Table line.""" + + bbox: JsonContentsBbox + + +class TableRow(TypedDict): + """Table row.""" + + bbox: JsonContentsBbox + + +class Table(TypedDict): + """Detected table.""" + + confidence: float + bbox: JsonContentsBbox + cells: list[TableCell] + columns: list[TableColumn] + lines: list[TableLine] + rows: list[TableRow] + + +class PageJsonContents(TypedDict): + """JSON content for a single page.""" + + pageIndex: int + plainText: NotRequired[PlainText] + structuredText: NotRequired[StructuredText] + keyValuePairs: NotRequired[list[KeyValuePair]] + tables: NotRequired[list[Table]] + + +class BuildResponseJsonContents(TypedDict): + """Build response JSON contents.""" + + pages: NotRequired[list[PageJsonContents]] diff --git a/src/nutrient_dws/types/create_auth_token.py b/src/nutrient_dws/types/create_auth_token.py new file mode 100644 index 0000000..f288af2 --- /dev/null +++ b/src/nutrient_dws/types/create_auth_token.py @@ -0,0 +1,34 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class CreateAuthTokenParameters(TypedDict): + allowedOperations: NotRequired[ + list[ + Literal[ + "annotations_api", + "compression_api", + "data_extraction_api", + "digital_signatures_api", + "document_editor_api", + "html_conversion_api", + "image_conversion_api", + "image_rendering_api", + "email_conversion_api", + "linearization_api", + "ocr_api", + "office_conversion_api", + "pdfa_api", + "pdf_to_office_conversion_api", + "redaction_api", + ] + ] + ] + allowedOrigins: NotRequired[list[str]] + expirationTime: NotRequired[int] + + +class CreateAuthTokenResponse(TypedDict): + id: NotRequired[str] + accessToken: NotRequired[str] diff --git a/src/nutrient_dws/types/error_response.py b/src/nutrient_dws/types/error_response.py new file mode 100644 index 0000000..9aa254c --- /dev/null +++ b/src/nutrient_dws/types/error_response.py @@ -0,0 +1,15 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class FailingPath(TypedDict): + path: NotRequired[str] + details: NotRequired[str] + + +class HostedErrorResponse(TypedDict): + details: NotRequired[str] + status: NotRequired[Literal[400, 402, 408, 413, 422, 500]] + requestId: NotRequired[str] + failingPaths: NotRequired[list[FailingPath]] diff --git a/src/nutrient_dws/types/file_handle.py b/src/nutrient_dws/types/file_handle.py new file mode 100644 index 0000000..ba99ff2 --- /dev/null +++ b/src/nutrient_dws/types/file_handle.py @@ -0,0 +1,11 @@ +from typing import TypedDict, Union + +from typing_extensions import NotRequired + + +class RemoteFileHandle(TypedDict): + url: str + sha256: NotRequired[str] + + +FileHandle = Union[RemoteFileHandle, str] diff --git a/src/nutrient_dws/types/input_parts.py b/src/nutrient_dws/types/input_parts.py new file mode 100644 index 0000000..4dfe86a --- /dev/null +++ b/src/nutrient_dws/types/input_parts.py @@ -0,0 +1,63 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.build_actions import BuildAction +from nutrient_dws.types.file_handle import FileHandle +from nutrient_dws.types.misc import PageLayout, PageRange + + +class FilePartOptions(TypedDict): + password: NotRequired[str] + pages: NotRequired[PageRange] + layout: NotRequired[PageLayout] + content_type: NotRequired[str] + actions: NotRequired[list[BuildAction]] + + +class FilePart(FilePartOptions): + file: FileHandle + + +class HTMLPartOptions(TypedDict): + layout: NotRequired[PageLayout] + + +class HTMLPart(HTMLPartOptions): + html: FileHandle + assets: NotRequired[list[str]] + actions: NotRequired[list[BuildAction]] + + +class NewPagePartOptions(TypedDict): + pageCount: NotRequired[int] + layout: NotRequired[PageLayout] + + +class NewPagePart(NewPagePartOptions): + page: Literal["new"] + actions: NotRequired[list[BuildAction]] + + +DocumentId = str + + +class DocumentEngineID(TypedDict): + id: DocumentId | Literal["#self"] + layer: NotRequired[str] + + +class DocumentPartOptions(TypedDict): + password: NotRequired[str] + pages: NotRequired[PageRange] + layer: NotRequired[str] + + +class DocumentPart(TypedDict): + document: DocumentEngineID + password: NotRequired[str] + pages: NotRequired[PageRange] + actions: NotRequired[list[BuildAction]] + + +Part = Union[FilePart, HTMLPart, NewPagePart, DocumentPart] diff --git a/src/nutrient_dws/types/instant_json/__init__.py b/src/nutrient_dws/types/instant_json/__init__.py new file mode 100644 index 0000000..36c5d7d --- /dev/null +++ b/src/nutrient_dws/types/instant_json/__init__.py @@ -0,0 +1,27 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation import Annotation +from nutrient_dws.types.instant_json.attachments import Attachments +from nutrient_dws.types.instant_json.bookmark import Bookmark +from nutrient_dws.types.instant_json.comment import CommentContent +from nutrient_dws.types.instant_json.form_field import FormField +from nutrient_dws.types.instant_json.form_field_value import FormFieldValue + + +class PdfId(TypedDict): + permanent: NotRequired[str] + changing: NotRequired[str] + + +class InstantJson(TypedDict): + format: Literal["https://pspdfkit.com/instant-json/v1"] + annotations: NotRequired[list[Annotation]] + attachments: NotRequired[Attachments] + formFields: NotRequired[list[FormField]] + formFieldValues: NotRequired[list[FormFieldValue]] + bookmarks: NotRequired[list[Bookmark]] + comments: NotRequired[list[CommentContent]] + skippedPdfObjectIds: NotRequired[list[int]] + pdfId: NotRequired[PdfId] diff --git a/src/nutrient_dws/types/instant_json/actions.py b/src/nutrient_dws/types/instant_json/actions.py new file mode 100644 index 0000000..49e2fee --- /dev/null +++ b/src/nutrient_dws/types/instant_json/actions.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + + +class BaseAction(TypedDict): + subAction: NotRequired[Action] + + +class GoToAction(BaseAction): + type: Literal["goTo"] + pageIndex: int + + +class GoToRemoteAction(BaseAction): + type: Literal["goToRemote"] + relativePath: str + namedDestination: NotRequired[str] + + +class GoToEmbeddedAction(BaseAction): + type: Literal["goToEmbedded"] + relativePath: str + newWindow: NotRequired[bool] + targetType: NotRequired[Literal["parent", "child"]] + + +class LaunchAction(BaseAction): + type: Literal["launch"] + filePath: str + + +class URIAction(BaseAction): + type: Literal["uri"] + uri: str + + +class AnnotationReference(TypedDict): + fieldName: NotRequired[str] + pdfObjectId: NotRequired[int] + + +class HideAction(BaseAction): + type: Literal["hide"] + hide: bool + annotationReferences: list[AnnotationReference] + + +class JavaScriptAction(BaseAction): + type: Literal["javascript"] + script: str + + +class SubmitFormAction(BaseAction): + type: Literal["submitForm"] + uri: str + flags: list[ + Literal[ + "includeExclude", + "includeNoValueFields", + "exportFormat", + "getMethod", + "submitCoordinated", + "xfdf", + "includeAppendSaves", + "includeAnnotations", + "submitPDF", + "canonicalFormat", + "excludeNonUserAnnotations", + "excludeFKey", + "embedForm", + ] + ] + fields: NotRequired[list[AnnotationReference]] + + +class ResetFormAction(BaseAction): + type: Literal["resetForm"] + flags: NotRequired[Literal["includeExclude"]] + fields: NotRequired[list[AnnotationReference]] + + +class NamedAction(BaseAction): + type: Literal["named"] + action: Literal[ + "nextPage", + "prevPage", + "firstPage", + "lastPage", + "goBack", + "goForward", + "goToPage", + "find", + "print", + "outline", + "search", + "brightness", + "zoomIn", + "zoomOut", + "saveAs", + "info", + ] + + +Action = Union[ + GoToAction, + GoToRemoteAction, + GoToEmbeddedAction, + LaunchAction, + URIAction, + HideAction, + JavaScriptAction, + SubmitFormAction, + ResetFormAction, + NamedAction, +] diff --git a/src/nutrient_dws/types/instant_json/attachments.py b/src/nutrient_dws/types/instant_json/attachments.py new file mode 100644 index 0000000..d84f0be --- /dev/null +++ b/src/nutrient_dws/types/instant_json/attachments.py @@ -0,0 +1,11 @@ +from typing import Optional, TypedDict + +from typing_extensions import NotRequired + + +class Attachment(TypedDict): + binary: NotRequired[str] + contentType: NotRequired[str] + + +Attachments = Optional[dict[str, Attachment]] diff --git a/src/nutrient_dws/types/instant_json/bookmark.py b/src/nutrient_dws/types/instant_json/bookmark.py new file mode 100644 index 0000000..ad98210 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/bookmark.py @@ -0,0 +1,13 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.instant_json.actions import Action + + +class Bookmark(TypedDict): + name: NotRequired[str] + type: Literal["pspdfkit/bookmark"] + v: Literal[1] + action: Action + pdfBookmarkId: NotRequired[str] diff --git a/src/nutrient_dws/types/instant_json/comment.py b/src/nutrient_dws/types/instant_json/comment.py new file mode 100644 index 0000000..dd38546 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/comment.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union + +from typing_extensions import NotRequired + +if TYPE_CHECKING: + from nutrient_dws.types.misc import PageIndex, PdfObjectId + + +class AnnotationText(TypedDict): + format: NotRequired[Literal["xhtml", "plain"]] + value: NotRequired[str] + + +IsoDateTime = str + + +CustomData = Optional[dict[str, Any]] + + +class V2(TypedDict): + type: Literal["pspdfkit/comment"] + pageIndex: PageIndex + rootId: str + text: AnnotationText + v: Literal[2] + createdAt: NotRequired[IsoDateTime] + creatorName: NotRequired[str] + customData: NotRequired[CustomData | None] + pdfObjectId: NotRequired[PdfObjectId] + updatedAt: NotRequired[IsoDateTime] + + +class V1(TypedDict): + type: Literal["pspdfkit/comment"] + pageIndex: PageIndex + rootId: str + text: str + v: Literal[1] + createdAt: NotRequired[IsoDateTime] + creatorName: NotRequired[str] + customData: NotRequired[CustomData | None] + pdfObjectId: NotRequired[PdfObjectId] + updatedAt: NotRequired[IsoDateTime] + + +CommentContent = Union[V2, V1] diff --git a/src/nutrient_dws/types/instant_json/form_field.py b/src/nutrient_dws/types/instant_json/form_field.py new file mode 100644 index 0000000..e8d0b19 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/form_field.py @@ -0,0 +1,117 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.instant_json.actions import Action + + +class BaseFormField(TypedDict): + v: Literal[1] + id: NotRequired[str] + name: str + label: str + annotationIds: list[str] + pdfObjectId: NotRequired[int] + flags: NotRequired[list[Literal["readOnly", "required", "noExport"]]] + + +class ButtonFormField(BaseFormField): + type: Literal["pspdfkit/form-field/button"] + buttonLabel: str + + +class FormFieldOption(TypedDict): + label: str + value: str + + +FormFieldOptions = list[FormFieldOption] + + +FormFieldDefaultValues = list[str] + + +class FormFieldAdditionalActionsEvent(TypedDict): + onChange: NotRequired[Action] + onCalculate: NotRequired[Action] + + +class ChoiceFormField(TypedDict): + options: FormFieldOptions + multiSelect: NotRequired[bool] + commitOnChange: NotRequired[bool] + defaultValues: NotRequired[FormFieldDefaultValues] + additionalActions: NotRequired[FormFieldAdditionalActionsEvent] + + +class FormFieldAdditionalActionsInput(TypedDict): + onInput: NotRequired[Action] + onFormat: NotRequired[Action] + + +class AdditionalActions( + FormFieldAdditionalActionsEvent, FormFieldAdditionalActionsInput +): + pass + + +class ListBoxFormField(BaseFormField): + type: NotRequired[Literal["pspdfkit/form-field/listbox"]] + additionalActions: NotRequired[AdditionalActions] + options: FormFieldOptions + multiSelect: NotRequired[bool] + commitOnChange: NotRequired[bool] + defaultValues: NotRequired[FormFieldDefaultValues] + + +class ComboBoxFormField(BaseFormField, ChoiceFormField): + type: NotRequired[Literal["pspdfkit/form-field/combobox"]] + edit: bool + doNotSpellCheck: bool + + +class CheckboxFormField(BaseFormField): + type: Literal["pspdfkit/form-field/checkbox"] + options: FormFieldOptions + defaultValues: FormFieldDefaultValues + additionalActions: NotRequired[FormFieldAdditionalActionsEvent] + + +FormFieldDefaultValue = str + + +class RadioButtonFormField(BaseFormField): + type: Literal["pspdfkit/form-field/radio"] + options: FormFieldOptions + defaultValue: NotRequired[FormFieldDefaultValue] + noToggleToOff: NotRequired[bool] + radiosInUnison: NotRequired[bool] + + +class TextFormField(BaseFormField): + type: Literal["pspdfkit/form-field/text"] + password: NotRequired[bool] + maxLength: NotRequired[int] + doNotSpellCheck: bool + doNotScroll: bool + multiLine: bool + comb: bool + defaultValue: FormFieldDefaultValue + richText: NotRequired[bool] + richTextValue: NotRequired[str] + additionalActions: NotRequired[AdditionalActions] + + +class SignatureFormField(BaseFormField): + type: NotRequired[Literal["pspdfkit/form-field/signature"]] + + +FormField = Union[ + ButtonFormField, + ListBoxFormField, + ComboBoxFormField, + CheckboxFormField, + RadioButtonFormField, + TextFormField, + SignatureFormField, +] diff --git a/src/nutrient_dws/types/instant_json/form_field_value.py b/src/nutrient_dws/types/instant_json/form_field_value.py new file mode 100644 index 0000000..29e90f0 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/form_field_value.py @@ -0,0 +1,12 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class FormFieldValue(TypedDict): + name: str + value: NotRequired[str | None | list[str]] + type: Literal["pspdfkit/form-field-value"] + v: Literal[1] + optionIndexes: NotRequired[list[int]] + isFitting: NotRequired[bool] diff --git a/src/nutrient_dws/types/misc.py b/src/nutrient_dws/types/misc.py new file mode 100644 index 0000000..37756d3 --- /dev/null +++ b/src/nutrient_dws/types/misc.py @@ -0,0 +1,317 @@ +from typing import Any, Literal, Optional, TypedDict + +from typing_extensions import NotRequired + + +class PageRange(TypedDict): + start: NotRequired[int] + end: NotRequired[int] + + +class Pages(TypedDict): + start: int + end: int + + +class Size(TypedDict): + width: NotRequired[float] + height: NotRequired[float] + + +class Margin(TypedDict): + left: NotRequired[float] + top: NotRequired[float] + right: NotRequired[float] + bottom: NotRequired[float] + + +class PageLayout(TypedDict): + orientation: NotRequired[Literal["portrait", "landscape"]] + size: NotRequired[ + Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "Letter", "Legal"] + | Size + ] + margin: NotRequired[Margin] + + +OcrLanguage = Literal[ + "afrikaans", + "albanian", + "arabic", + "armenian", + "azerbaijani", + "basque", + "belarusian", + "bengali", + "bosnian", + "bulgarian", + "catalan", + "chinese", + "croatian", + "czech", + "danish", + "dutch", + "english", + "finnish", + "french", + "german", + "indonesian", + "italian", + "malay", + "norwegian", + "polish", + "portuguese", + "serbian", + "slovak", + "slovenian", + "spanish", + "swedish", + "turkish", + "welsh", + "afr", + "amh", + "ara", + "asm", + "aze", + "bel", + "ben", + "bod", + "bos", + "bre", + "bul", + "cat", + "ceb", + "ces", + "chr", + "cos", + "cym", + "dan", + "deu", + "div", + "dzo", + "ell", + "eng", + "enm", + "epo", + "equ", + "est", + "eus", + "fao", + "fas", + "fil", + "fin", + "fra", + "frk", + "frm", + "fry", + "gla", + "gle", + "glg", + "grc", + "guj", + "hat", + "heb", + "hin", + "hrv", + "hun", + "hye", + "iku", + "ind", + "isl", + "ita", + "jav", + "jpn", + "kan", + "kat", + "kaz", + "khm", + "kir", + "kmr", + "kor", + "kur", + "lao", + "lat", + "lav", + "lit", + "ltz", + "mal", + "mar", + "mkd", + "mlt", + "mon", + "mri", + "msa", + "mya", + "nep", + "nld", + "nor", + "oci", + "ori", + "osd", + "pan", + "pol", + "por", + "pus", + "que", + "ron", + "rus", + "san", + "sin", + "slk", + "slv", + "snd", + "sp1", + "spa", + "sqi", + "srp", + "sun", + "swa", + "swe", + "syr", + "tam", + "tat", + "tel", + "tgk", + "tgl", + "tha", + "tir", + "ton", + "tur", + "uig", + "ukr", + "urd", + "uzb", + "vie", + "yid", + "yor", +] + + +class WatermarkDimension(TypedDict): + value: float + unit: Literal["pt", "%"] + + +PageIndex = int + + +AnnotationBbox = list[float] + + +AnnotationOpacity = float + + +PdfObjectId = int + + +AnnotationCustomData = Optional[dict[str, Any]] + + +Rect = list[float] + + +AnnotationRotation = Literal[0, 90, 180, 270] + + +AnnotationNote = str + + +BlendMode = Literal[ + "normal", + "multiply", + "screen", + "overlay", + "darken", + "lighten", + "colorDodge", + "colorBurn", + "hardLight", + "softLight", + "difference", + "exclusion", +] + + +IsCommentThreadRoot = bool + + +CloudyBorderIntensity = float + + +CloudyBorderInset = list[float] + + +FillColor = str + + +MeasurementScale = TypedDict( + "MeasurementScale", + { + "unitFrom": NotRequired[Literal["in", "mm", "cm", "pt"]], + "unitTo": NotRequired[ + Literal["in", "mm", "cm", "pt", "ft", "m", "yd", "km", "mi"] + ], + "from": NotRequired[float], + "to": NotRequired[float], + }, +) + + +MeasurementPrecision = Literal["whole", "oneDp", "twoDp", "threeDp", "fourDp"] + + +FontSizeInt = int + + +FontStyle = list[Literal["bold", "italic"]] + + +FontColor = str + + +Font = str + + +HorizontalAlign = Literal["left", "center", "right"] + + +VerticalAlign = Literal["top", "center", "bottom"] + + +Point = list[float] + + +LineCap = Literal[ + "square", + "circle", + "diamond", + "openArrow", + "closedArrow", + "butt", + "reverseOpenArrow", + "reverseClosedArrow", + "slash", +] + + +BorderStyle = Literal["solid", "dashed", "beveled", "inset", "underline"] + + +class LineCaps(TypedDict): + start: NotRequired[LineCap] + end: NotRequired[LineCap] + + +AnnotationPlainText = str + +BackgroundColor = str + +FontSizeAuto = Literal["auto"] + + +Intensity = float + + +class Lines(TypedDict): + intensities: NotRequired[list[list[Intensity]]] + points: NotRequired[list[list[Point]]] diff --git a/src/nutrient_dws/types/redact_data.py b/src/nutrient_dws/types/redact_data.py new file mode 100644 index 0000000..2496883 --- /dev/null +++ b/src/nutrient_dws/types/redact_data.py @@ -0,0 +1,29 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.misc import Pages + + +class RemoteFile(TypedDict): + url: str + + +class Document(TypedDict): + file: NotRequired[str | RemoteFile] + pages: NotRequired[list[int] | Pages] + + +class Confidence(TypedDict): + threshold: float + + +class RedactOptions(TypedDict): + confidence: NotRequired[Confidence] + + +class RedactData(TypedDict): + documents: list[Document] + criteria: str + redaction_state: NotRequired[Literal["stage", "apply"]] + options: NotRequired[RedactOptions] diff --git a/src/nutrient_dws/types/sign_request.py b/src/nutrient_dws/types/sign_request.py new file mode 100644 index 0000000..1d16181 --- /dev/null +++ b/src/nutrient_dws/types/sign_request.py @@ -0,0 +1,27 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class Appearance(TypedDict): + mode: NotRequired[ + Literal["signatureOnly", "signatureAndDescription", "descriptionOnly"] + ] + contentType: NotRequired[str] + showWatermark: NotRequired[bool] + showSignDate: NotRequired[bool] + showDateTimezone: NotRequired[bool] + + +class Position(TypedDict): + pageIndex: int + rect: list[float] + + +class CreateDigitalSignature(TypedDict): + signatureType: Literal["cms", "cades"] + flatten: NotRequired[bool] + formFieldName: NotRequired[str] + appearance: NotRequired[Appearance] + position: NotRequired[Position] + cadesLevel: NotRequired[Literal["b-lt", "b-t", "b-b"]] diff --git a/src/nutrient_dws/utils/__init__.py b/src/nutrient_dws/utils/__init__.py new file mode 100644 index 0000000..d51fdc5 --- /dev/null +++ b/src/nutrient_dws/utils/__init__.py @@ -0,0 +1,3 @@ +from .version import get_library_version, get_user_agent + +__all__ = ["get_library_version", "get_user_agent"] diff --git a/src/nutrient_dws/utils/version.py b/src/nutrient_dws/utils/version.py new file mode 100644 index 0000000..5877cbb --- /dev/null +++ b/src/nutrient_dws/utils/version.py @@ -0,0 +1,21 @@ +import os +from importlib.metadata import version as pkg_version + + +def get_library_version() -> str: + """Gets the current version of the Nutrient DWS Python Client library. + + Strategy: Try importlib.metadata.version("nutrient-dws"); on any failure, return "0.0.0-dev". + """ + if os.getenv("PYTHON_ENV") == "development": + return "0.0.0-dev" + try: + return pkg_version("nutrient-dws") + except Exception: + return "0.0.0-dev" + + +def get_user_agent() -> str: + """Creates a User-Agent string for HTTP requests.""" + package_version = get_library_version() + return f"nutrient-dws/{package_version}" diff --git a/src/nutrient_dws/workflow.py b/src/nutrient_dws/workflow.py new file mode 100644 index 0000000..9e430d6 --- /dev/null +++ b/src/nutrient_dws/workflow.py @@ -0,0 +1,42 @@ +"""Factory function to create a new workflow builder with staged interface.""" + +from collections.abc import Callable + +from nutrient_dws.builder.builder import StagedWorkflowBuilder +from nutrient_dws.builder.staged_builders import WorkflowInitialStage +from nutrient_dws.http import NutrientClientOptions + + +def workflow( + api_key: str | Callable[[], str], + base_url: str | None = None, + timeout: int | None = None, +) -> WorkflowInitialStage: + r"""Factory function to create a new workflow builder with staged interface. + + Args: + api_key: API key or API key getter + base_url: DWS Base url + timeout: DWS request timeout + + Returns: + A new staged workflow builder instance + + Example: + ```python + from nutrient_dws import workflow + + # Create a workflow + result = await workflow({ + 'apiKey': 'your-api-key' + }) \\ + .add_file_part('document.pdf') \\ + .apply_action(BuildActions.ocr('english')) \\ + .output_pdf() \\ + .execute() + ``` + """ + client_options = NutrientClientOptions( + apiKey=api_key, baseUrl=base_url, timeout=timeout + ) + return StagedWorkflowBuilder(client_options) diff --git a/src/scripts/__init__.py b/src/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scripts/add_claude_code_rule.py b/src/scripts/add_claude_code_rule.py new file mode 100644 index 0000000..830322f --- /dev/null +++ b/src/scripts/add_claude_code_rule.py @@ -0,0 +1,28 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + + user_project_root = os.getcwd() + output_file = os.path.join(user_project_root, "CLAUDE.md") + + relative_doc_path = os.path.relpath(doc_path, user_project_root) + + try: + rule = f""" +# Nutrient DWS Python Client Usage +- Use the `nutrient-dws` package for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +- Package Documentation and Examples can be found at: {relative_doc_path} +""" + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Claude Code Rules to point to Nutrient DWS documentation at {relative_doc_path}." + ) + except Exception as err: + print(f"Failed to update CLAUDE.md file: {err}", file=sys.stderr) + sys.exit(1) diff --git a/src/scripts/add_cursor_rule.py b/src/scripts/add_cursor_rule.py new file mode 100644 index 0000000..3e0001b --- /dev/null +++ b/src/scripts/add_cursor_rule.py @@ -0,0 +1,35 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + with open(doc_path, encoding="utf-8") as file: + documentation = file.read() + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".cursor/rules") + output_file = os.path.join(output_directory, "nutrient-dws-doc.mdc") + + try: + rule = f""" +--- +description: This rule explains how to use the Nutrient DWS Python Client (`nutrient-dws`) for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +globs: +alwaysApply: false +--- + +{documentation} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Cursor Rules to point to Nutrient DWS documentation at {output_file}." + ) + except Exception as err: + print(f"Failed to update Cursor Rule: {err}", file=sys.stderr) + sys.exit(1) diff --git a/src/scripts/add_github_copilot_rule.py b/src/scripts/add_github_copilot_rule.py new file mode 100644 index 0000000..c5060a1 --- /dev/null +++ b/src/scripts/add_github_copilot_rule.py @@ -0,0 +1,34 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".github") + output_file = os.path.join(output_directory, "copilot-instructions.md") + + relative_doc_path = os.path.relpath(doc_path, user_project_root) + + try: + rule = f""" +# Nutrient DWS Python Client Usage +- Use the `nutrient-dws` package for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +- Package Documentation and Examples can be found at: {relative_doc_path} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated GitHub Copilot Rules to point to Nutrient DWS documentation at {relative_doc_path}." + ) + except Exception as err: + print( + f"Failed to update .github/copilot-instructions.md file: {err}", + file=sys.stderr, + ) + sys.exit(1) diff --git a/src/scripts/add_junie_rule.py b/src/scripts/add_junie_rule.py new file mode 100644 index 0000000..cfeaa8a --- /dev/null +++ b/src/scripts/add_junie_rule.py @@ -0,0 +1,31 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".junie") + output_file = os.path.join(output_directory, "guidelines.md") + + relative_doc_path = os.path.relpath(doc_path, user_project_root) + + try: + rule = f""" +# Nutrient DWS Python Client Usage +- Use the `nutrient-dws` package for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +- Package Documentation and Examples can be found at: {relative_doc_path} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Junie Code Rules to point to Nutrient DWS documentation at {relative_doc_path}." + ) + except Exception as err: + print(f"Failed to update .junie/guidelines.md file: {err}", file=sys.stderr) + sys.exit(1) diff --git a/src/scripts/add_windsurf_rule.py b/src/scripts/add_windsurf_rule.py new file mode 100644 index 0000000..5cd4554 --- /dev/null +++ b/src/scripts/add_windsurf_rule.py @@ -0,0 +1,34 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + with open(doc_path, encoding="utf-8") as file: + documentation = file.read() + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".windsurf/rules") + output_file = os.path.join(output_directory, "nutrient-dws-doc.mdc") + + try: + rule = f""" +--- +description: This rule explains how to use the Nutrient DWS Python Client (`nutrient-dws`) for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +trigger: model_decision +--- + +{documentation} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Windsurf Rules to point to Nutrient DWS documentation at {output_file}." + ) + except Exception as err: + print(f"Failed to update Windsurf Rule: {err}", file=sys.stderr) + sys.exit(1) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9c8f10f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,72 @@ +from unittest.mock import AsyncMock + +import pytest +from nutrient_dws import NutrientClient +from tests.helpers import TestDocumentGenerator + +@pytest.fixture +def mock_workflow_instance(): + """Create a mock workflow instance for testing.""" + mock_output_stage = AsyncMock() + mock_output_stage.execute.return_value = { + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + mock_output_stage.dry_run.return_value = {"success": True} + + mock_workflow = AsyncMock() + mock_workflow.add_file_part.return_value = mock_workflow + mock_workflow.add_html_part.return_value = mock_workflow + mock_workflow.add_new_page.return_value = mock_workflow + mock_workflow.add_document_part.return_value = mock_workflow + mock_workflow.apply_actions.return_value = mock_workflow + mock_workflow.apply_action.return_value = mock_workflow + mock_workflow.output_pdf.return_value = mock_output_stage + mock_workflow.output_pdfa.return_value = mock_output_stage + mock_workflow.output_pdfua.return_value = mock_output_stage + mock_workflow.output_image.return_value = mock_output_stage + mock_workflow.output_office.return_value = mock_output_stage + mock_workflow.output_html.return_value = mock_output_stage + mock_workflow.output_markdown.return_value = mock_output_stage + mock_workflow.output_json.return_value = mock_output_stage + + return mock_workflow + + +@pytest.fixture +def valid_client_options(): + """Valid client options for testing.""" + return {"apiKey": "test-api-key", "baseUrl": "https://api.test.com/v1", "timeout": None} + +@pytest.fixture +def unit_client(): + return NutrientClient(api_key="test-api-key", base_url="https://api.test.com/v1") + +@pytest.fixture +def test_table_pdf(): + """Generate PDF with table for annotation tests.""" + return TestDocumentGenerator.generate_pdf_with_table() + +@pytest.fixture +def test_xfdf_content(): + """Generate XFDF content for testing.""" + return TestDocumentGenerator.generate_xfdf_content() + +@pytest.fixture +def test_instant_json_content(): + """Generate Instant JSON content for testing.""" + return TestDocumentGenerator.generate_instant_json_content() + +@pytest.fixture +def test_sensitive_pdf(): + """Generate PDF with sensitive data for redaction testing.""" + return TestDocumentGenerator.generate_pdf_with_sensitive_data() + +@pytest.fixture +def test_html_content(): + """Generate HTML content for testing.""" + return TestDocumentGenerator.generate_html_content() diff --git a/tests/data/sample.png b/tests/data/sample.png new file mode 100644 index 0000000..32ee307 Binary files /dev/null and b/tests/data/sample.png differ diff --git a/tests/data/sample_multipage.pdf b/tests/data/sample_multipage.pdf deleted file mode 100644 index 95556b3..0000000 Binary files a/tests/data/sample_multipage.pdf and /dev/null differ diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 0000000..7661924 --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,423 @@ +"""Test utilities and helpers for Nutrient DWS Python Client tests.""" + +from datetime import datetime, timezone +import json +from typing import Any, Optional, TypedDict, Literal, List +from pathlib import Path + + +class XfdfAnnotation(TypedDict): + type: Literal["highlight", "text", "square", "circle"] + page: int + rect: List[int] + content: Optional[str] + color: Optional[str] + + +class TestDocumentGenerator: + """Generate test documents and content for testing purposes.""" + + @staticmethod + def generate_simple_pdf_content(content: str = "Test PDF Document") -> bytes: + """Generate a simple PDF-like content for testing. + + Note: This is not a real PDF, just bytes that can be used for testing file handling. + """ + pdf = f"""%PDF-1.4 +1 0 obj<>endobj +2 0 obj<>endobj +3 0 obj<>>>>>>/Contents 4 0 R>>endobj +4 0 obj<>stream +BT /F1 12 Tf 100 700 Td ({content}) Tj ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +trailer<> +startxref +356 +%%EOF""" + return pdf.encode("utf-8") + + @staticmethod + def generate_pdf_with_sensitive_data() -> bytes: + """Generate PDF-like content with sensitive data patterns for redaction testing.""" + content = f"""Personal Information: +Name: John Doe +SSN: 123-45-6789 +Email: john.doe@example.com +Phone: (555) 123-4567 +Credit Card: 4111-1111-1111-1111 +Medical Record: MR-2024-12345 +License: DL-ABC-123456""" + return TestDocumentGenerator.generate_simple_pdf_content(content) + + @staticmethod + def generate_pdf_with_table() -> bytes: + """Generate PDF-like content with table data patterns""" + content = f"""Sales Report 2024 +Product | Q1 | Q2 | Q3 | Q4 +Widget A | 100 | 120 | 140 | 160 +Widget B | 80 | 90 | 100 | 110 +Widget C | 60 | 70 | 80 | 90""" + return TestDocumentGenerator.generate_simple_pdf_content(content) + + @staticmethod + def generate_html_content( + title: str = "Test Document", + include_styles: bool = True, + include_table: bool = False, + include_images: bool = False, + include_form: bool = False, + ) -> bytes: + """Generate HTML content for testing.""" + + styles = ( + """""" + if include_styles + else "" + ) + tables = ( + """
Product | +Price | +Quantity | +Total | +
---|---|---|---|
Widget A | +$10.00 | +5 | +$50.00 | +
Widget B | +$15.00 | +3 | +$45.00 | +
Widget C | +$20.00 | +2 | +$40.00 | +
Below is a placeholder for image content:
+This is a test document with highlighted text for PDF conversion testing.
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+ {tables}{images}{form} + +""" + return html.encode("utf-8") + + @staticmethod + def generate_xfdf_content( + annotations: Optional[list[XfdfAnnotation]] = None, + ) -> bytes: + """Generate XFDF annotation content.""" + + if annotations is None: + annotations = [ + { + "type": "highlight", + "page": 0, + "rect": [100, 100, 200, 150], + "color": "#FFFF00", + "content": "Important text", + }, + ] + + inner_xfdf = "" + + for annot in annotations: + rectStr = ",".join([str(x) for x in annot["rect"]]) + color = annot["color"] or "#FFFF00" + if annot["type"] == "highlight": + inner_xfdf = f"""