diff --git a/.claude/project.json b/.claude/project.json deleted file mode 100644 index c906f02..0000000 --- a/.claude/project.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "nutrient-dws-client-python", - "description": "Python client for Nutrient DWS", - "workspace": "/Users/admin/Projects/nutrient-dws-client-python", - "mcpServers": { - "claude-code-mcp": { - "enabled": true, - "workspace": "/Users/admin/Projects/nutrient-dws-client-python" - }, - "claude-code-github": { - "enabled": true, - "repository": "nutrient-dws-client-python" - } - } -} \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f7e1872 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +# Nutrient DWS Processor API Configuration for Testing +NUTRIENT_API_KEY=your_api_key_here +NUTRIENT_BASE_URL=https://api.nutrient.io + +# Development Settings +DEBUG=true +PYTHON_ENV=development diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1eee5eb..d6e2225 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,81 +1,44 @@ name: CI -# Integration Test Strategy: -# - Fork PRs: Cannot access secrets, so integration tests are skipped with informative feedback -# - Same-repo PRs: Have access to secrets, integration tests run normally -# - Push to main/develop: Integration tests always run to catch any issues after merge -# - Manual trigger: Allows maintainers to run integration tests on demand -# -# This ensures security while still validating integration tests before release - on: push: - branches: [ main, develop ] + branches: [ main ] pull_request: - branches: [ main, develop ] - # Run integration tests after PR is merged - workflow_dispatch: # Allow manual trigger for integration tests + branches: [ main ] jobs: - test: + lint-and-type-check: runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip- + python-version: '3.12' + cache: 'pip' - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install ".[dev]" - - name: Run linting with ruff - if: matrix.python-version == '3.10' + - name: Run linting run: | - python -m ruff check . - python -m ruff format --check . - - - name: Run type checking with mypy - run: python -m mypy --python-version=${{ matrix.python-version }} src tests + python -m ruff check src/ - - name: Run unit tests with pytest - run: python -m pytest tests/unit/ -v --cov=nutrient_dws --cov-report=xml --cov-report=term + - name: Run type checking + run: python -m mypy src/ - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - files: ./coverage.xml - flags: unittests - name: codecov-umbrella - fail_ci_if_error: false + unit-tests: + runs-on: ${{ matrix.os }} + needs: lint-and-type-check - integration-test: - runs-on: ubuntu-latest - # Run on: pushes to main/develop, PRs from same repo, and manual triggers - if: | - github.event_name == 'push' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) strategy: matrix: python-version: ['3.10', '3.11', '3.12'] - + os: [ubuntu-latest, windows-latest, macos-latest] + steps: - uses: actions/checkout@v4 @@ -83,108 +46,26 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - - name: Cache pip dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip- + cache: 'pip' - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install ".[dev]" - - name: Check for API key availability - run: | - if [ -z "${{ secrets.NUTRIENT_DWS_API_KEY }}" ]; then - echo "::warning::NUTRIENT_DWS_API_KEY secret not found, skipping integration tests" - echo "skip_tests=true" >> $GITHUB_ENV - - # Provide context about why this might be happening - if [ "${{ github.event_name }}" == "pull_request" ]; then - if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then - echo "::notice::This appears to be a PR from a fork. Secrets are not available for security reasons." - else - echo "::error::This is a PR from the same repository but the API key is missing. Please check repository secrets configuration." - fi - else - echo "::error::Running on ${{ github.event_name }} event but API key is missing. Please configure NUTRIENT_DWS_API_KEY secret." - fi - else - echo "::notice::API key found, integration tests will run" - echo "skip_tests=false" >> $GITHUB_ENV - fi - - - name: Create integration config with API key - if: env.skip_tests != 'true' - run: | - python -c " - import os - with open('tests/integration/integration_config.py', 'w') as f: - f.write(f'API_KEY = \"{os.environ[\"NUTRIENT_DWS_API_KEY\"]}\"\n') - " - env: - NUTRIENT_DWS_API_KEY: ${{ secrets.NUTRIENT_DWS_API_KEY }} - - - name: Run integration tests - if: env.skip_tests != 'true' - run: python -m pytest tests/integration/ -v - - - name: Cleanup integration config - if: always() - run: rm -f tests/integration/integration_config.py - - # Provide feedback for fork PRs where integration tests can't run - integration-test-fork-feedback: - runs-on: ubuntu-latest - if: | - github.event_name == 'pull_request' && - github.event.pull_request.head.repo.full_name != github.repository - steps: - - name: Comment on PR about integration tests - uses: actions/github-script@v7 + - name: Run unit tests with coverage + run: python -m pytest tests/unit/ -v --cov=nutrient_dws --cov-report=xml --cov-report=term + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12' with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const issue_number = context.issue.number; - const owner = context.repo.owner; - const repo = context.repo.repo; - - // Check if we've already commented - const comments = await github.rest.issues.listComments({ - owner, - repo, - issue_number, - }); - - const botComment = comments.data.find(comment => - comment.user.type === 'Bot' && - comment.body.includes('Integration tests are skipped for pull requests from forks') - ); - - if (!botComment) { - await github.rest.issues.createComment({ - owner, - repo, - issue_number, - body: `## Integration Tests Status\n\n` + - `Integration tests are skipped for pull requests from forks due to security restrictions. ` + - `These tests will run automatically after the PR is merged.\n\n` + - `**What this means:**\n` + - `- Unit tests, linting, and type checking have passed ✅\n` + - `- Integration tests require API credentials that aren't available to fork PRs\n` + - `- A maintainer will review your changes and merge if appropriate\n` + - `- Integration tests will run on the main branch after merge\n\n` + - `Thank you for your contribution! 🙏` - }); - } + files: ./coverage.xml + flags: unittests + name: codecov-umbrella build: runs-on: ubuntu-latest - needs: test + needs: [lint-and-type-check, unit-tests] steps: - uses: actions/checkout@v4 @@ -193,20 +74,14 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.12' + cache: 'pip' - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install ".[dev]" - name: Build package run: python -m build - - name: Check package with twine + - name: Verify build outputs run: twine check dist/* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - name: dist - path: dist/ diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 0000000..15fd475 --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,54 @@ +name: Integration Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + integration-tests: + runs-on: ubuntu-latest + needs: [] # Run in parallel with other workflows + + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + + - name: Check for API key + id: check-api-key + env: + NUTRIENT_API_KEY: ${{ secrets.NUTRIENT_API_KEY }} + run: | + if [ -n "$NUTRIENT_API_KEY" ] && [ "$NUTRIENT_API_KEY" != "fake_key" ] && [ ${#NUTRIENT_API_KEY} -gt 10 ]; then + echo "has_api_key=true" >> $GITHUB_OUTPUT + echo "✅ Valid API key detected" + else + echo "has_api_key=false" >> $GITHUB_OUTPUT + echo "⏭️ No valid API key - Integration tests will be skipped" + fi + + - name: Run integration tests + if: steps.check-api-key.outputs.has_api_key == 'true' + env: + NUTRIENT_API_KEY: ${{ secrets.NUTRIENT_API_KEY }} + run: python -m pytest tests/test_integration.py -v + + - name: Skip integration tests (no API key) + if: steps.check-api-key.outputs.has_api_key == 'false' + run: | + echo "✅ Integration tests skipped - no valid API key available" + echo "This is expected for forks and external PRs" diff --git a/.github/workflows/publish-existing-tag.yml b/.github/workflows/publish-existing-tag.yml deleted file mode 100644 index 44babb1..0000000 --- a/.github/workflows/publish-existing-tag.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Publish Existing Tag to PyPI - -on: - workflow_dispatch: - inputs: - tag: - description: 'Tag to publish (e.g., v1.0.2)' - required: true - default: 'v1.0.2' - -jobs: - build-and-publish: - name: Build and Publish to PyPI - runs-on: ubuntu-latest - - # IMPORTANT: This permission is required for trusted publishing - permissions: - id-token: write - - steps: - - name: Checkout specific tag - uses: actions/checkout@v4 - with: - ref: ${{ github.event.inputs.tag }} - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - python -m pip install build - - - name: Build distribution - run: python -m build - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - # No need for username/password with trusted publishing! \ No newline at end of file diff --git a/.github/workflows/publish-manual.yml b/.github/workflows/publish.yml similarity index 87% rename from .github/workflows/publish-manual.yml rename to .github/workflows/publish.yml index fc0a83d..db74c79 100644 --- a/.github/workflows/publish-manual.yml +++ b/.github/workflows/publish.yml @@ -7,27 +7,27 @@ jobs: publish: name: Publish to PyPI runs-on: ubuntu-latest - + permissions: id-token: write contents: read - + steps: # Use current branch/tag - uses: actions/checkout@v4 - + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.10' - + - name: Install build dependencies run: | python -m pip install --upgrade pip python -m pip install build - + - name: Build distribution run: python -m build - + - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 99cd5f6..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Release - -on: - release: - types: [published] # Changed from 'created' to 'published' for better control - # Allow manual trigger - workflow_dispatch: - -jobs: - deploy: - runs-on: ubuntu-latest - - # IMPORTANT: Required for trusted publishing - permissions: - id-token: write - contents: read - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - python -m pip install build - - - name: Build package - run: python -m build - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - # No API token needed with trusted publishing! \ No newline at end of file diff --git a/.github/workflows/scheduled-integration-tests.yml b/.github/workflows/scheduled-integration-tests.yml new file mode 100644 index 0000000..cd6e679 --- /dev/null +++ b/.github/workflows/scheduled-integration-tests.yml @@ -0,0 +1,154 @@ +name: Scheduled Integration Tests + +on: + schedule: + # Run every day at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +jobs: + scheduled-integration-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + + - name: Run all integration tests + if: secrets.NUTRIENT_API_KEY != '' + env: + NUTRIENT_API_KEY: ${{ secrets.NUTRIENT_API_KEY }} + run: | + echo "Running scheduled integration tests to detect API changes..." + python -m pytest tests/test_integration.py -v --tb=short + timeout-minutes: 20 + continue-on-error: true + id: test-run + + - name: Skip scheduled tests (no API key) + if: secrets.NUTRIENT_API_KEY == '' + run: | + echo "⏭️ Skipping scheduled integration tests - NUTRIENT_API_KEY not available" + echo "Configure NUTRIENT_API_KEY secret to enable scheduled API validation" + + - name: Generate detailed test report + if: always() + run: | + python -m pytest tests/test_integration.py -v --tb=short --junit-xml=scheduled-test-results.xml || true + + # Create summary + echo "## Integration Test Summary" > test-summary.md + echo "Date: $(date)" >> test-summary.md + echo "Status: ${{ steps.test-run.outcome }}" >> test-summary.md + + # Extract test counts if possible + if [ -f scheduled-test-results.xml ]; then + echo "### Test Results" >> test-summary.md + python -c " + import xml.etree.ElementTree as ET + import os + if os.path.exists('scheduled-test-results.xml'): + tree = ET.parse('scheduled-test-results.xml') + root = tree.getroot() + tests = root.get('tests', '0') + failures = root.get('failures', '0') + errors = root.get('errors', '0') + skipped = root.get('skipped', '0') + passed = str(int(tests) - int(failures) - int(errors) - int(skipped)) + print(f'- Total Tests: {tests}') + print(f'- Passed: {passed}') + print(f'- Failed: {failures}') + print(f'- Errors: {errors}') + print(f'- Skipped: {skipped}') + " >> test-summary.md + fi + + - name: Create issue if tests fail + if: failure() && steps.test-run.outcome == 'failure' + uses: actions/github-script@v7 + with: + script: | + const date = new Date().toISOString().split('T')[0]; + const title = `🚨 Integration Tests Failed - ${date}`; + + // Check if issue already exists + const issues = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['integration-failure', 'automated'], + state: 'open' + }); + + const existingIssue = issues.data.find(issue => issue.title.includes(date)); + + if (!existingIssue) { + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: title, + body: `## Scheduled Integration tests failed + + The scheduled integration test run has detected failures. This could indicate: + - API changes that need to be addressed + - Service degradation + - Test flakiness + + ### Action Required + 1. Check the [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for details + 2. Investigate any API changes + 3. Update tests if needed + 4. Close this issue once resolved + + ### Test Summary + See the workflow artifacts for detailed test results.`, + labels: ['integration-failure', 'automated', 'high-priority'] + }); + } + + - name: Upload test artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: scheduled-integration-results-${{ github.run_number }} + path: | + scheduled-test-results.xml + test-summary.md + retention-days: 30 + + - name: Notify on success after previous failure + if: success() && steps.test-run.outcome == 'success' + uses: actions/github-script@v7 + with: + script: | + // Close any open integration failure issues + const issues = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['integration-failure', 'automated'], + state: 'open' + }); + + for (const issue of issues.data) { + await github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issue.number, + state: 'closed' + }); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issue.number, + body: '✅ **Resolved**: Integration tests are now passing.' + }); + } diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..a47e70d --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,128 @@ +name: Security Checks + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 0 * * 0' # Weekly on Sunday + +jobs: + secret-scanning: + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run Gitleaks + uses: gitleaks/gitleaks-action@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITLEAKS_LICENSE: ${{ secrets.GITLEAKS_LICENSE }} + continue-on-error: true + + - name: Check for hardcoded secrets + run: | + echo "🔍 Scanning for hardcoded secrets..." + + # Check for potential API keys + if grep -r "pdf_live_" --include="*.py" --include="*.json" --exclude-dir=.venv --exclude-dir=__pycache__ --exclude-dir=.pytest_cache . 2>/dev/null; then + echo "❌ Found hardcoded API keys!" + exit 1 + fi + + # Check for base64 encoded secrets (common Nutrient patterns) + if grep -r "cGRmX2xpdmVf" --include="*.py" --include="*.json" --exclude-dir=.venv --exclude-dir=__pycache__ --exclude-dir=.pytest_cache . 2>/dev/null; then + echo "❌ Found base64 encoded API keys!" + exit 1 + fi + + # Check for other common secret patterns + if grep -rE "(sk_|pk_|nutr_sk_)" --include="*.py" --include="*.json" --exclude-dir=.venv --exclude-dir=__pycache__ --exclude-dir=.pytest_cache . 2>/dev/null; then + echo "❌ Found potential secret keys!" + exit 1 + fi + + echo "✅ No hardcoded secrets found" + + dependency-check: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + pip install safety bandit + + - name: Run Safety check + run: | + echo "🔍 Running Safety security scan..." + safety check --json --output safety-report.json || echo "⚠️ Safety found issues but continuing..." + + # Display summary if report exists + if [ -f safety-report.json ]; then + echo "Safety report generated - check artifacts for details" + fi + continue-on-error: true + + - name: Run Bandit security linter + run: | + echo "🔍 Running Bandit security linter..." + bandit -r src/ -f json -o bandit-report.json || echo "⚠️ Bandit found issues but continuing..." + + # Display summary + bandit -r src/ --severity-level medium || echo "⚠️ Medium+ severity issues found" + continue-on-error: true + + - name: Upload security scan results + uses: actions/upload-artifact@v4 + if: always() + with: + name: security-reports-${{ github.run_number }} + path: | + safety-report.json + bandit-report.json + retention-days: 30 + + - name: Run pip audit (if available) + run: | + echo "🔍 Running pip audit..." + pip install pip-audit || echo "pip-audit not available" + pip-audit --format=json --output=pip-audit-report.json || echo "⚠️ pip-audit found issues but continuing..." + continue-on-error: true + + code-quality: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install dependencies + run: | + pip install ".[dev]" + + - name: Run additional security checks with ruff + run: | + echo "🔍 Running security-focused linting..." + python -m ruff check . --select=S # Security rules + continue-on-error: true diff --git a/.gitignore b/.gitignore index d62bed8..0e553f9 100644 --- a/.gitignore +++ b/.gitignore @@ -154,5 +154,5 @@ openapi_spec.yml .pixi .claude/settings.local.json -# Integration test configuration -tests/integration/integration_config.py +# Example output +examples/output diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4464cc4..6e18f1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,17 +12,19 @@ repos: - id: debug-statements - id: mixed-line-ending + - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.11 + rev: v0.12.8 hooks: - id: ruff args: [--fix] + files: ^src - id: ruff-format + files: ^src - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 + rev: v1.17.1 hooks: - id: mypy - additional_dependencies: [types-requests] - args: [--strict, --no-implicit-reexport] - files: ^src/ \ No newline at end of file + additional_dependencies: [types-aiofiles, httpx] + files: ^src diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 216ead5..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,136 +0,0 @@ -# Changelog - -All notable changes to the nutrient-dws Python client library will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [1.0.2] - 2025-01-03 - -### Added - -#### Direct API Methods -- `create_redactions_preset()` - Create redactions using predefined patterns (SSN, email, phone, etc.) -- `create_redactions_regex()` - Create redactions using custom regex patterns -- `create_redactions_text()` - Create redactions for specific text strings -- `optimize_pdf()` - Optimize PDF file size and performance -- `password_protect_pdf()` - Add password protection to PDFs -- `set_pdf_metadata()` - Update PDF metadata (title, author, subject, keywords) -- `split_pdf()` - Split PDFs into multiple files based on page ranges -- `duplicate_pdf_pages()` - Duplicate specific pages within a PDF -- `delete_pdf_pages()` - Remove specific pages from a PDF -- `add_page()` - Insert blank pages at specific positions -- `apply_instant_json()` - Apply PSPDFKit Instant JSON annotations -- `apply_xfdf()` - Apply XFDF annotations to PDFs -- `set_page_label()` - Set custom page labels (Roman numerals, letters, etc.) - -#### Enhancements -- Image file support for `watermark_pdf()` method - now accepts PNG/JPEG images as watermarks -- Improved CI/CD integration test strategy with better error reporting -- Enhanced test coverage for all new Direct API methods - -### Fixed -- Critical API compatibility issues in Direct API integration -- Python 3.9 and 3.10 syntax compatibility across the codebase -- Comprehensive CI failure resolution based on multi-model analysis -- Integration test fixes to match actual API behavior patterns -- Ruff linting and formatting issues throughout the project -- MyPy type checking errors and improved type annotations -- Removed unsupported parameters (stroke_width, base_url) from API calls -- Corrected API parameter formats for various operations -- Fixed page range handling in split_pdf with proper defaults -- Resolved runtime errors with isinstance union syntax -- Updated test fixtures to use valid PNG images - -### Changed -- Minimum Python version maintained at 3.10+ as per project design -- Improved error messages for better debugging experience -- Standardized code formatting with ruff across entire codebase - -## [1.0.1] - 2025-06-20 - -### Fixed - -#### Critical Bug Fixes -- Fix README.md documentation to use `NutrientTimeoutError` instead of `TimeoutError` -- Resolve inconsistency where code exported `NutrientTimeoutError` but docs referenced `TimeoutError` - -#### Testing Improvements -- Added comprehensive unit tests (31 tests total) -- Added integration test framework for CI -- Improved test stability and coverage - -## [1.0.0] - 2024-06-17 - -### Added - -#### Core Features -- **NutrientClient**: Main client class with support for both Direct API and Builder API patterns -- **Direct API Methods**: Convenient methods for single operations: - - `convert_to_pdf()` - Convert Office documents to PDF (uses implicit conversion) - - `flatten_annotations()` - Flatten PDF annotations and form fields - - `rotate_pages()` - Rotate specific or all pages - - `ocr_pdf()` - Apply OCR to make PDFs searchable - - `watermark_pdf()` - Add text or image watermarks - - `apply_redactions()` - Apply existing redaction annotations - - `merge_pdfs()` - Merge multiple PDFs and Office documents - -- **Builder API**: Fluent interface for chaining multiple operations: - ```python - client.build(input_file="document.docx") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "english"}) \ - .execute(output_path="processed.pdf") - ``` - -#### Infrastructure -- **HTTP Client**: - - Connection pooling for performance - - Automatic retry logic with exponential backoff - - Bearer token authentication - - Comprehensive error handling - -- **File Handling**: - - Support for multiple input types (paths, Path objects, bytes, file-like objects) - - Automatic streaming for large files (>10MB) - - Memory-efficient processing - -- **Exception Hierarchy**: - - `NutrientError` - Base exception - - `AuthenticationError` - API key issues - - `APIError` - General API errors with status codes - - `ValidationError` - Request validation failures - - `TimeoutError` - Request timeouts - - `FileProcessingError` - File operation failures - -#### Development Tools -- **Testing**: 82 unit tests with 92.46% code coverage -- **Type Safety**: Full mypy type checking support -- **Linting**: Configured with ruff -- **Pre-commit Hooks**: Automated code quality checks -- **CI/CD**: GitHub Actions for testing, linting, and releases -- **Documentation**: Comprehensive README with examples - -### Changed -- Package name updated from `nutrient` to `nutrient-dws` for PyPI -- Source directory renamed from `src/nutrient` to `src/nutrient_dws` -- API endpoint updated to https://api.pspdfkit.com -- Authentication changed from X-Api-Key header to Bearer token - -### Discovered -- **Implicit Document Conversion**: The API automatically converts Office documents (DOCX, XLSX, PPTX) to PDF when processing, eliminating the need for explicit conversion steps - -### Fixed -- Watermark operation now correctly requires width/height parameters -- OCR language codes properly mapped (e.g., "en" → "english") -- All API operations updated to use the Build API endpoint -- Type annotations corrected throughout the codebase - -### Security -- API keys are never logged or exposed -- Support for environment variable configuration -- Secure handling of authentication tokens - -[1.0.2]: https://github.com/PSPDFKit/nutrient-dws-client-python/releases/tag/v1.0.2 -[1.0.1]: https://github.com/PSPDFKit/nutrient-dws-client-python/releases/tag/v1.0.1 -[1.0.0]: https://github.com/PSPDFKit/nutrient-dws-client-python/releases/tag/v1.0.0 \ No newline at end of file diff --git a/DEVELOPMENT_ROADMAP.md b/DEVELOPMENT_ROADMAP.md deleted file mode 100644 index aef5356..0000000 --- a/DEVELOPMENT_ROADMAP.md +++ /dev/null @@ -1,100 +0,0 @@ -# Development Roadmap - Nutrient DWS Python Client - -## 📊 Issue Review & Recommendations - -After reviewing all open issues and analyzing the codebase, here are my recommendations for what to tackle next: - -### 🥇 **Top Priority: Quick Wins (1-2 days each)** - -#### 1. **Issue #11: Image Watermark Support** ⭐⭐⭐⭐⭐ -- **Why**: 80% already implemented! Just needs file upload support -- **Current**: Supports `image_url` parameter -- **Add**: `image_file` parameter for local image files -- **Effort**: Very Low - mostly parameter handling -- **Value**: High - common user request - -#### 2. **Issue #10: Multi-Language OCR Support** ⭐⭐⭐⭐ -- **Why**: Small change with big impact -- **Current**: Single language string -- **Add**: Accept `List[str]` for multiple languages -- **Effort**: Low - update parameter handling and validation -- **Value**: High - enables multi-lingual document processing - -### 🥈 **Second Priority: Core Features (3-5 days each)** - -#### 3. **Issue #13: Create Redactions Method** ⭐⭐⭐⭐ -- **Why**: Complements existing `apply_redactions()` -- **Value**: Complete redaction workflow -- **Complexity**: Medium - new API patterns for search strategies -- **Use cases**: Compliance, privacy, legal docs - -#### 4. **Issue #12: Selective Annotation Flattening** ⭐⭐⭐ -- **Why**: Enhancement to existing `flatten_annotations()` -- **Add**: `annotation_ids` parameter -- **Effort**: Low-Medium -- **Value**: More control over flattening - -### 🥉 **Third Priority: High-Value Features (1 week each)** - -#### 5. **Issue #16: Convert to PDF/A** ⭐⭐⭐⭐ -- **Why**: Critical for archival/compliance -- **Value**: Legal requirement for many organizations -- **Complexity**: Medium - new output format handling - -#### 6. **Issue #17: Convert PDF to Images** ⭐⭐⭐⭐ -- **Why**: Very common use case -- **Value**: Thumbnails, previews, web display -- **Complexity**: Medium - handle multiple output files - -### 📋 **Issues to Defer** - -- **Issue #20: AI-Powered Redaction** - Requires AI endpoint investigation -- **Issue #21: Digital Signatures** - Complex, needs certificate handling -- **Issue #22: Batch Processing** - Client-side enhancement, do after core features -- **Issue #19: Office Formats** - Lower priority, complex format handling - -### 🎯 **Recommended Implementation Order** - -**Sprint 1 (Week 1):** -1. Image Watermark Support (1 day) -2. Multi-Language OCR (1 day) -3. Selective Annotation Flattening (2 days) - -**Sprint 2 (Week 2):** -4. Create Redactions Method (4 days) - -**Sprint 3 (Week 3):** -5. Convert to PDF/A (3 days) -6. Convert PDF to Images (3 days) - -### 💡 **Why This Order?** - -1. **Quick Wins First**: Build momentum with easy enhancements -2. **Complete Workflows**: Redaction creation completes the redaction workflow -3. **High User Value**: PDF/A and image conversion are frequently requested -4. **Incremental Complexity**: Start simple, build up to more complex features -5. **API Coverage**: These 6 features would increase API coverage significantly - -### 📈 **Expected Outcomes** - -After implementing these 6 features: -- **Methods**: 18 total (up from 12) -- **API Coverage**: ~50% (up from ~30%) -- **User Satisfaction**: Address most common feature requests -- **Time**: ~3 weeks of development - -## 🚀 Current Status - -As of the last update: -- **PR #7 (Direct API Methods)**: ✅ Merged - Added 5 new methods -- **PR #23 (OpenAPI Compliance)**: ✅ Merged - Added comprehensive documentation -- **Current Methods**: 12 Direct API methods -- **Test Coverage**: 94% -- **Python Support**: 3.8 - 3.12 - -## 📝 Notes - -- All features should maintain backward compatibility -- Each feature should include comprehensive tests -- Documentation should reference OpenAPI spec where applicable -- Integration tests should be added for each new method \ No newline at end of file diff --git a/GITHUB_ABOUT.md b/GITHUB_ABOUT.md deleted file mode 100644 index e54d040..0000000 --- a/GITHUB_ABOUT.md +++ /dev/null @@ -1,132 +0,0 @@ -# GitHub Repository Settings - -## About Section - -### Description -Official Python client library for Nutrient Document Web Services API - PDF processing, OCR, watermarking, and document manipulation with automatic Office format conversion - -### Website -https://www.nutrient.io/ - -### Topics (Tags) -Add these topics to make your repository more discoverable: - -**Core Technologies:** -- `python` -- `python3` -- `api-client` -- `sdk` -- `rest-api` - -**PDF & Document Processing:** -- `pdf` -- `pdf-processing` -- `pdf-manipulation` -- `pdf-generation` -- `document-processing` -- `document-conversion` -- `document-automation` - -**Features:** -- `ocr` -- `optical-character-recognition` -- `watermark` -- `pdf-merge` -- `pdf-rotation` -- `office-conversion` -- `docx-to-pdf` - -**Brand/Product:** -- `nutrient` -- `pspdfkit` -- `nutrient-api` -- `dws` - -**Development:** -- `type-hints` -- `async-ready` -- `well-tested` -- `developer-tools` - -## Recommended Repository Settings - -### ✅ Features to Enable: -- **Issues** - For bug reports and feature requests -- **Discussions** - For Q&A and community support -- **Wiki** - For additional documentation (optional) -- **Projects** - For tracking development roadmap - -### 🔧 Settings: -- **Allow forking** - Enable community contributions -- **Sponsorships** - If you want to accept sponsorships -- **Preserve this repository** - For long-term stability - -### 📌 Pinned Issues: -Consider pinning: -1. "Getting Started Guide" -2. "API Key Request" -3. "Roadmap & Feature Requests" - -### 🏷️ Issue Labels: -Add these custom labels: -- `api-question` - Questions about API usage -- `office-conversion` - Related to DOCX/XLSX/PPTX conversion -- `performance` - Performance-related issues -- `security` - Security-related issues - -### 📋 Issue Templates: -Consider adding templates for: -1. Bug Report -2. Feature Request -3. API Question -4. Documentation Issue - -## Social Preview - -Consider adding a social preview image that shows: -- Nutrient DWS logo -- "Python Client Library" -- Key features (PDF, OCR, Watermark, etc.) -- Code snippet example - -## Repository Insights to Highlight - -### In your README badges: -```markdown -![Python](https://img.shields.io/badge/python-3.8+-blue.svg) -![Coverage](https://img.shields.io/badge/coverage-92%25-brightgreen.svg) -![License](https://img.shields.io/badge/license-MIT-green.svg) -![PyPI](https://img.shields.io/pypi/v/nutrient-dws.svg) -``` - -### Quick Stats: -- **Language**: Python 100% -- **Test Coverage**: 92.46% -- **Dependencies**: Minimal (just `requests`) -- **Python Support**: 3.8, 3.9, 3.10, 3.11, 3.12 -- **API Operations**: 7 supported operations -- **Development Time**: Rapid implementation with comprehensive testing - -## Suggested Bio/Tagline Options - -1. "🚀 Transform documents at scale with Nutrient's Python SDK - PDF processing made simple" - -2. "📄 Enterprise-ready Python client for Nutrient DWS - Convert, OCR, watermark, and manipulate PDFs with ease" - -3. "🔧 The official Python SDK for Nutrient Document Web Services - Your toolkit for PDF automation" - -4. "⚡ Fast, reliable document processing in Python - Powered by Nutrient's cloud API" - -## SEO Keywords for Better Discovery - -Include these naturally in your README: -- Python PDF library -- Document automation API -- PDF OCR Python -- Office to PDF conversion -- PDF watermarking Python -- Document processing SDK -- Nutrient API Python -- Cloud PDF API -- PDF manipulation library -- Enterprise document processing \ No newline at end of file diff --git a/LLM_DOC.md b/LLM_DOC.md new file mode 100644 index 0000000..53f18cb --- /dev/null +++ b/LLM_DOC.md @@ -0,0 +1,1920 @@ +# Nutrient DWS Python Client Documentation + +> Nutrient DWS is a document processing service which provides document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. + +## Authentication + +### Direct API Key + +Provide your API key directly: + +```python +from nutrient_dws import NutrientClient + +client = NutrientClient(api_key='your_api_key') +``` + +### Token Provider + +Use an async token provider to fetch tokens from a secure source: + +```python +import httpx +from nutrient_dws import NutrientClient + +async def get_token(): + async with httpx.AsyncClient() as http_client: + response = await http_client.get('/api/get-nutrient-token') + data = response.json() + return data['token'] + +client = NutrientClient(api_key=get_token) +``` + +## NutrientClient + +The main client for interacting with the Nutrient DWS Processor API. + +### Constructor + +```python +NutrientClient(api_key: str | Callable[[], Awaitable[str] | str], base_url: str | None = None, timeout: int | None = None) +``` + +Parameters: +- `api_key` (required): Your API key string or async function returning a token +- `base_url` (optional): Custom API base URL (defaults to `https://api.nutrient.io`) +- `timeout` (optional): Request timeout in milliseconds + +## Direct Methods + +The client provides numerous async methods for document processing: + +### Account Methods + +#### get_account_info() +Gets account information for the current API key. + +**Returns**: `AccountInfo` - Account information dictionary + +```python +account_info = await client.get_account_info() + +# Access subscription information +print(account_info['subscriptionType']) +``` + +#### create_token(params) +Creates a new authentication token. + +**Parameters**: +- `params: CreateAuthTokenParameters` - Parameters for creating the token + +**Returns**: `CreateAuthTokenResponse` - The created token information + +```python +token = await client.create_token({ + 'expirationTime': 3600 +}) +print(token['id']) + +# Store the token for future use +token_id = token['id'] +token_value = token['accessToken'] +``` + +#### delete_token(id) +Deletes an authentication token. + +**Parameters**: +- `id: str` - ID of the token to delete + +**Returns**: `None` + +```python +await client.delete_token('token-id-123') + +# Example in a token management function +async def revoke_user_token(token_id: str) -> bool: + try: + await client.delete_token(token_id) + print(f'Token {token_id} successfully revoked') + return True + except Exception as error: + print(f'Failed to revoke token: {error}') + return False +``` + +### Document Processing Methods + +#### sign(file, data?, options?) +Signs a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to sign +- `data: CreateDigitalSignature | None` - Signature data (optional) +- `options: SignRequestOptions | None` - Additional options (image, graphicImage) (optional) + +**Returns**: `BufferOutput` - The signed PDF file output + +```python +result = await client.sign('document.pdf', { + 'signatureType': 'cms', + 'flatten': False, + 'cadesLevel': 'b-lt' +}) + +# Access the signed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('signed-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_ai(file, criteria, redaction_state?, pages?, options?) +Uses AI to redact sensitive information in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to redact +- `criteria: str` - AI redaction criteria +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional pages to redact +- `options: RedactOptions | None` - Optional redaction options + +**Returns**: `BufferOutput` - The redacted document + +```python +# Stage redactions +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails' +) + +# Apply redactions immediately +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'apply' +) + +# Redact only specific pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# Redact only the last 3 pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the redacted PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### ocr(file, language) +Performs OCR (Optical Character Recognition) on a document. + +**Parameters**: +- `file: FileInput` - The input file to perform OCR on +- `language: OcrLanguage | list[OcrLanguage]` - The language(s) to use for OCR + +**Returns**: `BufferOutput` - The OCR result + +```python +result = await client.ocr('scanned-document.pdf', 'english') + +# Access the OCR-processed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('ocr-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### watermark_text(file, text, options?) +Adds a text watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `text: str` - The watermark text +- `options: dict[str, Any] | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_text('document.pdf', 'CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 24 +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### watermark_image(file, image, options?) +Adds an image watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `image: FileInput` - The watermark image +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_image('document.pdf', 'watermark.jpg', { + 'opacity': 0.5, + 'width': {'value': 50, 'unit': "%"}, + 'height': {'value': 50, 'unit': "%"} +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('image-watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### convert(file, target_format) +Converts a document to a different format. + +**Parameters**: +- `file: FileInput` - The input file to convert +- `target_format: OutputFormat` - The target format to convert to + +**Returns**: `BufferOutput | ContentOutput | JsonContentOutput` - The specific output type based on the target format + +```python +# Convert DOCX to PDF +pdf_result = await client.convert('document.docx', 'pdf') +# Supports formats: pdf, pdfa, pdfua, docx, xlsx, pptx, png, jpeg, jpg, webp, html, markdown + +# Access the PDF buffer +pdf_buffer = pdf_result['buffer'] +print(pdf_result['mimeType']) # 'application/pdf' + +# Save the PDF +with open('converted-document.pdf', 'wb') as f: + f.write(pdf_buffer) + +# Convert PDF to image +image_result = await client.convert('document.pdf', 'png') + +# Access the PNG buffer +png_buffer = image_result['buffer'] +print(image_result['mimeType']) # 'image/png' + +# Save the image +with open('document-page.png', 'wb') as f: + f.write(png_buffer) +``` + +#### merge(files) +Merges multiple documents into one. + +**Parameters**: +- `files: list[FileInput]` - The files to merge + +**Returns**: `BufferOutput` - The merged document + +```python +result = await client.merge([ + 'doc1.pdf', + 'doc2.pdf', + 'doc3.pdf' +]) + +# Access the merged PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('merged-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### extract_text(file, pages?) +Extracts text content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract text from +- `pages: PageRange | None` - Optional page range to extract text from + +**Returns**: `JsonContentOutput` - The extracted text data + +```python +result = await client.extract_text('document.pdf') + +# Extract text from specific pages +result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract text from the last page +result = await client.extract_text('document.pdf', {'end': -1}) # Last page + +# Extract text from the second-to-last page to the end +result = await client.extract_text('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted text content +text_content = result['data']['pages'][0]['plainText'] + +# Process the extracted text +word_count = len(text_content.split()) +print(f'Document contains {word_count} words') + +# Search for specific content +if 'confidential' in text_content: + print('Document contains confidential information') +``` + +#### extract_table(file, pages?) +Extracts table content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract tables from +- `pages: PageRange | None` - Optional page range to extract tables from + +**Returns**: `JsonContentOutput` - The extracted table data + +```python +result = await client.extract_table('document.pdf') + +# Extract tables from specific pages +result = await client.extract_table('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract tables from the last page +result = await client.extract_table('document.pdf', {'end': -1}) # Last page + +# Extract tables from the second-to-last page to the end +result = await client.extract_table('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted tables +tables = result['data']['pages'][0]['tables'] + +# Process the first table if available +if tables and len(tables) > 0: + first_table = tables[0] + + # Get table dimensions + print(f"Table has {len(first_table['rows'])} rows and {len(first_table['columns'])} columns") + + # Access table cells + for i in range(len(first_table['rows'])): + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + cell_content = cell['text'] if cell else '' + print(f"Cell [{i}][{j}]: {cell_content}") + + # Convert table to CSV + csv_content = '' + for i in range(len(first_table['rows'])): + row_data = [] + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + row_data.append(cell['text'] if cell else '') + csv_content += ','.join(row_data) + '\n' + print(csv_content) +``` + +#### extract_key_value_pairs(file, pages?) +Extracts key value pair content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract KVPs from +- `pages: PageRange | None` - Optional page range to extract KVPs from + +**Returns**: `JsonContentOutput` - The extracted KVPs data + +```python +result = await client.extract_key_value_pairs('document.pdf') + +# Extract KVPs from specific pages +result = await client.extract_key_value_pairs('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract KVPs from the last page +result = await client.extract_key_value_pairs('document.pdf', {'end': -1}) # Last page + +# Extract KVPs from the second-to-last page to the end +result = await client.extract_key_value_pairs('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted key-value pairs +kvps = result['data']['pages'][0]['keyValuePairs'] + +# Process the key-value pairs +if kvps and len(kvps) > 0: + # Iterate through all key-value pairs + for index, kvp in enumerate(kvps): + print(f'KVP {index + 1}:') + print(f' Key: {kvp["key"]}') + print(f' Value: {kvp["value"]}') + print(f' Confidence: {kvp["confidence"]}') + + # Create a dictionary from the key-value pairs + dictionary = {} + for kvp in kvps: + dictionary[kvp['key']] = kvp['value'] + + # Look up specific values + print(f'Invoice Number: {dictionary.get("Invoice Number")}') + print(f'Date: {dictionary.get("Date")}') + print(f'Total Amount: {dictionary.get("Total")}') +``` + +#### flatten(file, annotation_ids?) +Flattens annotations in a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to flatten +- `annotation_ids: list[str | int] | None` - Optional specific annotation IDs to flatten + +**Returns**: `BufferOutput` - The flattened document + +```python +# Flatten all annotations +result = await client.flatten('annotated-document.pdf') + +# Flatten specific annotations by ID +result = await client.flatten('annotated-document.pdf', ['annotation1', 'annotation2']) +``` + +#### password_protect(file, user_password, owner_password, permissions?) +Password protects a PDF document. + +**Parameters**: +- `file: FileInput` - The file to protect +- `user_password: str` - Password required to open the document +- `owner_password: str` - Password required to modify the document +- `permissions: list[PDFUserPermission] | None` - Optional list of permissions granted when opened with user password + +**Returns**: `BufferOutput` - The password-protected document + +```python +result = await client.password_protect('document.pdf', 'user123', 'owner456') + +# Or with specific permissions: +result = await client.password_protect('document.pdf', 'user123', 'owner456', + ['printing', 'extract_accessibility']) + +# Access the password-protected PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('protected-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### set_metadata(file, metadata) +Sets metadata for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `metadata: Metadata` - The metadata to set (title and/or author) + +**Returns**: `BufferOutput` - The document with updated metadata + +```python +result = await client.set_metadata('document.pdf', { + 'title': 'My Document', + 'author': 'John Doe' +}) +``` + +#### set_page_labels(file, labels) +Sets page labels for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `labels: list[Label]` - List of label objects with pages and label properties + +**Returns**: `BufferOutput` - The document with updated page labels + +```python +result = await client.set_page_labels('document.pdf', [ + {'pages': [0, 1, 2], 'label': 'Cover'}, + {'pages': [3, 4, 5], 'label': 'Chapter 1'} +]) + +# Access the updated PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('labeled-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### apply_instant_json(file, instant_json_file) +Applies Instant JSON to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `instant_json_file: FileInput` - The Instant JSON file to apply + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_instant_json('document.pdf', 'annotations.json') + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('annotated-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### apply_xfdf(file, xfdf_file, options?) +Applies XFDF to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `xfdf_file: FileInput` - The XFDF file to apply +- `options: ApplyXfdfActionOptions | None` - Optional settings for applying XFDF + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') + +# Or with options: +result = await client.apply_xfdf( + 'document.pdf', 'annotations.xfdf', + {'ignorePageRotation': True, 'richTextEnabled': False} +) + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('xfdf-applied-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_preset(file, preset, redaction_state?, pages?, preset_options?, options?) +Creates redaction annotations based on a preset pattern. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `preset: SearchPreset` - The preset pattern to search for (e.g., 'email-address', 'social-security-number') +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `preset_options: CreateRedactionsStrategyOptionsPreset | None` - Optional settings for the preset strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_preset('document.pdf', 'email-address') + +# With specific pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the document with redactions +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_regex(file, regex, redaction_state?, pages?, regex_options?, options?) +Creates redaction annotations based on a regular expression. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `regex: str` - The regular expression to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `regex_options: CreateRedactionsStrategyOptionsRegex | None` - Optional settings for the regex strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_regex('document.pdf', r'Account:\s*\d{8,12}') + +# With specific pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\s*\d{8,12}', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\s*\d{8,12}', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the document with redactions +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('regex-redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### create_redactions_text(file, text, redaction_state?, pages?, text_options?, options?) +Creates redaction annotations based on text. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `text: str` - The text to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `text_options: CreateRedactionsStrategyOptionsText | None` - Optional settings for the text strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_text('document.pdf', 'email@example.com') + +# With specific pages and options +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': 0, 'end': 4}, # Pages 0, 1, 2, 3, 4 + {'caseSensitive': False, 'includeAnnotations': True} +) + +# Create redactions on the last 3 pages +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the document with redactions +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('text-redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### apply_redactions(file) +Applies redaction annotations in a document. + +**Parameters**: +- `file: FileInput` - The PDF file with redaction annotations to apply + +**Returns**: `BufferOutput` - The document with applied redactions + +```python +# Stage redactions from a createRedaction Method: +staged_result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage' +) + +result = await client.apply_redactions(staged_result['buffer']) + +# Access the final redacted document +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('final-redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### rotate(file, angle, pages?) +Rotates pages in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to rotate +- `angle: Literal[90, 180, 270]` - Rotation angle (90, 180, or 270 degrees) +- `pages: PageRange | None` - Optional page range to rotate + +**Returns**: `BufferOutput` - The entire document with specified pages rotated + +```python +result = await client.rotate('document.pdf', 90) + +# Rotate specific pages: +result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) # Pages 1, 2, 3 + +# Rotate the last page: +result = await client.rotate('document.pdf', 90, {'end': -1}) # Last page + +# Rotate from page 2 to the second-to-last page: +result = await client.rotate('document.pdf', 90, {'start': 2, 'end': -2}) + +# Access the rotated PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('rotated-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### add_page(file, count?, index?) +Adds blank pages to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to add pages to +- `count: int` - The number of blank pages to add (default: 1) +- `index: int | None` - Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + +**Returns**: `BufferOutput` - The document with added pages + +```python +# Add 2 blank pages at the end +result = await client.add_page('document.pdf', 2) + +# Add 1 blank page after the first page (at index 1) +result = await client.add_page('document.pdf', 1, 1) + +# Access the document with added pages +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('document-with-pages.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### optimize(file, options?) +Optimizes a PDF document for size reduction. + +**Parameters**: +- `file: FileInput` - The PDF file to optimize +- `options: OptimizePdf | None` - Optimization options + +**Returns**: `BufferOutput` - The optimized document + +```python +result = await client.optimize('large-document.pdf', { + 'grayscaleImages': True, + 'mrcCompression': True, + 'imageOptimizationQuality': 2 +}) + +# Access the optimized PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('optimized-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### split(file, page_ranges) +Splits a PDF document into multiple parts based on page ranges. + +**Parameters**: +- `file: FileInput` - The PDF file to split +- `page_ranges: list[PageRange]` - List of page ranges to extract + +**Returns**: `list[BufferOutput]` - A list of PDF documents, one for each page range + +```python +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # Pages 0, 1, 2 + {'start': 3, 'end': 5} # Pages 3, 4, 5 +]) + +# Split using negative indices +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # First three pages + {'start': 3, 'end': -3}, # Middle pages + {'start': -2, 'end': -1} # Last two pages +]) + +# Process each resulting PDF +for i, result in enumerate(results): + # Access the PDF buffer + pdf_buffer = result['buffer'] + + # Get the MIME type of the output + print(result['mimeType']) # 'application/pdf' + + # Save the buffer to a file + with open(f'split-part-{i}.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### duplicate_pages(file, page_indices) +Creates a new PDF containing only the specified pages in the order provided. + +**Parameters**: +- `file: FileInput` - The PDF file to extract pages from +- `page_indices: list[int]` - List of page indices to include in the new PDF (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - A new document with only the specified pages + +```python +# Create a new PDF with only the first and third pages +result = await client.duplicate_pages('document.pdf', [0, 2]) + +# Create a new PDF with pages in a different order +result = await client.duplicate_pages('document.pdf', [2, 0, 1]) + +# Create a new PDF with duplicated pages +result = await client.duplicate_pages('document.pdf', [0, 0, 1, 1, 0]) + +# Create a new PDF with the first and last pages +result = await client.duplicate_pages('document.pdf', [0, -1]) + +# Create a new PDF with the last three pages in reverse order +result = await client.duplicate_pages('document.pdf', [-1, -2, -3]) + +# Access the PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('duplicated-pages.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +#### delete_pages(file, page_indices) +Deletes pages from a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `page_indices: list[int]` - List of page indices to delete (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - The document with deleted pages + +```python +# Delete second and fourth pages +result = await client.delete_pages('document.pdf', [1, 3]) + +# Delete the last page +result = await client.delete_pages('document.pdf', [-1]) + +# Delete the first and last two pages +result = await client.delete_pages('document.pdf', [0, -1, -2]) + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('modified-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +### Error Handling + +The library provides a comprehensive error hierarchy: + +```python +from nutrient_dws import ( + NutrientError, + ValidationError, + APIError, + AuthenticationError, + NetworkError +) + +try: + result = await client.convert('file.docx', 'pdf') +except ValidationError as error: + # Invalid input parameters + print(f'Invalid input: {error.message} - Details: {error.details}') +except AuthenticationError as error: + # Authentication failed + print(f'Auth error: {error.message} - Status: {error.status_code}') +except APIError as error: + # API returned an error + print(f'API error: {error.message} - Status: {error.status_code} - Details: {error.details}') +except NetworkError as error: + # Network request failed + print(f'Network error: {error.message} - Details: {error.details}') +``` + +## Workflow Methods + +The Nutrient DWS Python Client uses a fluent builder pattern with staged interfaces to create document processing workflows. This architecture provides several benefits: + +1. **Type Safety**: The staged interface ensures that methods are only available at appropriate stages +2. **Readability**: Method chaining creates readable, declarative code +3. **Discoverability**: IDE auto-completion guides you through the workflow stages +4. **Flexibility**: Complex workflows can be built with simple, composable pieces + +### Stage 0: Create Workflow + +You have several ways of creating a workflow + +```python +# Creating Workflow from a client +workflow = client.workflow() + +# Override the client timeout +workflow = client.workflow(60000) + +# Create a workflow without a client +from nutrient_dws.builder.builder import StagedWorkflowBuilder +workflow = StagedWorkflowBuilder({ + 'apiKey': 'your-api-key' +}) +``` + +### Stage 1: Add Parts + +In this stage, you add document parts to the workflow: + +```python +workflow = (client.workflow() + .add_file_part('document.pdf') + .add_file_part('appendix.pdf')) +``` + +Available methods: + +#### `add_file_part(file, options?, actions?)` +Adds a file part to the workflow. + +**Parameters:** +- `file: FileInput` - The file to add to the workflow. Can be a local file path, bytes, or file-like object. +- `options: FilePartOptions | None` - Additional options for the file part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the file part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Add a PDF file from a local path +workflow.add_file_part('/path/to/document.pdf') + +# Add a file with options and actions +workflow.add_file_part( + '/path/to/document.pdf', + {'pages': {'start': 1, 'end': 3}}, + [BuildActions.watermark_text('CONFIDENTIAL')] +) +``` + +#### `add_html_part(html, assets?, options?, actions?)` +Adds an HTML part to the workflow. + +**Parameters:** +- `html: FileInput` - The HTML content to add. Can be a file path, bytes, or file-like object. +- `assets: list[FileInput] | None` - Optional list of assets (CSS, images, etc.) to include with the HTML. Only local files or bytes are supported (optional) +- `options: HTMLPartOptions | None` - Additional options for the HTML part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the HTML part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add HTML content from a file +workflow.add_html_part('/path/to/content.html') + +# Add HTML with assets and options +workflow.add_html_part( + '/path/to/content.html', + ['/path/to/style.css', '/path/to/image.png'], + {'layout': {'size': 'A4'}} +) +``` + +#### `add_new_page(options?, actions?)` +Adds a new blank page to the workflow. + +**Parameters:** +- `options: NewPagePartOptions | None` - Additional options for the new page, such as page size, orientation, etc. (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the new page (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a simple blank page +workflow.add_new_page() + +# Add a new page with specific options +workflow.add_new_page({ + 'layout': {'size': 'A4', 'orientation': 'portrait'} +}) +``` + +#### `add_document_part(document_id, options?, actions?)` +Adds a document part to the workflow by referencing an existing document by ID. + +**Parameters:** +- `document_id: str` - The ID of the document to add to the workflow. +- `options: DocumentPartOptions | None` - Additional options for the document part (optional) + - `options['layer']: str` - Optional layer name to select a specific layer from the document. +- `actions: list[BuildAction] | None` - Actions to apply to the document part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a document by ID +workflow.add_document_part('doc_12345abcde') + +# Add a document with a specific layer and options +workflow.add_document_part( + 'doc_12345abcde', + { + 'layer': 'content', + 'pages': {'start': 0, 'end': 3} + } +) +``` + +### Stage 2: Apply Actions (Optional) + +In this stage, you can apply actions to the document: + +```python +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) +``` + +Available methods: + +#### `apply_action(action)` +Applies a single action to the workflow. + +**Parameters:** +- `action: BuildAction` - The action to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply a watermark action +workflow.apply_action( + BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.3, + 'rotation': 45 + }) +) + +# Apply an OCR action +workflow.apply_action(BuildActions.ocr('english')) +``` + +#### `apply_actions(actions)` +Applies multiple actions to the workflow. + +**Parameters:** +- `actions: list[BuildAction]` - A list of actions to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply multiple actions to the workflow +workflow.apply_actions([ + BuildActions.watermark_text('DRAFT', {'opacity': 0.5}), + BuildActions.ocr('english'), + BuildActions.flatten() +]) +``` + +#### Action Types: + +#### Document Processing + +##### `BuildActions.ocr(language)` +Creates an OCR (Optical Character Recognition) action to extract text from images or scanned documents. + +**Parameters:** +- `language: str | list[str]` - Language(s) for OCR. Can be a single language or a list of languages. + +**Example:** +```python +# Basic OCR with English language +workflow.apply_action(BuildActions.ocr('english')) + +# OCR with multiple languages +workflow.apply_action(BuildActions.ocr(['english', 'french', 'german'])) + +# OCR with options (via dict syntax) +workflow.apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True +})) +``` + +##### `BuildActions.rotate(rotate_by)` +Creates an action to rotate pages in the document. + +**Parameters:** +- `rotate_by: Literal[90, 180, 270]` - Rotation angle in degrees (must be 90, 180, or 270). + +**Example:** +```python +# Rotate pages by 90 degrees +workflow.apply_action(BuildActions.rotate(90)) + +# Rotate pages by 180 degrees +workflow.apply_action(BuildActions.rotate(180)) +``` + +##### `BuildActions.flatten(annotation_ids?)` +Creates an action to flatten annotations into the document content, making them non-interactive but permanently visible. + +**Parameters:** +- `annotation_ids: list[str | int] | None` - Optional list of annotation IDs to flatten. If not specified, all annotations will be flattened (optional) + +**Example:** +```python +# Flatten all annotations +workflow.apply_action(BuildActions.flatten()) + +# Flatten specific annotations +workflow.apply_action(BuildActions.flatten(['annotation1', 'annotation2'])) +``` + +#### Watermarking + +##### `BuildActions.watermark_text(text, options?)` +Creates an action to add a text watermark to the document. + +**Parameters:** +- `text: str` - Watermark text content. +- `options: TextWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + - `fontFamily`: Font family for the text (e.g. 'Helvetica') + - `fontSize`: Size of the text in points + - `fontColor`: Foreground color of the text (e.g. '#ffffff') + - `fontStyle`: Text style list (['bold'], ['italic'], or ['bold', 'italic']) + +**Example:** + +```python +# Simple text watermark +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Customized text watermark +workflow.apply_action(BuildActions.watermark_text('DRAFT', { + 'opacity': 0.5, + 'rotation': 45, + 'fontSize': 36, + 'fontColor': '#FF0000', + 'fontStyle': ['bold', 'italic'] +})) +``` + +##### `BuildActions.watermark_image(image, options?)` +Creates an action to add an image watermark to the document. + +**Parameters:** +- `image: FileInput` - Watermark image (file path, bytes, or file-like object). +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + +**Example:** + +```python +# Simple image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png')) + +# Customized image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png', { + 'opacity': 0.3, + 'width': {'value': 50, 'unit': '%'}, + 'height': {'value': 50, 'unit': '%'}, + 'top': {'value': 10, 'unit': 'px'}, + 'left': {'value': 10, 'unit': 'px'}, + 'rotation': 0 +})) +``` + +#### Annotations + +##### `BuildActions.apply_instant_json(file)` +Creates an action to apply annotations from an Instant JSON file to the document. + +**Parameters:** +- `file: FileInput` - Instant JSON file input (file path, bytes, or file-like object). + +**Example:** + +```python +# Apply annotations from Instant JSON file +workflow.apply_action(BuildActions.apply_instant_json('/path/to/annotations.json')) +``` + +##### `BuildActions.apply_xfdf(file, options?)` +Creates an action to apply annotations from an XFDF file to the document. + +**Parameters:** +- `file: FileInput` - XFDF file input (file path, bytes, or file-like object). +- `options: ApplyXfdfActionOptions | None` - Apply XFDF options (optional): + - `ignorePageRotation: bool` - If True, ignores page rotation when applying XFDF data (default: False) + - `richTextEnabled: bool` - If True, plain text annotations will be converted to rich text annotations. If False, all text annotations will be plain text annotations (default: True) + +**Example:** + +```python +# Apply annotations from XFDF file with default options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf')) + +# Apply annotations with specific options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf', { + 'ignorePageRotation': True, + 'richTextEnabled': False +})) +``` + +#### Redactions + +##### `BuildActions.create_redactions_text(text, options?, strategy_options?)` +Creates an action to add redaction annotations based on text search. + +**Parameters:** +- `text: str` - Text to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsText | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided text (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: False) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for all occurrences of "Confidential" +workflow.apply_action(BuildActions.create_redactions_text('Confidential')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_text('Confidential', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'REDACTED', + 'textColor': '#FFFFFF' + } + }, + { + 'caseSensitive': True, + 'start': 2, + 'limit': 5 + } + )) +``` + +##### `BuildActions.create_redactions_regex(regex, options?, strategy_options?)` +Creates an action to add redaction annotations based on regex pattern matching. + +**Parameters:** +- `regex: str` - Regex pattern to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsRegex | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided regex (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', + { + 'content': { + 'backgroundColor': '#FF0000', + 'overlayText': 'EMAIL REDACTED' + } + }, + { + 'caseSensitive': False, + 'start': 0, + 'limit': 10 + } + )) +``` + +##### `BuildActions.create_redactions_preset(preset, options?, strategy_options?)` +Creates an action to add redaction annotations based on a preset pattern. + +**Parameters:** +- `preset: str` - Preset pattern to search and redact (e.g. 'email-address', 'credit-card-number', 'social-security-number', etc.) +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsPreset | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided preset (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses using preset +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Create redactions for credit card numbers with custom appearance +workflow.apply_action(BuildActions.create_redactions_preset('credit-card-number', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'FINANCIAL DATA' + } + }, + { + 'start': 0, + 'limit': 5 + } + )) +``` + +##### `BuildActions.apply_redactions()` +Creates an action to apply previously created redaction annotations, permanently removing the redacted content. + +**Example:** + +```python +# First create redactions +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Then apply them +workflow.apply_action(BuildActions.apply_redactions()) +``` + +### Stage 3: Set Output Format + +In this stage, you specify the desired output format: + +```python +workflow.output_pdf({ + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + } +}) +``` + +Available methods: + +#### `output_pdf(options?)` +Sets the output format to PDF. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF output, such as compression, encryption, etc. (optional) + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF with default options +workflow.output_pdf() + +# Set output format to PDF with specific options +workflow.output_pdf({ + 'user_password': 'secret', + 'user_permissions': ["printing"], + 'metadata': { + 'title': 'Important Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_pdfa(options?)` +Sets the output format to PDF/A (archival PDF). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/A output (optional): + - `options['conformance']: str` - The PDF/A conformance level to target. Options include 'pdfa-1b', 'pdfa-1a', 'pdfa-2b', 'pdfa-2a', 'pdfa-3b', 'pdfa-3a'. + Different levels have different requirements for long-term archiving. + - `options['vectorization']: bool` - When True, attempts to convert raster content to vector graphics where possible, improving quality and reducing file size. + - `options['rasterization']: bool` - When True, converts vector graphics to raster images, which can help with compatibility in some cases. + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/A with default options +workflow.output_pdfa() + +# Set output format to PDF/A with specific options +workflow.output_pdfa({ + 'conformance': 'pdfa-2b', + 'vectorization': True, + 'metadata': { + 'title': 'Archive Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True + } +}) +``` + +#### `output_pdfua(options?)` +Sets the output format to PDF/UA (Universal Accessibility). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/UA output (optional): + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/UA with default options +workflow.output_pdfua() + +# Set output format to PDF/UA with specific options +workflow.output_pdfua({ + 'metadata': { + 'title': 'Accessible Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_image(format, options?)` +Sets the output format to an image format (PNG, JPEG, WEBP). + +**Parameters:** +- `format: Literal['png', 'jpeg', 'jpg', 'webp']` - The image format to output. + - PNG: Lossless compression, supports transparency, best for graphics and screenshots + - JPEG/JPG: Lossy compression, smaller file size, best for photographs + - WEBP: Modern format with both lossy and lossless compression, good for web use +- `options: dict[str, Any] | None` - Additional options for image output, such as resolution, quality, etc. (optional) + **Note: At least one of options['width'], options['height'], or options['dpi'] must be specified.** + - `options['pages']: dict[str, int]` - Specifies which pages to convert to images. If omitted, all pages are converted. + - `options['pages']['start']: int` - The first page to convert (0-based index). + - `options['pages']['end']: int` - The last page to convert (0-based index). + - `options['width']: int` - The width of the output image in pixels. If specified without height, aspect ratio is maintained. + - `options['height']: int` - The height of the output image in pixels. If specified without width, aspect ratio is maintained. + - `options['dpi']: int` - The resolution in dots per inch. Higher values create larger, more detailed images. + Common values: 72 (web), 150 (standard), 300 (print quality), 600 (high quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PNG with dpi specified +workflow.output_image('png', {'dpi': 300}) + +# Set output format to JPEG with specific options +workflow.output_image('jpeg', { + 'dpi': 300, + 'pages': {'start': 1, 'end': 3} +}) + +# Set output format to WEBP with specific dimensions +workflow.output_image('webp', { + 'width': 1200, + 'height': 800, + 'dpi': 150 +}) +``` + +#### `output_office(format)` +Sets the output format to an Office document format (DOCX, XLSX, PPTX). + +**Parameters:** +- `format: Literal['docx', 'xlsx', 'pptx']` - The Office format to output ('docx' for Word, 'xlsx' for Excel, or 'pptx' for PowerPoint). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Word document (DOCX) +workflow.output_office('docx') + +# Set output format to Excel spreadsheet (XLSX) +workflow.output_office('xlsx') + +# Set output format to PowerPoint presentation (PPTX) +workflow.output_office('pptx') +``` + +#### `output_html(layout)` +Sets the output format to HTML. + +**Parameters:** +- `layout: Literal['page', 'reflow']` - The layout type to use for conversion to HTML: + - 'page' layout keeps the original structure of the document, segmented by page. + - 'reflow' layout converts the document into a continuous flow of text, without page breaks. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to HTML +workflow.output_html('page') +``` + +#### `output_markdown()` +Sets the output format to Markdown. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Markdown with default options +workflow.output_markdown() +``` + +#### `output_json(options?)` +Sets the output format to JSON content. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for JSON output (optional): + - `options['plainText']: bool` - When True, extracts plain text content from the document and includes it in the JSON output. + This provides the raw text without structural information. + - `options['structuredText']: bool` - When True, extracts text with structural information (paragraphs, headings, etc.) + and includes it in the JSON output. + - `options['keyValuePairs']: bool` - When True, attempts to identify and extract key-value pairs from the document + (like form fields, labeled data, etc.) and includes them in the JSON output. + - `options['tables']: bool` - When True, attempts to identify and extract tabular data from the document + and includes it in the JSON output as structured table objects. + - `options['language']: str | list[str]` - Specifies the language(s) of the document content for better text extraction. + Can be a single language code or a list of language codes for multi-language documents. + Examples: "english", "french", "german", or ["english", "spanish"]. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to JSON with default options +workflow.output_json() + +# Set output format to JSON with specific options +workflow.output_json({ + 'plainText': True, + 'structuredText': True, + 'keyValuePairs': True, + 'tables': True, + 'language': "english" +}) + +# Set output format to JSON with multiple languages +workflow.output_json({ + 'plainText': True, + 'tables': True, + 'language': ["english", "french", "german"] +}) +``` + +### Stage 4: Execute or Dry Run + +In this final stage, you execute the workflow or perform a dry run: + +```python +result = await workflow.execute() +``` + +Available methods: + +#### `execute(options?)` +Executes the workflow and returns the result. + +**Parameters:** +- `on_progress: Callable[[int, int], None] | None` - Callback for progress updates (optional). + +**Returns:** `TypedWorkflowResult` - The workflow result. + +**Example:** +```python +# Execute the workflow with default options +result = await workflow.execute() + +# Execute with progress tracking +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + +result = await workflow.execute(on_progress=progress_callback) +``` + +#### `dry_run(options?)` +Performs a dry run of the workflow without generating the final output. This is useful for validating the workflow configuration and estimating processing time. + +**Returns:** `WorkflowDryRunResult` - The dry run result, containing validation information and estimated processing time. + +**Example:** +```python +# Perform a dry run with default options +dry_run_result = await (workflow + .add_file_part('/path/to/document.pdf') + .output_pdf() + .dry_run()) +``` + +### Workflow Examples + +#### Basic Document Conversion + +```python +result = await (client + .workflow() + .add_file_part('document.docx') + .output_pdf() + .execute()) +``` + +#### Document Merging with Watermark + +```python +result = await (client + .workflow() + .add_file_part('document1.pdf') + .add_file_part('document2.pdf') + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) + .output_pdf() + .execute()) +``` + +#### OCR with Language Selection + +```python +result = await (client + .workflow() + .add_file_part('scanned-document.pdf') + .apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True + })) + .output_pdf() + .execute()) +``` + +#### HTML to PDF Conversion + +```python +result = await (client + .workflow() + .add_html_part('index.html', None, { + 'layout': { + 'size': 'A4', + 'margin': { + 'top': 50, + 'bottom': 50, + 'left': 50, + 'right': 50 + } + } + }) + .output_pdf() + .execute()) +``` + +#### Complex Multi-step Workflow + +```python +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + + +result = await (client + .workflow() + .add_file_part('document.pdf', {'pages': {'start': 0, 'end': 5}}) + .add_file_part('appendix.pdf') + .apply_actions([ + BuildActions.ocr({'language': 'english'}), + BuildActions.watermark_text('CONFIDENTIAL'), + BuildActions.create_redactions_preset('email-address', 'apply') +]) + .output_pdfa({ + 'level': 'pdfa-2b', + 'optimize': { + 'mrcCompression': True + } +}) + .execute(on_progress=progress_callback)) +``` + +### Staged Workflow Builder + +For more complex scenarios where you need to build workflows dynamically, you can use the staged workflow builder: + +```python +# Create a staged workflow +workflow = client.workflow() + +# Add parts +workflow.add_file_part('document.pdf') + +# Conditionally add more parts +if include_appendix: + workflow.add_file_part('appendix.pdf') + +# Conditionally apply actions +if needs_watermark: + workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Set output format based on user preference +if output_format == 'pdf': + workflow.output_pdf() +elif output_format == 'docx': + workflow.output_office('docx') +else: + workflow.output_image('png') + +# Execute the workflow +result = await workflow.execute() +``` + +### Error Handling in Workflows + +Workflows provide detailed error information: + +```python +try: + result = await (client + .workflow() + .add_file_part('document.pdf') + .output_pdf() + .execute()) + + if not result['success']: + # Handle workflow errors + for error in result.get('errors', []): + print(f"Step {error['step']}: {error['error']['message']}") +except Exception as error: + # Handle unexpected errors + print(f'Workflow execution failed: {error}') +``` + +### Workflow Result Structure + +The result of a workflow execution includes: + +```python +from typing import TypedDict, Any, List, Optional, Union + +class WorkflowError(TypedDict): + step: str + error: dict[str, Any] + +class BufferOutput(TypedDict): + mimeType: str + filename: str + buffer: bytes + +class ContentOutput(TypedDict): + mimeType: str + filename: str + content: str + +class JsonContentOutput(TypedDict): + mimeType: str + filename: str + data: Any + +class WorkflowResult(TypedDict): + # Overall success status + success: bool + + # Output data (if successful) + output: Optional[Union[BufferOutput, ContentOutput, JsonContentOutput]] + + # Error information (if failed) + errors: Optional[List[WorkflowError]] +``` + +### Performance Considerations + +For optimal performance with workflows: + +1. **Minimize the number of parts**: Combine related files when possible +2. **Use appropriate output formats**: Choose formats based on your needs +3. **Consider dry runs**: Use `dry_run()` to estimate resource usage +4. **Monitor progress**: Use the `on_progress` callback for long-running workflows +5. **Handle large files**: For very large files, consider splitting into smaller workflows diff --git a/README.md b/README.md index 3bf020a..b12a086 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,26 @@ # Nutrient DWS Python Client -[![Python](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) -[![Coverage](https://img.shields.io/badge/coverage-92%25-brightgreen.svg)](https://github.com/jdrhyne/nutrient-dws-client-python/actions) +[![PyPI version](https://badge.fury.io/py/nutrient-dws.svg)](https://badge.fury.io/py/nutrient-dws) +[![CI](https://github.com/PSPDFKit/nutrient-dws-client-python/actions/workflows/ci.yml/badge.svg)](https://github.com/PSPDFKit/nutrient-dws-client-python/actions/workflows/ci.yml) +[![Integration Tests](https://github.com/PSPDFKit/nutrient-dws-client-python/actions/workflows/integration-tests.yml/badge.svg)](https://github.com/PSPDFKit/nutrient-dws-client-python/actions/workflows/integration-tests.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![PyPI version](https://img.shields.io/pypi/v/nutrient-dws.svg)](https://pypi.org/project/nutrient-dws/) -A Python client library for the [Nutrient Document Web Services (DWS) API](https://www.nutrient.io/). This library provides a Pythonic interface to interact with Nutrient's document processing services, supporting both Direct API calls and Builder API workflows. +A Python client library for [Nutrient Document Web Services (DWS) API](https://nutrient.io/). This library provides a fully async, type-safe, and ergonomic interface for document processing operations including conversion, merging, compression, watermarking, OCR, and text extraction. + +> **Note**: This package is published as `nutrient-dws` on PyPI. The package provides full type support and is designed for async Python environments (Python 3.10+). ## Features -- 🚀 **Two API styles**: Direct API for single operations, Builder API for complex workflows -- 📄 **Comprehensive document tools**: Convert, merge, rotate, OCR, watermark, and more -- 🔄 **Automatic retries**: Built-in retry logic for transient failures -- 📁 **Flexible file handling**: Support for file paths, bytes, and file-like objects -- 🔒 **Type-safe**: Full type hints for better IDE support -- ⚡ **Streaming support**: Memory-efficient processing of large files -- 🧪 **Well-tested**: Comprehensive test suite with high coverage +- 📄 **Powerful document processing**: Convert, OCR, edit, compress, watermark, redact, and digitally sign documents +- 🤖 **LLM friendly**: Built-in support for popular Coding Agents (Claude Code, GitHub Copilot, JetBrains Junie, Cursor, Windsurf) with auto-generated rules +- 🔄 **100% mapping with DWS Processor API**: Complete coverage of all Nutrient DWS Processor API capabilities +- 🛠️ **Convenient functions with sane defaults**: Simple interfaces for common operations with smart default settings +- ⛓️ **Chainable operations**: Build complex document workflows with intuitive method chaining +- 🚀 **Fully async**: Built from the ground up with async/await support for optimal performance +- 🔐 **Flexible authentication and security**: Support for API keys and async token providers with secure handling +- ✅ **Highly tested**: Comprehensive test suite ensuring reliability and stability +- 🔒 **Type-safe**: Full type annotations with comprehensive type definitions +- 🐍 **Pythonic**: Follows Python conventions and best practices ## Installation @@ -24,305 +28,219 @@ A Python client library for the [Nutrient Document Web Services (DWS) API](https pip install nutrient-dws ``` -## Quick Start - -```python -from nutrient_dws import NutrientClient - -# Initialize the client -client = NutrientClient(api_key="your-api-key") -# Direct API - Flatten PDF annotations -client.flatten_annotations( - input_file="document.pdf", - output_path="flattened.pdf" -) +## Integration with Coding Agents -# Builder API - Chain multiple operations -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "en"}) \ - .add_step("watermark-pdf", {"text": "CONFIDENTIAL"}) \ - .execute(output_path="processed.pdf") -``` +This package has built-in support with popular coding agents like Claude Code, GitHub Copilot, Cursor, and Windsurf by exposing scripts that will inject rules instructing the coding agents on how to use the package. This ensures that the coding agent doesn't hallucinate documentation, as well as making full use of all the features offered in Nutrient DWS Python Client. -## Authentication +```bash +# Adding code rule to Claude Code +dws-add-claude-code-rule -The client supports API key authentication through multiple methods: +# Adding code rule to GitHub Copilot +dws-add-github-copilot-rule -```python -# 1. Pass directly to client -client = NutrientClient(api_key="your-api-key") +# Adding code rule to Junie (Jetbrains) +dws-add-junie-rule -# 2. Set environment variable -# export NUTRIENT_API_KEY=your-api-key -client = NutrientClient() # Will use env variable +# Adding code rule to Cursor +dws-add-cursor-rule -# 3. Use context manager for automatic cleanup -with NutrientClient(api_key="your-api-key") as client: - client.convert_to_pdf("document.docx") +# Adding code rule to Windsurf +dws-add-windsurf-rule ``` -## Direct API Examples +The documentation for Nutrient DWS Python Client is also available on [Context7](https://context7.com/pspdfkit/nutrient-dws-client-python) -### Flatten Annotations +## Quick Start ```python -# Flatten all annotations and form fields -client.flatten_annotations( - input_file="form.pdf", - output_path="flattened.pdf" -) -``` - -### Merge PDFs +from nutrient_dws import NutrientClient -```python -# Merge multiple PDFs -client.merge_pdfs( - input_files=["doc1.pdf", "doc2.pdf", "doc3.pdf"], - output_path="merged.pdf" -) +client = NutrientClient(api_key='your_api_key') ``` -### OCR PDF - -```python -# Add OCR layer to scanned PDF -client.ocr_pdf( - input_file="scanned.pdf", - output_path="searchable.pdf", - language="en" -) -``` +## Direct Methods -### Rotate Pages +The client provides numerous async methods for document processing: ```python -# Rotate all pages -client.rotate_pages( - input_file="document.pdf", - output_path="rotated.pdf", - degrees=180 -) - -# Rotate specific pages -client.rotate_pages( - input_file="document.pdf", - output_path="rotated.pdf", - degrees=90, - page_indexes=[0, 2, 4] # Pages 1, 3, and 5 -) -``` - -### Watermark PDF +import asyncio +from nutrient_dws import NutrientClient -```python -# Add text watermark (width/height required) -client.watermark_pdf( - input_file="document.pdf", - output_path="watermarked.pdf", - text="DRAFT", - width=200, - height=100, - opacity=0.5, - position="center" -) +async def main(): + client = NutrientClient(api_key='your_api_key') -# Add image watermark from URL -client.watermark_pdf( - input_file="document.pdf", - output_path="watermarked.pdf", - image_url="https://example.com/logo.png", - width=150, - height=75, - opacity=0.8, - position="bottom-right" -) + # Convert a document + pdf_result = await client.convert('document.docx', 'pdf') -# Add image watermark from local file (NEW!) -client.watermark_pdf( - input_file="document.pdf", - output_path="watermarked.pdf", - image_file="logo.png", # Can be path, bytes, or file-like object - width=150, - height=75, - opacity=0.8, - position="bottom-right" -) -``` + # Extract text + text_result = await client.extract_text('document.pdf') -## Builder API Examples + # Add a watermark + watermarked_doc = await client.watermark_text('document.pdf', 'CONFIDENTIAL') -The Builder API allows you to chain multiple operations in a single workflow: + # Merge multiple documents + merged_pdf = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']) -```python -# Complex document processing pipeline -result = client.build(input_file="raw-scan.pdf") \ - .add_step("ocr-pdf", {"language": "en"}) \ - .add_step("rotate-pages", {"degrees": -90, "page_indexes": [0]}) \ - .add_step("watermark-pdf", { - "text": "PROCESSED", - "opacity": 0.3, - "position": "top-right" - }) \ - .add_step("flatten-annotations") \ - .set_output_options( - metadata={"title": "Processed Document", "author": "DWS Client"}, - optimize=True - ) \ - .execute(output_path="final.pdf") - -# Using image file in builder API -result = client.build(input_file="document.pdf") \ - .add_step("watermark-pdf", { - "image_file": "company-logo.png", # Local file - "width": 100, - "height": 50, - "opacity": 0.5, - "position": "bottom-left" - }) \ - .execute() +asyncio.run(main()) ``` -## File Input Options - -The library supports multiple ways to provide input files: - -```python -# File path (string or Path object) -client.convert_to_pdf("document.docx") -client.convert_to_pdf(Path("document.docx")) +For a complete list of available methods with examples, see the [Methods Documentation](docs/METHODS.md). -# Bytes -with open("document.docx", "rb") as f: - file_bytes = f.read() -client.convert_to_pdf(file_bytes) +## Workflow System -# File-like object -with open("document.docx", "rb") as f: - client.convert_to_pdf(f) +The client also provides a fluent builder pattern with staged interfaces to create document processing workflows: -# URL (for supported operations) -client.import_from_url("https://example.com/document.pdf") -``` +```python +from nutrient_dws.builder.constant import BuildActions + +async def main(): + client = NutrientClient(api_key='your_api_key') + + result = await (client + .workflow() + .add_file_part('document.pdf') + .add_file_part('appendix.pdf') + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 + })) + .output_pdf({ + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + } + }) + .execute()) + +asyncio.run(main()) +``` + +The workflow system follows a staged approach: +1. Add document parts (files, HTML, pages) +2. Apply actions (optional) +3. Set output format +4. Execute or perform a dry run + +For detailed information about the workflow system, including examples and best practices, see the [Workflow Documentation](docs/WORKFLOW.md). ## Error Handling -The library provides specific exceptions for different error scenarios: +The library provides a comprehensive error hierarchy: ```python from nutrient_dws import ( + NutrientClient, NutrientError, - AuthenticationError, - APIError, ValidationError, - TimeoutError, - FileProcessingError + APIError, + AuthenticationError, + NetworkError ) -try: - client.convert_to_pdf("document.docx") -except AuthenticationError: - print("Invalid API key") -except ValidationError as e: - print(f"Invalid parameters: {e.errors}") -except APIError as e: - print(f"API error: {e.status_code} - {e.message}") -except TimeoutError: - print("Request timed out") -except FileProcessingError as e: - print(f"File processing failed: {e}") -``` - -## Advanced Configuration +async def main(): + client = NutrientClient(api_key='your_api_key') -### Custom Timeout + try: + result = await client.convert('file.docx', 'pdf') + except ValidationError as error: + # Invalid input parameters + print(f'Invalid input: {error.message} - Details: {error.details}') + except AuthenticationError as error: + # Authentication failed + print(f'Auth error: {error.message} - Status: {error.status_code}') + except APIError as error: + # API returned an error + print(f'API error: {error.message} - Status: {error.status_code} - Details: {error.details}') + except NetworkError as error: + # Network request failed + print(f'Network error: {error.message} - Details: {error.details}') -```python -# Set timeout to 10 minutes for large files -client = NutrientClient(api_key="your-api-key", timeout=600) +asyncio.run(main()) ``` -### Streaming Large Files - -Files larger than 10MB are automatically streamed to avoid memory issues: - -```python -# This will stream the file instead of loading it into memory -client.flatten_annotations("large-document.pdf") -``` +## Testing -## Available Operations +The library includes comprehensive unit and integration tests: -### PDF Manipulation -- `merge_pdfs` - Merge multiple PDFs into one -- `rotate_pages` - Rotate PDF pages (all or specific pages) -- `flatten_annotations` - Flatten form fields and annotations +```bash +# Run all tests +python -m pytest -### PDF Enhancement -- `ocr_pdf` - Add searchable text layer (English and German) -- `watermark_pdf` - Add text or image watermarks +# Run with coverage report +python -m pytest --cov=nutrient_dws --cov-report=html -### PDF Security -- `apply_redactions` - Apply existing redaction annotations +# Run only unit tests +python -m pytest tests/unit/ -### Builder API -The Builder API allows chaining multiple operations: -```python -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "english"}) \ - .add_step("watermark-pdf", {"text": "DRAFT", "width": 200, "height": 100}) \ - .execute(output_path="processed.pdf") +# Run integration tests (requires API key) +NUTRIENT_API_KEY=your_key python -m pytest tests/test_integration.py ``` -Note: See [SUPPORTED_OPERATIONS.md](SUPPORTED_OPERATIONS.md) for detailed documentation of all supported operations and their parameters. +The library maintains high test coverage across all API methods, including: +- Unit tests for all public methods +- Integration tests for real API interactions +- Type checking with mypy ## Development -### Setup +For development, install the package in development mode: ```bash # Clone the repository -git clone https://github.com/jdrhyne/nutrient-dws-client-python.git +git clone https://github.com/PSPDFKit/nutrient-dws-client-python.git cd nutrient-dws-client-python # Install in development mode pip install -e ".[dev]" -# Run tests -pytest +# Run type checking +mypy src/ # Run linting -ruff check . +ruff check src/ -# Run type checking -mypy src tests +# Run formatting +ruff format src/ ``` -### Running Tests +## Contributing -```bash -# Run all tests -pytest +We welcome contributions to improve the library! Please follow our development standards to ensure code quality and maintainability. -# Run with coverage -pytest --cov=nutrient --cov-report=html +Quick start for contributors: -# Run specific test file -pytest tests/unit/test_client.py -``` +1. Clone and setup the repository +2. Make changes following atomic commit practices +3. Use conventional commits for clear change history +4. Include appropriate tests for new features +5. Ensure type checking passes with mypy +6. Follow Python code style with ruff -## Contributing +For detailed contribution guidelines, see the [Contributing Guide](docs/CONTRIBUTING.md). + +## Project Structure -Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change. +``` +src/ +├── nutrient_dws/ +│ ├── builder/ # Builder classes and constants +│ ├── generated/ # Generated type definitions +│ ├── types/ # Type definitions +│ ├── client.py # Main NutrientClient class +│ ├── errors.py # Error classes +│ ├── http.py # HTTP layer +│ ├── inputs.py # Input handling +│ ├── workflow.py # Workflow factory +│ └── __init__.py # Public exports +├── scripts/ # CLI scripts for coding agents +└── tests/ # Test files +``` -1. Fork the repository -2. Create your feature branch (`git checkout -b feature/amazing-feature`) -3. Commit your changes (`git commit -m 'Add some amazing feature'`) -4. Push to the branch (`git push origin feature/amazing-feature`) -5. Open a Pull Request +## Python Version Support + +This library supports Python 3.10 and higher. The async-first design requires modern Python features for optimal performance and type safety. ## License @@ -330,6 +248,6 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## Support -- 📧 Email: support@nutrient.io -- 📚 Documentation: https://www.nutrient.io/docs/ -- 🐛 Issues: https://github.com/jdrhyne/nutrient-dws-client-python/issues \ No newline at end of file +For issues and feature requests, please use the [GitHub issue tracker](https://github.com/PSPDFKit/nutrient-dws-client-python/issues). + +For questions about the Nutrient DWS Processor API, refer to the [official documentation](https://nutrient.io/docs/). diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md deleted file mode 100644 index b916875..0000000 --- a/RELEASE_NOTES.md +++ /dev/null @@ -1,118 +0,0 @@ -# Release Notes - v1.0.0 - -**Release Date**: June 17, 2024 - -We are excited to announce the first release of the official Python client library for Nutrient Document Web Services (DWS) API! This library provides a comprehensive, Pythonic interface for document processing operations including PDF manipulation, OCR, watermarking, and more. - -## 🎉 Highlights - -### Dual API Design -The library offers two complementary ways to interact with the Nutrient API: - -1. **Direct API** - Simple method calls for single operations -2. **Builder API** - Fluent interface for complex, multi-step workflows - -### Automatic Office Document Conversion -A major discovery during development: the Nutrient API automatically converts Office documents (DOCX, XLSX, PPTX) to PDF when processing them. This means you can: -- Apply any PDF operation directly to Office documents -- Mix PDFs and Office documents in merge operations -- Skip explicit conversion steps in your workflows - -### Enterprise-Ready Features -- **Robust Error Handling**: Comprehensive exception hierarchy for different error scenarios -- **Automatic Retries**: Built-in retry logic for transient failures -- **Connection Pooling**: Optimized performance for multiple requests -- **Large File Support**: Automatic streaming for files over 10MB -- **Type Safety**: Full type hints for better IDE support - -## 📦 Installation - -```bash -pip install nutrient-dws -``` - -## 🚀 Quick Start - -```python -from nutrient_dws import NutrientClient - -# Initialize client -client = NutrientClient(api_key="your-api-key") - -# Direct API - Single operation -client.rotate_pages("document.pdf", output_path="rotated.pdf", degrees=90) - -# Convert Office document to PDF (automatic!) -client.convert_to_pdf("report.docx", output_path="report.pdf") - -# Builder API - Complex workflow -client.build(input_file="scan.pdf") \ - .add_step("ocr-pdf", {"language": "english"}) \ - .add_step("watermark-pdf", {"text": "CONFIDENTIAL"}) \ - .add_step("flatten-annotations") \ - .execute(output_path="processed.pdf") - -# Merge PDFs and Office documents together -client.merge_pdfs([ - "chapter1.pdf", - "chapter2.docx", - "appendix.xlsx" -], output_path="complete_document.pdf") -``` - -## 🔧 Supported Operations - -- **convert_to_pdf** - Convert Office documents to PDF -- **flatten_annotations** - Flatten form fields and annotations -- **rotate_pages** - Rotate specific or all pages -- **ocr_pdf** - Make scanned PDFs searchable (English & German) -- **watermark_pdf** - Add text or image watermarks -- **apply_redactions** - Apply redaction annotations -- **merge_pdfs** - Combine multiple documents - -## 🛡️ Error Handling - -The library provides specific exceptions for different error scenarios: - -```python -from nutrient_dws import NutrientClient, AuthenticationError, ValidationError - -try: - client = NutrientClient(api_key="your-api-key") - result = client.ocr_pdf("scan.pdf") -except AuthenticationError: - print("Invalid API key") -except ValidationError as e: - print(f"Invalid parameters: {e.errors}") -``` - -## 📚 Documentation - -- [README](https://github.com/jdrhyne/nutrient-dws-client-python/blob/main/README.md) - Getting started guide -- [SUPPORTED_OPERATIONS](https://github.com/jdrhyne/nutrient-dws-client-python/blob/main/SUPPORTED_OPERATIONS.md) - Detailed operation documentation -- [API Reference](https://nutrient-dws-client-python.readthedocs.io) - Coming soon! - -## 🧪 Quality Assurance - -- **Test Coverage**: 92.46% with 82 unit tests -- **Type Checking**: Full mypy compliance -- **Code Quality**: Enforced with ruff and pre-commit hooks -- **CI/CD**: Automated testing on Python 3.8-3.12 - -## 🤝 Contributing - -We welcome contributions! Please see our [Contributing Guidelines](https://github.com/jdrhyne/nutrient-dws-client-python/blob/main/CONTRIBUTING.md) for details. - -## 📝 License - -This project is licensed under the MIT License. - -## 🙏 Acknowledgments - -Special thanks to the Nutrient team for their excellent API and documentation. - ---- - -**Note**: This is the initial release. We're actively working on additional features including more language support for OCR, additional file format support, and performance optimizations. Stay tuned! - -For questions or support, please [open an issue](https://github.com/jdrhyne/nutrient-dws-client-python/issues). \ No newline at end of file diff --git a/RELEASE_PROCESS.md b/RELEASE_PROCESS.md deleted file mode 100644 index a977ce5..0000000 --- a/RELEASE_PROCESS.md +++ /dev/null @@ -1,66 +0,0 @@ -# Release Process - -This document describes how to release a new version of nutrient-dws to PyPI using GitHub's trusted publishing. - -## Prerequisites - -1. PyPI account with maintainer access to nutrient-dws -2. GitHub repository configured as a trusted publisher on PyPI -3. Write access to the GitHub repository - -## Automatic Release Process (Recommended) - -### For New Releases - -1. Update version in `pyproject.toml` -2. Update `CHANGELOG.md` with release notes -3. Commit changes: `git commit -m "chore: prepare release v1.0.x"` -4. Create and push tag: `git tag v1.0.x && git push origin v1.0.x` -5. Create GitHub release: - - Go to https://github.com/PSPDFKit/nutrient-dws-client-python/releases/new - - Select the tag you just created - - Add release notes - - Click "Publish release" -6. The `Release` workflow will automatically trigger and upload to PyPI - -### For Existing Tags (like v1.0.2) - -1. Go to Actions tab in GitHub -2. Select "Publish Existing Tag to PyPI" workflow -3. Click "Run workflow" -4. Enter the tag name (e.g., `v1.0.2`) -5. Click "Run workflow" -6. Monitor the workflow progress - -## Manual Trigger - -You can also manually trigger the release workflow: -1. Go to Actions tab -2. Select "Release" workflow -3. Click "Run workflow" -4. Select branch/tag and run - -## Verification - -After publishing: -1. Check PyPI: https://pypi.org/project/nutrient-dws/ -2. Test installation: `pip install nutrient-dws==1.0.x` -3. Verify the GitHub release page shows the release - -## Troubleshooting - -### Trusted Publisher Issues -- Ensure the GitHub repository is configured as a trusted publisher on PyPI -- Check that the workflow has `id-token: write` permission -- Verify the PyPI project name matches exactly - -### Build Issues -- Ensure `pyproject.toml` is valid -- Check that all required files are present -- Verify Python version compatibility - -## Security Notes - -- No API tokens or passwords are needed with trusted publishing -- GitHub Actions uses OIDC to authenticate with PyPI -- This is more secure than storing PyPI tokens as secrets \ No newline at end of file diff --git a/SPECIFICATION.md b/SPECIFICATION.md deleted file mode 100644 index b67cb13..0000000 --- a/SPECIFICATION.md +++ /dev/null @@ -1,170 +0,0 @@ -# Software Design Specification: Nutrient DWS Python Client -Version: 1.2 -Date: December 19, 2024 - -## 1. Introduction -### 1.1. Project Overview -This document outlines the software design specification for a new Python client library for the Nutrient Document Web Services (DWS) API. The goal of this project is to create a high-quality, lightweight, and intuitive Python package that simplifies interaction with the Nutrient DWS API for developers. - -The library will provide two primary modes of interaction: -- A **Direct API** for executing single, discrete document processing tasks (e.g., converting a single file, rotating a page). -- A **Builder API** that offers a fluent, chainable interface for composing and executing complex, multi-step document processing workflows, abstracting the `POST /build` endpoint of the Nutrient API. - -The final product will be a distributable package suitable for publishing on PyPI, with comprehensive documentation. The design prioritizes ease of use, adherence to Python best practices, and clear documentation consumable by both humans and LLMs. - -### 1.2. Scope -This specification covers the implemented Python client library: -- Client authentication and configuration -- Direct API methods for common document operations -- Builder API for multi-step workflows -- Comprehensive error handling with custom exceptions -- Optimized file input/output handling -- Standard Python package structure - -Out of scope: -- Command-line interface (CLI) -- Asynchronous operations (all calls are synchronous) -- Non-Python implementations - -### 1.3. References -- **Nutrient DWS OpenAPI Specification**: https://dashboard.nutrient.io/assets/specs/public@1.9.0-dfc6ec1c1d008be3dcb81a72be6346b5.yml -- **Nutrient DWS API Documentation**: https://www.nutrient.io/api/reference/public/ -- **Nutrient DWS List of Tools**: https://www.nutrient.io/api/tools-overview/ -- **Target API Endpoint**: https://api.pspdfkit.com - -## 2. Goals and Objectives -- **Simplicity**: Clean, Pythonic interface abstracting HTTP requests, authentication, and file handling -- **Flexibility**: Direct API for single operations and Builder API for complex workflows -- **Lightweight**: Single external dependency on `requests` library -- **Performance**: Optimized file handling with streaming for large files (>10MB) -- **Distribution-Ready**: Standard Python package structure with `pyproject.toml` - -## 3. High-Level Architecture -The library is architected around a central `NutrientClient` class, which is the main entry point for all interactions. - -### 3.1. Core Components -**NutrientClient (The Main Client):** -- Handles initialization and configuration, including a configurable timeout for API calls. -- Manages the API key for authentication. All outgoing requests will include the `X-Api-Key` header. -- Contains static methods for the Direct API (e.g., `client.rotate_pages(...)`), which are derived from the OpenAPI specification. -- Acts as a factory for the Builder API via the `client.build()` method. - -**Direct API (Static Methods):** -- A collection of methods attached directly to the `NutrientClient` object. -- Each method corresponds to a specific tool available in the OpenAPI specification (e.g., `ocr_pdf`, `rotate_pages`). -- These methods abstract the `POST /process/{tool}` endpoint. They handle file preparation, making the request, and returning the processed file. - -**BuildAPIWrapper (Builder API):** -- A separate class, instantiated via `client.build()`. -- Implements the Builder design pattern with a fluent, chainable interface. -- The `execute()` method compiles the workflow into a `multipart/form-data` request for the `POST /build` endpoint, including a JSON part for actions and the necessary file parts. - -### 3.2. Data Flow -**Direct API Call:** -1. User calls method like `client.rotate_pages(input_file='path/to/doc.pdf', degrees=90)` -2. Method internally uses Builder API with single step -3. File is processed via `/build` endpoint -4. Returns processed file bytes or saves to `output_path` - -**Builder API Call:** -1. User chains operations: `client.build(input_file='doc.docx').add_step(tool='rotate-pages', options={'degrees': 90})` -2. `execute()` sends `multipart/form-data` request to `/build` endpoint -3. Returns processed file bytes or saves to `output_path` - -## 4. API Design -### 4.1. Client Initialization -```python -from nutrient_dws import NutrientClient, AuthenticationError - -# API key from parameter (takes precedence) or NUTRIENT_API_KEY env var -client = NutrientClient(api_key="YOUR_DWS_API_KEY", timeout=300) - -# Context manager support -with NutrientClient() as client: - result = client.convert_to_pdf("document.docx") -``` - -- **API Key**: Parameter takes precedence over `NUTRIENT_API_KEY` environment variable -- **Timeout**: Default 300 seconds, configurable per client -- **Error Handling**: `AuthenticationError` raised on first API call if key invalid - -### 4.2. File Handling -**Input Types**: -- `str` or `Path` for local file paths -- `bytes` objects -- File-like objects (`io.IOBase`) - -**Output Behavior**: -- Returns `bytes` by default -- Saves to `output_path` and returns `None` when path provided -- Large files (>10MB) use streaming to optimize memory usage - -### 4.3. Direct API Design -Method names are snake_case versions of operations. Tool-specific parameters are keyword-only arguments. - -**Example Usage:** -```python -# User Story: Convert a DOCX to PDF and rotate it. - -# Step 1: Convert DOCX to PDF -pdf_bytes = client.convert_to_pdf( - input_file="path/to/document.docx" -) - -# Step 2: Rotate the newly created PDF from memory -client.rotate_pages( - input_file=pdf_bytes, - output_path="path/to/rotated_document.pdf", - degrees=90 # keyword-only argument -) - -print("File saved to path/to/rotated_document.pdf") -``` - -### 4.4. Builder API Design -Fluent interface for multi-step workflows with single API call: - -- `client.build(input_file)`: Starts workflow -- `.add_step(tool, options=None)`: Adds processing step -- `.execute(output_path=None)`: Executes workflow -- `.set_output_options(**options)`: Sets output metadata/optimization - -**Example Usage:** -```python -from nutrient_dws import APIError - -# User Story: Convert a DOCX to PDF and rotate it (Builder version) -try: - client.build(input_file="path/to/document.docx") \ - .add_step(tool="rotate-pages", options={"degrees": 90}) \ - .execute(output_path="path/to/final_document.pdf") - - print("Workflow complete. File saved to path/to/final_document.pdf") - -except APIError as e: - print(f"An API error occurred: Status {e.status_code}, Response: {e.response_body}") -``` - -### 4.5. Error Handling -The library provides a comprehensive set of custom exceptions for clear error feedback: - -- `NutrientError(Exception)`: The base exception for all library-specific errors. -- `AuthenticationError(NutrientError)`: Raised on 401/403 HTTP errors, indicating an invalid or missing API key. -- `APIError(NutrientError)`: Raised for general API errors (e.g., 400, 422, 5xx status codes). Contains `status_code`, `response_body`, and optional `request_id` attributes. -- `ValidationError(NutrientError)`: Raised when request validation fails, with optional `errors` dictionary. -- `NutrientTimeoutError(NutrientError)`: Raised when requests timeout. -- `FileProcessingError(NutrientError)`: Raised when file processing operations fail. -- `FileNotFoundError` (Built-in): Standard Python exception for missing file paths. - -## 5. Implementation Details - -### 5.1. Package Structure -- **Layout**: Standard `src` layout with `nutrient_dws` package -- **Configuration**: `pyproject.toml` for project metadata and dependencies -- **Dependencies**: `requests` as sole runtime dependency -- **Versioning**: Semantic versioning starting at `1.0.0` - -### 5.2. File Handling Optimizations -- **Large Files**: Files >10MB are streamed rather than loaded into memory -- **Input Types**: Support for `str` paths, `bytes`, `Path` objects, and file-like objects -- **Output**: Returns `bytes` by default, or saves to `output_path` when provided diff --git a/SUPPORTED_OPERATIONS.md b/SUPPORTED_OPERATIONS.md deleted file mode 100644 index a86395c..0000000 --- a/SUPPORTED_OPERATIONS.md +++ /dev/null @@ -1,386 +0,0 @@ -# Supported Operations - -This document lists all operations currently supported by the Nutrient DWS API through this Python client. - -## 🎯 Important Discovery: Implicit Document Conversion - -The Nutrient DWS API automatically converts Office documents (DOCX, XLSX, PPTX) to PDF when processing them. This means: - -- **No explicit conversion needed** - Just pass your Office documents to any method -- **All methods accept Office documents** - `rotate_pages()`, `ocr_pdf()`, etc. work with DOCX files -- **Seamless operation chaining** - Convert and process in one API call - -### Example: -```python -# This automatically converts DOCX to PDF and rotates it! -client.rotate_pages("document.docx", degrees=90) - -# Merge PDFs and Office documents together -client.merge_pdfs(["file1.pdf", "file2.docx", "spreadsheet.xlsx"]) -``` - -## Direct API Methods - -The following methods are available on the `NutrientClient` instance: - -### 1. `convert_to_pdf(input_file, output_path=None)` -Converts Office documents to PDF format using implicit conversion. - -**Parameters:** -- `input_file`: Office document (DOCX, XLSX, PPTX) -- `output_path`: Optional path to save output - -**Example:** -```python -# Convert DOCX to PDF -client.convert_to_pdf("document.docx", "document.pdf") - -# Convert and get bytes -pdf_bytes = client.convert_to_pdf("spreadsheet.xlsx") -``` - -**Note:** HTML files are not currently supported. - -### 2. `flatten_annotations(input_file, output_path=None)` -Flattens all annotations and form fields in a PDF, converting them to static page content. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional path to save output - -**Example:** -```python -client.flatten_annotations("document.pdf", "flattened.pdf") -# Works with Office docs too! -client.flatten_annotations("form.docx", "flattened.pdf") -``` - -### 3. `rotate_pages(input_file, output_path=None, degrees=0, page_indexes=None)` -Rotates pages in a PDF or converts Office document to PDF and rotates. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional output path -- `degrees`: Rotation angle (90, 180, 270, or -90) -- `page_indexes`: Optional list of page indexes to rotate (0-based) - -**Example:** -```python -# Rotate all pages 90 degrees -client.rotate_pages("document.pdf", "rotated.pdf", degrees=90) - -# Works with Office documents too! -client.rotate_pages("presentation.pptx", "rotated.pdf", degrees=180) - -# Rotate specific pages -client.rotate_pages("document.pdf", "rotated.pdf", degrees=180, page_indexes=[0, 2]) -``` - -### 4. `ocr_pdf(input_file, output_path=None, language="english")` -Applies OCR to make a PDF searchable. Converts Office documents to PDF first if needed. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional output path -- `language`: OCR language - supported values: - - `"english"` or `"eng"` - English - - `"deu"` or `"german"` - German - -**Example:** -```python -client.ocr_pdf("scanned.pdf", "searchable.pdf", language="english") -# Convert DOCX to searchable PDF -client.ocr_pdf("document.docx", "searchable.pdf", language="eng") -``` - -### 5. `watermark_pdf(input_file, output_path=None, text=None, image_url=None, width=200, height=100, opacity=1.0, position="center")` -Adds a watermark to all pages of a PDF. Converts Office documents to PDF first if needed. - -**Parameters:** -- `input_file`: PDF or Office document -- `output_path`: Optional output path -- `text`: Text for watermark (either text or image_url required) -- `image_url`: URL of image for watermark -- `width`: Width in points (required) -- `height`: Height in points (required) -- `opacity`: Opacity from 0.0 to 1.0 -- `position`: One of: "top-left", "top-center", "top-right", "center", "bottom-left", "bottom-center", "bottom-right" - -**Example:** -```python -# Text watermark -client.watermark_pdf( - "document.pdf", - "watermarked.pdf", - text="CONFIDENTIAL", - width=300, - height=150, - opacity=0.5, - position="center" -) -``` - -### 6. `apply_redactions(input_file, output_path=None)` -Applies redaction annotations to permanently remove content. Converts Office documents to PDF first if needed. - -**Parameters:** -- `input_file`: PDF or Office document with redaction annotations -- `output_path`: Optional output path - -**Example:** -```python -client.apply_redactions("document_with_redactions.pdf", "redacted.pdf") -``` - -### 7. `merge_pdfs(input_files, output_path=None)` -Merges multiple files into one PDF. Automatically converts Office documents to PDF before merging. - -**Parameters:** -- `input_files`: List of files to merge (PDFs and/or Office documents) -- `output_path`: Optional output path - -**Example:** -```python -# Merge PDFs only -client.merge_pdfs( - ["document1.pdf", "document2.pdf", "document3.pdf"], - "merged.pdf" -) - -# Mix PDFs and Office documents - they'll be converted automatically! -client.merge_pdfs( - ["report.pdf", "spreadsheet.xlsx", "presentation.pptx"], - "combined.pdf" -) -``` - -### 8. `split_pdf(input_file, page_ranges=None, output_paths=None)` -Splits a PDF into multiple documents by page ranges. - -**Parameters:** -- `input_file`: PDF file to split -- `page_ranges`: List of page range dictionaries with `start`/`end` keys (0-based indexing) -- `output_paths`: Optional list of paths to save output files - -**Returns:** -- List of PDF bytes for each split, or empty list if `output_paths` provided - -**Example:** -```python -# Split into custom ranges -parts = client.split_pdf( - "document.pdf", - page_ranges=[ - {"start": 0, "end": 4}, # Pages 1-5 - {"start": 5, "end": 9}, # Pages 6-10 - {"start": 10} # Pages 11 to end - ] -) - -# Save to specific files -client.split_pdf( - "document.pdf", - page_ranges=[{"start": 0, "end": 1}, {"start": 2}], - output_paths=["part1.pdf", "part2.pdf"] -) - -# Default behavior (extracts first page) -pages = client.split_pdf("document.pdf") -``` - -### 9. `duplicate_pdf_pages(input_file, page_indexes, output_path=None)` -Duplicates specific pages within a PDF document. - -**Parameters:** -- `input_file`: PDF file to process -- `page_indexes`: List of page indexes to include (0-based). Pages can be repeated for duplication. Negative indexes supported (-1 for last page) -- `output_path`: Optional path to save the output file - -**Returns:** -- Processed PDF as bytes, or None if `output_path` provided - -**Example:** -```python -# Duplicate first page twice, then include second page -result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 0, 1] # Page 1, Page 1, Page 2 -) - -# Include last page at beginning and end -result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[-1, 0, 1, 2, -1] # Last, First, Second, Third, Last -) - -# Save to specific file -client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 2, 1], # Reorder: Page 1, Page 3, Page 2 - output_path="reordered.pdf" -) -``` - -### 10. `delete_pdf_pages(input_file, page_indexes, output_path=None)` -Deletes specific pages from a PDF document. - -**Parameters:** -- `input_file`: PDF file to process -- `page_indexes`: List of page indexes to delete (0-based). Duplicates are automatically removed. -- `output_path`: Optional path to save the output file - -**Returns:** -- Processed PDF as bytes, or None if `output_path` provided - -**Note:** Negative page indexes are not currently supported. - -**Example:** -```python -# Delete first and third pages -result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[0, 2] # Delete pages 1 and 3 (0-based indexing) -) - -# Delete specific pages with duplicates (duplicates ignored) -result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[1, 3, 1, 5] # Effectively deletes pages 2, 4, and 6 -) - -# Save to specific file -client.delete_pdf_pages( - "document.pdf", - page_indexes=[0, 1], # Delete first two pages - output_path="trimmed_document.pdf" -) -``` - -### 11. `set_page_label(input_file, labels, output_path=None)` -Sets custom labels/numbering for specific page ranges in a PDF. - -**Parameters:** -- `input_file`: PDF file to process -- `labels`: List of label configurations. Each dict must contain: - - `pages`: Page range dict with `start` (required) and optionally `end` - - `label`: String label to apply to those pages - - Page ranges use 0-based indexing where `end` is inclusive. -- `output_path`: Optional path to save the output file - -**Returns:** -- Processed PDF as bytes, or None if `output_path` provided - -**Example:** -```python -# Set labels for different page ranges -client.set_page_label( - "document.pdf", - labels=[ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - {"pages": {"start": 10}, "label": "Appendix"} - ], - output_path="labeled_document.pdf" -) - -# Set label for single page -client.set_page_label( - "document.pdf", - labels=[{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] -) -``` - -## Builder API - -The Builder API allows chaining multiple operations. Like the Direct API, it automatically converts Office documents to PDF when needed: - -```python -# Works with PDFs -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .add_step("ocr-pdf", {"language": "english"}) \ - .add_step("watermark-pdf", { - "text": "DRAFT", - "width": 200, - "height": 100, - "opacity": 0.3 - }) \ - .add_step("flatten-annotations") \ - .execute(output_path="processed.pdf") - -# Also works with Office documents! -client.build(input_file="report.docx") \ - .add_step("watermark-pdf", {"text": "CONFIDENTIAL", "width": 300, "height": 150}) \ - .add_step("flatten-annotations") \ - .execute(output_path="watermarked_report.pdf") - -# Setting page labels with Builder API -client.build(input_file="document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .set_page_labels([ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3}, "label": "Content"} - ]) \ - .execute(output_path="labeled_document.pdf") -``` - -### Supported Builder Actions - -1. **flatten-annotations** - No parameters required -2. **rotate-pages** - Parameters: `degrees`, `page_indexes` (optional) -3. **ocr-pdf** - Parameters: `language` -4. **watermark-pdf** - Parameters: `text` or `image_url`, `width`, `height`, `opacity`, `position` -5. **apply-redactions** - No parameters required - -### Builder Output Options - -The Builder API also supports setting output options: - -- **set_output_options()** - General output configuration (metadata, optimization, etc.) -- **set_page_labels()** - Set page labels for specific page ranges - -Example: -```python -client.build("document.pdf") \ - .add_step("rotate-pages", {"degrees": 90}) \ - .set_output_options(metadata={"title": "My Document"}) \ - .set_page_labels([{"pages": {"start": 0}, "label": "Chapter 1"}]) \ - .execute("output.pdf") -``` - -## API Limitations - -The following operations are **NOT** currently supported by the API: - -- HTML to PDF conversion (only Office documents are supported) -- PDF to image export -- Form filling -- Digital signatures -- Compression/optimization -- Linearization -- Creating redactions (only applying existing ones) -- Instant JSON annotations -- XFDF annotations - -## Language Support - -OCR currently supports: -- English (`"english"` or `"eng"`) -- German (`"deu"` or `"german"`) - -## File Input Types - -All methods accept files as: -- String paths: `"document.pdf"` -- Path objects: `Path("document.pdf")` -- Bytes: `b"...pdf content..."` -- File-like objects: `open("document.pdf", "rb")` - -## Error Handling - -Common exceptions: -- `AuthenticationError` - Invalid or missing API key -- `APIError` - General API errors with status code -- `ValidationError` - Invalid parameters -- `FileNotFoundError` - File not found -- `ValueError` - Invalid input values diff --git a/conftest.py b/conftest.py deleted file mode 100644 index 3fbb1b6..0000000 --- a/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Root conftest.py to ensure proper test configuration.""" - -import sys -from pathlib import Path - -# Add src to Python path for test discovery -src_path = Path(__file__).parent / "src" -if str(src_path) not in sys.path: - sys.path.insert(0, str(src_path)) diff --git a/context7.json b/context7.json new file mode 100644 index 0000000..20e8d2e --- /dev/null +++ b/context7.json @@ -0,0 +1,7 @@ +{ + "$schema": "https://context7.com/schema/context7.json", + "projectTitle": "Nutrient DWS Python Client", + "description": "Python client library for Nutrient Document Web Services (DWS) API.\n", + "excludeFolders": ["src", "docs", "example", ".github"], + "excludeFiles": ["README.md"] +} diff --git a/CONTRIBUTING.md b/docs/CONTRIBUTING.md similarity index 97% rename from CONTRIBUTING.md rename to docs/CONTRIBUTING.md index 4d98b11..2784f47 100644 --- a/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -70,7 +70,7 @@ By participating in this project, you agree to abide by our Code of Conduct: be 5. Run linting: ```bash - ruff check src/ tests/ + ruff check src/ ``` 6. Commit your changes: @@ -120,7 +120,7 @@ Example test: ```python def test_new_feature(): """Test description.""" - client = NutrientClient(api_key="test-key") + client = NutrientClient(api_key='your_api_key') result = client.new_feature() assert result == expected_value ``` @@ -146,4 +146,4 @@ def test_new_feature(): - Check existing documentation - Review closed issues and PRs -Thank you for contributing! \ No newline at end of file +Thank you for contributing! diff --git a/docs/METHODS.md b/docs/METHODS.md new file mode 100644 index 0000000..5873a9b --- /dev/null +++ b/docs/METHODS.md @@ -0,0 +1,882 @@ +# Nutrient DWS Python Client Methods + +This document provides detailed information about all the methods available in the Nutrient DWS Python Client. + +## Client Methods + +### NutrientClient + +The main client for interacting with the Nutrient DWS Processor API. + +#### Constructor + +```python +NutrientClient(api_key: str | Callable[[], Awaitable[str] | str], base_url: str | None = None, timeout: int | None = None) +``` + +Parameters: +- `api_key` (required): Your API key string or async function returning a token +- `base_url` (optional): Custom API base URL (defaults to `https://api.nutrient.io`) +- `timeout` (optional): Request timeout in milliseconds + +#### Authentication + +Provide your API key directly: + +```python +from nutrient_dws import NutrientClient + +client = NutrientClient(api_key='your_api_key') +``` + +Or use an async token provider to fetch tokens from a secure source: + +```python +import httpx +from nutrient_dws import NutrientClient + +async def get_token(): + async with httpx.AsyncClient() as http_client: + response = await http_client.get('/api/get-nutrient-token') + data = response.json() + return data['token'] + +client = NutrientClient(api_key=get_token) +``` + + +#### Account Methods + +##### get_account_info() +Gets account information for the current API key. + +**Returns**: `AccountInfo` - Account information dictionary + +```python +account_info = await client.get_account_info() + +# Access subscription information +print(account_info['subscriptionType']) +``` + +##### create_token(params) +Creates a new authentication token. + +**Parameters**: +- `params: CreateAuthTokenParameters` - Parameters for creating the token + +**Returns**: `CreateAuthTokenResponse` - The created token information + +```python +token = await client.create_token({ + 'expirationTime': 3600 +}) +print(token['id']) + +# Store the token for future use +token_id = token['id'] +token_value = token['accessToken'] +``` + +##### delete_token(id) +Deletes an authentication token. + +**Parameters**: +- `id: str` - ID of the token to delete + +**Returns**: `None` + +```python +await client.delete_token('token-id-123') + +# Example in a token management function +async def revoke_user_token(token_id: str) -> bool: + try: + await client.delete_token(token_id) + print(f'Token {token_id} successfully revoked') + return True + except Exception as error: + print(f'Failed to revoke token: {error}') + return False +``` + +#### Document Processing Methods + +##### sign(file, data?, options?) +Signs a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to sign +- `data: CreateDigitalSignature | None` - Signature data (optional) +- `options: SignRequestOptions | None` - Additional options (image, graphicImage) (optional) + +**Returns**: `BufferOutput` - The signed PDF file output + +```python +result = await client.sign('document.pdf', { + 'signatureType': 'cms', + 'flatten': False, + 'cadesLevel': 'b-lt' +}) + +# Access the signed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('signed-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### create_redactions_ai(file, criteria, redaction_state?, pages?, options?) +Uses AI to redact sensitive information in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to redact +- `criteria: str` - AI redaction criteria +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional pages to redact +- `options: RedactOptions | None` - Optional redaction options + +**Returns**: `BufferOutput` - The redacted document + +```python +# Stage redactions +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails' +) + +# Apply redactions immediately +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'apply' +) + +# Redact only specific pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# Redact only the last 3 pages +result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) + +# Access the redacted PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('redacted-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### ocr(file, language) +Performs OCR (Optical Character Recognition) on a document. + +**Parameters**: +- `file: FileInput` - The input file to perform OCR on +- `language: OcrLanguage | list[OcrLanguage]` - The language(s) to use for OCR + +**Returns**: `BufferOutput` - The OCR result + +```python +result = await client.ocr('scanned-document.pdf', 'english') + +# Access the OCR-processed PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('ocr-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### watermark_text(file, text, options?) +Adds a text watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `text: str` - The watermark text +- `options: dict[str, Any] | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_text('document.pdf', 'CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 24 +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### watermark_image(file, image, options?) +Adds an image watermark to a document. + +**Parameters**: +- `file: FileInput` - The input file to watermark +- `image: FileInput` - The watermark image +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional) + +**Returns**: `BufferOutput` - The watermarked document + +```python +result = await client.watermark_image('document.pdf', 'watermark.jpg', { + 'opacity': 0.5, + 'width': {'value': 50, 'unit': "%"}, + 'height': {'value': 50, 'unit': "%"} +}) + +# Access the watermarked PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('image-watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### convert(file, target_format) +Converts a document to a different format. + +**Parameters**: +- `file: FileInput` - The input file to convert +- `target_format: OutputFormat` - The target format to convert to + +**Returns**: `BufferOutput | ContentOutput | JsonContentOutput` - The specific output type based on the target format + +```python +# Convert DOCX to PDF +pdf_result = await client.convert('document.docx', 'pdf') +# Supports formats: pdf, pdfa, pdfua, docx, xlsx, pptx, png, jpeg, jpg, webp, html, markdown + +# Access the PDF buffer +pdf_buffer = pdf_result['buffer'] +print(pdf_result['mimeType']) # 'application/pdf' + +# Save the PDF +with open('converted-document.pdf', 'wb') as f: + f.write(pdf_buffer) + +# Convert PDF to image +image_result = await client.convert('document.pdf', 'png') + +# Access the PNG buffer +png_buffer = image_result['buffer'] +print(image_result['mimeType']) # 'image/png' + +# Save the image +with open('document-page.png', 'wb') as f: + f.write(png_buffer) +``` + +##### merge(files) +Merges multiple documents into one. + +**Parameters**: +- `files: list[FileInput]` - The files to merge + +**Returns**: `BufferOutput` - The merged document + +```python +result = await client.merge([ + 'doc1.pdf', + 'doc2.pdf', + 'doc3.pdf' +]) + +# Access the merged PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('merged-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### extract_text(file, pages?) +Extracts text content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract text from +- `pages: PageRange | None` - Optional page range to extract text from + +**Returns**: `JsonContentOutput` - The extracted text data + +```python +result = await client.extract_text('document.pdf') + +# Extract text from specific pages +result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract text from the last page +result = await client.extract_text('document.pdf', {'end': -1}) # Last page + +# Extract text from the second-to-last page to the end +result = await client.extract_text('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted text content +text_content = result['data']['pages'][0]['plainText'] + +# Process the extracted text +word_count = len(text_content.split()) +print(f'Document contains {word_count} words') + +# Search for specific content +if 'confidential' in text_content: + print('Document contains confidential information') +``` + +##### extract_table(file, pages?) +Extracts table content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract tables from +- `pages: PageRange | None` - Optional page range to extract tables from + +**Returns**: `JsonContentOutput` - The extracted table data + +```python +result = await client.extract_table('document.pdf') + +# Extract tables from specific pages +result = await client.extract_table('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract tables from the last page +result = await client.extract_table('document.pdf', {'end': -1}) # Last page + +# Extract tables from the second-to-last page to the end +result = await client.extract_table('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted tables +tables = result['data']['pages'][0]['tables'] + +# Process the first table if available +if tables and len(tables) > 0: + first_table = tables[0] + + # Get table dimensions + print(f"Table has {len(first_table['rows'])} rows and {len(first_table['columns'])} columns") + + # Access table cells + for i in range(len(first_table['rows'])): + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + cell_content = cell['text'] if cell else '' + print(f"Cell [{i}][{j}]: {cell_content}") + + # Convert table to CSV + csv_content = '' + for i in range(len(first_table['rows'])): + row_data = [] + for j in range(len(first_table['columns'])): + cell = next((cell for cell in first_table['cells'] + if cell['rowIndex'] == i and cell['columnIndex'] == j), None) + row_data.append(cell['text'] if cell else '') + csv_content += ','.join(row_data) + '\n' + print(csv_content) +``` + +##### extract_key_value_pairs(file, pages?) +Extracts key value pair content from a document. + +**Parameters**: +- `file: FileInput` - The file to extract KVPs from +- `pages: PageRange | None` - Optional page range to extract KVPs from + +**Returns**: `JsonContentOutput` - The extracted KVPs data + +```python +result = await client.extract_key_value_pairs('document.pdf') + +# Extract KVPs from specific pages +result = await client.extract_key_value_pairs('document.pdf', {'start': 0, 'end': 2}) # Pages 0, 1, 2 + +# Extract KVPs from the last page +result = await client.extract_key_value_pairs('document.pdf', {'end': -1}) # Last page + +# Extract KVPs from the second-to-last page to the end +result = await client.extract_key_value_pairs('document.pdf', {'start': -2}) # Second-to-last and last page + +# Access the extracted key-value pairs +kvps = result['data']['pages'][0]['keyValuePairs'] + +# Process the key-value pairs +if kvps and len(kvps) > 0: + # Iterate through all key-value pairs + for index, kvp in enumerate(kvps): + print(f'KVP {index + 1}:') + print(f' Key: {kvp["key"]}') + print(f' Value: {kvp["value"]}') + print(f' Confidence: {kvp["confidence"]}') + + # Create a dictionary from the key-value pairs + dictionary = {} + for kvp in kvps: + dictionary[kvp['key']] = kvp['value'] + + # Look up specific values + print(f'Invoice Number: {dictionary.get("Invoice Number")}') + print(f'Date: {dictionary.get("Date")}') + print(f'Total Amount: {dictionary.get("Total")}') +``` + +##### flatten(file, annotation_ids?) +Flattens annotations in a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to flatten +- `annotation_ids: list[str | int] | None` - Optional specific annotation IDs to flatten + +**Returns**: `BufferOutput` - The flattened document + +```python +# Flatten all annotations +result = await client.flatten('annotated-document.pdf') + +# Flatten specific annotations by ID +result = await client.flatten('annotated-document.pdf', ['annotation1', 'annotation2']) +``` + +##### password_protect(file, user_password, owner_password, permissions?) +Password protects a PDF document. + +**Parameters**: +- `file: FileInput` - The file to protect +- `user_password: str` - Password required to open the document +- `owner_password: str` - Password required to modify the document +- `permissions: list[PDFUserPermission] | None` - Optional array of permissions granted when opened with user password + +**Returns**: `BufferOutput` - The password-protected document + +```python +result = await client.password_protect('document.pdf', 'user123', 'owner456') + +# Or with specific permissions: +result = await client.password_protect('document.pdf', 'user123', 'owner456', + ['printing', 'extract_accessibility']) + +# Access the password-protected PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('protected-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### set_metadata(file, metadata) +Sets metadata for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `metadata: Metadata` - The metadata to set (title and/or author) + +**Returns**: `BufferOutput` - The document with updated metadata + +```python +result = await client.set_metadata('document.pdf', { + 'title': 'My Document', + 'author': 'John Doe' +}) +``` + +##### set_page_labels(file, labels) +Sets page labels for a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `labels: list[Label]` - Array of label objects with pages and label properties + +**Returns**: `BufferOutput` - The document with updated page labels + +```python +result = await client.set_page_labels('document.pdf', [ + {'pages': [0, 1, 2], 'label': 'Cover'}, + {'pages': [3, 4, 5], 'label': 'Chapter 1'} +]) +``` + +##### apply_instant_json(file, instant_json_file) +Applies Instant JSON to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `instant_json_file: FileInput` - The Instant JSON file to apply + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_instant_json('document.pdf', 'annotations.json') +``` + +##### apply_xfdf(file, xfdf_file, options?) +Applies XFDF to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `xfdf_file: FileInput` - The XFDF file to apply +- `options: ApplyXfdfActionOptions | None` - Optional settings for applying XFDF + +**Returns**: `BufferOutput` - The modified document + +```python +result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') +# Or with options: +result = await client.apply_xfdf( + 'document.pdf', 'annotations.xfdf', + {'ignorePageRotation': True, 'richTextEnabled': False} +) +``` + +##### create_redactions_preset(file, preset, redaction_state?, pages?, preset_options?, options?) +Creates redaction annotations based on a preset pattern. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `preset: SearchPreset` - The preset pattern to search for (e.g., 'email-address', 'social-security-number') +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `preset_options: CreateRedactionsStrategyOptionsPreset | None` - Optional settings for the preset strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_preset('document.pdf', 'email-address') + +# With specific pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_preset( + 'document.pdf', + 'email-address', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) +``` + +##### create_redactions_regex(file, regex, redaction_state?, pages?, regex_options?, options?) +Creates redaction annotations based on a regular expression. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `regex: str` - The regular expression to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `regex_options: CreateRedactionsStrategyOptionsRegex | None` - Optional settings for the regex strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_regex('document.pdf', r'Account:\\s*\\d{8,12}') + +# With specific pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\\s*\\d{8,12}', + 'stage', + {'start': 0, 'end': 4} # Pages 0, 1, 2, 3, 4 +) + +# With the last 3 pages +result = await client.create_redactions_regex( + 'document.pdf', + r'Account:\\s*\\d{8,12}', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) +``` + +##### create_redactions_text(file, text, redaction_state?, pages?, text_options?, options?) +Creates redaction annotations based on text. + +**Parameters**: +- `file: FileInput` - The PDF file to create redactions in +- `text: str` - The text to search for +- `redaction_state: Literal['stage', 'apply']` - Whether to stage or apply redactions (default: 'stage') +- `pages: PageRange | None` - Optional page range to create redactions in +- `text_options: CreateRedactionsStrategyOptionsText | None` - Optional settings for the text strategy +- `options: BaseCreateRedactionsOptions | None` - Optional settings for creating redactions + +**Returns**: `BufferOutput` - The document with redaction annotations + +```python +result = await client.create_redactions_text('document.pdf', 'email@example.com') + +# With specific pages and options +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': 0, 'end': 4}, # Pages 0, 1, 2, 3, 4 + {'caseSensitive': False, 'includeAnnotations': True} +) + +# Create redactions on the last 3 pages +result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage', + {'start': -3, 'end': -1} # Last three pages +) +``` + +##### apply_redactions(file) +Applies redaction annotations in a document. + +**Parameters**: +- `file: FileInput` - The PDF file with redaction annotations to apply + +**Returns**: `BufferOutput` - The document with applied redactions + +```python +# Stage redactions from a createRedaction Method: +staged_result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage' +) + +result = await client.apply_redactions(staged_result['buffer']) +``` + +##### rotate(file, angle, pages?) +Rotates pages in a document. + +**Parameters**: +- `file: FileInput` - The PDF file to rotate +- `angle: Literal[90, 180, 270]` - Rotation angle (90, 180, or 270 degrees) +- `pages: PageRange | None` - Optional page range to rotate + +**Returns**: `BufferOutput` - The entire document with specified pages rotated + +```python +result = await client.rotate('document.pdf', 90) + +# Rotate specific pages: +result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) # Pages 1, 2, 3 + +# Rotate the last page: +result = await client.rotate('document.pdf', 90, {'end': -1}) # Last page + +# Rotate from page 2 to the second-to-last page: +result = await client.rotate('document.pdf', 90, {'start': 2, 'end': -2}) +``` + +##### add_page(file, count?, index?) +Adds blank pages to a document. + +**Parameters**: +- `file: FileInput` - The PDF file to add pages to +- `count: int` - The number of blank pages to add (default: 1) +- `index: int | None` - Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + +**Returns**: `BufferOutput` - The document with added pages + +```python +# Add 2 blank pages at the end +result = await client.add_page('document.pdf', 2) + +# Add 1 blank page after the first page (at index 1) +result = await client.add_page('document.pdf', 1, 1) +``` + +##### optimize(file, options?) +Optimizes a PDF document for size reduction. + +**Parameters**: +- `file: FileInput` - The PDF file to optimize +- `options: OptimizePdf | None` - Optimization options + +**Returns**: `BufferOutput` - The optimized document + +```python +result = await client.optimize('large-document.pdf', { + 'grayscaleImages': True, + 'mrcCompression': True, + 'imageOptimizationQuality': 2 +}) +``` + +##### split(file, page_ranges) +Splits a PDF document into multiple parts based on page ranges. + +**Parameters**: +- `file: FileInput` - The PDF file to split +- `page_ranges: list[PageRange]` - Array of page ranges to extract + +**Returns**: `list[BufferOutput]` - An array of PDF documents, one for each page range + +```python +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # Pages 0, 1, 2 + {'start': 3, 'end': 5} # Pages 3, 4, 5 +]) + +# Split using negative indices +results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # First three pages + {'start': 3, 'end': -3}, # Middle pages + {'start': -2, 'end': -1} # Last two pages +]) + +# Process each resulting PDF +for i, result in enumerate(results): + # Access the PDF buffer + pdf_buffer = result['buffer'] + + # Get the MIME type of the output + print(result['mimeType']) # 'application/pdf' + + # Save the buffer to a file + with open(f'split-part-{i}.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### duplicate_pages(file, page_indices) +Creates a new PDF containing only the specified pages in the order provided. + +**Parameters**: +- `file: FileInput` - The PDF file to extract pages from +- `page_indices: list[int]` - Array of page indices to include in the new PDF (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - A new document with only the specified pages + +```python +# Create a new PDF with only the first and third pages +result = await client.duplicate_pages('document.pdf', [0, 2]) + +# Create a new PDF with pages in a different order +result = await client.duplicate_pages('document.pdf', [2, 0, 1]) + +# Create a new PDF with duplicated pages +result = await client.duplicate_pages('document.pdf', [0, 0, 1, 1, 0]) + +# Create a new PDF with the first and last pages +result = await client.duplicate_pages('document.pdf', [0, -1]) + +# Create a new PDF with the last three pages in reverse order +result = await client.duplicate_pages('document.pdf', [-1, -2, -3]) + +# Access the PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('duplicated-pages.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +##### delete_pages(file, page_indices) +Deletes pages from a PDF document. + +**Parameters**: +- `file: FileInput` - The PDF file to modify +- `page_indices: list[int]` - Array of page indices to delete (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + +**Returns**: `BufferOutput` - The document with deleted pages + +```python +# Delete second and fourth pages +result = await client.delete_pages('document.pdf', [1, 3]) + +# Delete the last page +result = await client.delete_pages('document.pdf', [-1]) + +# Delete the first and last two pages +result = await client.delete_pages('document.pdf', [0, -1, -2]) + +# Access the modified PDF buffer +pdf_buffer = result['buffer'] + +# Get the MIME type of the output +print(result['mimeType']) # 'application/pdf' + +# Save the buffer to a file +with open('modified-document.pdf', 'wb') as f: + f.write(pdf_buffer) +``` + +## Workflow Builder Methods + +The workflow builder provides a fluent interface for chaining multiple operations. See [WORKFLOW.md](WORKFLOW.md) for detailed information about workflow methods including: + +- `workflow()` - Create a new workflow builder +- `add_file_part()` - Add file parts to the workflow +- `add_html_part()` - Add HTML content +- `apply_action()` - Apply processing actions +- `output_pdf()`, `output_image()`, `output_json()` - Set output formats +- `execute()` - Execute the workflow + +## Error Handling + +All methods can raise the following exceptions: + +- `ValidationError` - Invalid input parameters +- `AuthenticationError` - Authentication failed +- `APIError` - API returned an error +- `NetworkError` - Network request failed +- `NutrientError` - Base error class + +```python +from nutrient_dws import ( + NutrientError, + ValidationError, + APIError, + AuthenticationError, + NetworkError +) + +try: + result = await client.convert('file.docx', 'pdf') +except ValidationError as error: + print(f'Invalid input: {error.message} - Details: {error.details}') +except AuthenticationError as error: + print(f'Auth error: {error.message} - Status: {error.status_code}') +except APIError as error: + print(f'API error: {error.message} - Status: {error.status_code} - Details: {error.details}') +except NetworkError as error: + print(f'Network error: {error.message} - Details: {error.details}') +``` diff --git a/docs/WORKFLOW.md b/docs/WORKFLOW.md new file mode 100644 index 0000000..303cc51 --- /dev/null +++ b/docs/WORKFLOW.md @@ -0,0 +1,952 @@ +# Nutrient DWS Python Client Workflow + +This document provides detailed information about the workflow system in the Nutrient DWS Python Client. + +## Workflow Architecture + +The Nutrient DWS Python Client uses a fluent builder pattern with staged interfaces to create document processing workflows. This architecture provides several benefits: + +1. **Type Safety**: The staged interface ensures that methods are only available at appropriate stages +2. **Readability**: Method chaining creates readable, declarative code +3. **Discoverability**: IDE auto-completion guides you through the workflow stages +4. **Flexibility**: Complex workflows can be built with simple, composable pieces + +## Workflow Stages + +The workflow builder follows a staged approach: + +### Stage 0: Create Workflow + +You have several ways of creating a workflow + +```python +# Creating Workflow from a client +workflow = client.workflow() + +# Override the client timeout +workflow = client.workflow(60000) + +# Create a workflow without a client +from nutrient_dws.builder.builder import StagedWorkflowBuilder +workflow = StagedWorkflowBuilder({ + 'apiKey': 'your-api-key' +}) +``` + +### Stage 1: Add Parts + +In this stage, you add document parts to the workflow: + +```python +workflow = (client.workflow() + .add_file_part('document.pdf') + .add_file_part('appendix.pdf')) +``` + +Available methods: + +#### `add_file_part(file, options?, actions?)` +Adds a file part to the workflow. + +**Parameters:** +- `file: FileInput` - The file to add to the workflow. Can be a local file path, bytes, or file-like object. +- `options: FilePartOptions | None` - Additional options for the file part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the file part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Add a PDF file from a local path +workflow.add_file_part('/path/to/document.pdf') + +# Add a file with options and actions +workflow.add_file_part( + '/path/to/document.pdf', + {'pages': {'start': 1, 'end': 3}}, + [BuildActions.watermark_text('CONFIDENTIAL')] +) +``` + +#### `add_html_part(html, assets?, options?, actions?)` +Adds an HTML part to the workflow. + +**Parameters:** +- `html: FileInput` - The HTML content to add. Can be a file path, bytes, or file-like object. +- `assets: list[FileInput] | None` - Optional list of assets (CSS, images, etc.) to include with the HTML. Only local files or bytes are supported (optional) +- `options: HTMLPartOptions | None` - Additional options for the HTML part (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the HTML part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add HTML content from a file +workflow.add_html_part('/path/to/content.html') + +# Add HTML with assets and options +workflow.add_html_part( + '/path/to/content.html', + ['/path/to/style.css', '/path/to/image.png'], + {'layout': {'size': 'A4'}} +) +``` + +#### `add_new_page(options?, actions?)` +Adds a new blank page to the workflow. + +**Parameters:** +- `options: NewPagePartOptions | None` - Additional options for the new page, such as page size, orientation, etc. (optional) +- `actions: list[BuildAction] | None` - Actions to apply to the new page (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a simple blank page +workflow.add_new_page() + +# Add a new page with specific options +workflow.add_new_page({ + 'layout': {'size': 'A4', 'orientation': 'portrait'} +}) +``` + +#### `add_document_part(document_id, options?, actions?)` +Adds a document part to the workflow by referencing an existing document by ID. + +**Parameters:** +- `document_id: str` - The ID of the document to add to the workflow. +- `options: DocumentPartOptions | None` - Additional options for the document part (optional) + - `options['layer']: str` - Optional layer name to select a specific layer from the document. +- `actions: list[BuildAction] | None` - Actions to apply to the document part (optional) + +**Returns:** `WorkflowWithPartsStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Add a document by ID +workflow.add_document_part('doc_12345abcde') + +# Add a document with a specific layer and options +workflow.add_document_part( + 'doc_12345abcde', + { + 'layer': 'content', + 'pages': {'start': 0, 'end': 3} + } +) +``` + +### Stage 2: Apply Actions (Optional) + +In this stage, you can apply actions to the document: + +```python +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) +``` + +Available methods: + +#### `apply_action(action)` +Applies a single action to the workflow. + +**Parameters:** +- `action: BuildAction` - The action to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply a watermark action +workflow.apply_action( + BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.3, + 'rotation': 45 + }) +) + +# Apply an OCR action +workflow.apply_action(BuildActions.ocr('english')) +``` + +#### `apply_actions(actions)` +Applies multiple actions to the workflow. + +**Parameters:** +- `actions: list[BuildAction]` - A list of actions to apply to the workflow. + +**Returns:** `WorkflowWithActionsStage` - The workflow builder instance for method chaining. + +**Example:** + +```python +# Apply multiple actions to the workflow +workflow.apply_actions([ + BuildActions.watermark_text('DRAFT', {'opacity': 0.5}), + BuildActions.ocr('english'), + BuildActions.flatten() +]) +``` + +#### Action Types: + +#### Document Processing + +##### `BuildActions.ocr(language)` +Creates an OCR (Optical Character Recognition) action to extract text from images or scanned documents. + +**Parameters:** +- `language: str | list[str]` - Language(s) for OCR. Can be a single language or a list of languages. + +**Example:** +```python +# Basic OCR with English language +workflow.apply_action(BuildActions.ocr('english')) + +# OCR with multiple languages +workflow.apply_action(BuildActions.ocr(['english', 'french', 'german'])) + +# OCR with options (via dict syntax) +workflow.apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True +})) +``` + +##### `BuildActions.rotate(rotate_by)` +Creates an action to rotate pages in the document. + +**Parameters:** +- `rotate_by: Literal[90, 180, 270]` - Rotation angle in degrees (must be 90, 180, or 270). + +**Example:** +```python +# Rotate pages by 90 degrees +workflow.apply_action(BuildActions.rotate(90)) + +# Rotate pages by 180 degrees +workflow.apply_action(BuildActions.rotate(180)) +``` + +##### `BuildActions.flatten(annotation_ids?)` +Creates an action to flatten annotations into the document content, making them non-interactive but permanently visible. + +**Parameters:** +- `annotation_ids: list[str | int] | None` - Optional list of annotation IDs to flatten. If not specified, all annotations will be flattened (optional) + +**Example:** +```python +# Flatten all annotations +workflow.apply_action(BuildActions.flatten()) + +# Flatten specific annotations +workflow.apply_action(BuildActions.flatten(['annotation1', 'annotation2'])) +``` + +#### Watermarking + +##### `BuildActions.watermark_text(text, options?)` +Creates an action to add a text watermark to the document. + +**Parameters:** +- `text: str` - Watermark text content. +- `options: TextWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + - `fontFamily`: Font family for the text (e.g. 'Helvetica') + - `fontSize`: Size of the text in points + - `fontColor`: Foreground color of the text (e.g. '#ffffff') + - `fontStyle`: Text style list (['bold'], ['italic'], or ['bold', 'italic']) + +**Example:** + +```python +# Simple text watermark +workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Customized text watermark +workflow.apply_action(BuildActions.watermark_text('DRAFT', { + 'opacity': 0.5, + 'rotation': 45, + 'fontSize': 36, + 'fontColor': '#FF0000', + 'fontStyle': ['bold', 'italic'] +})) +``` + +##### `BuildActions.watermark_image(image, options?)` +Creates an action to add an image watermark to the document. + +**Parameters:** +- `image: FileInput` - Watermark image (file path, bytes, or file-like object). +- `options: ImageWatermarkActionOptions | None` - Watermark options (optional): + - `width`: Width dimension of the watermark (dict with 'value' and 'unit', e.g. `{'value': 100, 'unit': '%'}`) + - `height`: Height dimension of the watermark (dict with 'value' and 'unit') + - `top`, `right`, `bottom`, `left`: Position of the watermark (dict with 'value' and 'unit') + - `rotation`: Rotation of the watermark in counterclockwise degrees (default: 0) + - `opacity`: Watermark opacity (0 is fully transparent, 1 is fully opaque) + +**Example:** + +```python +# Simple image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png')) + +# Customized image watermark +workflow.apply_action(BuildActions.watermark_image('/path/to/logo.png', { + 'opacity': 0.3, + 'width': {'value': 50, 'unit': '%'}, + 'height': {'value': 50, 'unit': '%'}, + 'top': {'value': 10, 'unit': 'px'}, + 'left': {'value': 10, 'unit': 'px'}, + 'rotation': 0 +})) +``` + +#### Annotations + +##### `BuildActions.apply_instant_json(file)` +Creates an action to apply annotations from an Instant JSON file to the document. + +**Parameters:** +- `file: FileInput` - Instant JSON file input (file path, bytes, or file-like object). + +**Example:** + +```python +# Apply annotations from Instant JSON file +workflow.apply_action(BuildActions.apply_instant_json('/path/to/annotations.json')) +``` + +##### `BuildActions.apply_xfdf(file, options?)` +Creates an action to apply annotations from an XFDF file to the document. + +**Parameters:** +- `file: FileInput` - XFDF file input (file path, bytes, or file-like object). +- `options: ApplyXfdfActionOptions | None` - Apply XFDF options (optional): + - `ignorePageRotation: bool` - If True, ignores page rotation when applying XFDF data (default: False) + - `richTextEnabled: bool` - If True, plain text annotations will be converted to rich text annotations. If False, all text annotations will be plain text annotations (default: True) + +**Example:** + +```python +# Apply annotations from XFDF file with default options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf')) + +# Apply annotations with specific options +workflow.apply_action(BuildActions.apply_xfdf('/path/to/annotations.xfdf', { + 'ignorePageRotation': True, + 'richTextEnabled': False +})) +``` + +#### Redactions + +##### `BuildActions.create_redactions_text(text, options?, strategy_options?)` +Creates an action to add redaction annotations based on text search. + +**Parameters:** +- `text: str` - Text to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsText | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided text (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: False) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for all occurrences of "Confidential" +workflow.apply_action(BuildActions.create_redactions_text('Confidential')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_text('Confidential', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'REDACTED', + 'textColor': '#FFFFFF' + } + }, + { + 'caseSensitive': True, + 'start': 2, + 'limit': 5 + } + )) +``` + +##### `BuildActions.create_redactions_regex(regex, options?, strategy_options?)` +Creates an action to add redaction annotations based on regex pattern matching. + +**Parameters:** +- `regex: str` - Regex pattern to search and redact. +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsRegex | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided regex (default: True) + - `caseSensitive: bool` - If True, the search will be case sensitive (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')) + +# Create redactions with custom appearance and search options +workflow.apply_action(BuildActions.create_redactions_regex(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', + { + 'content': { + 'backgroundColor': '#FF0000', + 'overlayText': 'EMAIL REDACTED' + } + }, + { + 'caseSensitive': False, + 'start': 0, + 'limit': 10 + } + )) +``` + +##### `BuildActions.create_redactions_preset(preset, options?, strategy_options?)` +Creates an action to add redaction annotations based on a preset pattern. + +**Parameters:** +- `preset: str` - Preset pattern to search and redact (e.g. 'email-address', 'credit-card-number', 'social-security-number', etc.) +- `options: BaseCreateRedactionsOptions | None` - Redaction options (optional): + - `content: RedactionAnnotation` - Visual aspects of the redaction annotation (background color, overlay text, etc.) +- `strategy_options: CreateRedactionsStrategyOptionsPreset | None` - Redaction strategy options (optional): + - `includeAnnotations: bool` - If True, redaction annotations are created on top of annotations whose content match the provided preset (default: True) + - `start: int` - The index of the page from where to start the search (default: 0) + - `limit: int` - Starting from start, the number of pages to search (default: to the end of the document) + +**Example:** + +```python +# Create redactions for email addresses using preset +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Create redactions for credit card numbers with custom appearance +workflow.apply_action(BuildActions.create_redactions_preset('credit-card-number', + { + 'content': { + 'backgroundColor': '#000000', + 'overlayText': 'FINANCIAL DATA' + } + }, + { + 'start': 0, + 'limit': 5 + } + )) +``` + +##### `BuildActions.apply_redactions()` +Creates an action to apply previously created redaction annotations, permanently removing the redacted content. + +**Example:** + +```python +# First create redactions +workflow.apply_action(BuildActions.create_redactions_preset('email-address')) + +# Then apply them +workflow.apply_action(BuildActions.apply_redactions()) +``` + +### Stage 3: Set Output Format + +In this stage, you specify the desired output format: + +```python +workflow.output_pdf({ + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + } +}) +``` + +Available methods: + +#### `output_pdf(options?)` +Sets the output format to PDF. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF output, such as compression, encryption, etc. (optional) + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF with default options +workflow.output_pdf() + +# Set output format to PDF with specific options +workflow.output_pdf({ + 'user_password': 'secret', + 'user_permissions': ["printing"], + 'metadata': { + 'title': 'Important Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_pdfa(options?)` +Sets the output format to PDF/A (archival PDF). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/A output (optional): + - `options['conformance']: str` - The PDF/A conformance level to target. Options include 'pdfa-1b', 'pdfa-1a', 'pdfa-2b', 'pdfa-2a', 'pdfa-3b', 'pdfa-3a'. + Different levels have different requirements for long-term archiving. + - `options['vectorization']: bool` - When True, attempts to convert raster content to vector graphics where possible, improving quality and reducing file size. + - `options['rasterization']: bool` - When True, converts vector graphics to raster images, which can help with compatibility in some cases. + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/A with default options +workflow.output_pdfa() + +# Set output format to PDF/A with specific options +workflow.output_pdfa({ + 'conformance': 'pdfa-2b', + 'vectorization': True, + 'metadata': { + 'title': 'Archive Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True + } +}) +``` + +#### `output_pdfua(options?)` +Sets the output format to PDF/UA (Universal Accessibility). + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for PDF/UA output (optional): + - `options['metadata']: dict[str, Any]` - Document metadata properties like title, author. + - `options['labels']: list[dict[str, Any]]` - Custom labels to add to the document for organization and categorization. + - `options['user_password']: str` - Password required to open the document. When set, the PDF will be encrypted. + - `options['owner_password']: str` - Password required to modify the document. Provides additional security beyond the user password. + - `options['user_permissions']: list[str]` - List of permissions granted to users who open the document with the user password. + Options include: "printing", "modification", "content-copying", "annotation", "form-filling", etc. + - `options['optimize']: dict[str, Any]` - PDF optimization settings to reduce file size and improve performance. + - `options['optimize']['mrcCompression']: bool` - When True, applies Mixed Raster Content compression to reduce file size. + - `options['optimize']['imageOptimizationQuality']: int` - Controls the quality of image optimization (1-5, where 1 is highest quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PDF/UA with default options +workflow.output_pdfua() + +# Set output format to PDF/UA with specific options +workflow.output_pdfua({ + 'metadata': { + 'title': 'Accessible Document', + 'author': 'Document System' + }, + 'optimize': { + 'mrcCompression': True, + 'imageOptimizationQuality': 3 + } +}) +``` + +#### `output_image(format, options?)` +Sets the output format to an image format (PNG, JPEG, WEBP). + +**Parameters:** +- `format: Literal['png', 'jpeg', 'jpg', 'webp']` - The image format to output. + - PNG: Lossless compression, supports transparency, best for graphics and screenshots + - JPEG/JPG: Lossy compression, smaller file size, best for photographs + - WEBP: Modern format with both lossy and lossless compression, good for web use +- `options: dict[str, Any] | None` - Additional options for image output, such as resolution, quality, etc. (optional) + **Note: At least one of options['width'], options['height'], or options['dpi'] must be specified.** + - `options['pages']: dict[str, int]` - Specifies which pages to convert to images. If omitted, all pages are converted. + - `options['pages']['start']: int` - The first page to convert (0-based index). + - `options['pages']['end']: int` - The last page to convert (0-based index). + - `options['width']: int` - The width of the output image in pixels. If specified without height, aspect ratio is maintained. + - `options['height']: int` - The height of the output image in pixels. If specified without width, aspect ratio is maintained. + - `options['dpi']: int` - The resolution in dots per inch. Higher values create larger, more detailed images. + Common values: 72 (web), 150 (standard), 300 (print quality), 600 (high quality). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to PNG with dpi specified +workflow.output_image('png', {'dpi': 300}) + +# Set output format to JPEG with specific options +workflow.output_image('jpeg', { + 'dpi': 300, + 'pages': {'start': 1, 'end': 3} +}) + +# Set output format to WEBP with specific dimensions +workflow.output_image('webp', { + 'width': 1200, + 'height': 800, + 'dpi': 150 +}) +``` + +#### `output_office(format)` +Sets the output format to an Office document format (DOCX, XLSX, PPTX). + +**Parameters:** +- `format: Literal['docx', 'xlsx', 'pptx']` - The Office format to output ('docx' for Word, 'xlsx' for Excel, or 'pptx' for PowerPoint). + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Word document (DOCX) +workflow.output_office('docx') + +# Set output format to Excel spreadsheet (XLSX) +workflow.output_office('xlsx') + +# Set output format to PowerPoint presentation (PPTX) +workflow.output_office('pptx') +``` + +#### `output_html(layout)` +Sets the output format to HTML. + +**Parameters:** +- `layout: Literal['page', 'reflow']` - The layout type to use for conversion to HTML: + - 'page' layout keeps the original structure of the document, segmented by page. + - 'reflow' layout converts the document into a continuous flow of text, without page breaks. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to HTML +workflow.output_html('page') +``` + +#### `output_markdown()` +Sets the output format to Markdown. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to Markdown with default options +workflow.output_markdown() +``` + +#### `output_json(options?)` +Sets the output format to JSON content. + +**Parameters:** +- `options: dict[str, Any] | None` - Additional options for JSON output (optional): + - `options['plainText']: bool` - When True, extracts plain text content from the document and includes it in the JSON output. + This provides the raw text without structural information. + - `options['structuredText']: bool` - When True, extracts text with structural information (paragraphs, headings, etc.) + and includes it in the JSON output. + - `options['keyValuePairs']: bool` - When True, attempts to identify and extract key-value pairs from the document + (like form fields, labeled data, etc.) and includes them in the JSON output. + - `options['tables']: bool` - When True, attempts to identify and extract tabular data from the document + and includes it in the JSON output as structured table objects. + - `options['language']: str | list[str]` - Specifies the language(s) of the document content for better text extraction. + Can be a single language code or a list of language codes for multi-language documents. + Examples: "english", "french", "german", or ["english", "spanish"]. + +**Returns:** `WorkflowWithOutputStage` - The workflow builder instance for method chaining. + +**Example:** +```python +# Set output format to JSON with default options +workflow.output_json() + +# Set output format to JSON with specific options +workflow.output_json({ + 'plainText': True, + 'structuredText': True, + 'keyValuePairs': True, + 'tables': True, + 'language': "english" +}) + +# Set output format to JSON with multiple languages +workflow.output_json({ + 'plainText': True, + 'tables': True, + 'language': ["english", "french", "german"] +}) +``` + +### Stage 4: Execute or Dry Run + +In this final stage, you execute the workflow or perform a dry run: + +```python +result = await workflow.execute() +``` + +Available methods: + +#### `execute(options?)` +Executes the workflow and returns the result. + +**Parameters:** +- `on_progress: Callable[[int, int], None] | None` - Callback for progress updates (optional). + +**Returns:** `TypedWorkflowResult` - The workflow result. + +**Example:** +```python +# Execute the workflow with default options +result = await workflow.execute() + +# Execute with progress tracking +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + +result = await workflow.execute(on_progress=progress_callback) +``` + +#### `dry_run(options?)` +Performs a dry run of the workflow without generating the final output. This is useful for validating the workflow configuration and estimating processing time. + +**Returns:** `WorkflowDryRunResult` - The dry run result, containing validation information and estimated processing time. + +**Example:** +```python +# Perform a dry run with default options +dry_run_result = await (workflow + .add_file_part('/path/to/document.pdf') + .output_pdf() + .dry_run()) +``` + +## Workflow Examples + +### Basic Document Conversion + +```python +result = await (client + .workflow() + .add_file_part('document.docx') + .output_pdf() + .execute()) +``` + +### Document Merging with Watermark + +```python +result = await (client + .workflow() + .add_file_part('document1.pdf') + .add_file_part('document2.pdf') + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48 +})) + .output_pdf() + .execute()) +``` + +### OCR with Language Selection + +```python +result = await (client + .workflow() + .add_file_part('scanned-document.pdf') + .apply_action(BuildActions.ocr({ + 'language': 'english', + 'enhanceResolution': True + })) + .output_pdf() + .execute()) +``` + +### HTML to PDF Conversion + +```python +result = await (client + .workflow() + .add_html_part('index.html', None, { + 'layout': { + 'size': 'A4', + 'margin': { + 'top': 50, + 'bottom': 50, + 'left': 50, + 'right': 50 + } + } + }) + .output_pdf() + .execute()) +``` + +### Complex Multi-step Workflow + +```python +def progress_callback(current: int, total: int) -> None: + print(f'Processing step {current} of {total}') + + +result = await (client + .workflow() + .add_file_part('document.pdf', {'pages': {'start': 0, 'end': 5}}) + .add_file_part('appendix.pdf') + .apply_actions([ + BuildActions.ocr({'language': 'english'}), + BuildActions.watermark_text('CONFIDENTIAL'), + BuildActions.create_redactions_preset('email-address', 'apply') +]) + .output_pdfa({ + 'level': 'pdfa-2b', + 'optimize': { + 'mrcCompression': True + } +}) + .execute(on_progress=progress_callback)) +``` + +## Staged Workflow Builder + +For more complex scenarios where you need to build workflows dynamically, you can use the staged workflow builder: + +```python +# Create a staged workflow +workflow = client.workflow() + +# Add parts +workflow.add_file_part('document.pdf') + +# Conditionally add more parts +if include_appendix: + workflow.add_file_part('appendix.pdf') + +# Conditionally apply actions +if needs_watermark: + workflow.apply_action(BuildActions.watermark_text('CONFIDENTIAL')) + +# Set output format based on user preference +if output_format == 'pdf': + workflow.output_pdf() +elif output_format == 'docx': + workflow.output_office('docx') +else: + workflow.output_image('png') + +# Execute the workflow +result = await workflow.execute() +``` + +## Error Handling in Workflows + +Workflows provide detailed error information: + +```python +try: + result = await (client + .workflow() + .add_file_part('document.pdf') + .output_pdf() + .execute()) + + if not result['success']: + # Handle workflow errors + for error in result.get('errors', []): + print(f"Step {error['step']}: {error['error']['message']}") +except Exception as error: + # Handle unexpected errors + print(f'Workflow execution failed: {error}') +``` + +## Workflow Result Structure + +The result of a workflow execution includes: + +```python +from typing import TypedDict, Any, List, Optional, Union + +class WorkflowError(TypedDict): + step: str + error: dict[str, Any] + +class BufferOutput(TypedDict): + mimeType: str + filename: str + buffer: bytes + +class ContentOutput(TypedDict): + mimeType: str + filename: str + content: str + +class JsonContentOutput(TypedDict): + mimeType: str + filename: str + data: Any + +class WorkflowResult(TypedDict): + # Overall success status + success: bool + + # Output data (if successful) + output: Optional[Union[BufferOutput, ContentOutput, JsonContentOutput]] + + # Error information (if failed) + errors: Optional[List[WorkflowError]] +``` + +## Performance Considerations + +For optimal performance with workflows: + +1. **Minimize the number of parts**: Combine related files when possible +2. **Use appropriate output formats**: Choose formats based on your needs +3. **Consider dry runs**: Use `dry_run()` to estimate resource usage +4. **Monitor progress**: Use the `on_progress` callback for long-running workflows +5. **Handle large files**: For very large files, consider splitting into smaller workflows diff --git a/examples/.env.example b/examples/.env.example new file mode 100644 index 0000000..4a0ee74 --- /dev/null +++ b/examples/.env.example @@ -0,0 +1,2 @@ +NUTRIENT_API_KEY=your_api_key_here +PYTHON_ENV=development diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..31f96f8 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,135 @@ +# Nutrient DWS Python Client Examples + +This example project demonstrates how to use the Nutrient DWS Python Client for document processing operations. + +## Project Structure + +- `assets/` - Contains sample files for processing (PDF, DOCX, PNG) +- `src/` - Contains Python source files + - `direct_method.py` - Examples using direct method calls + - `workflow.py` - Examples using the workflow builder pattern +- `output/` - Directory where processed files will be saved +- `.env.example` - Example environment variables file + +## Prerequisites + +- Python 3.10 or higher +- pip + +## Setup + +### Option 1: Virtual Environment Setup + +1. Clone the repository: + ```bash + git clone https://github.com/pspdfkit-labs/nutrient-dws-client-python.git + cd nutrient-dws-client-python + ``` + +2. Build the package from source: + ```bash + python -m build + ``` + +3. Navigate to the examples directory: + ```bash + cd examples + ``` + +4. Set up and activate the virtual environment: + ```bash + # Set up the virtual environment and install dependencies + python setup_venv.py + + # Activate the virtual environment + # On macOS/Linux: + source example_venv/bin/activate + + # On Windows: + example_venv\Scripts\activate + ``` + +5. Create a `.env` file from the example: + ```bash + cp .env.example .env + ``` + +6. Edit the `.env` file and add your Nutrient DWS Processor API key. You can sign up for a free API key by visiting [Nutrient](https://www.nutrient.io/api/): + ``` + NUTRIENT_API_KEY=your_api_key_here + ``` + +### Option 2: Development Mode Setup + +1. Clone the repository: + ```bash + git clone https://github.com/pspdfkit-labs/nutrient-dws-client-python.git + cd nutrient-dws-client-python + ``` + +2. Install the main package in development mode: + ```bash + pip install -e . + ``` + +3. Navigate to the examples directory: + ```bash + cd examples + ``` + +4. Install dependencies for the example project: + ```bash + pip install -r requirements.txt + ``` + +5. Create a `.env` file from the example: + ```bash + cp .env.example .env + ``` + +6. Edit the `.env` file and add your Nutrient DWS Processor API key. You can sign up for a free API key by visiting [Nutrient](https://www.nutrient.io/api/): + ``` + NUTRIENT_API_KEY=your_api_key_here + ``` + +## Running the Examples + +### Direct Method Examples + +To run the direct method examples: + +```bash +python src/direct_method.py +``` + +This will: +1. Convert a DOCX file to PDF +2. Extract text from the PDF +3. Add a watermark to the PDF +4. Merge multiple documents + +### Workflow Examples + +To run the workflow examples: + +```bash +python src/workflow.py +``` + +This will: +1. Perform a basic document conversion workflow +2. Create a document merging with watermark workflow +3. Extract text with JSON output +4. Execute a complex multi-step workflow + +## Output + +All processed files will be saved to the `output/` directory. You can examine these files to see the results of the document processing operations. + +## Documentation + +For more information about the Nutrient DWS Python Client, refer to: + +- [README.md](../README.md) - Main documentation +- [METHODS.md](../docs/METHODS.md) - Direct methods documentation +- [WORKFLOW.md](../docs/WORKFLOW.md) - Workflow system documentation diff --git a/examples/assets/sample.docx b/examples/assets/sample.docx new file mode 100644 index 0000000..22e2341 Binary files /dev/null and b/examples/assets/sample.docx differ diff --git a/examples/assets/sample.pdf b/examples/assets/sample.pdf new file mode 100644 index 0000000..ada2bdd Binary files /dev/null and b/examples/assets/sample.pdf differ diff --git a/examples/assets/sample.png b/examples/assets/sample.png new file mode 100644 index 0000000..32ee307 Binary files /dev/null and b/examples/assets/sample.png differ diff --git a/examples/requirements.txt b/examples/requirements.txt new file mode 100644 index 0000000..1e2f2f3 --- /dev/null +++ b/examples/requirements.txt @@ -0,0 +1,2 @@ +python-dotenv>=1.0.0 +pathlib diff --git a/examples/setup_venv.py b/examples/setup_venv.py new file mode 100644 index 0000000..63ba273 --- /dev/null +++ b/examples/setup_venv.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Setup script for creating and configuring the examples virtual environment. +This script creates a virtual environment and installs the nutrient-dws package +from the built distribution files. +""" + +import os +import subprocess +import sys +from pathlib import Path + +def run_command(cmd, cwd=None, check=True): + """Run a command and return the result.""" + print(f"Running: {' '.join(cmd) if isinstance(cmd, list) else cmd}") + try: + result = subprocess.run( + cmd, + shell=isinstance(cmd, str), + cwd=cwd, + check=check, + capture_output=True, + text=True + ) + if result.stdout: + print(result.stdout) + return result + except subprocess.CalledProcessError as e: + print(f"Error: {e}") + if e.stderr: + print(f"Error output: {e.stderr}") + raise + +def main(): + # Get the current directory (examples folder) + examples_dir = Path(__file__).parent + project_root = examples_dir.parent + dist_dir = project_root / "dist" + + print(f"Setting up virtual environment in: {examples_dir}") + + # Create virtual environment + venv_path = examples_dir / "example_venv" + if venv_path.exists(): + print("Virtual environment already exists. Removing...") + import shutil + shutil.rmtree(venv_path) + + print("Creating virtual environment...") + run_command([sys.executable, "-m", "venv", "example_venv"], cwd=examples_dir) + + # Determine the python executable in the venv + if sys.platform == "win32": + python_exe = venv_path / "Scripts" / "python.exe" + pip_exe = venv_path / "Scripts" / "pip.exe" + else: + python_exe = venv_path / "bin" / "python" + pip_exe = venv_path / "bin" / "pip" + + # Upgrade pip + print("Upgrading pip...") + run_command([str(pip_exe), "install", "--upgrade", "pip"]) + + # Install the wheel and tar.gz files + wheel_file = dist_dir / "nutrient_dws-2.0.0-py3-none-any.whl" + tar_file = dist_dir / "nutrient_dws-2.0.0.tar.gz" + + if wheel_file.exists(): + print("Installing nutrient-dws from wheel...") + run_command([str(pip_exe), "install", str(wheel_file)]) + elif tar_file.exists(): + print("Installing nutrient-dws from tar.gz...") + run_command([str(pip_exe), "install", str(tar_file)]) + else: + print("Error: Neither wheel nor tar.gz file found in dist directory") + print("Please build the package first using: python -m build") + sys.exit(1) + + # Install example requirements + requirements_file = examples_dir / "requirements.txt" + if requirements_file.exists(): + print("Installing example requirements...") + run_command([str(pip_exe), "install", "-r", str(requirements_file)]) + + print("\n" + "="*50) + print("Virtual environment setup complete!") + print(f"Virtual environment location: {venv_path}") + print("\nTo activate the virtual environment:") + if sys.platform == "win32": + print(f" {venv_path / 'Scripts' / 'activate.bat'}") + else: + print(f" source {venv_path / 'bin' / 'activate'}") + + print("\nTo run examples:") + print(" python src/direct_method.py") + print(" python src/workflow.py") + +if __name__ == "__main__": + main() diff --git a/examples/src/direct_method.py b/examples/src/direct_method.py new file mode 100644 index 0000000..ae5e781 --- /dev/null +++ b/examples/src/direct_method.py @@ -0,0 +1,149 @@ +""" +Direct Method Example + +This example demonstrates how to use the Nutrient DWS Python Client +with direct method calls for document processing operations. +""" + +import asyncio +import os +import json +from pathlib import Path +from dotenv import load_dotenv + +from nutrient_dws import NutrientClient + +# Load environment variables from .env file +load_dotenv() + +# Check if API key is provided +if not os.getenv('NUTRIENT_API_KEY'): + print('Error: NUTRIENT_API_KEY is not set in .env file') + exit(1) + +# Initialize the client with API key +client = NutrientClient(api_key=os.getenv('NUTRIENT_API_KEY')) + +# Define paths +assets_dir = Path(__file__).parent.parent / 'assets' +output_dir = Path(__file__).parent.parent / 'output' + +# Ensure output directory exists +output_dir.mkdir(parents=True, exist_ok=True) + + +# Example 1: Convert a document +async def convert_document(): + print('Example 1: Converting DOCX to PDF') + + try: + docx_path = assets_dir / 'sample.docx' + result = await client.convert(docx_path, 'pdf') + + # Save the result to the output directory + output_path = output_dir / 'converted-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['buffer']) + + print(f'Conversion successful. Output saved to: {output_path}') + print(f'MIME type: {result["mimeType"]}') + return output_path + except Exception as error: + print(f'Conversion failed: {error}') + raise error + + +# Example 2: Extract text from a document +async def extract_text(file_path: Path): + print('\nExample 2: Extracting text from PDF') + + try: + result = await client.extract_text(file_path) + + # Save the extracted text to the output directory + output_path = output_dir / 'extracted-text.json' + with open(output_path, 'w') as f: + json.dump(result['data'], f, indent=2, default=str) + + # Display a sample of the extracted text + text_sample = result['data']['pages'][0]['plainText'][:100] + '...' + print(f'Text extraction successful. Output saved to: {output_path}') + print(f'Text sample: {text_sample}') + return output_path + except Exception as error: + print(f'Text extraction failed: {error}') + raise error + + +# Example 3: Add a watermark to a document +async def add_watermark(file_path: Path): + print('\nExample 3: Adding watermark to PDF') + + try: + result = await client.watermark_text(file_path, 'CONFIDENTIAL', { + 'opacity': 0.5, + 'font_color': '#FF0000', + 'rotation': 45, + 'width': {'value': 50, 'unit': '%'} + }) + + # Save the watermarked document to the output directory + output_path = output_dir / 'watermarked-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['buffer']) + + print(f'Watermarking successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Watermarking failed: {error}') + raise error + + +# Example 4: Merge multiple documents +async def merge_documents(): + print('\nExample 4: Merging documents') + + try: + # Create a second PDF + pdf_path = assets_dir / 'sample.pdf' + + # Get the converted PDF from Example 1 + converted_pdf_path = output_dir / 'converted-document.pdf' + + # Merge the documents + result = await client.merge([converted_pdf_path, pdf_path]) + + # Save the merged document to the output directory + output_path = output_dir / 'merged-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['buffer']) + + print(f'Merging successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Merging failed: {error}') + raise error + + + + + +# Run all examples +async def run_examples(): + try: + print('Starting direct method examples...\n') + + # Run the examples in sequence + converted_pdf_path = await convert_document() + await extract_text(converted_pdf_path) + await add_watermark(converted_pdf_path) + await merge_documents() + + print('\nAll examples completed successfully!') + except Exception as error: + print(f'\nExamples failed: {error}') + + +# Execute the examples +if __name__ == '__main__': + asyncio.run(run_examples()) diff --git a/examples/src/workflow.py b/examples/src/workflow.py new file mode 100644 index 0000000..c6106d3 --- /dev/null +++ b/examples/src/workflow.py @@ -0,0 +1,176 @@ +""" +Workflow Example + +This example demonstrates how to use the Nutrient DWS Python Client +with the workflow builder pattern for document processing operations. +""" + +import asyncio +import os +import json +from pathlib import Path +from dotenv import load_dotenv + +from nutrient_dws import NutrientClient +from nutrient_dws.builder.constant import BuildActions + +# Load environment variables from .env file +load_dotenv() + +# Check if API key is provided +if not os.getenv('NUTRIENT_API_KEY'): + print('Error: NUTRIENT_API_KEY is not set in .env file') + exit(1) + +# Initialize the client with API key +client = NutrientClient(api_key=os.getenv('NUTRIENT_API_KEY')) + +# Define paths +assets_dir = Path(__file__).parent.parent / 'assets' +output_dir = Path(__file__).parent.parent / 'output' + +# Ensure output directory exists +output_dir.mkdir(parents=True, exist_ok=True) + + +# Example 1: Basic document conversion workflow +async def basic_conversion_workflow(): + print('Example 1: Basic document conversion workflow') + + try: + docx_path = assets_dir / 'sample.docx' + + result = await client.workflow() \ + .add_file_part(docx_path) \ + .output_pdf() \ + .execute() + + # Save the result to the output directory + output_path = output_dir / 'workflow-converted-document.pdf' + with open(output_path, 'wb') as f: + f.write(result['output']['buffer']) + + print(f'Conversion workflow successful. Output saved to: {output_path}') + print(f'MIME type: {result["output"]["mimeType"]}') + return output_path + except Exception as error: + print(f'Conversion workflow failed: {error}') + raise error + + +# Example 2: Document merging with watermark +async def merge_with_watermark_workflow(): + print('\nExample 2: Document merging with watermark workflow') + + try: + pdf_path = output_dir / 'workflow-converted-document.pdf' + png_path = assets_dir / 'sample.png' + + result = await client.workflow() \ + .add_file_part(pdf_path) \ + .add_file_part(png_path) \ + .apply_action(BuildActions.watermark_text('CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 48, + 'fontColor': '#FF0000' + })) \ + .output_pdf() \ + .execute() + + # Save the result to the output directory + output_path = output_dir / 'workflow-merged-watermarked.pdf' + with open(output_path, 'wb') as f: + f.write(result['output']['buffer']) + + print(f'Merge with watermark workflow successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Merge with watermark workflow failed: {error}') + raise error + + +# Example 3: Extract text with JSON output +async def extract_text_workflow(file_path: Path): + print('\nExample 3: Extract text workflow with JSON output') + + try: + result = await client.workflow() \ + .add_file_part(file_path) \ + .output_json({ + 'plainText': True, + 'structuredText': True, + 'keyValuePairs': True, + 'tables': True + }) \ + .execute() + + # Save the result to the output directory + output_path = output_dir / 'workflow-extracted-text.json' + with open(output_path, 'w') as f: + json.dump(result['output']['data'], f, indent=2, default=str) + + print(f'Text extraction workflow successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Text extraction workflow failed: {error}') + raise error + + +# Example 4: Complex multi-step workflow +async def complex_workflow(): + print('\nExample 4: Complex multi-step workflow') + + try: + pdf_path = output_dir / 'workflow-converted-document.pdf' + png_path = assets_dir / 'sample.png' + + result = await client.workflow() \ + .add_file_part(pdf_path) \ + .add_file_part(png_path) \ + .apply_actions([ + BuildActions.watermark_text('DRAFT', { + 'opacity': 0.3, + 'fontSize': 36, + 'fontColor': '#0000FF' + }), + BuildActions.rotate(90) + ]) \ + .output_pdfua({ + 'metadata': { + 'title': 'Complex Workflow Example', + 'author': 'Nutrient DWS Python Client' + } + }) \ + .execute(on_progress= lambda current, total: print(f'Processing step {current} of {total}')) + + # Save the result to the output directory + output_path = output_dir / 'workflow-complex-result.pdf' + with open(output_path, 'wb') as f: + f.write(result['output']['buffer']) + + print(f'Complex workflow successful. Output saved to: {output_path}') + return output_path + except Exception as error: + print(f'Complex workflow failed: {error}') + raise error + + +# Run all examples +async def run_examples(): + try: + print('Starting workflow examples...\n') + + # Run the examples in sequence + converted_pdf_path = await basic_conversion_workflow() + await merge_with_watermark_workflow() + await extract_text_workflow(converted_pdf_path) + await complex_workflow() + + print('\nAll workflow examples completed successfully!') + except Exception as error: + print(f'\nWorkflow examples failed: {error}') + + +# Execute the examples +if __name__ == '__main__': + asyncio.run(run_examples()) diff --git a/pixi.toml b/pixi.toml deleted file mode 100644 index 7f04cf1..0000000 --- a/pixi.toml +++ /dev/null @@ -1,30 +0,0 @@ -[project] -name = "nutrient-dws-client-python" -channels = ["conda-forge"] -platforms = ["osx-arm64", "osx-64", "linux-64", "win-64"] - -[dependencies] -python = ">=3.10,<3.13" -requests = ">=2.25.0,<3.0.0" - -[feature.dev.dependencies] -pytest = ">=7.0.0" -pytest-cov = ">=4.0.0" -mypy = ">=1.0.0" -ruff = ">=0.1.0" -types-requests = ">=2.25.0" - -[environments] -default = {features = ["dev"], solve-group = "default"} -dev = {features = ["dev"], solve-group = "default"} - -[tasks] -test = "pytest" -lint = "ruff check ." -format = "ruff format ." -typecheck = "mypy src/" -dev = "python -m pip install -e ." - -[pypi-dependencies] -build = ">=1.2.2.post1, <2" -twine = ">=6.1.0, <7" diff --git a/pyproject.toml b/pyproject.toml index 49f452b..fe461e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,9 +8,17 @@ package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +nutrient_dws = [ + "py.typed", + "../../docs/WORKFLOW.md", + "../../docs/METHODS.md", + "../../LLM_DOC.md", +] + [project] name = "nutrient-dws" -version = "1.0.2" +version = "2.0.0" description = "Python client library for Nutrient Document Web Services API" readme = "README.md" requires-python = ">=3.10" @@ -32,37 +40,40 @@ classifiers = [ "Topic :: Multimedia :: Graphics :: Graphics Conversion", ] dependencies = [ - "requests>=2.25.0,<3.0.0", + "httpx>=0.24.0,<1.0.0", + "aiofiles>=23.0.0,<25.0.0", + "typing_extensions>=4.9.0" ] [project.optional-dependencies] dev = [ "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", "pytest-cov>=4.0.0", "mypy>=1.0.0", "ruff>=0.1.0", - "types-requests>=2.25.0", - "build>=1.0.0", - "twine>=4.0.0", -] -docs = [ - "sphinx>=5.0.0", - "sphinx-rtd-theme>=1.2.0", - "sphinx-autodoc-typehints>=1.22.0", + "types-aiofiles>=24.1.0", + "build>=1.2.2.post1,<2", + "twine>=6.1.0,<7", + "python-dotenv>=1.1.1" ] [project.urls] Homepage = "https://github.com/PSPDFKit/nutrient-dws-client-python" -Documentation = "https://nutrient-dws-client-python.readthedocs.io" +Documentation = "https://github.com/PSPDFKit/nutrient-dws-client-python/blob/main/README.md" Repository = "https://github.com/PSPDFKit/nutrient-dws-client-python" "Bug Tracker" = "https://github.com/PSPDFKit/nutrient-dws-client-python/issues" -[tool.setuptools.package-data] -nutrient_dws = ["py.typed"] +[project.scripts] +dws-add-claude-code-rule = "scripts.add_claude_code_rule:main" +dws-add-cursor-rule = "scripts.add_cursor_rule:main" +dws-add-github-copilot-rule = "scripts.add_github_copilot_rule:main" +dws-add-junie-rule = "scripts.add_junie_rule:main" +dws-add-windsurf-rule = "scripts.add_windsurf_rule:main" [tool.ruff] target-version = "py310" -line-length = 100 + [tool.ruff.lint] select = [ @@ -80,17 +91,28 @@ select = [ "RUF", # Ruff-specific rules ] ignore = [ + "E501", # Line too long + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling "D100", # Missing docstring in public module + "D101", # Missing docstring in public class + "D103", # Missing docstring in public function "D104", # Missing docstring in public package "D107", # Missing docstring in __init__ + "D205", # 1 blank line required between summary line and description + "UP007", # Use `X | Y` for type annotations "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)` - not supported in Python 3.10 runtime + "UP045", # Use `X | None` for type annotations + "N802", # Function name should be lowercase + "N803", # Argument name should be lowercase + "N815", # Variable in class scope should not be mixedCase + "N811", # Constant imported as non-constant ] [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.lint.per-file-ignores] -"tests/*" = ["D", "S101"] # Don't require docstrings in tests, allow asserts +"tests/*" = [] # Don't require docstrings in tests, allow asserts [tool.mypy] python_version = "3.10" @@ -130,4 +152,4 @@ exclude_lines = [ ] [tool.coverage.html] -directory = "htmlcov" \ No newline at end of file +directory = "htmlcov" diff --git a/pytest.ini b/pytest.ini index 0b63f27..9a9cab7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,4 +3,5 @@ testpaths = tests python_files = test_*.py python_classes = Test* python_functions = test_* -addopts = -ra \ No newline at end of file +addopts = -ra --tb=short +asyncio_mode = auto diff --git a/scripts/build_package.py b/scripts/build_package.py deleted file mode 100755 index c2e7a23..0000000 --- a/scripts/build_package.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -"""Build the package for distribution.""" - -import subprocess -import sys -from pathlib import Path - - -def main(): - """Build the package.""" - root_dir = Path(__file__).parent.parent - - # Clean previous builds - print("Cleaning previous builds...") - for dir_name in ["dist", "build", "*.egg-info"]: - subprocess.run(["rm", "-rf", str(root_dir / dir_name)]) - - # Build the package - print("Building package...") - result = subprocess.run( - [sys.executable, "-m", "build"], cwd=root_dir, capture_output=True, text=True - ) - - if result.returncode != 0: - print(f"Build failed:\n{result.stderr}") - return 1 - - print("Build successful!") - print("\nBuilt files:") - dist_dir = root_dir / "dist" - for file in dist_dir.iterdir(): - print(f" - {file.name}") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/generate_api_methods.py b/scripts/generate_api_methods.py deleted file mode 100755 index 68401b3..0000000 --- a/scripts/generate_api_methods.py +++ /dev/null @@ -1,376 +0,0 @@ -#!/usr/bin/env python3 -"""Generate Direct API methods from OpenAPI specification.""" - -import re -from pathlib import Path -from typing import Any - - -def to_snake_case(name: str) -> str: - """Convert string to snake_case.""" - # Handle common patterns - name = name.replace("-", "_") - # Insert underscore before uppercase letters - name = re.sub(r"(? str: - """Convert OpenAPI schema type to Python type hint.""" - if not schema: - return "Any" - - type_mapping = { - "string": "str", - "integer": "int", - "number": "float", - "boolean": "bool", - "array": "List[Any]", - "object": "Dict[str, Any]", - } - - schema_type = schema.get("type", "string") - return type_mapping.get(schema_type, "Any") - - -def create_manual_tools() -> list[dict[str, Any]]: - """Create tool definitions based on the specification documentation. - - Since the Nutrient API uses a build endpoint with actions rather than - individual tool endpoints, we'll create convenience methods that wrap - the build API. - """ - tools = [ - { - "tool_name": "convert-to-pdf", - "method_name": "convert_to_pdf", - "summary": "Convert a document to PDF", - "description": "Convert various document formats (DOCX, XLSX, PPTX, etc.) to PDF.", - "parameters": {}, - }, - { - "tool_name": "convert-to-pdfa", - "method_name": "convert_to_pdfa", - "summary": "Convert a document to PDF/A", - "description": "Convert documents to PDF/A format for long-term archiving.", - "parameters": { - "conformance_level": { - "type": "str", - "required": False, - "description": "PDF/A conformance level (e.g., '2b', '3b')", - "default": "2b", - }, - }, - }, - { - "tool_name": "ocr-pdf", - "method_name": "ocr_pdf", - "summary": "Perform OCR on a PDF", - "description": "Apply optical character recognition to make scanned PDFs searchable.", - "parameters": { - "language": { - "type": "str", - "required": False, - "description": "OCR language code (e.g., 'en', 'de', 'fr')", - "default": "en", - }, - }, - }, - { - "tool_name": "rotate-pages", - "method_name": "rotate_pages", - "summary": "Rotate PDF pages", - "description": "Rotate pages in a PDF document.", - "parameters": { - "degrees": { - "type": "int", - "required": True, - "description": "Rotation angle in degrees (90, 180, 270)", - }, - "page_indexes": { - "type": "List[int]", - "required": False, - "description": ( - "List of page indexes to rotate (0-based). " - "If not specified, all pages are rotated." - ), - }, - }, - }, - { - "tool_name": "flatten-annotations", - "method_name": "flatten_annotations", - "summary": "Flatten PDF annotations", - "description": "Flatten annotations and form fields in a PDF.", - "parameters": {}, - }, - { - "tool_name": "watermark-pdf", - "method_name": "watermark_pdf", - "summary": "Add watermark to PDF", - "description": "Add text or image watermark to PDF pages.", - "parameters": { - "text": { - "type": "str", - "required": False, - "description": "Watermark text", - }, - "image_url": { - "type": "str", - "required": False, - "description": "URL of watermark image", - }, - "opacity": { - "type": "float", - "required": False, - "description": "Watermark opacity (0.0 to 1.0)", - "default": 0.5, - }, - "position": { - "type": "str", - "required": False, - "description": "Watermark position", - "default": "center", - }, - }, - }, - { - "tool_name": "sign-pdf", - "method_name": "sign_pdf", - "summary": "Digitally sign a PDF", - "description": "Add a digital signature to a PDF document.", - "parameters": { - "certificate_file": { - "type": "FileInput", - "required": True, - "description": "Digital certificate file (P12/PFX format)", - }, - "certificate_password": { - "type": "str", - "required": True, - "description": "Certificate password", - }, - "reason": { - "type": "str", - "required": False, - "description": "Reason for signing", - }, - "location": { - "type": "str", - "required": False, - "description": "Location of signing", - }, - }, - }, - { - "tool_name": "redact-pdf", - "method_name": "redact_pdf", - "summary": "Redact sensitive information from PDF", - "description": "Use AI to automatically redact sensitive information from a PDF.", - "parameters": { - "types": { - "type": "List[str]", - "required": False, - "description": "Types of information to redact (e.g., 'email', 'phone', 'ssn')", - }, - }, - }, - { - "tool_name": "export-pdf-to-office", - "method_name": "export_pdf_to_office", - "summary": "Export PDF to Office format", - "description": "Convert PDF to Microsoft Office formats (DOCX, XLSX, PPTX).", - "parameters": { - "format": { - "type": "str", - "required": True, - "description": "Output format ('docx', 'xlsx', 'pptx')", - }, - }, - }, - { - "tool_name": "export-pdf-to-images", - "method_name": "export_pdf_to_images", - "summary": "Export PDF pages as images", - "description": "Convert PDF pages to image files.", - "parameters": { - "format": { - "type": "str", - "required": False, - "description": "Image format ('png', 'jpeg', 'webp')", - "default": "png", - }, - "dpi": { - "type": "int", - "required": False, - "description": "Image resolution in DPI", - "default": 150, - }, - "page_indexes": { - "type": "List[int]", - "required": False, - "description": "List of page indexes to export (0-based)", - }, - }, - }, - ] - - return tools - - -def generate_method_code(tool_info: dict[str, Any]) -> str: - """Generate Python method code for a tool.""" - method_name = tool_info["method_name"] - tool_name = tool_info["tool_name"] - summary = tool_info["summary"] - description = tool_info["description"] - parameters = tool_info["parameters"] - - # Build parameter list - param_list = ["self", "input_file: FileInput"] - param_docs = [] - - # Add required parameters first - for param_name, param_info in parameters.items(): - if param_info["required"]: - param_type = param_info["type"] - # Handle imports for complex types - if param_type == "FileInput": - param_type = "'FileInput'" # Forward reference - param_list.append(f"{param_name}: {param_type}") - param_docs.append(f" {param_name}: {param_info['description']}") - - # Always add output_path - param_list.append("output_path: Optional[str] = None") - - # Add optional parameters - for param_name, param_info in parameters.items(): - if not param_info["required"]: - param_type = param_info["type"] - # Handle List types - base_type = param_type - - default = param_info.get("default") - if default is None: - param_list.append(f"{param_name}: Optional[{base_type}] = None") - else: - if isinstance(default, str): - param_list.append(f'{param_name}: {base_type} = "{default}"') - else: - param_list.append(f"{param_name}: {base_type} = {default}") - param_docs.append(f" {param_name}: {param_info['description']}") - - # Build method signature - if len(param_list) > 3: # Multiple parameters - params_str = ",\n ".join(param_list) - method_signature = ( - f" def {method_name}(\n {params_str},\n ) -> Optional[bytes]:" - ) - else: - params_str = ", ".join(param_list) - method_signature = f" def {method_name}({params_str}) -> Optional[bytes]:" - - # Build docstring - docstring_lines = [f' """{summary}'] - if description and description != summary: - docstring_lines.append("") - docstring_lines.append(f" {description}") - - docstring_lines.extend( - [ - "", - " Args:", - " input_file: Input file (path, bytes, or file-like object).", - ] - ) - - if param_docs: - docstring_lines.extend(param_docs) - - docstring_lines.extend( - [ - " output_path: Optional path to save the output file.", - "", - " Returns:", - " Processed file as bytes, or None if output_path is provided.", - "", - " Raises:", - " AuthenticationError: If API key is missing or invalid.", - " APIError: For other API errors.", - ' """', - ] - ) - - # Build method body - method_body = [] - - # Collect kwargs - kwargs_params = [f"{name}={name}" for name in parameters] - - if kwargs_params: - kwargs_str = ", ".join(kwargs_params) - method_body.append( - f' return self._process_file("{tool_name}", input_file, ' - f"output_path, {kwargs_str})" - ) - else: - method_body.append( - f' return self._process_file("{tool_name}", input_file, output_path)' - ) - - # Combine all parts - return "\n".join( - [ - method_signature, - "\n".join(docstring_lines), - "\n".join(method_body), - ] - ) - - -def generate_api_methods(spec_path: Path, output_path: Path) -> None: - """Generate API methods from OpenAPI specification.""" - # For Nutrient API, we'll use manually defined tools since they use - # a build endpoint with actions rather than individual endpoints - tools = create_manual_tools() - - # Sort tools by method name - tools.sort(key=lambda t: t["method_name"]) - - # Generate code - code_lines = [ - '"""Direct API methods for individual document processing tools.', - "", - "This file provides convenient methods that wrap the Nutrient Build API", - "for common document processing operations.", - '"""', - "", - "from typing import List, Optional", - "", - "from nutrient_dws.file_handler import FileInput", - "", - "", - "class DirectAPIMixin:", - ' """Mixin class containing Direct API methods.', - " ", - " These methods provide a simplified interface to common document", - " processing operations. They internally use the Build API.", - ' """', - "", - ] - - # Add methods - for tool in tools: - code_lines.append(generate_method_code(tool)) - code_lines.append("") # Empty line between methods - - # Write to file - output_path.write_text("\n".join(code_lines)) - print(f"Generated {len(tools)} API methods in {output_path}") - - -if __name__ == "__main__": - spec_path = Path("openapi_spec.yml") - output_path = Path("src/nutrient/api/direct.py") - - generate_api_methods(spec_path, output_path) diff --git a/src/nutrient_dws/__init__.py b/src/nutrient_dws/__init__.py index 2e260e0..0cab185 100644 --- a/src/nutrient_dws/__init__.py +++ b/src/nutrient_dws/__init__.py @@ -4,22 +4,32 @@ """ from nutrient_dws.client import NutrientClient -from nutrient_dws.exceptions import ( +from nutrient_dws.errors import ( APIError, AuthenticationError, - FileProcessingError, + NetworkError, NutrientError, - NutrientTimeoutError, ValidationError, ) +from nutrient_dws.inputs import ( + is_remote_file_input, + process_file_input, + process_remote_file_input, + validate_file_input, +) +from nutrient_dws.utils import get_library_version, get_user_agent -__version__ = "1.0.2" __all__ = [ "APIError", "AuthenticationError", - "FileProcessingError", + "NetworkError", "NutrientClient", "NutrientError", - "NutrientTimeoutError", "ValidationError", + "get_library_version", + "get_user_agent", + "is_remote_file_input", + "process_file_input", + "process_remote_file_input", + "validate_file_input", ] diff --git a/src/nutrient_dws/api/__init__.py b/src/nutrient_dws/api/__init__.py deleted file mode 100644 index 72b9fda..0000000 --- a/src/nutrient_dws/api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""API module for Nutrient DWS client.""" diff --git a/src/nutrient_dws/api/direct.py b/src/nutrient_dws/api/direct.py deleted file mode 100644 index 690289c..0000000 --- a/src/nutrient_dws/api/direct.py +++ /dev/null @@ -1,1507 +0,0 @@ -"""Direct API methods for supported document processing tools. - -This file provides convenient methods that wrap the Nutrient Build API -for supported document processing operations. -""" - -from typing import TYPE_CHECKING, Any, Protocol - -from nutrient_dws.file_handler import FileInput - -if TYPE_CHECKING: - from nutrient_dws.builder import BuildAPIWrapper - from nutrient_dws.http_client import HTTPClient - - -class HasBuildMethod(Protocol): - """Protocol for objects that have a build method.""" - - def build(self, input_file: FileInput) -> "BuildAPIWrapper": - """Build method signature.""" - ... - - @property - def _http_client(self) -> "HTTPClient": - """HTTP client property.""" - ... - - -class DirectAPIMixin: - """Mixin class containing Direct API methods. - - These methods provide a simplified interface to common document - processing operations. They internally use the Build API. - - Note: The API automatically converts supported document formats - (DOCX, XLSX, PPTX) to PDF when processing. - """ - - def _process_file( - self, - tool: str, - input_file: FileInput, - output_path: str | None = None, - **options: Any, - ) -> bytes | None: - """Process file method that will be provided by NutrientClient.""" - raise NotImplementedError("This method is provided by NutrientClient") - - def convert_to_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - ) -> bytes | None: - """Convert a document to PDF. - - Converts Office documents (DOCX, XLSX, PPTX) to PDF format. - This uses the API's implicit conversion - simply uploading a - non-PDF document returns it as a PDF. - - Args: - input_file: Input document (DOCX, XLSX, PPTX, etc). - output_path: Optional path to save the output PDF. - - Returns: - Converted PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors (e.g., unsupported format). - - Note: - HTML files are not currently supported by the API. - """ - # Use builder with no actions - implicit conversion happens - # Type checking: at runtime, self is NutrientClient which has these methods - return self.build(input_file).execute(output_path) # type: ignore[attr-defined,no-any-return] - - def flatten_annotations( - self, input_file: FileInput, output_path: str | None = None - ) -> bytes | None: - """Flatten annotations and form fields in a PDF. - - Converts all annotations and form fields into static page content. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - return self._process_file("flatten-annotations", input_file, output_path) - - def rotate_pages( - self, - input_file: FileInput, - output_path: str | None = None, - degrees: int = 0, - page_indexes: list[int] | None = None, - ) -> bytes | None: - """Rotate pages in a PDF. - - Rotate all pages or specific pages by the specified degrees. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - degrees: Rotation angle (90, 180, 270, or -90). - page_indexes: Optional list of page indexes to rotate (0-based). - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - options = {"degrees": degrees} - if page_indexes is not None: - options["page_indexes"] = page_indexes # type: ignore - return self._process_file("rotate-pages", input_file, output_path, **options) - - def ocr_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - language: str = "english", - ) -> bytes | None: - """Apply OCR to a PDF to make it searchable. - - Performs optical character recognition on the PDF to extract text - and make it searchable. If input is an Office document, it will - be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - language: OCR language. Supported: "english", "eng", "deu", "german". - Default is "english". - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - return self._process_file("ocr-pdf", input_file, output_path, language=language) - - def watermark_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - text: str | None = None, - image_url: str | None = None, - image_file: FileInput | None = None, - width: int = 200, - height: int = 100, - opacity: float = 1.0, - position: str = "center", - ) -> bytes | None: - """Add a watermark to a PDF. - - Adds a text or image watermark to all pages of the PDF. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - text: Text to use as watermark. One of text, image_url, or image_file required. - image_url: URL of image to use as watermark. - image_file: Local image file to use as watermark (path, bytes, or file-like object). - Supported formats: PNG, JPEG, TIFF. - width: Width of the watermark in points (required). - height: Height of the watermark in points (required). - opacity: Opacity of the watermark (0.0 to 1.0). - position: Position of watermark. One of: "top-left", "top-center", - "top-right", "center", "bottom-left", "bottom-center", - "bottom-right". - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If none of text, image_url, or image_file is provided. - """ - if not text and not image_url and not image_file: - raise ValueError("Either text, image_url, or image_file must be provided") - - # For image file uploads, we need to use the builder directly - if image_file: - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Prepare files for upload - files = {} - - # Main PDF file - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - # Watermark image file - image_field, image_data = prepare_file_for_upload(image_file, "watermark") - files[image_field] = image_data - - # Build instructions with watermark action - action = { - "type": "watermark", - "width": width, - "height": height, - "opacity": opacity, - "position": position, - "image": "watermark", # Reference to the uploaded image file - } - - instructions = {"parts": [{"file": "file"}], "actions": [action]} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - # For text and URL watermarks, use the existing _process_file approach - options = { - "width": width, - "height": height, - "opacity": opacity, - "position": position, - } - - if text: - options["text"] = text - else: - options["image_url"] = image_url - - return self._process_file("watermark-pdf", input_file, output_path, **options) - - def apply_redactions( - self, - input_file: FileInput, - output_path: str | None = None, - ) -> bytes | None: - """Apply redaction annotations to permanently remove content. - - Applies any redaction annotations in the PDF to permanently remove - the underlying content. If input is an Office document, it will - be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - - Returns: - Processed file as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - return self._process_file("apply-redactions", input_file, output_path) - - def create_redactions_preset( - self, - input_file: FileInput, - preset: str, - output_path: str | None = None, - include_annotations: bool = False, - appearance_fill_color: str | None = None, - appearance_stroke_color: str | None = None, - ) -> bytes | None: - """Create redaction annotations using a preset pattern. - - Creates redaction annotations for common sensitive data patterns - like social security numbers, credit card numbers, etc. - - Args: - input_file: Input PDF file. - preset: Preset pattern to use. Valid options: - - "social-security-number": US Social Security Number - - "credit-card-number": Credit card numbers - - "international-phone-number": International phone numbers - - "north-american-phone-number": North America phone numbers - - "date": Date patterns - - "time": Time patterns - - "us-zip-code": US Zip Code patterns - - "email-address": Email addresses - output_path: Optional path to save the output file. - include_annotations: Include text in annotations (default: False). - appearance_fill_color: Fill color for redaction boxes (hex format). - appearance_stroke_color: Stroke color for redaction boxes (hex format). - - Returns: - PDF with redaction annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Note: - This creates redaction annotations but does not apply them. - Use apply_redactions() to permanently remove the content. - """ - options = { - "strategy": "preset", - "strategy_options": { - "preset": preset, - "includeAnnotations": include_annotations, - }, - } - - # Add appearance options if provided - content = {} - if appearance_fill_color: - content["fillColor"] = appearance_fill_color - if appearance_stroke_color: - content["outlineColor"] = appearance_stroke_color - - if content: - options["content"] = content - - return self._process_file("create-redactions", input_file, output_path, **options) - - def create_redactions_regex( - self, - input_file: FileInput, - pattern: str, - output_path: str | None = None, - case_sensitive: bool = False, - include_annotations: bool = False, - appearance_fill_color: str | None = None, - appearance_stroke_color: str | None = None, - ) -> bytes | None: - """Create redaction annotations using a regex pattern. - - Creates redaction annotations for text matching a regular expression. - - Args: - input_file: Input PDF file. - pattern: Regular expression pattern to match. - output_path: Optional path to save the output file. - case_sensitive: Whether pattern matching is case-sensitive (default: False). - include_annotations: Include text in annotations (default: False). - include_text: Include regular text content (default: True). - appearance_fill_color: Fill color for redaction boxes (hex format). - appearance_stroke_color: Stroke color for redaction boxes (hex format). - - Returns: - PDF with redaction annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Note: - This creates redaction annotations but does not apply them. - Use apply_redactions() to permanently remove the content. - """ - options = { - "strategy": "regex", - "strategy_options": { - "regex": pattern, - "caseSensitive": case_sensitive, - "includeAnnotations": include_annotations, - }, - } - - # Add appearance options if provided - content = {} - if appearance_fill_color: - content["fillColor"] = appearance_fill_color - if appearance_stroke_color: - content["outlineColor"] = appearance_stroke_color - - if content: - options["content"] = content - - return self._process_file("create-redactions", input_file, output_path, **options) - - def create_redactions_text( - self, - input_file: FileInput, - text: str, - output_path: str | None = None, - case_sensitive: bool = True, - include_annotations: bool = False, - appearance_fill_color: str | None = None, - appearance_stroke_color: str | None = None, - ) -> bytes | None: - """Create redaction annotations for exact text matches. - - Creates redaction annotations for all occurrences of specific text. - - Args: - input_file: Input PDF file. - text: Exact text to redact. - output_path: Optional path to save the output file. - case_sensitive: Whether text matching is case-sensitive (default: True). - include_annotations: Include text in annotations (default: False). - appearance_fill_color: Fill color for redaction boxes (hex format). - appearance_stroke_color: Stroke color for redaction boxes (hex format). - - Returns: - PDF with redaction annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Note: - This creates redaction annotations but does not apply them. - Use apply_redactions() to permanently remove the content. - """ - options = { - "strategy": "text", - "strategy_options": { - "text": text, - "caseSensitive": case_sensitive, - "includeAnnotations": include_annotations, - }, - } - - # Add appearance options if provided - content = {} - if appearance_fill_color: - content["fillColor"] = appearance_fill_color - if appearance_stroke_color: - content["outlineColor"] = appearance_stroke_color - - if content: - options["content"] = content - - return self._process_file("create-redactions", input_file, output_path, **options) - - def optimize_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - grayscale_text: bool = False, - grayscale_graphics: bool = False, - grayscale_images: bool = False, - grayscale_form_fields: bool = False, - grayscale_annotations: bool = False, - disable_images: bool = False, - mrc_compression: bool = False, - image_optimization_quality: int | None = 2, - linearize: bool = False, - ) -> bytes | None: - """Optimize a PDF to reduce file size. - - Applies various optimization techniques to reduce the file size of a PDF - while maintaining readability. If input is an Office document, it will - be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - grayscale_text: Convert text to grayscale (default: False). - grayscale_graphics: Convert graphics to grayscale (default: False). - grayscale_images: Convert images to grayscale (default: False). - grayscale_form_fields: Convert form_fields to grayscale (default: False). - grayscale_annotations: Convert annotations to grayscale (default: False). - disable_images: Remove all images from the PDF (default: False). - mrc_compression: MCR compression (default: False). - image_optimization_quality: Image optimization quality from 1 (least optimized) - to 4 (most optimized) (default: 2). - linearize: Linearize (optimize for web viewing) the PDF (default: False). - - Returns: - Optimized PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If image_optimization_quality is not between 1-4 - or no optimization is enabled - - Example: - # Aggressive optimization for minimum file size - client.optimize_pdf( - "large_document.pdf", - grayscale_images=True, - image_optimization_quality=4, - output_path="optimized.pdf" - ) - """ - options: dict[str, Any] = {} - - # Add grayscale options - if grayscale_text: - options["grayscale_text"] = True - if grayscale_graphics: - options["grayscale_graphics"] = True - if grayscale_images: - options["grayscale_images"] = True - if grayscale_form_fields: - options["grayscale_form_fields"] = True - if grayscale_annotations: - options["grayscale_annotations"] = True - - # Add MCR compression - if mrc_compression: - options["mrc_compression"] = True - - # Add image options - if disable_images: - options["disable_images"] = True - if image_optimization_quality is not None: - if not 1 <= image_optimization_quality <= 4: - raise ValueError("image_optimization_quality must be between 1 and 4") - options["image_optimization_quality"] = image_optimization_quality - - # Add linearization - if linearize: - options["linearize"] = True - - # Build using the Builder API with output options - builder = self.build(input_file) # type: ignore[attr-defined] - - # Apply optimization via output options - if options: - # If there are specific options, set optimize to the options dict - builder.set_output_options(optimize=options) - else: - # If no options, raise error - raise ValueError("No optimization is enabled") - return builder.execute(output_path) # type: ignore[no-any-return] - - def password_protect_pdf( - self, - input_file: FileInput, - output_path: str | None = None, - user_password: str | None = None, - owner_password: str | None = None, - permissions: list[str] | None = None, - ) -> bytes | None: - """Add password protection and permissions to a PDF. - - Secures a PDF with password protection and optional permission restrictions. - If input is an Office document, it will be converted to PDF first. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - user_password: Password required to open the document. - owner_password: Password required to change permissions/security settings. - If not provided, uses user_password. - permissions: Array of permission strings. Available permissions: - - "printing": Allow printing - - "modification": Allow document modification - - "extract": Allow content extraction - - "annotations_and_forms": Allow adding annotations - - "fill_forms": Allow filling forms - - "extract_accessibility": Allow accessibility features - - "assemble": Allow document assembly - - "print_high_quality": Allow high-quality printing - - Returns: - Protected PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If neither user_password nor owner_password is provided. - - Example: - # Protect with view-only permissions (only allowing extract_accessibility) - client.password_protect_pdf( - "sensitive.pdf", - user_password="view123", - owner_password="admin456", - permissions=["extract_accessibility"], - output_path="protected.pdf" - ) - """ - if not user_password and not owner_password: - raise ValueError("At least one of user_password or owner_password must be provided") - - # Build using the Builder API with output options - builder = self.build(input_file) # type: ignore[attr-defined] - - # Set up password options with camelCase for API - password_options: dict[str, Any] = {} - if user_password: - password_options["userPassword"] = user_password - if owner_password: - password_options["ownerPassword"] = owner_password - else: - # If no owner password provided, use user password - password_options["ownerPassword"] = user_password - - # Set up permissions if provided - if permissions: - password_options["permissions"] = permissions - - # Apply password protection via output options - builder.set_output_options(**password_options) - return builder.execute(output_path) # type: ignore[no-any-return] - - def set_pdf_metadata( - self, - input_file: FileInput, - output_path: str | None = None, - title: str | None = None, - author: str | None = None, - ) -> bytes | None: - """Set metadata properties of a PDF. - - Updates the metadata/document properties of a PDF file. - If input is an Office document, it will be converted to PDF first. - Only title and author metadata fields are supported. - - Args: - input_file: Input file (PDF or Office document). - output_path: Optional path to save the output file. - title: Document title. - author: Document author. - - Returns: - PDF with updated metadata as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If no metadata fields are provided. - - Example: - client.set_pdf_metadata( - "document.pdf", - title="Annual Report 2024", - author="John Doe", - output_path="document_with_metadata.pdf" - ) - """ - metadata = {} - if title is not None: - metadata["title"] = title - if author is not None: - metadata["author"] = author - - if not metadata: - raise ValueError("At least one metadata field must be provided") - - # Build using the Builder API with output options - builder = self.build(input_file) # type: ignore[attr-defined] - builder.set_output_options(metadata=metadata) - return builder.execute(output_path) # type: ignore[no-any-return] - - def split_pdf( - self, - input_file: FileInput, - page_ranges: list[dict[str, int]] | None = None, - output_paths: list[str] | None = None, - ) -> list[bytes]: - """Split a PDF into multiple documents by page ranges. - - Splits a PDF into multiple files based on specified page ranges. - Each range creates a separate output file. - - Args: - input_file: Input PDF file. - page_ranges: List of page range dictionaries. Each dict can contain: - - 'start': Starting page index (0-based, inclusive) - - 'end': Ending page index (0-based, inclusive) - - If not provided, splits into individual pages - output_paths: Optional list of paths to save output files. - Must match length of page_ranges if provided. - - Returns: - List of PDF bytes for each split, or empty list if output_paths provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_ranges and output_paths length mismatch. - - Examples: - # Split into individual pages - pages = client.split_pdf("document.pdf") - - # Split by custom ranges - parts = client.split_pdf( - "document.pdf", - page_ranges=[ - {"start": 0, "end": 4}, # Pages 1-5 - {"start": 5, "end": 9}, # Pages 6-10 - {"start": 10} # Pages 11 to end - ] - ) - - # Save to specific files - client.split_pdf( - "document.pdf", - page_ranges=[{"start": 0, "end": 1}, {"start": 2}], - output_paths=["part1.pdf", "part2.pdf"] - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not page_ranges: - # Default behavior: extract first page only - page_ranges = [{"start": 0, "end": 0}] - - if len(page_ranges) > 50: - raise ValueError("Maximum 50 page ranges allowed") - - if output_paths and len(output_paths) != len(page_ranges): - raise ValueError("output_paths length must match page_ranges length") - - # Get total number of pages to validate ranges - num_of_pages = get_pdf_page_count(input_file) - - # Validate and adjust page ranges - for i, page_range in enumerate(page_ranges): - start = page_range.get("start", 0) - - # Validate start is within document bounds - if start < 0 or start >= num_of_pages: - raise ValueError( - f"Page range {i}: start index {start} is out of bounds (0-{num_of_pages - 1})" - ) - - # If end is specified, validate it's within document bounds - if "end" in page_range: - end = page_range["end"] - if end < 0 or end >= num_of_pages: - raise ValueError( - f"Page range {i}: end index {end} is out of bounds (0-{num_of_pages - 1})" - ) - if end < start: - raise ValueError( - f"Page range {i}: end index {end} cannot be less than start index {start}" - ) - - results = [] - - # Process each page range as a separate API call - for i, page_range in enumerate(page_ranges): - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Build instructions for page extraction - instructions = {"parts": [{"file": "file", "pages": page_range}], "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_paths and i < len(output_paths): - save_file_output(result, output_paths[i]) - else: - results.append(result) # type: ignore[arg-type] - - return results if not output_paths else [] - - def duplicate_pdf_pages( - self, - input_file: FileInput, - page_indexes: list[int], - output_path: str | None = None, - ) -> bytes | None: - """Duplicate specific pages within a PDF document. - - Creates a new PDF containing the specified pages in the order provided. - Pages can be duplicated multiple times by including their index multiple times. - - Args: - input_file: Input PDF file. - page_indexes: List of page indexes to include (0-based). - Pages can be repeated to create duplicates. - Negative indexes are supported (-1 for last page). - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_indexes is empty. - - Examples: - # Duplicate first page twice, then include second page - result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 0, 1] # Page 1, Page 1, Page 2 - ) - - # Include last page at beginning and end - result = client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[-1, 0, 1, 2, -1] # Last, First, Second, Third, Last - ) - - # Save to specific file - client.duplicate_pdf_pages( - "document.pdf", - page_indexes=[0, 2, 1], # Reorder: Page 1, Page 3, Page 2 - output_path="reordered.pdf" - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not page_indexes: - raise ValueError("page_indexes cannot be empty") - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Get total number of pages to validate indexes - num_of_pages = get_pdf_page_count(input_file) - - # Build parts for each page index - parts = [] - for page_index in page_indexes: - if page_index < 0: - # For negative indexes, use the index directly (API supports negative indexes) - # No validation for negative indexes as they're handled by the API - parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}}) - else: - # Validate positive indexes are within bounds - if page_index >= num_of_pages: - raise ValueError( - f"Page index {page_index} is out of bounds (0-{num_of_pages - 1})" - ) - # For positive indexes, create single-page range - parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}}) - - # Build instructions for duplication - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def delete_pdf_pages( - self, - input_file: FileInput, - page_indexes: list[int], - output_path: str | None = None, - ) -> bytes | None: - """Delete specific pages from a PDF document. - - Creates a new PDF with the specified pages removed. The API approach - works by selecting all pages except those to be deleted. - - Args: - input_file: Input PDF file. - page_indexes: List of page indexes to delete (0-based). 0 = first page. - Must be unique, sorted in ascending order. - Negative indexes are NOT supported. - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_indexes is empty or contains negative indexes. - - Examples: - # Delete first and last pages (Note: negative indexes not supported) - result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[0, 2] # Delete pages 1 and 3 - ) - - # Delete specific pages (2nd and 4th pages) - result = client.delete_pdf_pages( - "document.pdf", - page_indexes=[1, 3] # 0-based indexing - ) - - # Save to specific file - client.delete_pdf_pages( - "document.pdf", - page_indexes=[2, 4, 5], - output_path="pages_deleted.pdf" - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not page_indexes: - raise ValueError("page_indexes cannot be empty") - - # Check for negative indexes - if any(idx < 0 for idx in page_indexes): - negative_indexes = [idx for idx in page_indexes if idx < 0] - raise ValueError( - f"Negative page indexes not yet supported for deletion: {negative_indexes}" - ) - - # Get total number of pages to validate indexes - num_of_pages = get_pdf_page_count(input_file) - - # Validate page indexes are within bounds - for idx in page_indexes: - if idx >= num_of_pages: - raise ValueError(f"Page index {idx} is out of bounds (0-{num_of_pages - 1})") - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Sort page indexes to handle ranges efficiently - sorted_indexes = sorted(set(page_indexes)) # Remove duplicates and sort - - # Build parts for pages to keep (excluding the ones to delete) - # We need to create ranges that exclude the deleted pages - parts = [] - - # Start from page 0 - current_page = 0 - - for delete_index in sorted_indexes: - # Add range from current_page to delete_index-1 (inclusive) - if current_page < delete_index: - parts.append( - {"file": "file", "pages": {"start": current_page, "end": delete_index - 1}} - ) - - # Skip the deleted page - current_page = delete_index + 1 - - # Add remaining pages after the last deleted page - num_of_pages = get_pdf_page_count(input_file) - if ( - current_page > 0 or (current_page == 0 and len(sorted_indexes) == 0) - ) and current_page < num_of_pages: - # Add all remaining pages from current_page onwards - parts.append({"file": "file", "pages": {"start": current_page}}) - - # If no parts, it means we're trying to delete all pages - if not parts: - raise ValueError("Cannot delete all pages from document") - - # Build instructions for deletion (keeping non-deleted pages) - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def merge_pdfs( - self, - input_files: list[FileInput], - output_path: str | None = None, - ) -> bytes | None: - """Merge multiple PDF files into one. - - Combines multiple files into a single PDF in the order provided. - Office documents (DOCX, XLSX, PPTX) will be automatically converted - to PDF before merging. - - Args: - input_files: List of input files (PDFs or Office documents). - output_path: Optional path to save the output file. - - Returns: - Merged PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If less than 2 files provided. - - Example: - # Merge PDFs and Office documents - client.merge_pdfs([ - "document1.pdf", - "document2.docx", - "spreadsheet.xlsx" - ], "merged.pdf") - """ - if len(input_files) < 2: - raise ValueError("At least 2 files required for merge") - - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Prepare files for upload - files = {} - parts = [] - - for i, file in enumerate(input_files): - field_name = f"file{i}" - file_field, file_data = prepare_file_for_upload(file, field_name) - files[file_field] = file_data - parts.append({"file": field_name}) - - # Build instructions for merge (no actions needed) - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def add_page( - self, - input_file: FileInput, - insert_index: int, - page_count: int = 1, - page_size: str = "A4", - orientation: str = "portrait", - output_path: str | None = None, - ) -> bytes | None: - """Add blank pages to a PDF document. - - Inserts blank pages at the specified insertion index in the document. - - Args: - input_file: Input PDF file. - insert_index: Position to insert pages (0-based insertion index). - 0 = insert before first page (at beginning) - 1 = insert before second page (after first page) - -1 = insert after last page (at end) - page_count: Number of blank pages to add (default: 1). - page_size: Page size for new pages. Common values: "A4", "Letter", - "Legal", "A3", "A5" (default: "A4"). - orientation: Page orientation. Either "portrait" or "landscape" - (default: "portrait"). - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If page_count is less than 1 or if insert_index is - a negative number other than -1. - - Examples: - # Add a single blank page at the beginning - result = client.add_page("document.pdf", insert_index=0) - - # Add multiple pages at the end - result = client.add_page( - "document.pdf", - insert_index=-1, # Insert at end - page_count=3, - page_size="Letter", - orientation="landscape" - ) - - # Add pages before third page and save to file - client.add_page( - "document.pdf", - insert_index=2, # Insert before third page - page_count=2, - output_path="with_blank_pages.pdf" - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if page_count < 1: - raise ValueError("page_count must be at least 1") - if page_count > 100: - raise ValueError("page_count cannot exceed 100 pages") - if insert_index < -1: - raise ValueError("insert_index must be -1 (for end) or a non-negative insertion index") - - # Get total number of pages to validate insert_index - if insert_index >= 0: # Skip validation for -1 (end) - num_of_pages = get_pdf_page_count(input_file) - if insert_index > num_of_pages: - raise ValueError(f"insert_index {insert_index} is out of bounds (0-{num_of_pages})") - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Build parts array - parts: list[dict[str, Any]] = [] - - # Create new page part - new_page_part = { - "page": "new", - "pageCount": page_count, - "layout": { - "size": page_size, - "orientation": orientation, - }, - } - - if insert_index == -1: - # Insert at end: add all original pages first, then new pages - parts.append({"file": "file"}) - parts.append(new_page_part) - elif insert_index == 0: - # Insert at beginning: add new pages first, then all original pages - parts.append(new_page_part) - parts.append({"file": "file"}) - else: - # Insert at specific position: split original document - # Add pages from start up to insertion point (0 to insert_index-1) - parts.append({"file": "file", "pages": {"start": 0, "end": insert_index - 1}}) - - # Add new blank pages - parts.append(new_page_part) - - # Add remaining pages from insertion point to end - parts.append({"file": "file", "pages": {"start": insert_index}}) - - # Build instructions for adding pages - instructions = {"parts": parts, "actions": []} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def apply_instant_json( - self, - input_file: FileInput, - instant_json: FileInput | str, - output_path: str | None = None, - ) -> bytes | None: - """Apply Nutrient Instant JSON annotations to a PDF. - - Applies annotations from a Nutrient Instant JSON file or URL to a PDF. - This allows importing annotations exported from Nutrient SDK or other - compatible sources. - - Args: - input_file: Input PDF file. - instant_json: Instant JSON data as file path, bytes, file object, or URL. - output_path: Optional path to save the output file. - - Returns: - PDF with applied annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Example: - # Apply annotations from file - client.apply_instant_json( - "document.pdf", - "annotations.json", - output_path="annotated.pdf" - ) - - # Apply annotations from URL - client.apply_instant_json( - "document.pdf", - "https://example.com/annotations.json", - output_path="annotated.pdf" - ) - """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Check if instant_json is a URL - if isinstance(instant_json, str) and ( - instant_json.startswith("http://") or instant_json.startswith("https://") - ): - # Use URL approach - action = { - "type": "applyInstantJson", - "file": {"url": instant_json}, - } - - # Prepare the PDF file - files = {} - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - else: - # It's a file input - need to upload both files - files = {} - - # Main PDF file - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - # Instant JSON file - json_field, json_data = prepare_file_for_upload(instant_json, "instant_json") - files[json_field] = json_data - - # Build instructions with applyInstantJson action - action = { - "type": "applyInstantJson", - "file": json_field, # Reference to the uploaded file - } - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def apply_xfdf( - self, - input_file: FileInput, - xfdf: FileInput | str, - output_path: str | None = None, - ) -> bytes | None: - """Apply XFDF annotations to a PDF. - - Applies annotations from an XFDF (XML Forms Data Format) file or URL - to a PDF. XFDF is a standard format for exchanging PDF annotations. - - Args: - input_file: Input PDF file. - xfdf: XFDF data as file path, bytes, file object, or URL. - output_path: Optional path to save the output file. - - Returns: - PDF with applied annotations as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - - Example: - # Apply annotations from file - client.apply_xfdf( - "document.pdf", - "annotations.xfdf", - output_path="annotated.pdf" - ) - - # Apply annotations from URL - client.apply_xfdf( - "document.pdf", - "https://example.com/annotations.xfdf", - output_path="annotated.pdf" - ) - """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output - - # Check if xfdf is a URL - if isinstance(xfdf, str) and (xfdf.startswith("http://") or xfdf.startswith("https://")): - # Use URL approach - action = { - "type": "applyXfdf", - "file": {"url": xfdf}, - } - - # Prepare the PDF file - files = {} - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - else: - # It's a file input - need to upload both files - files = {} - - # Main PDF file - file_field, file_data = prepare_file_for_upload(input_file, "file") - files[file_field] = file_data - - # XFDF file - xfdf_field, xfdf_data = prepare_file_for_upload(xfdf, "xfdf") - files[xfdf_field] = xfdf_data - - # Build instructions with applyXfdf action - action = { - "type": "applyXfdf", - "file": xfdf_field, # Reference to the uploaded file - } - - instructions = {"parts": [{"file": file_field}], "actions": [action]} - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def set_page_label( - self, - input_file: FileInput, - labels: list[dict[str, Any]], - output_path: str | None = None, - ) -> bytes | None: - """Set labels for specific pages in a PDF. - - Assigns custom labels/numbering to specific page ranges in a PDF document. - Each label configuration specifies a page range and the label text to apply. - - Args: - input_file: Input PDF file. - labels: List of label configurations. Each dict must contain: - - 'pages': Page range dict with 'start' (required) and optionally 'end' - - 'label': String label to apply to those pages - Page ranges use 0-based indexing where 'end' is inclusive. - output_path: Optional path to save the output file. - - Returns: - Processed PDF as bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - ValueError: If labels list is empty or contains invalid configurations. - - Examples: - # Set labels for different page ranges - client.set_page_label( - "document.pdf", - labels=[ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - {"pages": {"start": 10}, "label": "Appendix"} - ], - output_path="labeled_document.pdf" - ) - - # Set label for single page - client.set_page_label( - "document.pdf", - labels=[{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] - ) - """ - from nutrient_dws.file_handler import ( - get_pdf_page_count, - prepare_file_for_upload, - save_file_output, - ) - - # Validate inputs - if not labels: - raise ValueError("labels list cannot be empty") - - # Get total number of pages to validate ranges - num_of_pages = get_pdf_page_count(input_file) - - # Normalize labels to ensure proper format - normalized_labels = [] - for i, label_config in enumerate(labels): - if not isinstance(label_config, dict): - raise ValueError(f"Label configuration {i} must be a dictionary") - - if "pages" not in label_config: - raise ValueError(f"Label configuration {i} missing required 'pages' key") - - if "label" not in label_config: - raise ValueError(f"Label configuration {i} missing required 'label' key") - - pages = label_config["pages"] - if not isinstance(pages, dict) or "start" not in pages: - raise ValueError(f"Label configuration {i} 'pages' must be a dict with 'start' key") - - # Validate start is within document bounds - start = pages["start"] - if start < 0 or start >= num_of_pages: - raise ValueError( - f"Label configuration {i}: start index {start}" - f" is out of bounds (0-{num_of_pages - 1})" - ) - - # Normalize pages - only include 'end' if explicitly provided - normalized_pages = {"start": start} - if "end" in pages: - end = pages["end"] - # Validate end is within document bounds - if end < 0 or end >= num_of_pages: - raise ValueError( - f"Label configuration {i}: end index {end}" - f" is out of bounds (0-{num_of_pages - 1})" - ) - # Validate end is not less than start - if end < start: - raise ValueError( - f"Label configuration {i}: end index {end}" - f" cannot be less than start index {start}" - ) - normalized_pages["end"] = end - # If no end is specified, leave it out (meaning "to end of document") - - normalized_labels.append({"pages": normalized_pages, "label": label_config["label"]}) - - # Prepare file for upload - file_field, file_data = prepare_file_for_upload(input_file, "file") - files = {file_field: file_data} - - # Build instructions with page labels in output configuration - instructions = { - "parts": [{"file": "file"}], - "actions": [], - "output": {"labels": normalized_labels}, - } - - # Make API request - # Type checking: at runtime, self is NutrientClient which has _http_client - result = self._http_client.post( # type: ignore[attr-defined] - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] diff --git a/src/nutrient_dws/builder.py b/src/nutrient_dws/builder.py deleted file mode 100644 index bdada1f..0000000 --- a/src/nutrient_dws/builder.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Builder API implementation for multi-step workflows.""" - -from typing import Any - -from nutrient_dws.file_handler import FileInput, prepare_file_for_upload, save_file_output - - -class BuildAPIWrapper: - r"""Builder pattern implementation for chaining document operations. - - This class provides a fluent interface for building complex document - processing workflows using the Nutrient Build API. - - Example: - >>> client.build(input_file="document.pdf") \\ - ... .add_step(tool="rotate-pages", options={"degrees": 90}) \\ - ... .add_step(tool="ocr-pdf", options={"language": "en"}) \\ - ... .add_step(tool="watermark-pdf", options={"text": "CONFIDENTIAL"}) \\ - ... .execute(output_path="processed.pdf") - """ - - def __init__(self, client: Any, input_file: FileInput) -> None: - """Initialize builder with client and input file. - - Args: - client: NutrientClient instance. - input_file: Input file to process. - """ - self._client = client - self._input_file = input_file - self._parts: list[dict[str, Any]] = [{"file": "file"}] # Main file - self._files: dict[str, FileInput] = {"file": input_file} # Track files - self._actions: list[dict[str, Any]] = [] - self._output_options: dict[str, Any] = {} - - def _add_file_part(self, file: FileInput, name: str) -> None: - """Add an additional file part for operations like merge. - - Args: - file: File to add. - name: Name for the file part. - """ - self._parts.append({"file": name}) - self._files[name] = file - - def add_step(self, tool: str, options: dict[str, Any] | None = None) -> "BuildAPIWrapper": - """Add a processing step to the workflow. - - Args: - tool: Tool identifier (e.g., 'rotate-pages', 'ocr-pdf'). - options: Optional parameters for the tool. - - Returns: - Self for method chaining. - - Example: - >>> builder.add_step(tool="rotate-pages", options={"degrees": 180}) - """ - action = self._map_tool_to_action(tool, options or {}) - self._actions.append(action) - return self - - def set_output_options(self, **options: Any) -> "BuildAPIWrapper": - """Set output options for the final document. - - Args: - **options: Output options (e.g., metadata, optimization). - - Returns: - Self for method chaining. - - Example: - >>> builder.set_output_options( - ... metadata={"title": "My Document", "author": "John Doe"}, - ... optimize=True - ... ) - """ - self._output_options.update(options) - return self - - def set_page_labels(self, labels: list[dict[str, Any]]) -> "BuildAPIWrapper": - """Set page labels for the final document. - - Assigns custom labels/numbering to specific page ranges in the output PDF. - - Args: - labels: List of label configurations. Each dict must contain: - - 'pages': Page range dict with 'start' (required) and optionally 'end' - - 'label': String label to apply to those pages - Page ranges use 0-based indexing where 'end' is inclusive. - - Returns: - Self for method chaining. - - Example: - >>> builder.set_page_labels([ - ... {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - ... {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - ... {"pages": {"start": 10}, "label": "Appendix"} - ... ]) - """ - self._output_options["labels"] = labels - return self - - def execute(self, output_path: str | None = None) -> bytes | None: - """Execute the workflow. - - Args: - output_path: Optional path to save the output file. - - Returns: - Processed file bytes, or None if output_path is provided. - - Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. - """ - # Prepare the build instructions - instructions = self._build_instructions() - - # Prepare files for upload - files = {} - for name, file in self._files.items(): - file_field, file_data = prepare_file_for_upload(file, name) - files[file_field] = file_data - - # Make API request - result = self._client._http_client.post( - "/build", - files=files, - json_data=instructions, - ) - - # Handle output - if output_path: - save_file_output(result, output_path) - return None - else: - return result # type: ignore[no-any-return] - - def _build_instructions(self) -> dict[str, Any]: - """Build the instructions payload for the API. - - Returns: - Instructions dictionary for the Build API. - """ - instructions = { - "parts": self._parts, - "actions": self._actions, - } - - # Add output options if specified - if self._output_options: - instructions["output"] = self._output_options # type: ignore - - return instructions - - def _map_tool_to_action(self, tool: str, options: dict[str, Any]) -> dict[str, Any]: - """Map tool name and options to Build API action format. - - Args: - tool: Tool identifier. - options: Tool options. - - Returns: - Action dictionary for the Build API. - """ - # Map tool names to action types - tool_mapping = { - "rotate-pages": "rotate", - "ocr-pdf": "ocr", - "watermark-pdf": "watermark", - "flatten-annotations": "flatten", - "apply-instant-json": "applyInstantJson", - "apply-xfdf": "applyXfdf", - "create-redactions": "createRedactions", - "apply-redactions": "applyRedactions", - } - - action_type = tool_mapping.get(tool, tool) - - # Build action dictionary - action = {"type": action_type} - - # Handle special cases for different action types using pattern matching - match action_type: - case "rotate": - action["rotateBy"] = options.get("degrees", 0) - if "page_indexes" in options: - action["pageIndexes"] = options["page_indexes"] - - case "ocr": - if "language" in options: - # Map common language codes to API format - lang_map = { - "en": "english", - "de": "deu", - "eng": "eng", - "deu": "deu", - "german": "deu", - } - lang = options["language"] - action["language"] = lang_map.get(lang, lang) - - case "watermark": - # Watermark requires width/height - action["width"] = options.get("width", 200) # Default width - action["height"] = options.get("height", 100) # Default height - - if "text" in options: - action["text"] = options["text"] - elif "image_url" in options: - action["image"] = {"url": options["image_url"]} # type: ignore - elif "image_file" in options: - # Handle image file upload - image_file = options["image_file"] - # Add the image as a file part - watermark_name = f"watermark_{len(self._files)}" - self._files[watermark_name] = image_file - # Reference the uploaded file - action["image"] = watermark_name # type: ignore - else: - # Default to text watermark if neither specified - action["text"] = "WATERMARK" - - if "opacity" in options: - action["opacity"] = options["opacity"] - if "position" in options: - action["position"] = options["position"] - - case "createRedactions": - # Handle create redactions - pass through directly - # The direct.py already formats everything correctly - if "strategy" in options: - action["strategy"] = options["strategy"] - if "strategy_options" in options: - action["strategyOptions"] = options["strategy_options"] - if "content" in options: - action["content"] = options["content"] - - case "optimize": - # Handle optimize action with camelCase conversion - for key, value in options.items(): - # Convert snake_case to camelCase for API - camel_key = "".join( - word.capitalize() if i else word for i, word in enumerate(key.split("_")) - ) - action[camel_key] = value - - case _: - # For other actions, pass options directly - action.update(options) - - return action - - def __str__(self) -> str: - """String representation of the build workflow.""" - steps = [f"{action['type']}" for action in self._actions] - return f"BuildAPIWrapper(steps={steps})" - - def __repr__(self) -> str: - """Detailed representation of the build workflow.""" - return ( - f"BuildAPIWrapper(" - f"input_file={self._input_file!r}, " - f"actions={self._actions!r}, " - f"output_options={self._output_options!r})" - ) diff --git a/tests/integration/__init__.py b/src/nutrient_dws/builder/__init__.py similarity index 100% rename from tests/integration/__init__.py rename to src/nutrient_dws/builder/__init__.py diff --git a/src/nutrient_dws/builder/base_builder.py b/src/nutrient_dws/builder/base_builder.py new file mode 100644 index 0000000..f09ff96 --- /dev/null +++ b/src/nutrient_dws/builder/base_builder.py @@ -0,0 +1,64 @@ +"""Base builder class that all builders extend from.""" + +from abc import ABC, abstractmethod +from typing import Literal, Union, overload + +from nutrient_dws.builder.staged_builders import ( + TypedWorkflowResult, +) +from nutrient_dws.errors import ValidationError +from nutrient_dws.http import ( + AnalyzeBuildRequestData, + BuildRequestData, + NutrientClientOptions, + RequestConfig, + is_post_analyse_build_request_config, + is_post_build_request_config, + send_request, +) +from nutrient_dws.types.analyze_response import AnalyzeBuildResponse +from nutrient_dws.types.build_response_json import BuildResponseJsonContents + + +class BaseBuilder(ABC): + """Base builder class that all builders extend from. + Provides common functionality for API interaction. + """ + + def __init__(self, client_options: NutrientClientOptions) -> None: + self.client_options = client_options + + @overload + async def _send_request( + self, path: Literal["/build"], options: BuildRequestData + ) -> Union[BuildResponseJsonContents, bytes, str]: ... + + @overload + async def _send_request( + self, path: Literal["/analyze_build"], options: AnalyzeBuildRequestData + ) -> AnalyzeBuildResponse: ... + + async def _send_request( + self, + path: Literal["/build", "/analyze_build"], + options: BuildRequestData | AnalyzeBuildRequestData, + ) -> Union[BuildResponseJsonContents, bytes, str, AnalyzeBuildResponse]: + """Sends a request to the API.""" + config = RequestConfig(endpoint=path, method="POST", data=options, headers=None) + + if is_post_build_request_config(config): + response = await send_request(config, self.client_options) + return response["data"] + + if is_post_analyse_build_request_config(config): + analyze_response = await send_request(config, self.client_options) + return analyze_response["data"] + + raise ValidationError( + "Invalid _send_request args", {"path": path, "options": options} + ) + + @abstractmethod + async def execute(self) -> TypedWorkflowResult: + """Abstract method that child classes must implement for execution.""" + pass diff --git a/src/nutrient_dws/builder/builder.py b/src/nutrient_dws/builder/builder.py new file mode 100644 index 0000000..5497e88 --- /dev/null +++ b/src/nutrient_dws/builder/builder.py @@ -0,0 +1,616 @@ +"""Staged workflow builder that provides compile-time safety through Python's type system.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypeGuard, cast + +from nutrient_dws.builder.base_builder import BaseBuilder +from nutrient_dws.builder.constant import ActionWithFileInput, BuildOutputs +from nutrient_dws.builder.staged_builders import ( + ApplicableAction, + BufferOutput, + ContentOutput, + JsonContentOutput, + TypedWorkflowResult, + WorkflowDryRunResult, + WorkflowError, + WorkflowExecuteCallback, + WorkflowWithActionsStage, + WorkflowWithOutputStage, + WorkflowWithPartsStage, +) +from nutrient_dws.errors import ValidationError +from nutrient_dws.http import ( + AnalyzeBuildRequestData, + BuildRequestData, + NutrientClientOptions, +) +from nutrient_dws.inputs import ( + FileInput, + NormalizedFileData, + is_remote_file_input, + process_file_input, + validate_file_input, +) +from nutrient_dws.types.file_handle import FileHandle, RemoteFileHandle + +if TYPE_CHECKING: + from nutrient_dws.types.build_actions import BuildAction + from nutrient_dws.types.build_instruction import BuildInstructions + from nutrient_dws.types.build_output import ( + BuildOutput, + ImageOutputOptions, + JSONContentOutputOptions, + PDFAOutputOptions, + PDFOutput, + PDFOutputOptions, + PDFUAOutputOptions, + ) + from nutrient_dws.types.build_response_json import BuildResponseJsonContents + from nutrient_dws.types.input_parts import ( + DocumentPart, + DocumentPartOptions, + FilePart, + FilePartOptions, + HTMLPart, + HTMLPartOptions, + NewPagePart, + NewPagePartOptions, + ) + + +class StagedWorkflowBuilder( + BaseBuilder, + WorkflowWithPartsStage, + WorkflowWithOutputStage, +): + """Staged workflow builder that provides compile-time safety through Python's type system. + This builder ensures methods are only available at appropriate stages of the workflow. + """ + + def __init__(self, client_options: NutrientClientOptions) -> None: + """Initialize the staged workflow builder. + + Args: + client_options: Client configuration options + """ + super().__init__(client_options) + self.build_instructions: BuildInstructions = {"parts": []} + self.assets: dict[str, FileInput] = {} + self.asset_index = 0 + self.current_step = 0 + self.is_executed = False + + def _register_asset(self, asset: FileInput) -> str: + """Register an asset in the workflow and return its key for use in actions. + + Args: + asset: The asset to register + + Returns: + The asset key that can be used in BuildActions + """ + if not validate_file_input(asset): + raise ValidationError( + "Invalid file input provided to workflow", {"asset": asset} + ) + + if is_remote_file_input(asset): + raise ValidationError( + "Remote file input doesn't need to be registered", {"asset": asset} + ) + + asset_key = f"asset_{self.asset_index}" + self.asset_index += 1 + self.assets[asset_key] = asset + return asset_key + + def _ensure_not_executed(self) -> None: + """Ensure the workflow hasn't been executed yet.""" + if self.is_executed: + raise ValidationError( + "This workflow has already been executed. Create a new workflow builder for additional operations." + ) + + def _validate(self) -> None: + """Validate the workflow before execution.""" + if not self.build_instructions["parts"]: + raise ValidationError("Workflow has no parts to execute") + + if "output" not in self.build_instructions: + self.build_instructions["output"] = cast("PDFOutput", {"type": "pdf"}) + + def _process_action(self, action: ApplicableAction) -> BuildAction: + """Process an action, registering files if needed. + + Args: + action: The action to process + + Returns: + The processed action + """ + if self._is_action_with_file_input(action): + # Register the file and create the actual action + if is_remote_file_input(action.fileInput): + file_handle: FileHandle = RemoteFileHandle(url=action.fileInput) + else: + file_handle = self._register_asset(action.fileInput) + return action.createAction(file_handle) + else: + return cast("BuildAction", action) + + def _is_action_with_file_input( + self, action: ApplicableAction + ) -> TypeGuard[ActionWithFileInput]: + """Type guard to check if action needs file registration. + + Args: + action: The action to check + + Returns: + True if action needs file registration + """ + return hasattr(action, "createAction") + + async def _prepare_files(self) -> dict[str, NormalizedFileData]: + """Prepare files for the request concurrently. + + Returns: + Dictionary mapping asset keys to normalized file data + """ + import asyncio + + # Process all files concurrently + tasks = [] + keys = [] + for key, file_input in self.assets.items(): + tasks.append(process_file_input(file_input)) + keys.append(key) + + # Wait for all file processing to complete + normalized_files = await asyncio.gather(*tasks) + + # Build the result dictionary + request_files = {} + for key, normalized_data in zip(keys, normalized_files, strict=False): + request_files[key] = normalized_data + + return request_files + + def _cleanup(self) -> None: + """Clean up resources after execution.""" + self.assets.clear() + self.asset_index = 0 + self.current_step = 0 + self.is_executed = True + + # Part methods (WorkflowInitialStage) + + def add_file_part( + self, + file: FileInput, + options: FilePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a file part to the workflow. + + Args: + file: The file to add to the workflow. Can be a local file path, bytes, or URL. + options: Additional options for the file part. + actions: Actions to apply to the file part. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Handle file field + file_field: FileHandle + if is_remote_file_input(file): + file_field = RemoteFileHandle(url=file) + else: + file_field = self._register_asset(file) + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + file_part: FilePart = { + "file": file_field, + **(options or {}), + } + + if processed_actions: + file_part["actions"] = processed_actions + + self.build_instructions["parts"].append(file_part) + return self + + def add_html_part( + self, + html: FileInput, + assets: list[FileInput] | None = None, + options: HTMLPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add an HTML part to the workflow. + + Args: + html: The HTML content to add. Can be a file path, bytes, or URL. + assets: Optional array of assets (CSS, images, etc.) to include with the HTML. + options: Additional options for the HTML part. + actions: Actions to apply to the HTML part. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Handle HTML field + html_field: FileHandle + if is_remote_file_input(html): + html_field = RemoteFileHandle(url=html) + else: + html_field = self._register_asset(html) + + # Handle assets + assets_field = None + if assets: + assets_field = [] + for asset in assets: + if is_remote_file_input(asset): + raise ValidationError( + "Assets file input cannot be a URL", {"input": asset} + ) + asset_key = self._register_asset(asset) + assets_field.append(asset_key) + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + html_part: HTMLPart = { + "html": html_field, + } + + if options is not None and "layout" in options: + html_part["layout"] = options["layout"] + + if assets_field: + html_part["assets"] = assets_field + + if processed_actions: + html_part["actions"] = processed_actions + + self.build_instructions["parts"].append(html_part) + return self + + def add_new_page( + self, + options: NewPagePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a new blank page to the workflow. + + Args: + options: Additional options for the new page, such as page size, orientation, etc. + actions: Actions to apply to the new page. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + new_page_part: NewPagePart = { + "page": "new", + } + + if options is not None: + if "pageCount" in options: + new_page_part["pageCount"] = options["pageCount"] + + if "layout" in options: + new_page_part["layout"] = options["layout"] + + if processed_actions: + new_page_part["actions"] = processed_actions + + self.build_instructions["parts"].append(new_page_part) + return self + + def add_document_part( + self, + document_id: str, + options: DocumentPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a document part to the workflow by referencing an existing document by ID. + + Args: + document_id: The ID of the document to add to the workflow. + options: Additional options for the document part. + actions: Actions to apply to the document part. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + # Extract layer from options + layer = None + document_options = options or {} + if "layer" in document_options: + layer = document_options.pop("layer") + + # Process actions + processed_actions = None + if actions: + processed_actions = [self._process_action(action) for action in actions] + + document_part: DocumentPart = { + "document": {"id": document_id}, + } + + if "password" in document_options: + document_part["password"] = document_options["password"] + + if "pages" in document_options: + document_part["pages"] = document_options["pages"] + + if layer: + document_part["document"]["layer"] = layer + + if processed_actions: + document_part["actions"] = processed_actions + + self.build_instructions["parts"].append(document_part) + return self + + # Action methods (WorkflowWithPartsStage) + + def apply_actions( + self, actions: list[ApplicableAction] + ) -> WorkflowWithActionsStage: + """Apply multiple actions to the workflow. + + Args: + actions: An array of actions to apply to the workflow. + + Returns: + The workflow builder instance for method chaining. + """ + self._ensure_not_executed() + + if "actions" not in self.build_instructions: + self.build_instructions["actions"] = [] + + processed_actions = [self._process_action(action) for action in actions] + self.build_instructions["actions"].extend(processed_actions) + return cast("WorkflowWithActionsStage", self) + + def apply_action(self, action: ApplicableAction) -> WorkflowWithActionsStage: + """Apply a single action to the workflow. + + Args: + action: The action to apply to the workflow. + + Returns: + The workflow builder instance for method chaining. + """ + return self.apply_actions([action]) + + # Output methods (WorkflowWithPartsStage) + + def _output(self, output: BuildOutput) -> StagedWorkflowBuilder: + """Set the output configuration.""" + self._ensure_not_executed() + self.build_instructions["output"] = output + return self + + def output_pdf( + self, + options: PDFOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to PDF.""" + self._output(BuildOutputs.pdf(options)) + return cast("WorkflowWithOutputStage", self) + + def output_pdfa( + self, + options: PDFAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to PDF/A.""" + self._output(BuildOutputs.pdfa(options)) + return cast("WorkflowWithOutputStage", self) + + def output_pdfua( + self, + options: PDFUAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to PDF/UA.""" + self._output(BuildOutputs.pdfua(options)) + return cast("WorkflowWithOutputStage", self) + + def output_image( + self, + format: Literal["png", "jpeg", "jpg", "webp"], + options: ImageOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to an image format.""" + if not options or not any(k in options for k in ["dpi", "width", "height"]): + raise ValidationError( + "Image output requires at least one of the following options: dpi, height, width" + ) + self._output(BuildOutputs.image(format, options)) + return cast("WorkflowWithOutputStage", self) + + def output_office( + self, + format: Literal["docx", "xlsx", "pptx"], + ) -> WorkflowWithOutputStage: + """Set the output format to an Office document format.""" + self._output(BuildOutputs.office(format)) + return cast("WorkflowWithOutputStage", self) + + def output_html( + self, layout: Literal["page", "reflow"] | None = None + ) -> WorkflowWithOutputStage: + """Set the output format to HTML.""" + casted_layout: Literal["page", "reflow"] = "page" + if layout is not None: + casted_layout = layout + self._output(BuildOutputs.html(casted_layout)) + return cast("WorkflowWithOutputStage", self) + + def output_markdown( + self, + ) -> WorkflowWithOutputStage: + """Set the output format to Markdown.""" + self._output(BuildOutputs.markdown()) + return cast("WorkflowWithOutputStage", self) + + def output_json( + self, + options: JSONContentOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set the output format to JSON content.""" + self._output(BuildOutputs.jsonContent(options)) + return cast("WorkflowWithOutputStage", self) + + # Execution methods (WorkflowWithOutputStage) + + async def execute( + self, + on_progress: WorkflowExecuteCallback | None = None, + ) -> TypedWorkflowResult: + """Execute the workflow and return the result. + + Args: + on_progress: Optional progress callback. + + Returns: + The workflow execution result. + """ + self._ensure_not_executed() + self.current_step = 0 + + result: TypedWorkflowResult = { + "success": False, + "errors": [], + "output": None, + } + + try: + # Step 1: Validate + self.current_step = 1 + if on_progress: + on_progress(self.current_step, 3) + self._validate() + + # Step 2: Prepare files + self.current_step = 2 + if on_progress: + on_progress(self.current_step, 3) + + output_config = self.build_instructions.get("output") + if not output_config: + raise ValueError("Output configuration is required") + + files = await self._prepare_files() + + # Make the request + response = await self._send_request( + "/build", + BuildRequestData(instructions=self.build_instructions, files=files), + ) + + # Step 3: Process response + self.current_step = 3 + if on_progress: + on_progress(self.current_step, 3) + + if output_config["type"] == "json-content": + result["success"] = True + result["output"] = JsonContentOutput( + data=cast("BuildResponseJsonContents", response) + ) + elif output_config["type"] in ["html", "markdown"]: + mime_info = BuildOutputs.getMimeTypeForOutput(output_config) + result["success"] = True + result["output"] = ContentOutput( + content=cast("bytes", response).decode("utf-8"), + mimeType=mime_info["mimeType"], + filename=mime_info.get("filename"), + ) + else: + mime_info = BuildOutputs.getMimeTypeForOutput(output_config) + result["success"] = True + result["output"] = BufferOutput( + buffer=cast("bytes", response), + mimeType=mime_info["mimeType"], + filename=mime_info.get("filename"), + ) + + except Exception as error: + if result["errors"] is None: + result["errors"] = [] + + workflow_error: WorkflowError = { + "step": self.current_step, + "error": error + if isinstance(error, Exception) + else Exception(str(error)), + } + cast("list[WorkflowError]", result["errors"]).append(workflow_error) + + finally: + self._cleanup() + + return result + + async def dry_run(self) -> WorkflowDryRunResult: + """Perform a dry run of the workflow without generating the final output. + This is useful for validating the workflow configuration and estimating processing time. + + Returns: + A dry run result containing validation information and estimated processing time. + """ + self._ensure_not_executed() + + result: WorkflowDryRunResult = { + "success": False, + "errors": [], + "analysis": None, + } + + try: + self._validate() + + response = await self._send_request( + "/analyze_build", + AnalyzeBuildRequestData(instructions=self.build_instructions), + ) + + result["success"] = True + result["analysis"] = response + + except Exception as error: + if result["errors"] is None: + result["errors"] = [] + + workflow_error: WorkflowError = { + "step": 0, + "error": error + if isinstance(error, Exception) + else Exception(str(error)), + } + cast("list[WorkflowError]", result["errors"]).append(workflow_error) + + return result diff --git a/src/nutrient_dws/builder/constant.py b/src/nutrient_dws/builder/constant.py new file mode 100644 index 0000000..b61ea5c --- /dev/null +++ b/src/nutrient_dws/builder/constant.py @@ -0,0 +1,640 @@ +from collections.abc import Callable +from typing import Any, Literal, Protocol, TypeVar, cast + +from nutrient_dws.inputs import FileInput +from nutrient_dws.types.build_actions import ( + ApplyInstantJsonAction, + ApplyRedactionsAction, + ApplyXfdfAction, + ApplyXfdfActionOptions, + BaseCreateRedactionsOptions, + BuildAction, + CreateRedactionsActionPreset, + CreateRedactionsActionRegex, + CreateRedactionsActionText, + CreateRedactionsStrategyOptionsPreset, + CreateRedactionsStrategyOptionsRegex, + CreateRedactionsStrategyOptionsText, + FlattenAction, + ImageWatermarkAction, + ImageWatermarkActionOptions, + OcrAction, + RotateAction, + SearchPreset, + TextWatermarkAction, + TextWatermarkActionOptions, +) +from nutrient_dws.types.build_output import ( + HTMLOutput, + ImageOutput, + ImageOutputOptions, + JSONContentOutput, + JSONContentOutputOptions, + MarkdownOutput, + OfficeOutput, + PDFAOutput, + PDFAOutputOptions, + PDFOutput, + PDFOutputOptions, + PDFUAOutput, + PDFUAOutputOptions, +) +from nutrient_dws.types.file_handle import FileHandle +from nutrient_dws.types.misc import OcrLanguage, WatermarkDimension + +# Default dimension for watermarks +DEFAULT_DIMENSION: WatermarkDimension = {"value": 100, "unit": "%"} + + +T = TypeVar("T") + + +class ActionWithFileInput(Protocol): + """Internal action type that holds FileInput for deferred registration.""" + + __needsFileRegistration: bool + fileInput: FileInput + createAction: Callable[[FileHandle], BuildAction] + + +class BuildActions: + """Factory functions for creating common build actions.""" + + @staticmethod + def ocr(language: OcrLanguage | list[OcrLanguage]) -> OcrAction: + """Create an OCR action. + + Args: + language: Language(s) for OCR + + Returns: + OcrAction object + """ + return { + "type": "ocr", + "language": language, + } + + @staticmethod + def rotate(rotateBy: Literal[90, 180, 270]) -> RotateAction: + """Create a rotation action. + + Args: + rotateBy: Rotation angle (90, 180, or 270) + + Returns: + RotateAction object + """ + return { + "type": "rotate", + "rotateBy": rotateBy, + } + + @staticmethod + def watermark_text( + text: str, options: TextWatermarkActionOptions | None = None + ) -> TextWatermarkAction: + """Create a text watermark action. + + Args: + text: Watermark text + options: Watermark options + width: Width dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + height: Height dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + top: Top position of the watermark (value and unit) + right: Right position of the watermark (value and unit) + bottom: Bottom position of the watermark (value and unit) + left: Left position of the watermark (value and unit) + rotation: Rotation of the watermark in counterclockwise degrees (default: 0) + opacity: Watermark opacity (0 is fully transparent, 1 is fully opaque) + fontFamily: Font family for the text (e.g. 'Helvetica') + fontSize: Size of the text in points + fontColor: Foreground color of the text (e.g. '#ffffff') + fontStyle: Text style array ('bold', 'italic', or both) + + Returns: + TextWatermarkAction object + """ + if options is None: + options = { + "width": DEFAULT_DIMENSION, + "height": DEFAULT_DIMENSION, + "rotation": 0, + } + + return { + "type": "watermark", + "text": text, + **options, + "rotation": options.get("rotation", 0), + "width": options.get("width", DEFAULT_DIMENSION), + "height": options.get("height", DEFAULT_DIMENSION), + } + + @staticmethod + def watermark_image( + image: FileInput, options: ImageWatermarkActionOptions | None = None + ) -> ActionWithFileInput: + """Create an image watermark action. + + Args: + image: Watermark image + options: Watermark options + width: Width dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + height: Height dimension of the watermark (value and unit, e.g. {value: 100, unit: '%'}) + top: Top position of the watermark (value and unit) + right: Right position of the watermark (value and unit) + bottom: Bottom position of the watermark (value and unit) + left: Left position of the watermark (value and unit) + rotation: Rotation of the watermark in counterclockwise degrees (default: 0) + opacity: Watermark opacity (0 is fully transparent, 1 is fully opaque) + + Returns: + ActionWithFileInput object + """ + if options is None: + options = { + "width": DEFAULT_DIMENSION, + "height": DEFAULT_DIMENSION, + "rotation": 0, + } + + class ImageWatermarkActionWithFileInput(ActionWithFileInput): + __needsFileRegistration = True + + def __init__( + self, file_input: FileInput, opts: ImageWatermarkActionOptions + ): + self.fileInput = file_input + self.options = opts + + def createAction(self, fileHandle: FileHandle) -> ImageWatermarkAction: + return { + "type": "watermark", + "image": fileHandle, + **self.options, + "rotation": self.options.get("rotation", 0), + "width": self.options.get("width", DEFAULT_DIMENSION), + "height": self.options.get("height", DEFAULT_DIMENSION), + } + + return ImageWatermarkActionWithFileInput(image, options) + + @staticmethod + def flatten(annotation_ids: list[str | int] | None = None) -> FlattenAction: + """Create a flatten action. + + Args: + annotation_ids: Optional annotation IDs to flatten (all if not specified) + + Returns: + FlattenAction object + """ + result: FlattenAction = {"type": "flatten"} + if annotation_ids is not None: + result["annotationIds"] = annotation_ids + return result + + @staticmethod + def apply_instant_json(file: FileInput) -> ActionWithFileInput: + """Create an apply Instant JSON action. + + Args: + file: Instant JSON file input + + Returns: + ActionWithFileInput object + """ + + class ApplyInstantJsonActionWithFileInput(ActionWithFileInput): + __needsFileRegistration = True + + def __init__(self, file_input: FileInput): + self.fileInput = file_input + + def createAction(self, fileHandle: FileHandle) -> ApplyInstantJsonAction: + return { + "type": "applyInstantJson", + "file": fileHandle, + } + + return ApplyInstantJsonActionWithFileInput(file) + + @staticmethod + def apply_xfdf( + file: FileInput, options: ApplyXfdfActionOptions | None = None + ) -> ActionWithFileInput: + """Create an apply XFDF action. + + Args: + file: XFDF file input + options: Apply Xfdf options + ignorePageRotation: If true, ignores page rotation when applying XFDF data (default: false) + richTextEnabled: If true, plain text annotations will be converted to rich text annotations. If false, all text annotations will be plain text annotations (default: true) + + Returns: + ActionWithFileInput object + """ + + class ApplyXfdfActionWithFileInput(ActionWithFileInput): + __needsFileRegistration = True + + def __init__( + self, file_input: FileInput, opts: ApplyXfdfActionOptions | None + ): + self.fileInput = file_input + self.options = opts or {} + + def createAction(self, fileHandle: FileHandle) -> ApplyXfdfAction: + return { + "type": "applyXfdf", + "file": fileHandle, + **self.options, + } + + return ApplyXfdfActionWithFileInput(file, options) + + @staticmethod + def create_redactions_text( + text: str, + options: BaseCreateRedactionsOptions | None = None, + strategy_options: CreateRedactionsStrategyOptionsText | None = None, + ) -> CreateRedactionsActionText: + """Create redactions with text search. + + Args: + text: Text to search and redact + options: Redaction options + content: Visual aspects of the redaction annotation (background color, overlay text, etc.) + strategy_options: Redaction strategy options + includeAnnotations: If true, redaction annotations are created on top of annotations whose content match the provided text (default: true) + caseSensitive: If true, the search will be case sensitive (default: false) + start: The index of the page from where to start the search (default: 0) + limit: Starting from start, the number of pages to search (default: to the end of the document) + + Returns: + CreateRedactionsAction object + """ + result: dict[str, Any] = { + "type": "createRedactions", + "strategy": "text", + "strategyOptions": { + "text": text, + **(strategy_options or {}), + }, + **(options or {}), + } + return cast("CreateRedactionsActionText", result) + + @staticmethod + def create_redactions_regex( + regex: str, + options: BaseCreateRedactionsOptions | None = None, + strategy_options: CreateRedactionsStrategyOptionsRegex | None = None, + ) -> CreateRedactionsActionRegex: + """Create redactions with regex pattern. + + Args: + regex: Regex pattern to search and redact + options: Redaction options + content: Visual aspects of the redaction annotation (background color, overlay text, etc.) + strategy_options: Redaction strategy options + includeAnnotations: If true, redaction annotations are created on top of annotations whose content match the provided regex (default: true) + caseSensitive: If true, the search will be case sensitive (default: true) + start: The index of the page from where to start the search (default: 0) + limit: Starting from start, the number of pages to search (default: to the end of the document) + + Returns: + CreateRedactionsAction object + """ + result: dict[str, Any] = { + "type": "createRedactions", + "strategy": "regex", + "strategyOptions": { + "regex": regex, + **(strategy_options or {}), + }, + **(options or {}), + } + return cast("CreateRedactionsActionRegex", result) + + @staticmethod + def create_redactions_preset( + preset: SearchPreset, + options: BaseCreateRedactionsOptions | None = None, + strategy_options: CreateRedactionsStrategyOptionsPreset | None = None, + ) -> CreateRedactionsActionPreset: + """Create redactions with preset pattern. + + Args: + preset: Preset pattern to search and redact (e.g. 'email-address', 'credit-card-number', 'social-security-number', etc.) + options: Redaction options + content: Visual aspects of the redaction annotation (background color, overlay text, etc.) + strategy_options: Redaction strategy options + includeAnnotations: If true, redaction annotations are created on top of annotations whose content match the provided preset (default: true) + start: The index of the page from where to start the search (default: 0) + limit: Starting from start, the number of pages to search (default: to the end of the document) + + Returns: + CreateRedactionsAction object + """ + result: dict[str, Any] = { + "type": "createRedactions", + "strategy": "preset", + "strategyOptions": { + "preset": preset, + **(strategy_options or {}), + }, + **(options or {}), + } + return cast("CreateRedactionsActionPreset", result) + + @staticmethod + def apply_redactions() -> ApplyRedactionsAction: + """Apply previously created redactions. + + Returns: + ApplyRedactionsAction object + """ + return { + "type": "applyRedactions", + } + + +class BuildOutputs: + """Factory functions for creating output configurations.""" + + @staticmethod + def pdf(options: PDFOutputOptions | None = None) -> PDFOutput: + """PDF output configuration. + + Args: + options: PDF output options + metadata: Document metadata + labels: Page labels + user_password: User password for the PDF + owner_password: Owner password for the PDF + user_permissions: User permissions + optimize: PDF optimization options + + Returns: + PDFOutput object + """ + result: dict[str, Any] = {"type": "pdf"} + + if options: + if "metadata" in options: + result["metadata"] = options["metadata"] + if "labels" in options: + result["labels"] = options["labels"] + if "user_password" in options: + result["user_password"] = options["user_password"] + if "owner_password" in options: + result["owner_password"] = options["owner_password"] + if "user_permissions" in options: + result["user_permissions"] = options["user_permissions"] + if "optimize" in options: + result["optimize"] = options["optimize"] + + return cast("PDFOutput", result) + + @staticmethod + def pdfa(options: PDFAOutputOptions | None = None) -> PDFAOutput: + """PDF/A output configuration. + + Args: + options: PDF/A output options + conformance: PDF/A conformance level + vectorization: Enable vectorization + rasterization: Enable rasterization + metadata: Document metadata + labels: Page labels + user_password: User password for the PDF + owner_password: Owner password for the PDF + user_permissions: User permissions + optimize: PDF optimization options + + Returns: + PDFAOutput object + """ + result: dict[str, Any] = {"type": "pdfa"} + + if options: + if "conformance" in options: + result["conformance"] = options["conformance"] + if "vectorization" in options: + result["vectorization"] = options["vectorization"] + if "rasterization" in options: + result["rasterization"] = options["rasterization"] + if "metadata" in options: + result["metadata"] = options["metadata"] + if "labels" in options: + result["labels"] = options["labels"] + if "user_password" in options: + result["user_password"] = options["user_password"] + if "owner_password" in options: + result["owner_password"] = options["owner_password"] + if "user_permissions" in options: + result["user_permissions"] = options["user_permissions"] + if "optimize" in options: + result["optimize"] = options["optimize"] + + return cast("PDFAOutput", result) + + @staticmethod + def pdfua(options: PDFUAOutputOptions | None = None) -> PDFUAOutput: + """PDF/UA output configuration. + + Args: + options: PDF/UA output options + metadata: Document metadata + labels: Page labels + user_password: User password for the PDF + owner_password: Owner password for the PDF + user_permissions: User permissions + optimize: PDF optimization options + + Returns: + PDFUAOutput object + """ + result: dict[str, Any] = {"type": "pdfua"} + + if options: + if "metadata" in options: + result["metadata"] = options["metadata"] + if "labels" in options: + result["labels"] = options["labels"] + if "user_password" in options: + result["user_password"] = options["user_password"] + if "owner_password" in options: + result["owner_password"] = options["owner_password"] + if "user_permissions" in options: + result["user_permissions"] = options["user_permissions"] + if "optimize" in options: + result["optimize"] = options["optimize"] + + return cast("PDFUAOutput", result) + + @staticmethod + def image( + format: Literal["png", "jpeg", "jpg", "webp"], + options: ImageOutputOptions | None = None, + ) -> ImageOutput: + """Image output configuration. + + Args: + format: Image format type + options: Image output options + pages: Page range to convert + width: Width of the output image + height: Height of the output image + dpi: DPI of the output image + + Returns: + ImageOutput object + """ + result: dict[str, Any] = { + "type": "image", + "format": format, + } + + if options: + if "pages" in options: + result["pages"] = options["pages"] + if "width" in options: + result["width"] = options["width"] + if "height" in options: + result["height"] = options["height"] + if "dpi" in options: + result["dpi"] = options["dpi"] + + return cast("ImageOutput", result) + + @staticmethod + def jsonContent( + options: JSONContentOutputOptions | None = None, + ) -> JSONContentOutput: + """JSON content output configuration. + + Args: + options: JSON content extraction options + plainText: Extract plain text + structuredText: Extract structured text + keyValuePairs: Extract key-value pairs + tables: Extract tables + language: Language(s) for OCR + + Returns: + JSONContentOutput object + """ + result: dict[str, Any] = {"type": "json-content"} + + if options: + if "plainText" in options: + result["plainText"] = options["plainText"] + if "structuredText" in options: + result["structuredText"] = options["structuredText"] + if "keyValuePairs" in options: + result["keyValuePairs"] = options["keyValuePairs"] + if "tables" in options: + result["tables"] = options["tables"] + if "language" in options: + result["language"] = options["language"] + + return cast("JSONContentOutput", result) + + @staticmethod + def office(type: Literal["docx", "xlsx", "pptx"]) -> OfficeOutput: + """Office document output configuration. + + Args: + type: Office document type + + Returns: + OfficeOutput object + """ + return { + "type": type, + } + + @staticmethod + def html(layout: Literal["page", "reflow"]) -> HTMLOutput: + """HTML output configuration. + + Args: + layout: The layout type to use for conversion to HTML + + Returns: + HTMLOutput object + """ + return { + "type": "html", + "layout": layout, + } + + @staticmethod + def markdown() -> MarkdownOutput: + """Markdown output configuration. + + Returns: + MarkdownOutput object + """ + return { + "type": "markdown", + } + + @staticmethod + def getMimeTypeForOutput( + output: PDFOutput + | PDFAOutput + | PDFUAOutput + | ImageOutput + | OfficeOutput + | HTMLOutput + | MarkdownOutput, + ) -> dict[str, str]: + """Get MIME type and filename for a given output configuration. + + Args: + output: The output configuration + + Returns: + Dictionary with mimeType and optional filename + """ + output_type = output.get("type", "pdf") + + if output_type in ["pdf", "pdfa", "pdfua"]: + return {"mimeType": "application/pdf", "filename": "output.pdf"} + elif output_type == "image": + format = output.get("format", "png") + if format == "jpg": + return {"mimeType": "image/jpeg", "filename": "output.jpg"} + else: + return {"mimeType": f"image/{format}", "filename": f"output.{format}"} + elif output_type == "docx": + return { + "mimeType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "filename": "output.docx", + } + elif output_type == "xlsx": + return { + "mimeType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "filename": "output.xlsx", + } + elif output_type == "pptx": + return { + "mimeType": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "filename": "output.pptx", + } + elif output_type == "html": + return { + "mimeType": "text/html", + "filename": "output.html", + } + elif output_type == "markdown": + return { + "mimeType": "text/markdown", + "filename": "output.md", + } + else: + return {"mimeType": "application/octet-stream", "filename": "output"} diff --git a/src/nutrient_dws/builder/staged_builders.py b/src/nutrient_dws/builder/staged_builders.py new file mode 100644 index 0000000..bfd0743 --- /dev/null +++ b/src/nutrient_dws/builder/staged_builders.py @@ -0,0 +1,253 @@ +"""Staged builder interfaces for workflow pattern implementation.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections.abc import Callable +from typing import TYPE_CHECKING, Literal, TypedDict + +from nutrient_dws.builder.constant import ActionWithFileInput +from nutrient_dws.types.build_actions import BuildAction + +if TYPE_CHECKING: + from nutrient_dws.inputs import FileInput + from nutrient_dws.types.analyze_response import AnalyzeBuildResponse + from nutrient_dws.types.build_output import ( + ImageOutputOptions, + JSONContentOutputOptions, + PDFAOutputOptions, + PDFOutputOptions, + PDFUAOutputOptions, + ) + from nutrient_dws.types.build_response_json import BuildResponseJsonContents + from nutrient_dws.types.input_parts import ( + DocumentPartOptions, + FilePartOptions, + HTMLPartOptions, + NewPagePartOptions, + ) + +# Type aliases for output types +OutputFormat = Literal[ + "pdf", + "pdfa", + "pdfua", + "png", + "jpeg", + "jpg", + "webp", + "docx", + "xlsx", + "pptx", + "html", + "markdown", + "json-content", +] + + +# Output type mappings +class BufferOutput(TypedDict): + buffer: bytes + mimeType: str + filename: str | None + + +class ContentOutput(TypedDict): + content: str + mimeType: str + filename: str | None + + +class JsonContentOutput(TypedDict): + data: BuildResponseJsonContents + + +# Applicable actions type - actions that can be applied to workflows +ApplicableAction = BuildAction | ActionWithFileInput + + +class WorkflowError(TypedDict): + """Workflow execution error details.""" + + step: int + error: Exception + + +class WorkflowOutput(TypedDict): + """Represents an output file with its content and metadata.""" + + buffer: bytes + mimeType: str + filename: str | None + + +class WorkflowResult(TypedDict): + """Result of a workflow execution.""" + + success: bool + output: WorkflowOutput | None + errors: list[WorkflowError] | None + + +class TypedWorkflowResult(TypedDict): + """Typed result of a workflow execution based on output configuration.""" + + success: bool + output: BufferOutput | ContentOutput | JsonContentOutput | None + errors: list[WorkflowError] | None + + +class WorkflowDryRunResult(TypedDict): + """Result of a workflow dry run.""" + + success: bool + analysis: AnalyzeBuildResponse | None + errors: list[WorkflowError] | None + + +WorkflowExecuteCallback = Callable[[int, int], None] + + +class WorkflowInitialStage(ABC): + """Stage 1: Initial workflow - only part methods available.""" + + @abstractmethod + def add_file_part( + self, + file: FileInput, + options: FilePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a file part to the workflow.""" + pass + + @abstractmethod + def add_html_part( + self, + html: FileInput, + assets: list[FileInput] | None = None, + options: HTMLPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add an HTML part to the workflow.""" + pass + + @abstractmethod + def add_new_page( + self, + options: NewPagePartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a new page part to the workflow.""" + pass + + @abstractmethod + def add_document_part( + self, + document_id: str, + options: DocumentPartOptions | None = None, + actions: list[ApplicableAction] | None = None, + ) -> WorkflowWithPartsStage: + """Add a document part to the workflow.""" + pass + + +class WorkflowWithPartsStage(WorkflowInitialStage): + """Stage 2: After parts added - parts, actions, and output methods available.""" + + # Action methods + @abstractmethod + def apply_actions(self, actions: list[ApplicableAction]) -> WorkflowWithPartsStage: + """Apply multiple actions to the workflow.""" + pass + + @abstractmethod + def apply_action(self, action: ApplicableAction) -> WorkflowWithPartsStage: + """Apply a single action to the workflow.""" + pass + + # Output methods + @abstractmethod + def output_pdf( + self, + options: PDFOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set PDF output for the workflow.""" + pass + + @abstractmethod + def output_pdfa( + self, + options: PDFAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set PDF/A output for the workflow.""" + pass + + @abstractmethod + def output_pdfua( + self, + options: PDFUAOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set PDF/UA output for the workflow.""" + pass + + @abstractmethod + def output_image( + self, + format: Literal["png", "jpeg", "jpg", "webp"], + options: ImageOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set image output for the workflow.""" + pass + + @abstractmethod + def output_office( + self, + format: Literal["docx", "xlsx", "pptx"], + ) -> WorkflowWithOutputStage: + """Set Office format output for the workflow.""" + pass + + @abstractmethod + def output_html( + self, + layout: Literal["page", "reflow"] | None = None, + ) -> WorkflowWithOutputStage: + """Set HTML output for the workflow.""" + pass + + @abstractmethod + def output_markdown( + self, + ) -> WorkflowWithOutputStage: + """Set Markdown output for the workflow.""" + pass + + @abstractmethod + def output_json( + self, + options: JSONContentOutputOptions | None = None, + ) -> WorkflowWithOutputStage: + """Set JSON content output for the workflow.""" + pass + + +# Stage 3: After actions added - type alias since functionality is the same +WorkflowWithActionsStage = WorkflowWithPartsStage + + +class WorkflowWithOutputStage(ABC): + """Stage 4: After output set - only execute and dryRun available.""" + + @abstractmethod + async def execute( + self, + on_progress: WorkflowExecuteCallback | None = None, + ) -> TypedWorkflowResult: + """Execute the workflow and return the result.""" + pass + + @abstractmethod + async def dry_run(self) -> WorkflowDryRunResult: + """Perform a dry run of the workflow without executing.""" + pass diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index 02b5894..4c6cd85 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -1,110 +1,1751 @@ -"""Main client module for Nutrient DWS API.""" +"""Main client for interacting with the Nutrient Document Web Services API.""" -import os -from typing import Any +from collections.abc import Awaitable, Callable +from typing import TYPE_CHECKING, Any, Literal, cast -from nutrient_dws.api.direct import DirectAPIMixin -from nutrient_dws.builder import BuildAPIWrapper -from nutrient_dws.file_handler import FileInput -from nutrient_dws.http_client import HTTPClient +from nutrient_dws.builder.builder import StagedWorkflowBuilder +from nutrient_dws.builder.constant import BuildActions +from nutrient_dws.builder.staged_builders import ( + ApplicableAction, + BufferOutput, + ContentOutput, + JsonContentOutput, + OutputFormat, + TypedWorkflowResult, + WorkflowInitialStage, + WorkflowWithPartsStage, +) +from nutrient_dws.errors import NutrientError, ValidationError +from nutrient_dws.http import ( + NutrientClientOptions, + RedactRequestData, + RequestConfig, + SignRequestData, + SignRequestOptions, + send_request, +) +from nutrient_dws.inputs import ( + FileInput, + get_pdf_page_count, + is_remote_file_input, + is_valid_pdf, + process_file_input, + process_remote_file_input, +) +from nutrient_dws.types.account_info import AccountInfo +from nutrient_dws.types.build_actions import ( + ApplyXfdfActionOptions, + BaseCreateRedactionsOptions, + CreateRedactionsStrategyOptionsPreset, + CreateRedactionsStrategyOptionsRegex, + CreateRedactionsStrategyOptionsText, + ImageWatermarkActionOptions, + SearchPreset, + TextWatermarkActionOptions, +) +from nutrient_dws.types.build_output import ( + JSONContentOutputOptions, + Label, + Metadata, + OptimizePdf, + PDFOutputOptions, + PDFUserPermission, +) +from nutrient_dws.types.create_auth_token import ( + CreateAuthTokenParameters, + CreateAuthTokenResponse, +) +from nutrient_dws.types.misc import OcrLanguage, PageRange, Pages +from nutrient_dws.types.redact_data import RedactOptions +from nutrient_dws.types.sign_request import CreateDigitalSignature +if TYPE_CHECKING: + from nutrient_dws.types.input_parts import FilePartOptions -class NutrientClient(DirectAPIMixin): - r"""Main client for interacting with Nutrient DWS API. - This client provides two ways to interact with the API: +def normalize_page_params( + pages: PageRange | None = None, + page_count: int | None = None, +) -> Pages: + """Normalize page parameters according to the requirements: + - start and end are inclusive + - start defaults to 0 (first page) + - end defaults to -1 (last page) + - negative end values loop from the end of the document. - 1. Direct API: Individual method calls for single operations - Example: client.convert_to_pdf(input_file="document.docx") + Args: + pages: The page parameters to normalize + page_count: The total number of pages in the document (required for negative indices) + + Returns: + Normalized page parameters + """ + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 - 2. Builder API: Fluent interface for chaining multiple operations - Example: client.build(input_file="doc.docx").add_step("convert-to-pdf").execute() + # Handle negative end values if page_count is provided + if page_count is not None and start < 0: + start = page_count + start - Args: - api_key: API key for authentication. If not provided, will look for - NUTRIENT_API_KEY environment variable. - timeout: Request timeout in seconds. Defaults to 300. + if page_count is not None and end < 0: + end = page_count + end - Raises: - AuthenticationError: When making API calls without a valid API key. + return {"start": start, "end": end} + + +class NutrientClient: + """Main client for interacting with the Nutrient Document Web Services API. Example: - >>> from nutrient_dws import NutrientClient - >>> client = NutrientClient(api_key="your-api-key") - >>> # Direct API - >>> pdf = client.convert_to_pdf(input_file="document.docx") - >>> # Builder API - >>> client.build(input_file="document.docx") \\ - ... .add_step(tool="convert-to-pdf") \\ - ... .add_step(tool="ocr-pdf") \\ - ... .execute(output_path="output.pdf") + Server-side usage with an API key: + + ```python + client = NutrientClient(api_key='your_api_key') + ``` + + Client-side usage with token provider: + + ```python + async def get_token(): + # Your token retrieval logic here + return 'your-token' + + client = NutrientClient(api_key=get_token) + ``` """ - def __init__(self, api_key: str | None = None, timeout: int = 300) -> None: - """Initialize the Nutrient client.""" - # Get API key from parameter or environment - self._api_key = api_key or os.environ.get("NUTRIENT_API_KEY") - self._timeout = timeout + def __init__( + self, + api_key: str | Callable[[], str | Awaitable[str]], + base_url: str | None = None, + timeout: int | None = None, + ) -> None: + """Create a new NutrientClient instance. - # Initialize HTTP client - self._http_client = HTTPClient(api_key=self._api_key, timeout=timeout) + Args: + api_key: API key or API key getter + base_url: DWS Base url + timeout: DWS request timeout - # Direct API methods will be added dynamically + Raises: + ValidationError: If options are invalid + """ + options = NutrientClientOptions( + apiKey=api_key, baseUrl=base_url, timeout=timeout + ) + self._validate_options(options) + self.options = options - def build(self, input_file: FileInput) -> BuildAPIWrapper: - """Start a Builder API workflow. + def _validate_options(self, options: NutrientClientOptions) -> None: + """Validate client options. Args: - input_file: Input file (path, bytes, or file-like object). + options: Configuration options to validate + + Raises: + ValidationError: If options are invalid + """ + if not options: + raise ValidationError("Client options are required") + + if not options.get("apiKey"): + raise ValidationError("API key is required") + + api_key = options["apiKey"] + if not (isinstance(api_key, str) or callable(api_key)): + raise ValidationError( + "API key must be a string or a function that returns a string" + ) + + base_url = options.get("baseUrl") + if base_url is not None and not isinstance(base_url, str): + raise ValidationError("Base URL must be a string") + + async def get_account_info(self) -> AccountInfo: + """Get account information for the current API key. Returns: - BuildAPIWrapper instance for chaining operations. + Account information Example: - >>> builder = client.build(input_file="document.pdf") - >>> builder.add_step(tool="rotate-pages", options={"degrees": 90}) - >>> result = builder.execute() + ```python + account_info = await client.get_account_info() + print(account_info['subscriptionType']) + ``` """ - return BuildAPIWrapper(client=self, input_file=input_file) + response: Any = await send_request( + { + "method": "GET", + "endpoint": "/account/info", + "data": None, + "headers": None, + }, + self.options, + ) - def _process_file( - self, - tool: str, - input_file: FileInput, - output_path: str | None = None, - **options: Any, - ) -> bytes | None: - """Process a file using the Direct API. + return cast("AccountInfo", response["data"]) + + async def create_token( + self, params: CreateAuthTokenParameters + ) -> CreateAuthTokenResponse: + """Create a new authentication token. + + Args: + params: Parameters for creating the token + + Returns: + The created token information + + Example: + ```python + token = await client.create_token({ + 'allowedOperations': ['annotations_api'], + 'expirationTime': 3600 # 1 hour + }) + print(token['id']) + ``` + """ + response: Any = await send_request( + { + "method": "POST", + "endpoint": "/tokens", + "data": params, + "headers": None, + }, + self.options, + ) + + return cast("CreateAuthTokenResponse", response["data"]) + + async def delete_token(self, token_id: str) -> None: + """Delete an authentication token. + + Args: + token_id: ID of the token to delete + + Example: + ```python + await client.delete_token('token-id-123') + ``` + """ + await send_request( + { + "method": "DELETE", + "endpoint": "/tokens", + "data": cast("Any", {"id": token_id}), + "headers": None, + }, + self.options, + ) + + def workflow(self, override_timeout: int | None = None) -> WorkflowInitialStage: + r"""Create a new WorkflowBuilder for chaining multiple operations. + + Args: + override_timeout: Set a custom timeout for the workflow (in milliseconds) + + Returns: + A new WorkflowBuilder instance - This is the internal method used by all Direct API methods. - It internally uses the Build API with a single action. + Example: + ```python + result = await client.workflow() \\ + .add_file_part('document.docx') \\ + .apply_action(BuildActions.ocr('english')) \\ + .output_pdf() \\ + .execute() + ``` + """ + options = self.options.copy() + if override_timeout is not None: + options["timeout"] = override_timeout + + return StagedWorkflowBuilder(options) + + def _process_typed_workflow_result( + self, result: TypedWorkflowResult + ) -> BufferOutput | ContentOutput | JsonContentOutput: + """Helper function that takes a TypedWorkflowResult, throws any errors, and returns the specific output type. Args: - tool: The tool identifier from the API. - input_file: Input file to process. - output_path: Optional path to save the output. - **options: Tool-specific options. + result: The TypedWorkflowResult to process Returns: - Processed file as bytes, or None if output_path is provided. + The specific output type from the result Raises: - AuthenticationError: If API key is missing or invalid. - APIError: For other API errors. + NutrientError: If the workflow was not successful or if output is missing + """ + if not result["success"]: + # If there are errors, throw the first one + errors = result.get("errors") + if errors and len(errors) > 0: + raise errors[0]["error"] + # If no specific errors but operation failed + raise NutrientError( + "Workflow operation failed without specific error details", + "WORKFLOW_ERROR", + ) + + # Check if output exists + output = result.get("output") + if not output: + raise NutrientError( + "Workflow completed successfully but no output was returned", + "MISSING_OUTPUT", + ) + + return output + + async def sign( + self, + pdf: FileInput, + data: CreateDigitalSignature | None = None, + options: SignRequestOptions | None = None, + ) -> BufferOutput: + """Sign a PDF document. + + Args: + pdf: The PDF file to sign + data: Signature data + options: Additional options (image, graphicImage) + + Returns: + The signed PDF file output + + Example: + ```python + result = await client.sign('document.pdf', { + 'signatureType': 'cms', + 'flatten': False, + 'cadesLevel': 'b-lt' + }) + + # Access the signed PDF buffer + pdf_buffer = result['buffer'] + + # Get the MIME type of the output + print(result['mimeType']) # 'application/pdf' + + # Save the buffer to a file + with open('signed-document.pdf', 'wb') as f: + f.write(pdf_buffer) + ``` + """ + # Normalize the file input + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Prepare optional files + normalized_image = None + normalized_graphic_image = None + + if options: + if "image" in options: + image = options["image"] + if is_remote_file_input(image): + normalized_image = await process_remote_file_input(str(image)) + else: + normalized_image = await process_file_input(image) + + if "graphicImage" in options: + graphic_image = options["graphicImage"] + if is_remote_file_input(graphic_image): + normalized_graphic_image = await process_remote_file_input( + str(graphic_image) + ) + else: + normalized_graphic_image = await process_file_input(graphic_image) + + request_data = { + "file": normalized_file, + "data": data, + } + + if normalized_image: + request_data["image"] = normalized_image + if normalized_graphic_image: + request_data["graphicImage"] = normalized_graphic_image + + response: Any = await send_request( + { + "method": "POST", + "endpoint": "/sign", + "data": cast("SignRequestData", request_data), + "headers": None, + }, + self.options, + ) + + buffer = response["data"] + + return { + "mimeType": "application/pdf", + "filename": "output.pdf", + "buffer": buffer, + } + + async def watermark_text( + self, + file: FileInput, + text: str, + options: TextWatermarkActionOptions | None = None, + ) -> BufferOutput: + """Add a text watermark to a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to watermark + text: The watermark text + options: Watermark options + + Returns: + The watermarked document + + Example: + ```python + result = await client.watermark_text('document.pdf', 'CONFIDENTIAL', { + 'opacity': 0.5, + 'fontSize': 24 + }) + + # Access the watermarked PDF buffer + pdf_buffer = result['buffer'] + + # Save the buffer to a file + with open('watermarked-document.pdf', 'wb') as f: + f.write(pdf_buffer) + ``` + """ + watermark_action = BuildActions.watermark_text(text, options) + + builder = self.workflow().add_file_part(file, None, [watermark_action]) + + result = await builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def watermark_image( + self, + file: FileInput, + image: FileInput, + options: ImageWatermarkActionOptions | None = None, + ) -> BufferOutput: + """Add an image watermark to a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to watermark + image: The watermark image + options: Watermark options + + Returns: + The watermarked document + + Example: + ```python + result = await client.watermark_image('document.pdf', 'watermark.jpg', { + 'opacity': 0.5 + }) + + # Access the watermarked PDF buffer + pdf_buffer = result['buffer'] + ``` + """ + watermark_action = BuildActions.watermark_image(image, options) + + builder = self.workflow().add_file_part(file, None, [watermark_action]) + + result = await builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def convert( + self, + file: FileInput, + target_format: OutputFormat, + ) -> BufferOutput | ContentOutput | JsonContentOutput: + """Convert a document to a different format. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to convert + target_format: The target format to convert to + + Returns: + The specific output type based on the target format + + Example: + ```python + # Convert DOCX to PDF + pdf_result = await client.convert('document.docx', 'pdf') + pdf_buffer = pdf_result['buffer'] + + # Convert PDF to image + image_result = await client.convert('document.pdf', 'png') + png_buffer = image_result['buffer'] + + # Convert to HTML + html_result = await client.convert('document.pdf', 'html') + html_content = html_result['content'] + ``` + """ + builder = self.workflow().add_file_part(file) + + if target_format == "pdf": + result = await builder.output_pdf().execute() + elif target_format == "pdfa": + result = await builder.output_pdfa().execute() + elif target_format == "pdfua": + result = await builder.output_pdfua().execute() + elif target_format == "docx": + result = await builder.output_office("docx").execute() + elif target_format == "xlsx": + result = await builder.output_office("xlsx").execute() + elif target_format == "pptx": + result = await builder.output_office("pptx").execute() + elif target_format == "html": + result = await builder.output_html("page").execute() + elif target_format == "markdown": + result = await builder.output_markdown().execute() + elif target_format in ["png", "jpeg", "jpg", "webp"]: + result = await builder.output_image( + cast("Literal['png', 'jpeg', 'jpg', 'webp']", target_format), + {"dpi": 300}, + ).execute() + else: + raise ValidationError(f"Unsupported target format: {target_format}") + + return self._process_typed_workflow_result(result) + + async def ocr( + self, + file: FileInput, + language: OcrLanguage | list[OcrLanguage], + ) -> BufferOutput: + """Perform OCR (Optical Character Recognition) on a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The input file to perform OCR on + language: The language(s) to use for OCR + + Returns: + The OCR result + + Example: + ```python + result = await client.ocr('scanned-document.pdf', 'english') + + # Access the OCR-processed PDF buffer + pdf_buffer = result['buffer'] + ``` + """ + ocr_action = BuildActions.ocr(language) + + builder = self.workflow().add_file_part(file, None, [ocr_action]) + + result = await builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def extract_text( + self, + file: FileInput, + pages: PageRange | None = None, + ) -> JsonContentOutput: + """Extract text content from a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to extract text from + pages: Optional page range to extract text from + + Returns: + The extracted text data + + Example: + ```python + result = await client.extract_text('document.pdf') + print(result['data']) + + # Extract text from specific pages + result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) + + # Access the extracted text content + text_content = result['data']['pages'][0]['plainText'] + ``` + """ + normalized_pages = normalize_page_params(pages) if pages else None + + part_options = ( + cast("FilePartOptions", {"pages": normalized_pages}) + if normalized_pages + else None + ) + + result = ( + await self.workflow() + .add_file_part(file, part_options) + .output_json( + cast("JSONContentOutputOptions", {"plainText": True, "tables": False}) + ) + .execute() + ) + + return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + + async def extract_table( + self, + file: FileInput, + pages: PageRange | None = None, + ) -> JsonContentOutput: + """Extract table content from a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to extract table from + pages: Optional page range to extract tables from + + Returns: + The extracted table data + + Example: + ```python + result = await client.extract_table('document.pdf') + + # Access the extracted tables + tables = result['data']['pages'][0]['tables'] + + # Process the first table if available + if tables and len(tables) > 0: + first_table = tables[0] + print(f"Table has {len(first_table['rows'])} rows") + ``` + """ + normalized_pages = normalize_page_params(pages) if pages else None + + part_options = ( + cast("FilePartOptions", {"pages": normalized_pages}) + if normalized_pages + else None + ) + + result = ( + await self.workflow() + .add_file_part(file, part_options) + .output_json( + cast("JSONContentOutputOptions", {"plainText": False, "tables": True}) + ) + .execute() + ) + + return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + + async def extract_key_value_pairs( + self, + file: FileInput, + pages: PageRange | None = None, + ) -> JsonContentOutput: + """Extract key value pair content from a document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to extract KVPs from + pages: Optional page range to extract KVPs from + + Returns: + The extracted KVPs data + + Example: + ```python + result = await client.extract_key_value_pairs('document.pdf') + + # Access the extracted key-value pairs + kvps = result['data']['pages'][0]['keyValuePairs'] + + # Process the key-value pairs + if kvps and len(kvps) > 0: + for kvp in kvps: + print(f"Key: {kvp['key']}, Value: {kvp['value']}") + ``` + """ + normalized_pages = normalize_page_params(pages) if pages else None + + part_options = ( + cast("FilePartOptions", {"pages": normalized_pages}) + if normalized_pages + else None + ) + + result = ( + await self.workflow() + .add_file_part(file, part_options) + .output_json( + cast( + "JSONContentOutputOptions", + {"plainText": False, "tables": False, "keyValuePairs": True}, + ) + ) + .execute() + ) + + return cast("JsonContentOutput", self._process_typed_workflow_result(result)) + + async def set_page_labels( + self, + pdf: FileInput, + labels: list[Label], + ) -> BufferOutput: + """Set page labels for a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + labels: Array of label objects with pages and label properties + + Returns: + The document with updated page labels + + Example: + ```python + result = await client.set_page_labels('document.pdf', [ + {'pages': [0, 1, 2], 'label': 'Cover'}, + {'pages': [3, 4, 5], 'label': 'Chapter 1'} + ]) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + result = ( + await self.workflow() + .add_file_part(pdf) + .output_pdf(cast("PDFOutputOptions", {"labels": labels})) + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def password_protect( + self, + file: FileInput, + user_password: str, + owner_password: str, + permissions: list[PDFUserPermission] | None = None, + ) -> BufferOutput: + """Password protect a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + file: The file to protect + user_password: Password required to open the document + owner_password: Password required to modify the document + permissions: Optional array of permissions granted when opened with user password + + Returns: + The password-protected document + + Example: + ```python + result = await client.password_protect('document.pdf', 'user123', 'owner456') + + # Or with specific permissions: + result = await client.password_protect( + 'document.pdf', + 'user123', + 'owner456', + ['printing', 'extract_accessibility'] + ) + ``` + """ + pdf_options: PDFOutputOptions = { + "user_password": user_password, + "owner_password": owner_password, + } + + if permissions: + pdf_options["user_permissions"] = permissions + + result = ( + await self.workflow().add_file_part(file).output_pdf(pdf_options).execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def set_metadata( + self, + pdf: FileInput, + metadata: Metadata, + ) -> BufferOutput: + """Set metadata for a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + metadata: The metadata to set (title and/or author) + + Returns: + The document with updated metadata + + Example: + ```python + result = await client.set_metadata('document.pdf', { + 'title': 'My Document', + 'author': 'John Doe' + }) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + result = ( + await self.workflow() + .add_file_part(pdf) + .output_pdf(cast("PDFOutputOptions", {"metadata": metadata})) + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def apply_instant_json( + self, + pdf: FileInput, + instant_json_file: FileInput, + ) -> BufferOutput: + """Apply Instant JSON to a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + instant_json_file: The Instant JSON file to apply + + Returns: + The modified document + + Example: + ```python + result = await client.apply_instant_json('document.pdf', 'annotations.json') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + apply_json_action = BuildActions.apply_instant_json(instant_json_file) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [apply_json_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def apply_xfdf( + self, + pdf: FileInput, + xfdf_file: FileInput, + options: ApplyXfdfActionOptions | None = None, + ) -> BufferOutput: + """Apply XFDF to a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + xfdf_file: The XFDF file to apply + options: Optional settings for applying XFDF + + Returns: + The modified document + + Example: + ```python + result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') + # Or with options: + result = await client.apply_xfdf( + 'document.pdf', 'annotations.xfdf', + {'ignorePageRotation': True, 'richTextEnabled': False} + ) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + apply_xfdf_action = BuildActions.apply_xfdf(xfdf_file, options) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [apply_xfdf_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def merge(self, files: list[FileInput]) -> BufferOutput: + """Merge multiple documents into a single document. + This is a convenience method that uses the workflow builder. + + Args: + files: The files to merge + + Returns: + The merged document + + Example: + ```python + result = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']) + + # Access the merged PDF buffer + pdf_buffer = result['buffer'] + ``` + """ + if not files or len(files) < 2: + raise ValidationError("At least 2 files are required for merge operation") + + builder = self.workflow() + + # Add first file + workflow_builder = builder.add_file_part(files[0]) + + # Add remaining files + for file in files[1:]: + workflow_builder = workflow_builder.add_file_part(file) + + result = await workflow_builder.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def flatten( + self, + pdf: FileInput, + annotation_ids: list[str | int] | None = None, + ) -> BufferOutput: + """Flatten annotations in a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to flatten + annotation_ids: Optional specific annotation IDs to flatten + + Returns: + The flattened document + + Example: + ```python + # Flatten all annotations + result = await client.flatten('annotated-document.pdf') + + # Flatten specific annotations by ID + result = await client.flatten('annotated-document.pdf', ['annotation1', 'annotation2']) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + flatten_action = BuildActions.flatten(annotation_ids) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [flatten_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def create_redactions_ai( + self, + pdf: FileInput, + criteria: str, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + options: RedactOptions | None = None, + ) -> BufferOutput: + """Use AI to redact sensitive information in a document. + + Args: + pdf: The PDF file to redact + criteria: AI redaction criteria + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional pages to redact + options: Optional redaction options + + Returns: + The redacted document + + Example: + ```python + # Stage redactions + result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all emails' + ) + + # Apply redactions immediately + result = await client.create_redactions_ai( + 'document.pdf', + 'Remove all PII', + 'apply' + ) + ``` """ - # Use the builder API with a single step - builder = self.build(input_file) - builder.add_step(tool, options) - return builder.execute(output_path) + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + document_data: dict[str, Any] = { + "file": "file", + } + + if normalized_pages: + document_data["pages"] = normalized_pages + + documents = [document_data] + + request_data = { + "data": { + "documents": documents, + "criteria": criteria, + "redaction_state": redaction_state, + }, + "file": normalized_file, + "fileKey": "file", + } + + if options: + request_data["data"]["options"] = options # type: ignore + + config = RequestConfig( + method="POST", + data=cast("RedactRequestData", request_data), + endpoint="/ai/redact", + headers=None, + ) + + response: Any = await send_request( + config, + self.options, + ) + + buffer = response["data"] + + return { + "mimeType": "application/pdf", + "filename": "output.pdf", + "buffer": buffer, + } + + async def create_redactions_preset( + self, + pdf: FileInput, + preset: SearchPreset, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + preset_options: CreateRedactionsStrategyOptionsPreset | None = None, + options: BaseCreateRedactionsOptions | None = None, + ) -> BufferOutput: + """Create redaction annotations based on a preset pattern. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to create redactions in + preset: The preset pattern to search for (e.g., 'email-address', 'social-security-number') + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional page range to create redactions in + preset_options: Optional settings for the preset strategy + options: Optional settings for creating redactions + + Returns: + The document with redaction annotations + + Example: + ```python + result = await client.create_redactions_preset('document.pdf', 'email-address') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get page count for handling negative indices + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + # Prepare strategy options with pages + strategy_options = preset_options.copy() if preset_options else {} + if normalized_pages: + strategy_options["start"] = normalized_pages["start"] + if normalized_pages["end"] >= 0: + strategy_options["limit"] = ( + normalized_pages["end"] - normalized_pages["start"] + 1 + ) + + create_redactions_action = BuildActions.create_redactions_preset( + preset, options, strategy_options + ) + actions: list[ApplicableAction] = [create_redactions_action] + + if redaction_state == "apply": + actions.append(BuildActions.apply_redactions()) + + result = ( + await self.workflow() + .add_file_part(pdf, None, actions) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def create_redactions_regex( + self, + pdf: FileInput, + regex: str, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + regex_options: CreateRedactionsStrategyOptionsRegex | None = None, + options: BaseCreateRedactionsOptions | None = None, + ) -> BufferOutput: + r"""Create redaction annotations based on a regular expression. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to create redactions in + regex: The regular expression to search for + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional page range to create redactions in + regex_options: Optional settings for the regex strategy + options: Optional settings for creating redactions + + Returns: + The document with redaction annotations + + Example: + ```python + result = await client.create_redactions_regex('document.pdf', r'Account:\s*\d{8,12}') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get page count for handling negative indices + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + # Prepare strategy options with pages + strategy_options = regex_options.copy() if regex_options else {} + if normalized_pages: + strategy_options["start"] = normalized_pages["start"] + if normalized_pages["end"] >= 0: + strategy_options["limit"] = ( + normalized_pages["end"] - normalized_pages["start"] + 1 + ) + + create_redactions_action = BuildActions.create_redactions_regex( + regex, options, strategy_options + ) + actions: list[ApplicableAction] = [create_redactions_action] + + if redaction_state == "apply": + actions.append(BuildActions.apply_redactions()) + + result = ( + await self.workflow() + .add_file_part(pdf, None, actions) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def create_redactions_text( + self, + pdf: FileInput, + text: str, + redaction_state: Literal["stage", "apply"] = "stage", + pages: PageRange | None = None, + text_options: CreateRedactionsStrategyOptionsText | None = None, + options: BaseCreateRedactionsOptions | None = None, + ) -> BufferOutput: + """Create redaction annotations based on text. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to create redactions in + text: The text to search for + redaction_state: Whether to stage or apply redactions (default: 'stage') + pages: Optional page range to create redactions in + text_options: Optional settings for the text strategy + options: Optional settings for creating redactions + + Returns: + The document with redaction annotations + + Example: + ```python + result = await client.create_redactions_text('document.pdf', 'email@example.com') + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get page count for handling negative indices + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) if pages else None + + # Prepare strategy options with pages + strategy_options = text_options.copy() if text_options else {} + if normalized_pages: + strategy_options["start"] = normalized_pages["start"] + if normalized_pages["end"] >= 0: + strategy_options["limit"] = ( + normalized_pages["end"] - normalized_pages["start"] + 1 + ) + + create_redactions_action = BuildActions.create_redactions_text( + text, options, strategy_options + ) + actions: list[ApplicableAction] = [create_redactions_action] + + if redaction_state == "apply": + actions.append(BuildActions.apply_redactions()) + + result = ( + await self.workflow() + .add_file_part(pdf, None, actions) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def apply_redactions(self, pdf: FileInput) -> BufferOutput: + """Apply staged redaction into the PDF. + + Args: + pdf: The PDF file with redaction annotations to apply + + Returns: + The document with applied redactions + + Example: + ```python + # Stage redactions from a createRedaction Method: + staged_result = await client.create_redactions_text( + 'document.pdf', + 'email@example.com', + 'stage' + ) + + result = await client.apply_redactions(staged_result['buffer']) + ``` + """ + apply_redactions_action = BuildActions.apply_redactions() + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + result = ( + await self.workflow() + .add_file_part(pdf, None, [apply_redactions_action]) + .output_pdf() + .execute() + ) + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def rotate( + self, + pdf: FileInput, + angle: Literal[90, 180, 270], + pages: PageRange | None = None, + ) -> BufferOutput: + """Rotate pages in a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to rotate + angle: Rotation angle (90, 180, or 270 degrees) + pages: Optional page range to rotate + + Returns: + The entire document with specified pages rotated + + Example: + ```python + result = await client.rotate('document.pdf', 90) + + # Rotate specific pages: + result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) + ``` + """ + rotate_action = BuildActions.rotate(angle) + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + workflow = self.workflow() + + if pages: + page_count = get_pdf_page_count(normalized_file[0]) + normalized_pages = normalize_page_params(pages, page_count) + + # Add pages before the range to rotate + if normalized_pages["start"] > 0: + part_options = cast( + "FilePartOptions", + {"pages": {"start": 0, "end": normalized_pages["start"] - 1}}, + ) + workflow = workflow.add_file_part(pdf, part_options) + + # Add the specific pages with rotation action + part_options = cast("FilePartOptions", {"pages": normalized_pages}) + workflow = workflow.add_file_part(pdf, part_options, [rotate_action]) + + # Add pages after the range to rotate + if normalized_pages["end"] < page_count - 1: + part_options = cast( + "FilePartOptions", + { + "pages": { + "start": normalized_pages["end"] + 1, + "end": page_count - 1, + } + }, + ) + workflow = workflow.add_file_part(pdf, part_options) + else: + # If no pages specified, rotate the entire document + workflow = workflow.add_file_part(pdf, None, [rotate_action]) + + result = await workflow.output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def add_page( + self, pdf: FileInput, count: int = 1, index: int | None = None + ) -> BufferOutput: + """Add blank pages to a document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to add pages to + count: The number of blank pages to add + index: Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + + Returns: + The document with added pages + + Example: + ```python + # Add 2 blank pages at the end + result = await client.add_page('document.pdf', 2) + + # Add 1 blank page after the first page (at index 1) + result = await client.add_page('document.pdf', 1, 1) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # If no index is provided or it's the end of the document, simply add pages at the end + if index is None: + builder = self.workflow() + + builder.add_file_part(pdf) + + # Add the specified number of blank pages + builder = builder.add_new_page({"pageCount": count}) + + result = await builder.output_pdf().execute() + else: + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Validate that the index is within range + if index < 0 or index > page_count: + raise ValidationError( + f"Index {index} is out of range (document has {page_count} pages)" + ) + + builder = self.workflow() + + # Add pages before the specified index + if index > 0: + before_pages = normalize_page_params( + {"start": 0, "end": index - 1}, page_count + ) + part_options = cast("FilePartOptions", {"pages": before_pages}) + builder = builder.add_file_part(pdf, part_options) + + # Add the blank pages + builder = builder.add_new_page({"pageCount": count}) + + # Add pages after the specified index + if index < page_count: + after_pages = normalize_page_params( + {"start": index, "end": page_count - 1}, page_count + ) + part_options = cast("FilePartOptions", {"pages": after_pages}) + builder = builder.add_file_part(pdf, part_options) + + result = await builder.output_pdf().execute() + + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def split( + self, pdf: FileInput, page_ranges: list[PageRange] + ) -> list[BufferOutput]: + """Split a PDF document into multiple parts based on page ranges. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to split + page_ranges: Array of page ranges to extract + + Returns: + An array of PDF documents, one for each page range + + Example: + ```python + results = await client.split('document.pdf', [ + {'start': 0, 'end': 2}, # Pages 0, 1, 2 + {'start': 3, 'end': 5} # Pages 3, 4, 5 + ]) + ``` + """ + if not page_ranges or len(page_ranges) == 0: + raise ValidationError("At least one page range is required for splitting") + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Normalize and validate all page ranges + normalized_ranges = [ + normalize_page_params(page_range, page_count) for page_range in page_ranges + ] + + # Validate that all page ranges are within bounds + for page_range in normalized_ranges: + if page_range["start"] > page_range["end"]: + raise ValidationError( + f"Page range {page_range} is invalid (start > end)" + ) + + # Create a separate workflow for each page range + import asyncio + from typing import cast as typing_cast + + async def create_split_pdf(page_range: Pages) -> BufferOutput: + builder = self.workflow() + part_options = cast("FilePartOptions", {"pages": page_range}) + builder = builder.add_file_part(pdf, part_options) + result = await builder.output_pdf().execute() + return typing_cast( + "BufferOutput", self._process_typed_workflow_result(result) + ) + + # Execute all workflows in parallel and process the results + tasks = [create_split_pdf(page_range) for page_range in normalized_ranges] + results = await asyncio.gather(*tasks) + + return results + + async def duplicate_pages( + self, pdf: FileInput, page_indices: list[int] + ) -> BufferOutput: + """Create a new PDF containing only the specified pages in the order provided. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to extract pages from + page_indices: Array of page indices to include in the new PDF (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + + Returns: + A new document with only the specified pages + + Example: + ```python + # Create a new PDF with only the first and third pages + result = await client.duplicate_pages('document.pdf', [0, 2]) + + # Create a new PDF with pages in a different order + result = await client.duplicate_pages('document.pdf', [2, 0, 1]) + + # Create a new PDF with duplicated pages + result = await client.duplicate_pages('document.pdf', [0, 0, 1, 1, 0]) + + # Create a new PDF with the first and last pages + result = await client.duplicate_pages('document.pdf', [0, -1]) + ``` + """ + if not page_indices or len(page_indices) == 0: + raise ValidationError("At least one page index is required for duplication") + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Normalize negative indices + normalized_indices = [] + for index in page_indices: + if index < 0: + # Handle negative indices (e.g., -1 is the last page) + normalized_indices.append(page_count + index) + else: + normalized_indices.append(index) + + # Validate that all page indices are within range + for i, original_index in enumerate(page_indices): + normalized_index = normalized_indices[i] + if normalized_index < 0 or normalized_index >= page_count: + raise ValidationError( + f"Page index {original_index} is out of range (document has {page_count} pages)" + ) + + builder = self.workflow() + + # Add each page in the order specified + for page_index in normalized_indices: + # Use normalize_page_params to ensure consistent handling + page_range = normalize_page_params({"start": page_index, "end": page_index}) + part_options = cast("FilePartOptions", {"pages": page_range}) + builder = builder.add_file_part(pdf, part_options) + + result = await cast("WorkflowWithPartsStage", builder).output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def delete_pages( + self, pdf: FileInput, page_indices: list[int] + ) -> BufferOutput: + """Delete pages from a PDF document. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to modify + page_indices: Array of page indices to delete (0-based) + Negative indices count from the end of the document (e.g., -1 is the last page) + + Returns: + The document with deleted pages + + Example: + ```python + # Delete second and fourth pages + result = await client.delete_pages('document.pdf', [1, 3]) + + # Delete the last page + result = await client.delete_pages('document.pdf', [-1]) + + # Delete the first and last two pages + result = await client.delete_pages('document.pdf', [0, -1, -2]) + ``` + """ + if not page_indices or len(page_indices) == 0: + raise ValidationError("At least one page index is required for deletion") + + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) + + # Get the actual page count of the PDF + page_count = get_pdf_page_count(normalized_file[0]) + + # Normalize negative indices + normalized_indices = [] + for index in page_indices: + if index < 0: + # Handle negative indices (e.g., -1 is the last page) + normalized_indices.append(page_count + index) + else: + normalized_indices.append(index) + + # Remove duplicates and sort the deleteIndices + delete_indices = sorted(set(normalized_indices)) + + # Validate that all page indices are within range + for original_index in page_indices: + if original_index >= 0: + normalized_index = original_index + else: + normalized_index = page_count + original_index + + if normalized_index < 0 or normalized_index >= page_count: + raise ValidationError( + f"Page index {original_index} is out of range (document has {page_count} pages)" + ) + + builder = self.workflow() + + # Group consecutive pages that should be kept into ranges + current_page = 0 + page_ranges = [] + + for delete_index in delete_indices: + if current_page < delete_index: + page_ranges.append( + normalize_page_params( + {"start": current_page, "end": delete_index - 1} + ) + ) + current_page = delete_index + 1 + + if ( + current_page > 0 or (current_page == 0 and len(delete_indices) == 0) + ) and current_page < page_count: + page_ranges.append( + normalize_page_params({"start": current_page, "end": page_count - 1}) + ) + + if len(page_ranges) == 0: + raise ValidationError("You cannot delete all pages from a document") + + for page_range in page_ranges: + part_options = cast("FilePartOptions", {"pages": page_range}) + builder = builder.add_file_part(pdf, part_options) + + result = await cast("WorkflowWithPartsStage", builder).output_pdf().execute() + return cast("BufferOutput", self._process_typed_workflow_result(result)) + + async def optimize( + self, + pdf: FileInput, + options: OptimizePdf | None = None, + ) -> BufferOutput: + """Optimize a PDF document for size reduction. + This is a convenience method that uses the workflow builder. + + Args: + pdf: The PDF file to optimize + options: Optimization options + + Returns: + The optimized document + + Example: + ```python + result = await client.optimize('large-document.pdf', { + 'grayscaleImages': True, + 'mrcCompression': True, + 'imageOptimizationQuality': 2 + }) + ``` + """ + # Validate PDF + if is_remote_file_input(pdf): + normalized_file = await process_remote_file_input(str(pdf)) + else: + normalized_file = await process_file_input(pdf) + + if not is_valid_pdf(normalized_file[0]): + raise ValidationError("Invalid pdf file", {"input": pdf}) - def close(self) -> None: - """Close the HTTP client session.""" - self._http_client.close() + if options is None: + options = {"imageOptimizationQuality": 2} - def __enter__(self) -> "NutrientClient": - """Context manager entry.""" - return self + result = ( + await self.workflow() + .add_file_part(pdf) + .output_pdf(cast("PDFOutputOptions", {"optimize": options})) + .execute() + ) - def __exit__(self, *args: Any) -> None: - """Context manager exit.""" - self.close() + return cast("BufferOutput", self._process_typed_workflow_result(result)) diff --git a/src/nutrient_dws/errors.py b/src/nutrient_dws/errors.py new file mode 100644 index 0000000..2886487 --- /dev/null +++ b/src/nutrient_dws/errors.py @@ -0,0 +1,175 @@ +"""Error classes for Nutrient DWS client. +Provides consistent error handling across the library. +""" + +from typing import Any + + +class NutrientError(Exception): + """Base error class for all Nutrient DWS client errors. + Provides consistent error handling across the library. + """ + + def __init__( + self, + message: str, + code: str = "NUTRIENT_ERROR", + details: dict[str, Any] | None = None, + status_code: int | None = None, + ) -> None: + """Initialize a NutrientError. + + Args: + message: Error message + code: Error code for programmatic error handling + details: Additional error details + status_code: HTTP status code if the error originated from an HTTP response + """ + super().__init__(message) + self.name = "NutrientError" + self.message = message + self.code = code + self.details = details + self.status_code = status_code + + # Python doesn't have direct equivalent to Error.captureStackTrace, + # but the stack trace is automatically captured + + def to_json(self) -> dict[str, Any]: + """Returns a JSON representation of the error. + + Returns: + Dict containing error details + """ + return { + "name": self.name, + "message": str(self), + "code": self.code, + "details": self.details, + "status_code": self.status_code, + "stack": self.__traceback__, + } + + def __str__(self) -> str: + """Returns a string representation of the error. + + Returns: + Formatted error string + """ + result = f"{self.name}: {super().__str__()}" + if self.code != "NUTRIENT_ERROR": + result += f" ({self.code})" + if self.status_code: + result += f" [HTTP {self.status_code}]" + return result + + @classmethod + def wrap(cls, error: Any, message: str | None = None) -> "NutrientError": + """Wraps an unknown error into a NutrientError. + + Args: + error: The error to wrap + message: Optional message to prepend + + Returns: + A NutrientError instance + """ + if isinstance(error, NutrientError): + return error + + if isinstance(error, Exception): + wrapped_message = f"{message}: {error!s}" if message else str(error) + return NutrientError( + wrapped_message, + "WRAPPED_ERROR", + { + "originalError": error.__class__.__name__, + "originalMessage": str(error), + "stack": error.__traceback__, + }, + ) + + error_message = message or "An unknown error occurred" + return NutrientError( + error_message, "UNKNOWN_ERROR", {"originalError": str(error)} + ) + + +class ValidationError(NutrientError): + """Error thrown when input validation fails.""" + + def __init__( + self, + message: str, + details: dict[str, Any] | None = None, + status_code: int | None = None, + ) -> None: + """Initialize a ValidationError. + + Args: + message: Error message + details: Additional error details + status_code: HTTP status code if applicable + """ + super().__init__(message, "VALIDATION_ERROR", details, status_code) + self.name = "ValidationError" + + +class APIError(NutrientError): + """Error thrown when API requests fail.""" + + def __init__( + self, + message: str, + status_code: int, + details: dict[str, Any] | None = None, + ) -> None: + """Initialize an APIError. + + Args: + message: Error message + status_code: HTTP status code + details: Additional error details + """ + super().__init__(message, "API_ERROR", details, status_code) + self.name = "APIError" + + +class AuthenticationError(NutrientError): + """Error thrown when authentication fails.""" + + def __init__( + self, + message: str, + details: dict[str, Any] | None = None, + status_code: int = 401, + ) -> None: + """Initialize an AuthenticationError. + + Args: + message: Error message + details: Additional error details + status_code: HTTP status code, defaults to 401 + """ + super().__init__(message, "AUTHENTICATION_ERROR", details, status_code) + self.name = "AuthenticationError" + + +class NetworkError(NutrientError): + """Error thrown when network requests fail.""" + + def __init__( + self, + message: str, + details: dict[str, Any] | None = None, + status_code: int | None = None, + ) -> None: + """Initialize a NetworkError. + + Args: + message: Error message + details: Additional error details + status_code: HTTP status code if applicable + """ + super().__init__(message, "NETWORK_ERROR", details, status_code) + self.name = "NetworkError" diff --git a/src/nutrient_dws/exceptions.py b/src/nutrient_dws/exceptions.py deleted file mode 100644 index 413e2e9..0000000 --- a/src/nutrient_dws/exceptions.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Custom exceptions for Nutrient DWS client.""" - -from typing import Any - - -class NutrientError(Exception): - """Base exception for all Nutrient client errors.""" - - pass - - -class AuthenticationError(NutrientError): - """Raised when authentication fails (401/403 errors). - - This typically indicates: - - Missing API key - - Invalid API key - - Expired API key - - Insufficient permissions - """ - - def __init__(self, message: str = "Authentication failed") -> None: - """Initialize AuthenticationError.""" - super().__init__(message) - - -class APIError(NutrientError): - """Raised for general API errors. - - Attributes: - status_code: HTTP status code from the API. - response_body: Raw response body from the API for debugging. - request_id: Request ID for tracking (if available). - """ - - def __init__( - self, - message: str, - status_code: int | None = None, - response_body: str | None = None, - request_id: str | None = None, - ) -> None: - """Initialize APIError with status code and response body.""" - super().__init__(message) - self.status_code = status_code - self.response_body = response_body - self.request_id = request_id - - def __str__(self) -> str: - """String representation with all available error details.""" - parts = [str(self.args[0]) if self.args else "API Error"] - - if self.status_code: - parts.append(f"Status: {self.status_code}") - - if self.request_id: - parts.append(f"Request ID: {self.request_id}") - - if self.response_body: - parts.append(f"Response: {self.response_body}") - - return " | ".join(parts) - - -class ValidationError(NutrientError): - """Raised when request validation fails.""" - - def __init__(self, message: str, errors: dict[str, Any] | None = None) -> None: - """Initialize ValidationError with validation details.""" - super().__init__(message) - self.errors = errors or {} - - -class NutrientTimeoutError(NutrientError): - """Raised when a request times out.""" - - pass - - -class FileProcessingError(NutrientError): - """Raised when file processing fails.""" - - pass diff --git a/src/nutrient_dws/file_handler.py b/src/nutrient_dws/file_handler.py deleted file mode 100644 index f79cfde..0000000 --- a/src/nutrient_dws/file_handler.py +++ /dev/null @@ -1,263 +0,0 @@ -"""File handling utilities for input/output operations.""" - -import contextlib -import io -import os -import re -from collections.abc import Generator -from pathlib import Path -from typing import BinaryIO - -FileInput = str | Path | bytes | BinaryIO - -# Default chunk size for streaming operations (1MB) -DEFAULT_CHUNK_SIZE = 1024 * 1024 - - -def prepare_file_input(file_input: FileInput) -> tuple[bytes, str]: - """Convert various file input types to bytes. - - Args: - file_input: File path, bytes, or file-like object. - - Returns: - tuple of (file_bytes, filename). - - Raises: - FileNotFoundError: If file path doesn't exist. - ValueError: If input type is not supported. - """ - # Handle different file input types using pattern matching - match file_input: - case Path() if not file_input.exists(): - raise FileNotFoundError(f"File not found: {file_input}") - case Path(): - return file_input.read_bytes(), file_input.name - case str(): - path = Path(file_input) - if not path.exists(): - raise FileNotFoundError(f"File not found: {file_input}") - return path.read_bytes(), path.name - case bytes(): - return file_input, "document" - case _ if hasattr(file_input, "read"): - # Handle file-like objects - # Save current position if seekable - current_pos = None - if hasattr(file_input, "seek") and hasattr(file_input, "tell"): - try: - current_pos = file_input.tell() - file_input.seek(0) # Read from beginning - except (OSError, io.UnsupportedOperation): - pass - - content = file_input.read() - if isinstance(content, str): - content = content.encode() - - # Restore position if we saved it - if current_pos is not None: - with contextlib.suppress(OSError, io.UnsupportedOperation): - file_input.seek(current_pos) - - filename = getattr(file_input, "name", "document") - if hasattr(filename, "__fspath__"): - filename = os.path.basename(os.fspath(filename)) - elif isinstance(filename, bytes): - filename = os.path.basename(filename.decode()) - elif isinstance(filename, str): - filename = os.path.basename(filename) - return content, str(filename) - case _: - raise ValueError(f"Unsupported file input type: {type(file_input)}") - - -def prepare_file_for_upload( - file_input: FileInput, - field_name: str = "file", -) -> tuple[str, tuple[str, bytes | BinaryIO, str]]: - """Prepare file for multipart upload. - - Args: - file_input: File path, bytes, or file-like object. - field_name: Form field name for the file. - - Returns: - tuple of (field_name, (filename, file_content_or_stream, content_type)). - - Raises: - FileNotFoundError: If file path doesn't exist. - ValueError: If input type is not supported. - """ - content_type = "application/octet-stream" - - # Handle different file input types using pattern matching - path: Path | None - match file_input: - case Path(): - path = file_input - case str(): - path = Path(file_input) - case _: - path = None - - # Handle path-based inputs - if path is not None: - if not path.exists(): - raise FileNotFoundError(f"File not found: {path}") - - # For large files, return file handle instead of reading into memory - file_size = path.stat().st_size - if file_size > 10 * 1024 * 1024: # 10MB threshold - # Note: File handle is intentionally not using context manager - # as it needs to remain open for streaming upload by HTTP client - file_handle = open(path, "rb") # noqa: SIM115 - return field_name, (path.name, file_handle, content_type) - else: - return field_name, (path.name, path.read_bytes(), content_type) - - # Handle non-path inputs - match file_input: - case bytes(): - return field_name, ("document", file_input, content_type) - case _ if hasattr(file_input, "read"): - filename = getattr(file_input, "name", "document") - if hasattr(filename, "__fspath__"): - filename = os.path.basename(os.fspath(filename)) - elif isinstance(filename, bytes): - filename = os.path.basename(filename.decode()) - elif isinstance(filename, str): - filename = os.path.basename(filename) - return field_name, (str(filename), file_input, content_type) # type: ignore[return-value] - case _: - raise ValueError(f"Unsupported file input type: {type(file_input)}") - - -def save_file_output(content: bytes, output_path: str) -> None: - """Save file content to disk. - - Args: - content: File bytes to save. - output_path: Path where to save the file. - - Raises: - OSError: If file cannot be written. - """ - path = Path(output_path) - # Create parent directories if they don't exist - path.parent.mkdir(parents=True, exist_ok=True) - path.write_bytes(content) - - -def stream_file_content( - file_path: str, - chunk_size: int = DEFAULT_CHUNK_SIZE, -) -> Generator[bytes, None, None]: - """Stream file content in chunks. - - Args: - file_path: Path to the file to stream. - chunk_size: Size of each chunk in bytes. - - Yields: - Chunks of file content. - - Raises: - FileNotFoundError: If file doesn't exist. - """ - path = Path(file_path) - if not path.exists(): - raise FileNotFoundError(f"File not found: {file_path}") - - with open(path, "rb") as f: - while chunk := f.read(chunk_size): - yield chunk - - -def get_file_size(file_input: FileInput) -> int | None: - """Get size of file input if available. - - Args: - file_input: File path, bytes, or file-like object. - - Returns: - File size in bytes, or None if size cannot be determined. - """ - if isinstance(file_input, Path): - if file_input.exists(): - return file_input.stat().st_size - elif isinstance(file_input, str): - path = Path(file_input) - if path.exists(): - return path.stat().st_size - elif isinstance(file_input, bytes): - return len(file_input) - elif hasattr(file_input, "seek") and hasattr(file_input, "tell"): - # For seekable file-like objects - try: - current_pos = file_input.tell() - file_input.seek(0, 2) # Seek to end - size = file_input.tell() - file_input.seek(current_pos) # Restore position - return size - except (OSError, io.UnsupportedOperation): - pass - - return None - - -def get_pdf_page_count(pdf_input: FileInput) -> int: - """Zero dependency way to get the number of pages in a PDF. - - Args: - pdf_input: File path, bytes, or file-like object. Has to be of a PDF file - - Returns: - Number of pages in a PDF. - """ - if isinstance(pdf_input, (str, Path)): - with open(pdf_input, "rb") as f: - pdf_bytes = f.read() - elif isinstance(pdf_input, bytes): - pdf_bytes = pdf_input - elif hasattr(pdf_input, "read") and hasattr(pdf_input, "seek") and hasattr(pdf_input, "tell"): - pos = pdf_input.tell() - pdf_input.seek(0) - pdf_bytes = pdf_input.read() - pdf_input.seek(pos) - else: - raise TypeError("Unsupported input type. Expected str, Path, bytes, or seekable BinaryIO.") - - # Find all PDF objects - objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) - - # Get the Catalog Object - catalog_obj = None - for _obj_num, _gen_num, obj_data in objects: - if b"/Type" in obj_data and b"/Catalog" in obj_data: - catalog_obj = obj_data - break - - if not catalog_obj: - raise ValueError("Could not find /Catalog object in PDF.") - - # Extract /Pages reference (e.g. 3 0 R) - pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) - if not pages_ref_match: - raise ValueError("Could not find /Pages reference in /Catalog.") - pages_obj_num = pages_ref_match.group(1).decode() - pages_obj_gen = pages_ref_match.group(2).decode() - - # Step 3: Find the referenced /Pages object - pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() - pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) - if not pages_obj_match: - raise ValueError("Could not find root /Pages object.") - pages_obj_data = pages_obj_match.group(1) - - # Step 4: Extract /Count - count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) - if not count_match: - raise ValueError("Could not find /Count in root /Pages object.") - - return int(count_match.group(1)) diff --git a/src/nutrient_dws/http.py b/src/nutrient_dws/http.py new file mode 100644 index 0000000..6aae822 --- /dev/null +++ b/src/nutrient_dws/http.py @@ -0,0 +1,607 @@ +"""HTTP request and response type definitions for API communication.""" + +import json +from collections.abc import Awaitable, Callable +from typing import Any, Generic, Literal, TypeGuard, TypeVar, Union, overload + +import httpx +from typing_extensions import NotRequired, TypedDict + +from nutrient_dws.errors import ( + APIError, + AuthenticationError, + NetworkError, + NutrientError, + ValidationError, +) +from nutrient_dws.inputs import FileInput, NormalizedFileData +from nutrient_dws.types.account_info import AccountInfo +from nutrient_dws.types.analyze_response import AnalyzeBuildResponse +from nutrient_dws.types.build_instruction import BuildInstructions +from nutrient_dws.types.build_response_json import BuildResponseJsonContents +from nutrient_dws.types.create_auth_token import ( + CreateAuthTokenParameters, + CreateAuthTokenResponse, +) +from nutrient_dws.types.redact_data import RedactData +from nutrient_dws.types.sign_request import CreateDigitalSignature +from nutrient_dws.utils import get_user_agent + + +class BuildRequestData(TypedDict): + instructions: BuildInstructions + files: NotRequired[dict[str, NormalizedFileData]] + + +class AnalyzeBuildRequestData(TypedDict): + instructions: BuildInstructions + + +class SignRequestOptions(TypedDict): + image: NotRequired[FileInput] + graphicImage: NotRequired[FileInput] + + +class SignRequestData(TypedDict): + file: NormalizedFileData + data: NotRequired[CreateDigitalSignature] + image: NotRequired[NormalizedFileData] + graphicImage: NotRequired[NormalizedFileData] + + +class RedactRequestData(TypedDict): + data: RedactData + fileKey: NotRequired[str] + file: NotRequired[NormalizedFileData] + + +class DeleteTokenRequestData(TypedDict): + id: str + + +# Methods and Endpoints types +Method = TypeVar("Method", bound=Literal["GET", "POST", "DELETE"]) +Endpoint = TypeVar( + "Endpoint", + bound=Literal[ + "/account/info", "/build", "/analyze_build", "/sign", "/ai/redact", "/tokens" + ], +) + +# Type variables for generic types +Input = TypeVar( + "Input", + bound=CreateAuthTokenParameters + | BuildRequestData + | AnalyzeBuildRequestData + | SignRequestData + | RedactRequestData + | DeleteTokenRequestData + | None, +) +Output = TypeVar( + "Output", + bound=CreateAuthTokenResponse + | str + | bytes + | BuildResponseJsonContents + | AnalyzeBuildResponse + | AccountInfo + | None, +) + + +# Request configuration +class RequestConfig(TypedDict, Generic[Method, Endpoint, Input]): + """HTTP request configuration for API calls.""" + + method: Method + endpoint: Endpoint + data: Input # The actual type depends on the method and endpoint + headers: dict[str, str] | None + + +def is_get_account_info_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[RequestConfig[Literal["GET"], Literal["/account/info"], None]]: + return request["method"] == "GET" and request["endpoint"] == "/account/info" + + +def is_post_build_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[RequestConfig[Literal["POST"], Literal["/build"], BuildRequestData]]: + return request["method"] == "POST" and request["endpoint"] == "/build" + + +def is_post_analyse_build_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/analyze_build"], AnalyzeBuildRequestData] +]: + return request["method"] == "POST" and request["endpoint"] == "/analyze_build" + + +def is_post_sign_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[RequestConfig[Literal["POST"], Literal["/sign"], SignRequestData]]: + return request["method"] == "POST" and request["endpoint"] == "/sign" + + +def is_post_ai_redact_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/ai/redact"], RedactRequestData] +]: + return request["method"] == "POST" and request["endpoint"] == "/ai/redact" + + +def is_post_tokens_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["POST"], Literal["/tokens"], CreateAuthTokenParameters] +]: + return request["method"] == "POST" and request["endpoint"] == "/tokens" + + +def is_delete_tokens_request_config( + request: RequestConfig[Method, Endpoint, Input], +) -> TypeGuard[ + RequestConfig[Literal["DELETE"], Literal["/tokens"], DeleteTokenRequestData] +]: + return request["method"] == "DELETE" and request["endpoint"] == "/tokens" + + +# API response +class ApiResponse(TypedDict, Generic[Output]): + """Response from API call.""" + + data: Output # The actual type depends on the method and endpoint + status: int + statusText: str + headers: dict[str, Any] + + +# Client options +class NutrientClientOptions(TypedDict): + """Client options for Nutrient DWS API.""" + + apiKey: str | Callable[[], str | Awaitable[str]] + baseUrl: str | None + timeout: int | None + + +async def resolve_api_key(api_key: str | Callable[[], str | Awaitable[str]]) -> str: + """Resolves API key from string or function. + + Args: + api_key: API key as string or function that returns a string + + Returns: + Resolved API key as string + + Raises: + AuthenticationError: If API key function returns invalid value + """ + if isinstance(api_key, str): + return api_key + + try: + resolved_key = api_key() + if isinstance(resolved_key, Awaitable): + return await resolved_key + if not isinstance(resolved_key, str) or len(resolved_key) == 0: + raise AuthenticationError( + "API key function must return a non-empty string", + {"resolvedType": type(resolved_key).__name__}, + ) + return resolved_key + except Exception as error: + if isinstance(error, AuthenticationError): + raise error + raise AuthenticationError( + "Failed to resolve API key from function", {"error": str(error)} + ) + + +def append_file_to_form_data( + form_data: dict[str, Any], key: str, file: NormalizedFileData +) -> None: + """Appends file to form data with proper format. + + Args: + form_data: Form data dictionary + key: Key for the file + file: File data + + Raises: + ValidationError: If file data is not in expected format + """ + file_content, filename = file + + if not isinstance(file_content, bytes): + raise ValidationError( + "Expected bytes for file data", {"dataType": type(file_content).__name__} + ) + + form_data[key] = (filename, file_content) + + +def prepare_request_body( + request_config: dict[str, Any], config: RequestConfig[Method, Endpoint, Input] +) -> dict[str, Any]: + """Prepares request body with files and data. + + Args: + request_config: Request configuration dictionary + config: Request configuration + + Returns: + Updated request configuration + """ + if is_post_build_request_config(config): + # Use multipart/form-data for file uploads + files: dict[str, Any] = {} + for key, value in config["data"]["files"].items(): + append_file_to_form_data(files, key, value) + + request_config["files"] = files + request_config["data"] = { + "instructions": json.dumps(config["data"]["instructions"]) + } + + return request_config + + if is_post_analyse_build_request_config(config): + # JSON only request + request_config["json"] = config["data"]["instructions"] + + return request_config + + if is_post_sign_request_config(config): + files = {} + append_file_to_form_data(files, "file", config["data"]["file"]) + + if "image" in config["data"]: + append_file_to_form_data(files, "image", config["data"]["image"]) + + if "graphicImage" in config["data"]: + append_file_to_form_data( + files, "graphicImage", config["data"]["graphicImage"] + ) + + request_config["files"] = files + + data = {} + if "data" in config["data"] and config["data"]["data"] is not None: + data["data"] = json.dumps(config["data"]["data"]) + else: + data["data"] = json.dumps( + { + "signatureType": "cades", + "cadesLevel": "b-lt", + } + ) + + request_config["data"] = data + + return request_config + + if is_post_ai_redact_request_config(config): + if "file" in config["data"] and "fileKey" in config["data"]: + files = {} + append_file_to_form_data( + files, config["data"]["fileKey"], config["data"]["file"] + ) + + request_config["files"] = files + request_config["data"] = {"data": json.dumps(config["data"]["data"])} + else: + # JSON only request + request_config["json"] = config["data"]["data"] + + return request_config + + # Fallback, passing data as JSON + if "data" in config: + request_config["json"] = config["data"] + + return request_config + + +def extract_error_message(data: Any) -> str | None: + """Extracts error message from response data with comprehensive DWS error handling. + + Args: + data: Response data + + Returns: + Extracted error message or None if not found + """ + if isinstance(data, dict): + error_data = data + + # DWS-specific error fields (prioritized) + if "error_description" in error_data and isinstance( + error_data["error_description"], str + ): + return error_data["error_description"] + + if "error_message" in error_data and isinstance( + error_data["error_message"], str + ): + return error_data["error_message"] + + # Common error message fields + if "message" in error_data and isinstance(error_data["message"], str): + return error_data["message"] + + if "error" in error_data and isinstance(error_data["error"], str): + return error_data["error"] + + if "detail" in error_data and isinstance(error_data["detail"], str): + return error_data["detail"] + + if "details" in error_data and isinstance(error_data["details"], str): + return error_data["details"] + + # Handle nested error objects + if "error" in error_data and isinstance(error_data["error"], dict): + nested_error = error_data["error"] + + if "message" in nested_error and isinstance(nested_error["message"], str): + return nested_error["message"] + + if "description" in nested_error and isinstance( + nested_error["description"], str + ): + return nested_error["description"] + + # Handle errors array (common in validation responses) + if ( + "errors" in error_data + and isinstance(error_data["errors"], list) + and error_data["errors"] + ): + first_error = error_data["errors"][0] + + if isinstance(first_error, str): + return first_error + + if isinstance(first_error, dict): + error_obj = first_error + + if "message" in error_obj and isinstance(error_obj["message"], str): + return error_obj["message"] + + return None + + +def create_http_error(status: int, status_text: str, data: Any) -> NutrientError: + """Creates appropriate error for HTTP status codes. + + Args: + status: HTTP status code + status_text: HTTP status text + data: Response data + + Returns: + Appropriate NutrientError subclass + """ + message = extract_error_message(data) or f"HTTP {status}: {status_text}" + details = data if isinstance(data, dict) else {"response": data} + + if status in (401, 403): + return AuthenticationError(message, details, status) + + if 400 <= status < 500: + return ValidationError(message, details, status) + + return APIError(message, status, details) + + +def handle_response(response: httpx.Response) -> ApiResponse[Output]: + """Handles HTTP response and converts to standardized format. + + Args: + response: Response from the API + + Returns: + Standardized API response + + Raises: + NutrientError: For error responses + """ + status = response.status_code + status_text = response.reason_phrase + headers: dict[str, Any] = dict(response.headers) + + try: + data = response.json() + except (ValueError, json.JSONDecodeError): + data = response.content + + # Check for error status codes + if status >= 400: + raise create_http_error(status, status_text, data) + + return { + "data": data, + "status": status, + "statusText": status_text, + "headers": headers, + } + + +def convert_error( + error: Any, config: RequestConfig[Method, Endpoint, Input] +) -> NutrientError: + """Converts various error types to NutrientError. + + Args: + error: The error to convert + config: Request configuration + + Returns: + Converted NutrientError + """ + if isinstance(error, NutrientError): + return error + + if isinstance(error, (httpx.RequestError, httpx.HTTPStatusError)): + response = getattr(error, "response", None) + request = getattr(error, "request", None) + message = str(error) + + if response is not None: + # HTTP error response + try: + response_data = response.json() + except (ValueError, json.JSONDecodeError): + response_data = response.text + return create_http_error( + response.status_code, response.reason_phrase, response_data + ) + + if request is not None: + # Network error (request made but no response) + sanitized_headers = (config.get("headers") or {}).copy() + if "Authorization" in sanitized_headers: + del sanitized_headers["Authorization"] + + return NetworkError( + "Network request failed", + { + "message": message, + "endpoint": config["endpoint"], + "method": config["method"], + "headers": sanitized_headers, + }, + ) + + # Request setup error + return ValidationError( + "Request configuration error", + { + "message": message, + "endpoint": config["endpoint"], + "method": config["method"], + "data": config.get("data"), + }, + ) + + # Unknown error + return NutrientError( + "Unexpected error occurred", + "UNKNOWN_ERROR", + { + "error": str(error), + "endpoint": config["endpoint"], + "method": config["method"], + "data": config.get("data"), + }, + ) + + +@overload +async def send_request( + config: RequestConfig[Literal["GET"], Literal["/account/info"], None], + client_options: NutrientClientOptions, +) -> ApiResponse[AccountInfo]: ... + + +@overload +async def send_request( + config: RequestConfig[ + Literal["POST"], Literal["/tokens"], CreateAuthTokenParameters + ], + client_options: NutrientClientOptions, +) -> ApiResponse[CreateAuthTokenResponse]: ... + + +@overload +async def send_request( + config: RequestConfig[Literal["POST"], Literal["/build"], BuildRequestData], + client_options: NutrientClientOptions, +) -> ApiResponse[Union[BuildResponseJsonContents, bytes, str]]: ... + + +@overload +async def send_request( + config: RequestConfig[ + Literal["POST"], Literal["/analyze_build"], AnalyzeBuildRequestData + ], + client_options: NutrientClientOptions, +) -> ApiResponse[AnalyzeBuildResponse]: ... + + +@overload +async def send_request( + config: RequestConfig[Literal["POST"], Literal["/sign"], SignRequestData], + client_options: NutrientClientOptions, +) -> ApiResponse[bytes]: ... + + +@overload +async def send_request( + config: RequestConfig[Literal["POST"], Literal["/ai/redact"], RedactRequestData], + client_options: NutrientClientOptions, +) -> ApiResponse[bytes]: ... + + +@overload +async def send_request( + config: RequestConfig[ + Literal["DELETE"], Literal["/tokens"], DeleteTokenRequestData + ], + client_options: NutrientClientOptions, +) -> ApiResponse[None]: ... + + +async def send_request( + config: RequestConfig[Method, Endpoint, Input], + client_options: NutrientClientOptions, +) -> ApiResponse[Output]: + """Sends HTTP request to Nutrient DWS Processor API. + Handles authentication, file uploads, and error conversion. + + Args: + config: Request configuration + client_options: Client options + + Returns: + API response + + Raises: + NutrientError: For various error conditions + """ + try: + # Resolve API key (string or function) + api_key = await resolve_api_key(client_options["apiKey"]) + + # Build full URL + base_url: str = client_options.get("baseUrl") or "https://api.nutrient.io" + url = f"{base_url.rstrip('/')}{config['endpoint']}" + + headers = config.get("headers") or {} + headers["Authorization"] = f"Bearer {api_key}" + headers["User-Agent"] = get_user_agent() + + # Prepare request configuration + request_config: dict[str, Any] = { + "method": config["method"], + "url": url, + "headers": headers, + "timeout": client_options.get("timeout", None), + } + + # Prepare request body + request_config = prepare_request_body(request_config, config) + + # Make request using httpx async client + async with httpx.AsyncClient() as client: + response = await client.request(**request_config) + + # Handle response + return handle_response(response) + except Exception as error: + raise convert_error(error, config) diff --git a/src/nutrient_dws/http_client.py b/src/nutrient_dws/http_client.py deleted file mode 100644 index 8483428..0000000 --- a/src/nutrient_dws/http_client.py +++ /dev/null @@ -1,192 +0,0 @@ -"""HTTP client abstraction for API communication.""" - -import json -import logging -from typing import Any - -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -from nutrient_dws.exceptions import ( - APIError, - AuthenticationError, - NutrientTimeoutError, - ValidationError, -) - -logger = logging.getLogger(__name__) - - -class HTTPClient: - """HTTP client with connection pooling and retry logic.""" - - def __init__(self, api_key: str | None, timeout: int = 300) -> None: - """Initialize HTTP client with authentication. - - Args: - api_key: API key for authentication. - timeout: Request timeout in seconds. - """ - self._api_key = api_key - self._timeout = timeout - self._session = self._create_session() - self._base_url = "https://api.pspdfkit.com" - - def _create_session(self) -> requests.Session: - """Create requests session with retry logic.""" - session = requests.Session() - - # Configure retries with exponential backoff - retry_strategy = Retry( - total=3, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], - allowed_methods=["GET", "POST"], - raise_on_status=False, # We'll handle status codes ourselves - ) - adapter = HTTPAdapter( - max_retries=retry_strategy, - pool_connections=10, - pool_maxsize=10, - ) - session.mount("http://", adapter) - session.mount("https://", adapter) - - # Set default headers - headers = { - "User-Agent": "nutrient-dws-python-client/0.1.0", - } - if self._api_key: - headers["Authorization"] = f"Bearer {self._api_key}" - - session.headers.update(headers) - - return session - - def _handle_response(self, response: requests.Response) -> bytes: - """Handle API response and raise appropriate exceptions. - - Args: - response: Response from the API. - - Returns: - Response content as bytes. - - Raises: - AuthenticationError: For 401/403 responses. - ValidationError: For 422 responses. - APIError: For other error responses. - """ - # Extract request ID if available - request_id = response.headers.get("X-Request-Id") - - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - # Try to parse error message from response - error_message = f"HTTP {response.status_code}" - error_details = None - - try: - error_data = response.json() - error_message = error_data.get("message", error_message) - error_details = error_data.get("errors", error_data.get("details")) - except (json.JSONDecodeError, requests.exceptions.JSONDecodeError): - # If response is not JSON, use text content - if response.text: - error_message = f"{error_message}: {response.text[:200]}" - - # Handle specific status codes - if response.status_code in (401, 403): - raise AuthenticationError( - error_message or "Authentication failed. Check your API key." - ) from None - elif response.status_code == 422: - raise ValidationError( - error_message or "Request validation failed", - errors=error_details, - ) from None - else: - raise APIError( - error_message, - status_code=response.status_code, - response_body=response.text, - request_id=request_id, - ) from None - - return response.content - - def post( - self, - endpoint: str, - files: dict[str, Any] | None = None, - data: dict[str, Any] | None = None, - json_data: dict[str, Any] | None = None, - ) -> bytes: - """Make POST request to API. - - Args: - endpoint: API endpoint path. - files: Files to upload. - data: Form data. - json_data: JSON data (for multipart requests). - - Returns: - Response content as bytes. - - Raises: - AuthenticationError: If API key is missing or invalid. - TimeoutError: If request times out. - APIError: For other API errors. - """ - if not self._api_key: - raise AuthenticationError("API key is required but not provided") - - url = f"{self._base_url}{endpoint}" - logger.debug(f"POST {url}") - - # Prepare multipart data if json_data is provided - prepared_data = data or {} - if json_data is not None: - prepared_data["instructions"] = json.dumps(json_data) - - try: - response = self._session.post( - url, - files=files, - data=prepared_data, - timeout=self._timeout, - ) - except requests.exceptions.Timeout as e: - raise NutrientTimeoutError(f"Request timed out after {self._timeout} seconds") from e - except requests.exceptions.ConnectionError as e: - raise APIError(f"Connection error: {e!s}") from e - except requests.exceptions.RequestException as e: - raise APIError(f"Request failed: {e!s}") from e - - logger.debug(f"Response: {response.status_code}") - - # Clean up file handles after request - if files: - for _, file_data in files.items(): - if hasattr(file_data, "close"): - file_data.close() - elif isinstance(file_data, tuple) and len(file_data) > 1: - file_obj = file_data[1] - if hasattr(file_obj, "close"): - file_obj.close() - - return self._handle_response(response) - - def close(self) -> None: - """Close the session.""" - self._session.close() - - def __enter__(self) -> "HTTPClient": - """Context manager entry.""" - return self - - def __exit__(self, *args: Any) -> None: - """Context manager exit.""" - self.close() diff --git a/src/nutrient_dws/inputs.py b/src/nutrient_dws/inputs.py new file mode 100644 index 0000000..5acb5cf --- /dev/null +++ b/src/nutrient_dws/inputs.py @@ -0,0 +1,223 @@ +import contextlib +import io +import os +import re +from pathlib import Path +from typing import BinaryIO, TypeGuard +from urllib.parse import urlparse + +import aiofiles +import httpx + +FileInput = str | Path | bytes | BinaryIO + +NormalizedFileData = tuple[bytes, str] + + +def is_url(string: str) -> bool: + """Checks if a given string is a valid URL. + + Args: + string: The string to validate. + + Returns: + True if the string is a valid URL, False otherwise. + """ + try: + result = urlparse(string) + # A valid URL must have a scheme (e.g., 'http') and a network location (e.g., 'www.google.com') + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +def is_valid_pdf(file_bytes: bytes) -> bool: + """Check if a file is a valid PDF.""" + return file_bytes.startswith(b"%PDF-") + + +def is_remote_file_input(file_input: FileInput) -> TypeGuard[str]: + """Check if the file input is a remote URL. + + Args: + file_input: The file input to check + + Returns: + True if the file input is a remote URL + """ + return isinstance(file_input, str) and is_url(file_input) + + +async def process_file_input(file_input: FileInput) -> NormalizedFileData: + """Convert various file input types to bytes. + + Args: + file_input: File path, bytes, or file-like object. + + Returns: + tuple of (file_bytes, filename). + + Raises: + FileNotFoundError: If file path doesn't exist. + ValueError: If input type is not supported. + """ + # Handle different file input types using pattern matching + match file_input: + case Path() if not file_input.exists(): + raise FileNotFoundError(f"File not found: {file_input}") + case Path(): + async with aiofiles.open(file_input, "rb") as f: + content = await f.read() + return content, file_input.name + case str(): + path = Path(file_input) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_input}") + async with aiofiles.open(path, "rb") as f: + content = await f.read() + return content, path.name + case bytes(): + return file_input, "document" + case _ if hasattr(file_input, "read"): + # Handle file-like objects (both sync and async) + if hasattr(file_input, "aread"): + # Async file-like object + current_pos = None + if hasattr(file_input, "seek") and hasattr(file_input, "tell"): + try: + current_pos = ( + await file_input.atell() + if hasattr(file_input, "atell") + else file_input.tell() + ) + if hasattr(file_input, "aseek"): + await file_input.aseek(0) + else: + file_input.seek(0) + except (OSError, io.UnsupportedOperation): + pass + + content = await file_input.aread() + if isinstance(content, str): + content = content.encode() + + # Restore position if we saved it + if current_pos is not None: + with contextlib.suppress(OSError, io.UnsupportedOperation): + if hasattr(file_input, "aseek"): + await file_input.aseek(current_pos) + else: + file_input.seek(current_pos) + else: + # Synchronous file-like object + # Save current position if seekable + current_pos = None + if hasattr(file_input, "seek") and hasattr(file_input, "tell"): + try: + current_pos = file_input.tell() + file_input.seek(0) # Read from beginning + except (OSError, io.UnsupportedOperation): + pass + + content = file_input.read() + if isinstance(content, str): + content = content.encode() + + # Restore position if we saved it + if current_pos is not None: + with contextlib.suppress(OSError, io.UnsupportedOperation): + file_input.seek(current_pos) + + filename = getattr(file_input, "name", "document") + if hasattr(filename, "__fspath__"): + filename = os.path.basename(os.fspath(filename)) + elif isinstance(filename, bytes): + filename = os.path.basename(filename.decode()) + elif isinstance(filename, str): + filename = os.path.basename(filename) + return content, str(filename) + case _: + raise ValueError(f"Unsupported file input type: {type(file_input)}") + + +async def process_remote_file_input(url: str) -> NormalizedFileData: + """Convert various file input types to bytes.""" + async with httpx.AsyncClient() as client: + response = await client.get(url) + # This will raise an exception for bad responses (4xx or 5xx status codes) + response.raise_for_status() + # The .content attribute holds the raw bytes of the response + file_bytes = response.content + + filename = "downloaded_file" + # Try to get filename from 'Content-Disposition' header first + header = response.headers.get("content-disposition") + if header: + # Use regex to find a filename in the header + match = re.search(r'filename="?([^"]+)"?', header) + if match: + filename = match.group(1) + + return file_bytes, filename + + +def validate_file_input(file_input: FileInput) -> bool: + """Validate that the file input is in a supported format. + + Args: + file_input: The file input to validate + + Returns: + True if the file input is valid + """ + if isinstance(file_input, (bytes, str)): + return True + elif isinstance(file_input, Path): + return file_input.exists() and file_input.is_file() + elif hasattr(file_input, "read"): + return True + return False + + +def get_pdf_page_count(pdf_bytes: bytes) -> int: + """Zero dependency way to get the number of pages in a PDF. + + Args: + pdf_bytes: PDF file bytes + + Returns: + Number of pages in a PDF. + """ + # Find all PDF objects + objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) + + # Get the Catalog Object + catalog_obj = None + for _obj_num, _gen_num, obj_data in objects: + if b"/Type" in obj_data and b"/Catalog" in obj_data: + catalog_obj = obj_data + break + + if not catalog_obj: + raise ValueError("Could not find /Catalog object in PDF.") + + # Extract /Pages reference (e.g. 3 0 R) + pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) + if not pages_ref_match: + raise ValueError("Could not find /Pages reference in /Catalog.") + pages_obj_num = pages_ref_match.group(1).decode() + pages_obj_gen = pages_ref_match.group(2).decode() + + # Step 3: Find the referenced /Pages object + pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() + pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) + if not pages_obj_match: + raise ValueError("Could not find root /Pages object.") + pages_obj_data = pages_obj_match.group(1) + + # Step 4: Extract /Count + count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) + if not count_match: + raise ValueError("Could not find /Count in root /Pages object.") + + return int(count_match.group(1)) diff --git a/tests/integration/py.typed b/src/nutrient_dws/types/__init__.py similarity index 100% rename from tests/integration/py.typed rename to src/nutrient_dws/types/__init__.py diff --git a/src/nutrient_dws/types/account_info.py b/src/nutrient_dws/types/account_info.py new file mode 100644 index 0000000..52a8e64 --- /dev/null +++ b/src/nutrient_dws/types/account_info.py @@ -0,0 +1,22 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class APIKeys(TypedDict): + live: NotRequired[str] + + +SubscriptionType = Literal["free", "paid", "enterprise"] + + +class Usage(TypedDict): + totalCredits: NotRequired[int] + usedCredits: NotRequired[int] + + +class AccountInfo(TypedDict): + apiKeys: NotRequired[APIKeys] + signedIn: NotRequired[bool] + subscriptionType: NotRequired[SubscriptionType] + usage: NotRequired[Usage] diff --git a/src/nutrient_dws/types/analyze_response.py b/src/nutrient_dws/types/analyze_response.py new file mode 100644 index 0000000..45ef137 --- /dev/null +++ b/src/nutrient_dws/types/analyze_response.py @@ -0,0 +1,16 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class RequiredFeatures(TypedDict): + unit_cost: NotRequired[float] + unit_type: NotRequired[Literal["per_use", "per_output_page"]] + units: NotRequired[int] + cost: NotRequired[float] + usage: NotRequired[list[str]] + + +class AnalyzeBuildResponse(TypedDict): + cost: NotRequired[float] + required_features: NotRequired[dict[str, RequiredFeatures]] diff --git a/src/nutrient_dws/types/annotation/__init__.py b/src/nutrient_dws/types/annotation/__init__.py new file mode 100644 index 0000000..5bd7176 --- /dev/null +++ b/src/nutrient_dws/types/annotation/__init__.py @@ -0,0 +1,33 @@ +from typing import Union + +from nutrient_dws.types.annotation.ellipse import EllipseAnnotation +from nutrient_dws.types.annotation.image import ImageAnnotation +from nutrient_dws.types.annotation.ink import InkAnnotation +from nutrient_dws.types.annotation.line import LineAnnotation +from nutrient_dws.types.annotation.link import LinkAnnotation +from nutrient_dws.types.annotation.markup import MarkupAnnotation +from nutrient_dws.types.annotation.note import NoteAnnotation +from nutrient_dws.types.annotation.polygon import PolygonAnnotation +from nutrient_dws.types.annotation.polyline import PolylineAnnotation +from nutrient_dws.types.annotation.rectangle import RectangleAnnotation +from nutrient_dws.types.annotation.redaction import RedactionAnnotation +from nutrient_dws.types.annotation.stamp import StampAnnotation +from nutrient_dws.types.annotation.text import TextAnnotation +from nutrient_dws.types.annotation.widget import WidgetAnnotation + +Annotation = Union[ + MarkupAnnotation, + RedactionAnnotation, + TextAnnotation, + InkAnnotation, + LinkAnnotation, + NoteAnnotation, + EllipseAnnotation, + RectangleAnnotation, + LineAnnotation, + PolylineAnnotation, + PolygonAnnotation, + ImageAnnotation, + StampAnnotation, + WidgetAnnotation, +] diff --git a/src/nutrient_dws/types/annotation/base.py b/src/nutrient_dws/types/annotation/base.py new file mode 100644 index 0000000..0e80b69 --- /dev/null +++ b/src/nutrient_dws/types/annotation/base.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +if TYPE_CHECKING: + from nutrient_dws.types.instant_json.actions import Action + from nutrient_dws.types.misc import ( + AnnotationBbox, + AnnotationCustomData, + AnnotationNote, + AnnotationOpacity, + MeasurementPrecision, + MeasurementScale, + PageIndex, + PdfObjectId, + ) + + +class V1(TypedDict): + v: Literal[1] + pageIndex: int + bbox: AnnotationBbox + action: NotRequired[Action] + opacity: NotRequired[float] + pdfObjectId: NotRequired[int] + id: NotRequired[str] + flags: NotRequired[ + list[ + Literal[ + "noPrint", + "noZoom", + "noRotate", + "noView", + "hidden", + "invisible", + "readOnly", + "locked", + "toggleNoView", + "lockedContents", + ] + ] + ] + createdAt: NotRequired[str] + updatedAt: NotRequired[str] + name: NotRequired[str] + creatorName: NotRequired[str] + customData: NotRequired[AnnotationCustomData | None] + + +class V2(TypedDict): + v: Literal[2] + pageIndex: PageIndex + bbox: AnnotationBbox + action: NotRequired[Action] + opacity: NotRequired[AnnotationOpacity] + pdfObjectId: NotRequired[PdfObjectId] + id: NotRequired[str] + flags: NotRequired[ + list[ + Literal[ + "noPrint", + "noZoom", + "noRotate", + "noView", + "hidden", + "invisible", + "readOnly", + "locked", + "toggleNoView", + "lockedContents", + ] + ] + ] + createdAt: NotRequired[str] + updatedAt: NotRequired[str] + name: NotRequired[str] + creatorName: NotRequired[str] + customData: NotRequired[AnnotationCustomData | None] + + +class BaseShapeAnnotation(TypedDict): + strokeDashArray: NotRequired[list[float]] + strokeWidth: NotRequired[float] + strokeColor: NotRequired[str] + note: NotRequired[AnnotationNote] + measurementScale: NotRequired[MeasurementScale] + measurementPrecision: NotRequired[MeasurementPrecision] + + +BaseAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/ellipse.py b/src/nutrient_dws/types/annotation/ellipse.py new file mode 100644 index 0000000..516ddea --- /dev/null +++ b/src/nutrient_dws/types/annotation/ellipse.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + CloudyBorderInset, + CloudyBorderIntensity, + FillColor, + ) + + +class EllipseBase(TypedDict): + type: Literal["pspdfkit/shape/ellipse"] + fillColor: NotRequired[FillColor] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, BaseShapeAnnotation, EllipseBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, EllipseBase): ... + + +EllipseAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/image.py b/src/nutrient_dws/types/annotation/image.py new file mode 100644 index 0000000..f6ca973 --- /dev/null +++ b/src/nutrient_dws/types/annotation/image.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, AnnotationRotation + + +class ImageBase(TypedDict): + type: Literal["pspdfkit/image"] + description: NotRequired[str] + fileName: NotRequired[str] + contentType: NotRequired[Literal["image/jpeg", "image/png", "application/pdf"]] + imageAttachmentId: NotRequired[str] + rotation: NotRequired[AnnotationRotation] + isSignature: NotRequired[bool] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, ImageBase): + pass + + +class V2(BaseV2, ImageBase): + pass + + +ImageAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/ink.py b/src/nutrient_dws/types/annotation/ink.py new file mode 100644 index 0000000..c1811b9 --- /dev/null +++ b/src/nutrient_dws/types/annotation/ink.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationNote, + BackgroundColor, + BlendMode, + Lines, + ) + + +class InkBase(TypedDict): + type: Literal["pspdfkit/ink"] + lines: Lines + lineWidth: int + isDrawnNaturally: NotRequired[bool] + isSignature: NotRequired[bool] + strokeColor: NotRequired[str] + backgroundColor: NotRequired[BackgroundColor] + blendMode: NotRequired[BlendMode] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, InkBase): + pass + + +class V2(BaseV2, InkBase): + pass + + +InkAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/line.py b/src/nutrient_dws/types/annotation/line.py new file mode 100644 index 0000000..5afabf2 --- /dev/null +++ b/src/nutrient_dws/types/annotation/line.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import FillColor, LineCaps, Point + + +class LineBase(TypedDict): + type: Literal["pspdfkit/shape/line"] + startPoint: Point + endPoint: Point + fillColor: NotRequired[FillColor] + lineCaps: NotRequired[LineCaps] + + +class V1(BaseV1, BaseShapeAnnotation, LineBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, LineBase): ... + + +LineAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/link.py b/src/nutrient_dws/types/annotation/link.py new file mode 100644 index 0000000..60411bb --- /dev/null +++ b/src/nutrient_dws/types/annotation/link.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, BorderStyle + + +class LinkBase(TypedDict): + type: Literal["pspdfkit/link"] + borderColor: NotRequired[str] + borderStyle: NotRequired[BorderStyle] + borderWidth: NotRequired[int] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, LinkBase): + pass + + +class V2(BaseV2, LinkBase): + pass + + +LinkAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/markup.py b/src/nutrient_dws/types/annotation/markup.py new file mode 100644 index 0000000..aca6f4c --- /dev/null +++ b/src/nutrient_dws/types/annotation/markup.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationNote, + BlendMode, + IsCommentThreadRoot, + Rect, + ) + + +class MarkupBase(TypedDict): + type: Literal[ + "pspdfkit/markup/highlight", + "pspdfkit/markup/squiggly", + "pspdfkit/markup/strikeout", + "pspdfkit/markup/underline", + ] + rects: list[Rect] + blendMode: NotRequired[BlendMode] + color: str + note: NotRequired[AnnotationNote] + isCommentThreadRoot: NotRequired[IsCommentThreadRoot] + + +class V1(BaseV1, MarkupBase): ... + + +class V2(BaseV2, MarkupBase): ... + + +MarkupAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/note.py b/src/nutrient_dws/types/annotation/note.py new file mode 100644 index 0000000..bd0b9ae --- /dev/null +++ b/src/nutrient_dws/types/annotation/note.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationPlainText, IsCommentThreadRoot + +NoteIcon = Literal[ + "comment", + "rightPointer", + "rightArrow", + "check", + "circle", + "cross", + "insert", + "newParagraph", + "note", + "paragraph", + "help", + "star", + "key", +] + + +class NoteBase(TypedDict): + text: AnnotationPlainText + icon: NoteIcon + color: NotRequired[str] + isCommentThreadRoot: NotRequired[IsCommentThreadRoot] + + +class V1(BaseV1, NoteBase): + pass + + +class V2(BaseV2, NoteBase): + pass + + +NoteAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/polygon.py b/src/nutrient_dws/types/annotation/polygon.py new file mode 100644 index 0000000..7dade5d --- /dev/null +++ b/src/nutrient_dws/types/annotation/polygon.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import CloudyBorderIntensity, FillColor, Point + + +class PolygonBase(TypedDict): + type: Literal["pspdfkit/shape/polygon"] + fillColor: NotRequired[FillColor] + points: list[Point] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + + +class V1(BaseV1, BaseShapeAnnotation, PolygonBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, PolygonBase): ... + + +PolygonAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/polyline.py b/src/nutrient_dws/types/annotation/polyline.py new file mode 100644 index 0000000..53d9259 --- /dev/null +++ b/src/nutrient_dws/types/annotation/polyline.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + CloudyBorderInset, + CloudyBorderIntensity, + FillColor, + LineCaps, + Point, + ) + + +class PolylineBase(TypedDict): + type: Literal["pspdfkit/shape/polyline"] + fillColor: NotRequired[FillColor] + points: list[Point] + lineCaps: NotRequired[LineCaps] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, BaseShapeAnnotation, PolylineBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, PolylineBase): ... + + +PolylineAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/rectangle.py b/src/nutrient_dws/types/annotation/rectangle.py new file mode 100644 index 0000000..f37897b --- /dev/null +++ b/src/nutrient_dws/types/annotation/rectangle.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 +from nutrient_dws.types.annotation.base import BaseShapeAnnotation + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + CloudyBorderInset, + CloudyBorderIntensity, + FillColor, + ) + + +class RectangleBase(TypedDict): + type: Literal["pspdfkit/shape/rectangle"] + fillColor: NotRequired[FillColor] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, BaseShapeAnnotation, RectangleBase): ... + + +class V2(BaseV2, BaseShapeAnnotation, RectangleBase): ... + + +RectangleAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/redaction.py b/src/nutrient_dws/types/annotation/redaction.py new file mode 100644 index 0000000..1195af7 --- /dev/null +++ b/src/nutrient_dws/types/annotation/redaction.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, AnnotationRotation, Rect + + +class RedactionBase(TypedDict): + type: Literal["pspdfkit/markup/redaction"] + rects: NotRequired[list[Rect]] + outlineColor: NotRequired[str] + fillColor: NotRequired[str] + overlayText: NotRequired[str] + repeatOverlayText: NotRequired[bool] + color: NotRequired[str] + rotation: NotRequired[AnnotationRotation] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, RedactionBase): ... + + +class V2(BaseV2, RedactionBase): ... + + +RedactionAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/stamp.py b/src/nutrient_dws/types/annotation/stamp.py new file mode 100644 index 0000000..f190b8b --- /dev/null +++ b/src/nutrient_dws/types/annotation/stamp.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import AnnotationNote, AnnotationRotation + + +class StampBase(TypedDict): + type: Literal["pspdfkit/stamp"] + stampType: Literal[ + "Accepted", + "Approved", + "AsIs", + "Completed", + "Confidential", + "Departmental", + "Draft", + "Experimental", + "Expired", + "Final", + "ForComment", + "ForPublicRelease", + "InformationOnly", + "InitialHere", + "NotApproved", + "NotForPublicRelease", + "PreliminaryResults", + "Rejected", + "Revised", + "SignHere", + "Sold", + "TopSecret", + "Void", + "Witness", + "Custom", + ] + title: NotRequired[str] + subtitle: NotRequired[str] + color: NotRequired[str] + rotation: NotRequired[AnnotationRotation] + note: NotRequired[AnnotationNote] + + +class V1(BaseV1, StampBase): + pass + + +class V2(BaseV2, StampBase): + pass + + +StampAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/text.py b/src/nutrient_dws/types/annotation/text.py new file mode 100644 index 0000000..73fc21b --- /dev/null +++ b/src/nutrient_dws/types/annotation/text.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationPlainText, + AnnotationRotation, + BorderStyle, + CloudyBorderInset, + CloudyBorderIntensity, + Font, + FontColor, + FontSizeInt, + HorizontalAlign, + LineCap, + Point, + VerticalAlign, + ) + + +class Callout(TypedDict): + start: Point + end: Point + innerRectInset: list[float] + cap: NotRequired[LineCap] + knee: NotRequired[Point] + + +class TextBase(TypedDict): + type: Literal["pspdfkit/text"] + text: AnnotationPlainText + fontSize: FontSizeInt + fontStyle: NotRequired[list[Literal["bold", "italic"]]] + fontColor: NotRequired[FontColor] + font: NotRequired[Font] + backgroundColor: NotRequired[str] + horizontalAlign: NotRequired[HorizontalAlign] + verticalAlign: NotRequired[VerticalAlign] + rotation: NotRequired[AnnotationRotation] + isFitting: NotRequired[bool] + callout: NotRequired[Callout] + borderStyle: NotRequired[BorderStyle] + borderWidth: NotRequired[int] + cloudyBorderIntensity: NotRequired[CloudyBorderIntensity] + cloudyBorderInset: NotRequired[CloudyBorderInset] + + +class V1(BaseV1, TextBase): + pass + + +class V2(BaseV2, TextBase): + pass + + +TextAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/annotation/widget.py b/src/nutrient_dws/types/annotation/widget.py new file mode 100644 index 0000000..366a449 --- /dev/null +++ b/src/nutrient_dws/types/annotation/widget.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.base import V1 as BaseV1 +from nutrient_dws.types.annotation.base import V2 as BaseV2 + +if TYPE_CHECKING: + from nutrient_dws.types.misc import ( + AnnotationRotation, + BackgroundColor, + BorderStyle, + Font, + FontColor, + FontSizeAuto, + FontSizeInt, + HorizontalAlign, + VerticalAlign, + ) + + +class WidgetBase(TypedDict): + type: Literal["pspdfkit/widget"] + formFieldName: NotRequired[str] + borderColor: NotRequired[str] + borderStyle: NotRequired[BorderStyle] + borderWidth: NotRequired[int] + font: NotRequired[Font] + fontSize: NotRequired[FontSizeInt | FontSizeAuto] + fontColor: NotRequired[FontColor] + horizontalAlign: NotRequired[HorizontalAlign] + verticalAlign: NotRequired[VerticalAlign] + rotation: NotRequired[AnnotationRotation] + backgroundColor: NotRequired[BackgroundColor] + + +class V1(BaseV1, WidgetBase): + pass + + +class V2(BaseV2, WidgetBase): + pass + + +WidgetAnnotation = Union[V1, V2] diff --git a/src/nutrient_dws/types/build_actions.py b/src/nutrient_dws/types/build_actions.py new file mode 100644 index 0000000..4d18112 --- /dev/null +++ b/src/nutrient_dws/types/build_actions.py @@ -0,0 +1,161 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation.redaction import RedactionAnnotation +from nutrient_dws.types.file_handle import FileHandle +from nutrient_dws.types.misc import OcrLanguage, WatermarkDimension + + +class ApplyInstantJsonAction(TypedDict): + type: Literal["applyInstantJson"] + file: FileHandle + + +class ApplyXfdfActionOptions(TypedDict, total=False): + ignorePageRotation: NotRequired[bool] + richTextEnabled: NotRequired[bool] + + +class ApplyXfdfAction(TypedDict): + type: Literal["applyXfdf"] + file: FileHandle + ignorePageRotation: NotRequired[bool] + richTextEnabled: NotRequired[bool] + + +class FlattenAction(TypedDict): + type: Literal["flatten"] + annotationIds: NotRequired[list[str | int]] + + +class OcrAction(TypedDict): + type: Literal["ocr"] + language: OcrLanguage | list[OcrLanguage] + + +class RotateAction(TypedDict): + type: Literal["rotate"] + rotateBy: Literal[90, 180, 270] + + +class BaseWatermarkActionOptions(TypedDict): + width: WatermarkDimension + height: WatermarkDimension + top: NotRequired[WatermarkDimension] + right: NotRequired[WatermarkDimension] + bottom: NotRequired[WatermarkDimension] + left: NotRequired[WatermarkDimension] + rotation: NotRequired[float] + opacity: NotRequired[float] + + +class BaseWatermarkAction(BaseWatermarkActionOptions): + type: Literal["watermark"] + + +class TextWatermarkActionOptions(BaseWatermarkActionOptions, total=False): + fontFamily: NotRequired[str] + fontSize: NotRequired[int] + fontColor: NotRequired[str] + fontStyle: NotRequired[list[Literal["bold", "italic"]]] + + +class TextWatermarkAction(BaseWatermarkAction): + text: str + fontFamily: NotRequired[str] + fontSize: NotRequired[int] + fontColor: NotRequired[str] + fontStyle: NotRequired[list[Literal["bold", "italic"]]] + + +class ImageWatermarkActionOptions(BaseWatermarkActionOptions, total=False): ... + + +class ImageWatermarkAction(BaseWatermarkAction): + image: FileHandle + + +WatermarkAction = Union[TextWatermarkAction, ImageWatermarkAction] + +SearchPreset = Literal[ + "credit-card-number", + "date", + "email-address", + "international-phone-number", + "ipv4", + "ipv6", + "mac-address", + "north-american-phone-number", + "social-security-number", + "time", + "url", + "us-zip-code", + "vin", +] + + +class CreateRedactionsStrategyOptionsPreset(TypedDict): + includeAnnotations: NotRequired[bool] + start: NotRequired[int] + limit: NotRequired[int] + + +class CreateRedactionsStrategyOptionsRegex(TypedDict): + includeAnnotations: NotRequired[bool] + caseSensitive: NotRequired[bool] + start: NotRequired[int] + limit: NotRequired[int] + + +class CreateRedactionsStrategyOptionsText(TypedDict): + includeAnnotations: NotRequired[bool] + caseSensitive: NotRequired[bool] + start: NotRequired[int] + limit: NotRequired[int] + + +class BaseCreateRedactionsOptions(TypedDict): + content: NotRequired[RedactionAnnotation] + + +class BaseCreateRedactionsAction(BaseCreateRedactionsOptions): + type: Literal["createRedactions"] + + +class CreateRedactionsActionPreset(TypedDict, BaseCreateRedactionsAction): + strategy: Literal["preset"] + strategyOptions: CreateRedactionsStrategyOptionsPreset + + +class CreateRedactionsActionRegex(TypedDict, BaseCreateRedactionsAction): + strategy: Literal["regex"] + strategyOptions: CreateRedactionsStrategyOptionsRegex + + +class CreateRedactionsActionText(TypedDict, BaseCreateRedactionsAction): + strategy: Literal["text"] + strategyOptions: CreateRedactionsStrategyOptionsText + + +CreateRedactionsAction = Union[ + CreateRedactionsActionPreset, + CreateRedactionsActionRegex, + CreateRedactionsActionText, +] + + +class ApplyRedactionsAction(TypedDict): + type: Literal["applyRedactions"] + + +BuildAction = Union[ + ApplyInstantJsonAction, + ApplyXfdfAction, + FlattenAction, + OcrAction, + RotateAction, + WatermarkAction, + CreateRedactionsAction, + ApplyRedactionsAction, +] diff --git a/src/nutrient_dws/types/build_instruction.py b/src/nutrient_dws/types/build_instruction.py new file mode 100644 index 0000000..8d8ce9d --- /dev/null +++ b/src/nutrient_dws/types/build_instruction.py @@ -0,0 +1,13 @@ +from typing import TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.build_actions import BuildAction +from nutrient_dws.types.build_output import BuildOutput +from nutrient_dws.types.input_parts import Part + + +class BuildInstructions(TypedDict): + parts: list[Part] + actions: NotRequired[list[BuildAction]] + output: NotRequired[BuildOutput] diff --git a/src/nutrient_dws/types/build_output.py b/src/nutrient_dws/types/build_output.py new file mode 100644 index 0000000..e314b6f --- /dev/null +++ b/src/nutrient_dws/types/build_output.py @@ -0,0 +1,125 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.misc import OcrLanguage, PageRange + + +class Metadata(TypedDict): + title: NotRequired[str] + author: NotRequired[str] + + +class Label(TypedDict): + pages: list[int] + label: str + + +PDFUserPermission = Literal[ + "printing", + "modification", + "extract", + "annotations_and_forms", + "fill_forms", + "extract_accessibility", + "assemble", + "print_high_quality", +] + + +class OptimizePdf(TypedDict): + grayscaleText: NotRequired[bool] + grayscaleGraphics: NotRequired[bool] + grayscaleImages: NotRequired[bool] + grayscaleFormFields: NotRequired[bool] + grayscaleAnnotations: NotRequired[bool] + disableImages: NotRequired[bool] + mrcCompression: NotRequired[bool] + imageOptimizationQuality: NotRequired[int] + linearize: NotRequired[bool] + + +class BasePDFOutput(TypedDict): + metadata: NotRequired[Metadata] + labels: NotRequired[list[Label]] + user_password: NotRequired[str] + owner_password: NotRequired[str] + user_permissions: NotRequired[list[PDFUserPermission]] + optimize: NotRequired[OptimizePdf] + + +PDFOutputOptions = BasePDFOutput + + +class PDFOutput(BasePDFOutput): + type: NotRequired[Literal["pdf"]] + + +class PDFAOutputOptions(PDFOutputOptions): + conformance: NotRequired[ + Literal[ + "pdfa-1a", "pdfa-1b", "pdfa-2a", "pdfa-2u", "pdfa-2b", "pdfa-3a", "pdfa-3u" + ] + ] + vectorization: NotRequired[bool] + rasterization: NotRequired[bool] + + +class PDFAOutput(PDFAOutputOptions): + type: Literal["pdfa"] + + +PDFUAOutputOptions = BasePDFOutput + + +class PDFUAOutput(PDFUAOutputOptions): + type: Literal["pdfua"] + + +class ImageOutputOptions(TypedDict): + format: NotRequired[Literal["png", "jpeg", "jpg", "webp"]] + pages: NotRequired[PageRange] + width: NotRequired[float] + height: NotRequired[float] + dpi: NotRequired[float] + + +class ImageOutput(ImageOutputOptions): + type: Literal["image"] + + +class JSONContentOutputOptions(TypedDict): + plainText: NotRequired[bool] + structuredText: NotRequired[bool] + keyValuePairs: NotRequired[bool] + tables: NotRequired[bool] + language: NotRequired[OcrLanguage | list[OcrLanguage]] + + +class JSONContentOutput(JSONContentOutputOptions): + type: Literal["json-content"] + + +class OfficeOutput(TypedDict): + type: Literal["docx", "xlsx", "pptx"] + + +class HTMLOutput(TypedDict): + type: Literal["html"] + layout: NotRequired[Literal["page", "reflow"]] + + +class MarkdownOutput(TypedDict): + type: Literal["markdown"] + + +BuildOutput = Union[ + PDFOutput, + PDFAOutput, + PDFUAOutput, + ImageOutput, + JSONContentOutput, + OfficeOutput, + HTMLOutput, + MarkdownOutput, +] diff --git a/src/nutrient_dws/types/build_response_json.py b/src/nutrient_dws/types/build_response_json.py new file mode 100644 index 0000000..06fd943 --- /dev/null +++ b/src/nutrient_dws/types/build_response_json.py @@ -0,0 +1,129 @@ +from typing import TypedDict + +from typing_extensions import NotRequired + +PlainText = str + + +class JsonContentsBbox(TypedDict): + """Represents a rectangular region on the page.""" + + left: float + top: float + width: float + height: float + + +class Character(TypedDict): + """Character in structured text.""" + + bbox: JsonContentsBbox + char: str + + +class Line(TypedDict): + """Line in structured text.""" + + bbox: JsonContentsBbox + text: str + + +class Word(TypedDict): + """Word in structured text.""" + + bbox: JsonContentsBbox + text: str + + +class Paragraph(TypedDict): + """Paragraph in structured text.""" + + bbox: JsonContentsBbox + text: str + + +class StructuredText(TypedDict): + """Structured text content.""" + + characters: NotRequired[list[Character]] + lines: NotRequired[list[Line]] + paragraphs: NotRequired[list[Paragraph]] + words: NotRequired[list[Word]] + + +class KVPKey(TypedDict): + """Key-value pair key.""" + + bbox: JsonContentsBbox + confidence: float + text: str + + +class KVPValue(TypedDict): + """Key-value pair value.""" + + bbox: JsonContentsBbox + confidence: float + text: str + + +class KeyValuePair(TypedDict): + """Detected key-value pair.""" + + confidence: float + key: KVPKey + value: KVPValue + + +class TableCell(TypedDict): + """Table cell.""" + + bbox: JsonContentsBbox + rowIndex: int + colIndex: int + text: str + + +class TableColumn(TypedDict): + """Table column.""" + + bbox: JsonContentsBbox + + +class TableLine(TypedDict): + """Table line.""" + + bbox: JsonContentsBbox + + +class TableRow(TypedDict): + """Table row.""" + + bbox: JsonContentsBbox + + +class Table(TypedDict): + """Detected table.""" + + confidence: float + bbox: JsonContentsBbox + cells: list[TableCell] + columns: list[TableColumn] + lines: list[TableLine] + rows: list[TableRow] + + +class PageJsonContents(TypedDict): + """JSON content for a single page.""" + + pageIndex: int + plainText: NotRequired[PlainText] + structuredText: NotRequired[StructuredText] + keyValuePairs: NotRequired[list[KeyValuePair]] + tables: NotRequired[list[Table]] + + +class BuildResponseJsonContents(TypedDict): + """Build response JSON contents.""" + + pages: NotRequired[list[PageJsonContents]] diff --git a/src/nutrient_dws/types/create_auth_token.py b/src/nutrient_dws/types/create_auth_token.py new file mode 100644 index 0000000..f288af2 --- /dev/null +++ b/src/nutrient_dws/types/create_auth_token.py @@ -0,0 +1,34 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class CreateAuthTokenParameters(TypedDict): + allowedOperations: NotRequired[ + list[ + Literal[ + "annotations_api", + "compression_api", + "data_extraction_api", + "digital_signatures_api", + "document_editor_api", + "html_conversion_api", + "image_conversion_api", + "image_rendering_api", + "email_conversion_api", + "linearization_api", + "ocr_api", + "office_conversion_api", + "pdfa_api", + "pdf_to_office_conversion_api", + "redaction_api", + ] + ] + ] + allowedOrigins: NotRequired[list[str]] + expirationTime: NotRequired[int] + + +class CreateAuthTokenResponse(TypedDict): + id: NotRequired[str] + accessToken: NotRequired[str] diff --git a/src/nutrient_dws/types/error_response.py b/src/nutrient_dws/types/error_response.py new file mode 100644 index 0000000..9aa254c --- /dev/null +++ b/src/nutrient_dws/types/error_response.py @@ -0,0 +1,15 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class FailingPath(TypedDict): + path: NotRequired[str] + details: NotRequired[str] + + +class HostedErrorResponse(TypedDict): + details: NotRequired[str] + status: NotRequired[Literal[400, 402, 408, 413, 422, 500]] + requestId: NotRequired[str] + failingPaths: NotRequired[list[FailingPath]] diff --git a/src/nutrient_dws/types/file_handle.py b/src/nutrient_dws/types/file_handle.py new file mode 100644 index 0000000..ba99ff2 --- /dev/null +++ b/src/nutrient_dws/types/file_handle.py @@ -0,0 +1,11 @@ +from typing import TypedDict, Union + +from typing_extensions import NotRequired + + +class RemoteFileHandle(TypedDict): + url: str + sha256: NotRequired[str] + + +FileHandle = Union[RemoteFileHandle, str] diff --git a/src/nutrient_dws/types/input_parts.py b/src/nutrient_dws/types/input_parts.py new file mode 100644 index 0000000..4dfe86a --- /dev/null +++ b/src/nutrient_dws/types/input_parts.py @@ -0,0 +1,63 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.build_actions import BuildAction +from nutrient_dws.types.file_handle import FileHandle +from nutrient_dws.types.misc import PageLayout, PageRange + + +class FilePartOptions(TypedDict): + password: NotRequired[str] + pages: NotRequired[PageRange] + layout: NotRequired[PageLayout] + content_type: NotRequired[str] + actions: NotRequired[list[BuildAction]] + + +class FilePart(FilePartOptions): + file: FileHandle + + +class HTMLPartOptions(TypedDict): + layout: NotRequired[PageLayout] + + +class HTMLPart(HTMLPartOptions): + html: FileHandle + assets: NotRequired[list[str]] + actions: NotRequired[list[BuildAction]] + + +class NewPagePartOptions(TypedDict): + pageCount: NotRequired[int] + layout: NotRequired[PageLayout] + + +class NewPagePart(NewPagePartOptions): + page: Literal["new"] + actions: NotRequired[list[BuildAction]] + + +DocumentId = str + + +class DocumentEngineID(TypedDict): + id: DocumentId | Literal["#self"] + layer: NotRequired[str] + + +class DocumentPartOptions(TypedDict): + password: NotRequired[str] + pages: NotRequired[PageRange] + layer: NotRequired[str] + + +class DocumentPart(TypedDict): + document: DocumentEngineID + password: NotRequired[str] + pages: NotRequired[PageRange] + actions: NotRequired[list[BuildAction]] + + +Part = Union[FilePart, HTMLPart, NewPagePart, DocumentPart] diff --git a/src/nutrient_dws/types/instant_json/__init__.py b/src/nutrient_dws/types/instant_json/__init__.py new file mode 100644 index 0000000..36c5d7d --- /dev/null +++ b/src/nutrient_dws/types/instant_json/__init__.py @@ -0,0 +1,27 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.annotation import Annotation +from nutrient_dws.types.instant_json.attachments import Attachments +from nutrient_dws.types.instant_json.bookmark import Bookmark +from nutrient_dws.types.instant_json.comment import CommentContent +from nutrient_dws.types.instant_json.form_field import FormField +from nutrient_dws.types.instant_json.form_field_value import FormFieldValue + + +class PdfId(TypedDict): + permanent: NotRequired[str] + changing: NotRequired[str] + + +class InstantJson(TypedDict): + format: Literal["https://pspdfkit.com/instant-json/v1"] + annotations: NotRequired[list[Annotation]] + attachments: NotRequired[Attachments] + formFields: NotRequired[list[FormField]] + formFieldValues: NotRequired[list[FormFieldValue]] + bookmarks: NotRequired[list[Bookmark]] + comments: NotRequired[list[CommentContent]] + skippedPdfObjectIds: NotRequired[list[int]] + pdfId: NotRequired[PdfId] diff --git a/src/nutrient_dws/types/instant_json/actions.py b/src/nutrient_dws/types/instant_json/actions.py new file mode 100644 index 0000000..49e2fee --- /dev/null +++ b/src/nutrient_dws/types/instant_json/actions.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + + +class BaseAction(TypedDict): + subAction: NotRequired[Action] + + +class GoToAction(BaseAction): + type: Literal["goTo"] + pageIndex: int + + +class GoToRemoteAction(BaseAction): + type: Literal["goToRemote"] + relativePath: str + namedDestination: NotRequired[str] + + +class GoToEmbeddedAction(BaseAction): + type: Literal["goToEmbedded"] + relativePath: str + newWindow: NotRequired[bool] + targetType: NotRequired[Literal["parent", "child"]] + + +class LaunchAction(BaseAction): + type: Literal["launch"] + filePath: str + + +class URIAction(BaseAction): + type: Literal["uri"] + uri: str + + +class AnnotationReference(TypedDict): + fieldName: NotRequired[str] + pdfObjectId: NotRequired[int] + + +class HideAction(BaseAction): + type: Literal["hide"] + hide: bool + annotationReferences: list[AnnotationReference] + + +class JavaScriptAction(BaseAction): + type: Literal["javascript"] + script: str + + +class SubmitFormAction(BaseAction): + type: Literal["submitForm"] + uri: str + flags: list[ + Literal[ + "includeExclude", + "includeNoValueFields", + "exportFormat", + "getMethod", + "submitCoordinated", + "xfdf", + "includeAppendSaves", + "includeAnnotations", + "submitPDF", + "canonicalFormat", + "excludeNonUserAnnotations", + "excludeFKey", + "embedForm", + ] + ] + fields: NotRequired[list[AnnotationReference]] + + +class ResetFormAction(BaseAction): + type: Literal["resetForm"] + flags: NotRequired[Literal["includeExclude"]] + fields: NotRequired[list[AnnotationReference]] + + +class NamedAction(BaseAction): + type: Literal["named"] + action: Literal[ + "nextPage", + "prevPage", + "firstPage", + "lastPage", + "goBack", + "goForward", + "goToPage", + "find", + "print", + "outline", + "search", + "brightness", + "zoomIn", + "zoomOut", + "saveAs", + "info", + ] + + +Action = Union[ + GoToAction, + GoToRemoteAction, + GoToEmbeddedAction, + LaunchAction, + URIAction, + HideAction, + JavaScriptAction, + SubmitFormAction, + ResetFormAction, + NamedAction, +] diff --git a/src/nutrient_dws/types/instant_json/attachments.py b/src/nutrient_dws/types/instant_json/attachments.py new file mode 100644 index 0000000..d84f0be --- /dev/null +++ b/src/nutrient_dws/types/instant_json/attachments.py @@ -0,0 +1,11 @@ +from typing import Optional, TypedDict + +from typing_extensions import NotRequired + + +class Attachment(TypedDict): + binary: NotRequired[str] + contentType: NotRequired[str] + + +Attachments = Optional[dict[str, Attachment]] diff --git a/src/nutrient_dws/types/instant_json/bookmark.py b/src/nutrient_dws/types/instant_json/bookmark.py new file mode 100644 index 0000000..ad98210 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/bookmark.py @@ -0,0 +1,13 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.instant_json.actions import Action + + +class Bookmark(TypedDict): + name: NotRequired[str] + type: Literal["pspdfkit/bookmark"] + v: Literal[1] + action: Action + pdfBookmarkId: NotRequired[str] diff --git a/src/nutrient_dws/types/instant_json/comment.py b/src/nutrient_dws/types/instant_json/comment.py new file mode 100644 index 0000000..dd38546 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/comment.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union + +from typing_extensions import NotRequired + +if TYPE_CHECKING: + from nutrient_dws.types.misc import PageIndex, PdfObjectId + + +class AnnotationText(TypedDict): + format: NotRequired[Literal["xhtml", "plain"]] + value: NotRequired[str] + + +IsoDateTime = str + + +CustomData = Optional[dict[str, Any]] + + +class V2(TypedDict): + type: Literal["pspdfkit/comment"] + pageIndex: PageIndex + rootId: str + text: AnnotationText + v: Literal[2] + createdAt: NotRequired[IsoDateTime] + creatorName: NotRequired[str] + customData: NotRequired[CustomData | None] + pdfObjectId: NotRequired[PdfObjectId] + updatedAt: NotRequired[IsoDateTime] + + +class V1(TypedDict): + type: Literal["pspdfkit/comment"] + pageIndex: PageIndex + rootId: str + text: str + v: Literal[1] + createdAt: NotRequired[IsoDateTime] + creatorName: NotRequired[str] + customData: NotRequired[CustomData | None] + pdfObjectId: NotRequired[PdfObjectId] + updatedAt: NotRequired[IsoDateTime] + + +CommentContent = Union[V2, V1] diff --git a/src/nutrient_dws/types/instant_json/form_field.py b/src/nutrient_dws/types/instant_json/form_field.py new file mode 100644 index 0000000..e8d0b19 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/form_field.py @@ -0,0 +1,117 @@ +from typing import Literal, TypedDict, Union + +from typing_extensions import NotRequired + +from nutrient_dws.types.instant_json.actions import Action + + +class BaseFormField(TypedDict): + v: Literal[1] + id: NotRequired[str] + name: str + label: str + annotationIds: list[str] + pdfObjectId: NotRequired[int] + flags: NotRequired[list[Literal["readOnly", "required", "noExport"]]] + + +class ButtonFormField(BaseFormField): + type: Literal["pspdfkit/form-field/button"] + buttonLabel: str + + +class FormFieldOption(TypedDict): + label: str + value: str + + +FormFieldOptions = list[FormFieldOption] + + +FormFieldDefaultValues = list[str] + + +class FormFieldAdditionalActionsEvent(TypedDict): + onChange: NotRequired[Action] + onCalculate: NotRequired[Action] + + +class ChoiceFormField(TypedDict): + options: FormFieldOptions + multiSelect: NotRequired[bool] + commitOnChange: NotRequired[bool] + defaultValues: NotRequired[FormFieldDefaultValues] + additionalActions: NotRequired[FormFieldAdditionalActionsEvent] + + +class FormFieldAdditionalActionsInput(TypedDict): + onInput: NotRequired[Action] + onFormat: NotRequired[Action] + + +class AdditionalActions( + FormFieldAdditionalActionsEvent, FormFieldAdditionalActionsInput +): + pass + + +class ListBoxFormField(BaseFormField): + type: NotRequired[Literal["pspdfkit/form-field/listbox"]] + additionalActions: NotRequired[AdditionalActions] + options: FormFieldOptions + multiSelect: NotRequired[bool] + commitOnChange: NotRequired[bool] + defaultValues: NotRequired[FormFieldDefaultValues] + + +class ComboBoxFormField(BaseFormField, ChoiceFormField): + type: NotRequired[Literal["pspdfkit/form-field/combobox"]] + edit: bool + doNotSpellCheck: bool + + +class CheckboxFormField(BaseFormField): + type: Literal["pspdfkit/form-field/checkbox"] + options: FormFieldOptions + defaultValues: FormFieldDefaultValues + additionalActions: NotRequired[FormFieldAdditionalActionsEvent] + + +FormFieldDefaultValue = str + + +class RadioButtonFormField(BaseFormField): + type: Literal["pspdfkit/form-field/radio"] + options: FormFieldOptions + defaultValue: NotRequired[FormFieldDefaultValue] + noToggleToOff: NotRequired[bool] + radiosInUnison: NotRequired[bool] + + +class TextFormField(BaseFormField): + type: Literal["pspdfkit/form-field/text"] + password: NotRequired[bool] + maxLength: NotRequired[int] + doNotSpellCheck: bool + doNotScroll: bool + multiLine: bool + comb: bool + defaultValue: FormFieldDefaultValue + richText: NotRequired[bool] + richTextValue: NotRequired[str] + additionalActions: NotRequired[AdditionalActions] + + +class SignatureFormField(BaseFormField): + type: NotRequired[Literal["pspdfkit/form-field/signature"]] + + +FormField = Union[ + ButtonFormField, + ListBoxFormField, + ComboBoxFormField, + CheckboxFormField, + RadioButtonFormField, + TextFormField, + SignatureFormField, +] diff --git a/src/nutrient_dws/types/instant_json/form_field_value.py b/src/nutrient_dws/types/instant_json/form_field_value.py new file mode 100644 index 0000000..29e90f0 --- /dev/null +++ b/src/nutrient_dws/types/instant_json/form_field_value.py @@ -0,0 +1,12 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class FormFieldValue(TypedDict): + name: str + value: NotRequired[str | None | list[str]] + type: Literal["pspdfkit/form-field-value"] + v: Literal[1] + optionIndexes: NotRequired[list[int]] + isFitting: NotRequired[bool] diff --git a/src/nutrient_dws/types/misc.py b/src/nutrient_dws/types/misc.py new file mode 100644 index 0000000..37756d3 --- /dev/null +++ b/src/nutrient_dws/types/misc.py @@ -0,0 +1,317 @@ +from typing import Any, Literal, Optional, TypedDict + +from typing_extensions import NotRequired + + +class PageRange(TypedDict): + start: NotRequired[int] + end: NotRequired[int] + + +class Pages(TypedDict): + start: int + end: int + + +class Size(TypedDict): + width: NotRequired[float] + height: NotRequired[float] + + +class Margin(TypedDict): + left: NotRequired[float] + top: NotRequired[float] + right: NotRequired[float] + bottom: NotRequired[float] + + +class PageLayout(TypedDict): + orientation: NotRequired[Literal["portrait", "landscape"]] + size: NotRequired[ + Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "Letter", "Legal"] + | Size + ] + margin: NotRequired[Margin] + + +OcrLanguage = Literal[ + "afrikaans", + "albanian", + "arabic", + "armenian", + "azerbaijani", + "basque", + "belarusian", + "bengali", + "bosnian", + "bulgarian", + "catalan", + "chinese", + "croatian", + "czech", + "danish", + "dutch", + "english", + "finnish", + "french", + "german", + "indonesian", + "italian", + "malay", + "norwegian", + "polish", + "portuguese", + "serbian", + "slovak", + "slovenian", + "spanish", + "swedish", + "turkish", + "welsh", + "afr", + "amh", + "ara", + "asm", + "aze", + "bel", + "ben", + "bod", + "bos", + "bre", + "bul", + "cat", + "ceb", + "ces", + "chr", + "cos", + "cym", + "dan", + "deu", + "div", + "dzo", + "ell", + "eng", + "enm", + "epo", + "equ", + "est", + "eus", + "fao", + "fas", + "fil", + "fin", + "fra", + "frk", + "frm", + "fry", + "gla", + "gle", + "glg", + "grc", + "guj", + "hat", + "heb", + "hin", + "hrv", + "hun", + "hye", + "iku", + "ind", + "isl", + "ita", + "jav", + "jpn", + "kan", + "kat", + "kaz", + "khm", + "kir", + "kmr", + "kor", + "kur", + "lao", + "lat", + "lav", + "lit", + "ltz", + "mal", + "mar", + "mkd", + "mlt", + "mon", + "mri", + "msa", + "mya", + "nep", + "nld", + "nor", + "oci", + "ori", + "osd", + "pan", + "pol", + "por", + "pus", + "que", + "ron", + "rus", + "san", + "sin", + "slk", + "slv", + "snd", + "sp1", + "spa", + "sqi", + "srp", + "sun", + "swa", + "swe", + "syr", + "tam", + "tat", + "tel", + "tgk", + "tgl", + "tha", + "tir", + "ton", + "tur", + "uig", + "ukr", + "urd", + "uzb", + "vie", + "yid", + "yor", +] + + +class WatermarkDimension(TypedDict): + value: float + unit: Literal["pt", "%"] + + +PageIndex = int + + +AnnotationBbox = list[float] + + +AnnotationOpacity = float + + +PdfObjectId = int + + +AnnotationCustomData = Optional[dict[str, Any]] + + +Rect = list[float] + + +AnnotationRotation = Literal[0, 90, 180, 270] + + +AnnotationNote = str + + +BlendMode = Literal[ + "normal", + "multiply", + "screen", + "overlay", + "darken", + "lighten", + "colorDodge", + "colorBurn", + "hardLight", + "softLight", + "difference", + "exclusion", +] + + +IsCommentThreadRoot = bool + + +CloudyBorderIntensity = float + + +CloudyBorderInset = list[float] + + +FillColor = str + + +MeasurementScale = TypedDict( + "MeasurementScale", + { + "unitFrom": NotRequired[Literal["in", "mm", "cm", "pt"]], + "unitTo": NotRequired[ + Literal["in", "mm", "cm", "pt", "ft", "m", "yd", "km", "mi"] + ], + "from": NotRequired[float], + "to": NotRequired[float], + }, +) + + +MeasurementPrecision = Literal["whole", "oneDp", "twoDp", "threeDp", "fourDp"] + + +FontSizeInt = int + + +FontStyle = list[Literal["bold", "italic"]] + + +FontColor = str + + +Font = str + + +HorizontalAlign = Literal["left", "center", "right"] + + +VerticalAlign = Literal["top", "center", "bottom"] + + +Point = list[float] + + +LineCap = Literal[ + "square", + "circle", + "diamond", + "openArrow", + "closedArrow", + "butt", + "reverseOpenArrow", + "reverseClosedArrow", + "slash", +] + + +BorderStyle = Literal["solid", "dashed", "beveled", "inset", "underline"] + + +class LineCaps(TypedDict): + start: NotRequired[LineCap] + end: NotRequired[LineCap] + + +AnnotationPlainText = str + +BackgroundColor = str + +FontSizeAuto = Literal["auto"] + + +Intensity = float + + +class Lines(TypedDict): + intensities: NotRequired[list[list[Intensity]]] + points: NotRequired[list[list[Point]]] diff --git a/src/nutrient_dws/types/redact_data.py b/src/nutrient_dws/types/redact_data.py new file mode 100644 index 0000000..2496883 --- /dev/null +++ b/src/nutrient_dws/types/redact_data.py @@ -0,0 +1,29 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + +from nutrient_dws.types.misc import Pages + + +class RemoteFile(TypedDict): + url: str + + +class Document(TypedDict): + file: NotRequired[str | RemoteFile] + pages: NotRequired[list[int] | Pages] + + +class Confidence(TypedDict): + threshold: float + + +class RedactOptions(TypedDict): + confidence: NotRequired[Confidence] + + +class RedactData(TypedDict): + documents: list[Document] + criteria: str + redaction_state: NotRequired[Literal["stage", "apply"]] + options: NotRequired[RedactOptions] diff --git a/src/nutrient_dws/types/sign_request.py b/src/nutrient_dws/types/sign_request.py new file mode 100644 index 0000000..1d16181 --- /dev/null +++ b/src/nutrient_dws/types/sign_request.py @@ -0,0 +1,27 @@ +from typing import Literal, TypedDict + +from typing_extensions import NotRequired + + +class Appearance(TypedDict): + mode: NotRequired[ + Literal["signatureOnly", "signatureAndDescription", "descriptionOnly"] + ] + contentType: NotRequired[str] + showWatermark: NotRequired[bool] + showSignDate: NotRequired[bool] + showDateTimezone: NotRequired[bool] + + +class Position(TypedDict): + pageIndex: int + rect: list[float] + + +class CreateDigitalSignature(TypedDict): + signatureType: Literal["cms", "cades"] + flatten: NotRequired[bool] + formFieldName: NotRequired[str] + appearance: NotRequired[Appearance] + position: NotRequired[Position] + cadesLevel: NotRequired[Literal["b-lt", "b-t", "b-b"]] diff --git a/src/nutrient_dws/utils/__init__.py b/src/nutrient_dws/utils/__init__.py new file mode 100644 index 0000000..d51fdc5 --- /dev/null +++ b/src/nutrient_dws/utils/__init__.py @@ -0,0 +1,3 @@ +from .version import get_library_version, get_user_agent + +__all__ = ["get_library_version", "get_user_agent"] diff --git a/src/nutrient_dws/utils/version.py b/src/nutrient_dws/utils/version.py new file mode 100644 index 0000000..5877cbb --- /dev/null +++ b/src/nutrient_dws/utils/version.py @@ -0,0 +1,21 @@ +import os +from importlib.metadata import version as pkg_version + + +def get_library_version() -> str: + """Gets the current version of the Nutrient DWS Python Client library. + + Strategy: Try importlib.metadata.version("nutrient-dws"); on any failure, return "0.0.0-dev". + """ + if os.getenv("PYTHON_ENV") == "development": + return "0.0.0-dev" + try: + return pkg_version("nutrient-dws") + except Exception: + return "0.0.0-dev" + + +def get_user_agent() -> str: + """Creates a User-Agent string for HTTP requests.""" + package_version = get_library_version() + return f"nutrient-dws/{package_version}" diff --git a/src/nutrient_dws/workflow.py b/src/nutrient_dws/workflow.py new file mode 100644 index 0000000..9e430d6 --- /dev/null +++ b/src/nutrient_dws/workflow.py @@ -0,0 +1,42 @@ +"""Factory function to create a new workflow builder with staged interface.""" + +from collections.abc import Callable + +from nutrient_dws.builder.builder import StagedWorkflowBuilder +from nutrient_dws.builder.staged_builders import WorkflowInitialStage +from nutrient_dws.http import NutrientClientOptions + + +def workflow( + api_key: str | Callable[[], str], + base_url: str | None = None, + timeout: int | None = None, +) -> WorkflowInitialStage: + r"""Factory function to create a new workflow builder with staged interface. + + Args: + api_key: API key or API key getter + base_url: DWS Base url + timeout: DWS request timeout + + Returns: + A new staged workflow builder instance + + Example: + ```python + from nutrient_dws import workflow + + # Create a workflow + result = await workflow({ + 'apiKey': 'your-api-key' + }) \\ + .add_file_part('document.pdf') \\ + .apply_action(BuildActions.ocr('english')) \\ + .output_pdf() \\ + .execute() + ``` + """ + client_options = NutrientClientOptions( + apiKey=api_key, baseUrl=base_url, timeout=timeout + ) + return StagedWorkflowBuilder(client_options) diff --git a/src/scripts/__init__.py b/src/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scripts/add_claude_code_rule.py b/src/scripts/add_claude_code_rule.py new file mode 100644 index 0000000..830322f --- /dev/null +++ b/src/scripts/add_claude_code_rule.py @@ -0,0 +1,28 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + + user_project_root = os.getcwd() + output_file = os.path.join(user_project_root, "CLAUDE.md") + + relative_doc_path = os.path.relpath(doc_path, user_project_root) + + try: + rule = f""" +# Nutrient DWS Python Client Usage +- Use the `nutrient-dws` package for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +- Package Documentation and Examples can be found at: {relative_doc_path} +""" + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Claude Code Rules to point to Nutrient DWS documentation at {relative_doc_path}." + ) + except Exception as err: + print(f"Failed to update CLAUDE.md file: {err}", file=sys.stderr) + sys.exit(1) diff --git a/src/scripts/add_cursor_rule.py b/src/scripts/add_cursor_rule.py new file mode 100644 index 0000000..3e0001b --- /dev/null +++ b/src/scripts/add_cursor_rule.py @@ -0,0 +1,35 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + with open(doc_path, encoding="utf-8") as file: + documentation = file.read() + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".cursor/rules") + output_file = os.path.join(output_directory, "nutrient-dws-doc.mdc") + + try: + rule = f""" +--- +description: This rule explains how to use the Nutrient DWS Python Client (`nutrient-dws`) for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +globs: +alwaysApply: false +--- + +{documentation} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Cursor Rules to point to Nutrient DWS documentation at {output_file}." + ) + except Exception as err: + print(f"Failed to update Cursor Rule: {err}", file=sys.stderr) + sys.exit(1) diff --git a/src/scripts/add_github_copilot_rule.py b/src/scripts/add_github_copilot_rule.py new file mode 100644 index 0000000..c5060a1 --- /dev/null +++ b/src/scripts/add_github_copilot_rule.py @@ -0,0 +1,34 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".github") + output_file = os.path.join(output_directory, "copilot-instructions.md") + + relative_doc_path = os.path.relpath(doc_path, user_project_root) + + try: + rule = f""" +# Nutrient DWS Python Client Usage +- Use the `nutrient-dws` package for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +- Package Documentation and Examples can be found at: {relative_doc_path} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated GitHub Copilot Rules to point to Nutrient DWS documentation at {relative_doc_path}." + ) + except Exception as err: + print( + f"Failed to update .github/copilot-instructions.md file: {err}", + file=sys.stderr, + ) + sys.exit(1) diff --git a/src/scripts/add_junie_rule.py b/src/scripts/add_junie_rule.py new file mode 100644 index 0000000..cfeaa8a --- /dev/null +++ b/src/scripts/add_junie_rule.py @@ -0,0 +1,31 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".junie") + output_file = os.path.join(output_directory, "guidelines.md") + + relative_doc_path = os.path.relpath(doc_path, user_project_root) + + try: + rule = f""" +# Nutrient DWS Python Client Usage +- Use the `nutrient-dws` package for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +- Package Documentation and Examples can be found at: {relative_doc_path} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Junie Code Rules to point to Nutrient DWS documentation at {relative_doc_path}." + ) + except Exception as err: + print(f"Failed to update .junie/guidelines.md file: {err}", file=sys.stderr) + sys.exit(1) diff --git a/src/scripts/add_windsurf_rule.py b/src/scripts/add_windsurf_rule.py new file mode 100644 index 0000000..5cd4554 --- /dev/null +++ b/src/scripts/add_windsurf_rule.py @@ -0,0 +1,34 @@ +import os +import sys + + +def main() -> None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + doc_path = os.path.join(script_dir, "..", "..", "LLM_DOC.md") + with open(doc_path, encoding="utf-8") as file: + documentation = file.read() + + user_project_root = os.getcwd() + output_directory = os.path.join(user_project_root, ".windsurf/rules") + output_file = os.path.join(output_directory, "nutrient-dws-doc.mdc") + + try: + rule = f""" +--- +description: This rule explains how to use the Nutrient DWS Python Client (`nutrient-dws`) for operations with document processing operations including conversion, merging, compression, watermarking, signage, and text extraction. +trigger: model_decision +--- + +{documentation} +""" + if not os.path.exists(output_directory): + os.makedirs(output_directory) + with open(output_file, "a", encoding="utf-8") as f: + f.write(rule) + + print( + f"📄 Updated Windsurf Rules to point to Nutrient DWS documentation at {output_file}." + ) + except Exception as err: + print(f"Failed to update Windsurf Rule: {err}", file=sys.stderr) + sys.exit(1) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9c8f10f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,72 @@ +from unittest.mock import AsyncMock + +import pytest +from nutrient_dws import NutrientClient +from tests.helpers import TestDocumentGenerator + +@pytest.fixture +def mock_workflow_instance(): + """Create a mock workflow instance for testing.""" + mock_output_stage = AsyncMock() + mock_output_stage.execute.return_value = { + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + mock_output_stage.dry_run.return_value = {"success": True} + + mock_workflow = AsyncMock() + mock_workflow.add_file_part.return_value = mock_workflow + mock_workflow.add_html_part.return_value = mock_workflow + mock_workflow.add_new_page.return_value = mock_workflow + mock_workflow.add_document_part.return_value = mock_workflow + mock_workflow.apply_actions.return_value = mock_workflow + mock_workflow.apply_action.return_value = mock_workflow + mock_workflow.output_pdf.return_value = mock_output_stage + mock_workflow.output_pdfa.return_value = mock_output_stage + mock_workflow.output_pdfua.return_value = mock_output_stage + mock_workflow.output_image.return_value = mock_output_stage + mock_workflow.output_office.return_value = mock_output_stage + mock_workflow.output_html.return_value = mock_output_stage + mock_workflow.output_markdown.return_value = mock_output_stage + mock_workflow.output_json.return_value = mock_output_stage + + return mock_workflow + + +@pytest.fixture +def valid_client_options(): + """Valid client options for testing.""" + return {"apiKey": "test-api-key", "baseUrl": "https://api.test.com/v1", "timeout": None} + +@pytest.fixture +def unit_client(): + return NutrientClient(api_key="test-api-key", base_url="https://api.test.com/v1") + +@pytest.fixture +def test_table_pdf(): + """Generate PDF with table for annotation tests.""" + return TestDocumentGenerator.generate_pdf_with_table() + +@pytest.fixture +def test_xfdf_content(): + """Generate XFDF content for testing.""" + return TestDocumentGenerator.generate_xfdf_content() + +@pytest.fixture +def test_instant_json_content(): + """Generate Instant JSON content for testing.""" + return TestDocumentGenerator.generate_instant_json_content() + +@pytest.fixture +def test_sensitive_pdf(): + """Generate PDF with sensitive data for redaction testing.""" + return TestDocumentGenerator.generate_pdf_with_sensitive_data() + +@pytest.fixture +def test_html_content(): + """Generate HTML content for testing.""" + return TestDocumentGenerator.generate_html_content() diff --git a/tests/data/sample.png b/tests/data/sample.png new file mode 100644 index 0000000..32ee307 Binary files /dev/null and b/tests/data/sample.png differ diff --git a/tests/data/sample_multipage.pdf b/tests/data/sample_multipage.pdf deleted file mode 100644 index 95556b3..0000000 Binary files a/tests/data/sample_multipage.pdf and /dev/null differ diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 0000000..7661924 --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,423 @@ +"""Test utilities and helpers for Nutrient DWS Python Client tests.""" + +from datetime import datetime, timezone +import json +from typing import Any, Optional, TypedDict, Literal, List +from pathlib import Path + + +class XfdfAnnotation(TypedDict): + type: Literal["highlight", "text", "square", "circle"] + page: int + rect: List[int] + content: Optional[str] + color: Optional[str] + + +class TestDocumentGenerator: + """Generate test documents and content for testing purposes.""" + + @staticmethod + def generate_simple_pdf_content(content: str = "Test PDF Document") -> bytes: + """Generate a simple PDF-like content for testing. + + Note: This is not a real PDF, just bytes that can be used for testing file handling. + """ + pdf = f"""%PDF-1.4 +1 0 obj<>endobj +2 0 obj<>endobj +3 0 obj<>>>>>>/Contents 4 0 R>>endobj +4 0 obj<>stream +BT /F1 12 Tf 100 700 Td ({content}) Tj ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +trailer<> +startxref +356 +%%EOF""" + return pdf.encode("utf-8") + + @staticmethod + def generate_pdf_with_sensitive_data() -> bytes: + """Generate PDF-like content with sensitive data patterns for redaction testing.""" + content = f"""Personal Information: +Name: John Doe +SSN: 123-45-6789 +Email: john.doe@example.com +Phone: (555) 123-4567 +Credit Card: 4111-1111-1111-1111 +Medical Record: MR-2024-12345 +License: DL-ABC-123456""" + return TestDocumentGenerator.generate_simple_pdf_content(content) + + @staticmethod + def generate_pdf_with_table() -> bytes: + """Generate PDF-like content with table data patterns""" + content = f"""Sales Report 2024 +Product | Q1 | Q2 | Q3 | Q4 +Widget A | 100 | 120 | 140 | 160 +Widget B | 80 | 90 | 100 | 110 +Widget C | 60 | 70 | 80 | 90""" + return TestDocumentGenerator.generate_simple_pdf_content(content) + + @staticmethod + def generate_html_content( + title: str = "Test Document", + include_styles: bool = True, + include_table: bool = False, + include_images: bool = False, + include_form: bool = False, + ) -> bytes: + """Generate HTML content for testing.""" + + styles = ( + """""" + if include_styles + else "" + ) + tables = ( + """

Data Table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ProductPriceQuantityTotal
Widget A$10.005$50.00
Widget B$15.003$45.00
Widget C$20.002$40.00
""" + if include_table + else "" + ) + images = ( + """

Images

+

Below is a placeholder for image content:

+
+ Image Placeholder +
""" + if include_images + else "" + ) + form = ( + """

Form Example

+
+
+ + +
+
+ + +
+
+ + +
+
""" + if include_form + else "" + ) + + html = f""" + + + + + {title}{styles} + + +

{title}

+

This is a test document with highlighted text for PDF conversion testing.

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

+ {tables}{images}{form} + +""" + return html.encode("utf-8") + + @staticmethod + def generate_xfdf_content( + annotations: Optional[list[XfdfAnnotation]] = None, + ) -> bytes: + """Generate XFDF annotation content.""" + + if annotations is None: + annotations = [ + { + "type": "highlight", + "page": 0, + "rect": [100, 100, 200, 150], + "color": "#FFFF00", + "content": "Important text", + }, + ] + + inner_xfdf = "" + + for annot in annotations: + rectStr = ",".join([str(x) for x in annot["rect"]]) + color = annot["color"] or "#FFFF00" + if annot["type"] == "highlight": + inner_xfdf = f""" + ${annot.get("content", "Highlighted text")} + """ + elif annot["type"] == "text": + inner_xfdf = f""" + ${annot.get("content", "Note")} + """ + elif annot["type"] == "square": + inner_xfdf = f"""""" + elif annot["type"] == "circle": + inner_xfdf = f"""""" + + xfdf = f""" + + + {inner_xfdf} + +""" + + return xfdf.encode("utf-8") + + @staticmethod + def generate_instant_json_content(annotations: Optional[list] = None) -> bytes: + """Generate Instant JSON annotation content.""" + annotations = annotations or [ + { + "v": 2, + "type": "pspdfkit/text", + "pageIndex": 0, + "bbox": [100, 100, 200, 150], + "content": "Test annotation", + "fontSize": 14, + "opacity": 1, + "horizontalAlign": "left", + "verticalAlign": "top", + } + ] + instant_data = { + "format": "https://pspdfkit.com/instant-json/v1", + "annotations": [], + } + + for i, annotation in enumerate(annotations): + instant_data["annotations"].append( + { + **annotation, + "id": f"annotation_{i}", + "createdAt": datetime.now(timezone.utc).isoformat(), + "updatedAt": datetime.now(timezone.utc).isoformat(), + } + ) + + return json.dumps(instant_data).encode("utf-8") + + +class ResultValidator: + """Validate test results and outputs.""" + + @staticmethod + def validate_pdf_output(result: Any) -> None: + """Validates that the result contains a valid PDF""" + if not isinstance(result, dict): + raise ValueError("Result must be a dictionary") + + if "success" not in result: + raise ValueError("Result must have success property") + if not result.get("success") or "output" not in result: + raise ValueError("Result must be successful with output") + + output = result["output"] + if not isinstance(output.get("buffer"), (bytes, bytearray)): + raise ValueError("Output buffer must be bytes or bytearray") + if output.get("mimeType") != "application/pdf": + raise ValueError("Output must be PDF") + if len(output["buffer"]) == 0: + raise ValueError("Output buffer cannot be empty") + + # Check PDF header + header = output["buffer"][:5].decode(errors="ignore") + if not header.startswith("%PDF-"): + raise ValueError("Invalid PDF header") + + @staticmethod + def validate_office_output( + result: Any, format: Literal["docx", "xlsx", "pptx"] + ) -> None: + """Validates Office document output""" + mime_types = { + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + } + + if ( + not isinstance(result, dict) + or not result.get("success") + or "output" not in result + ): + raise ValueError("Result must be successful with output") + + output = result["output"] + if not isinstance(output.get("buffer"), (bytes, bytearray)): + raise ValueError("Output buffer must be bytes or bytearray") + if len(output["buffer"]) == 0: + raise ValueError("Output buffer cannot be empty") + if output.get("mimeType") != mime_types[format]: + raise ValueError(f"Expected {format} MIME type") + + @staticmethod + def validate_image_output( + result: Any, format: Literal["png", "jpeg", "jpg", "webp"] | None = None + ) -> None: + """Validates image output""" + if ( + not isinstance(result, dict) + or not result.get("success") + or "output" not in result + ): + raise ValueError("Result must be successful with output") + + output = result["output"] + if not isinstance(output.get("buffer"), (bytes, bytearray)): + raise ValueError("Output buffer must be bytes or bytearray") + if len(output["buffer"]) == 0: + raise ValueError("Output buffer cannot be empty") + + if format: + format_mime_types = { + "png": ["image/png"], + "jpg": ["image/jpeg"], + "jpeg": ["image/jpeg"], + "webp": ["image/webp"], + } + valid_mimes = format_mime_types.get(format, [f"image/{format}"]) + if output.get("mimeType") not in valid_mimes: + raise ValueError( + f"Expected format {format}, got {output.get('mimeType')}" + ) + else: + if not isinstance(output.get("mimeType"), str) or not output[ + "mimeType" + ].startswith("image/"): + raise ValueError("Expected image MIME type") + + @staticmethod + def validate_json_output(result: Any) -> None: + """Validates JSON extraction output""" + if ( + not isinstance(result, dict) + or not result.get("success") + or "output" not in result + ): + raise ValueError("Result must be successful with output") + + output = result["output"] + if "data" not in output: + raise ValueError("Output must have data property") + if not isinstance(output["data"], dict): + raise ValueError("Output data must be an object") + + @staticmethod + def validate_error_response( + result: Any, expected_error_type: str | None = None + ) -> None: + """Validates error response""" + if not isinstance(result, dict): + raise ValueError("Result must be a dictionary") + + if result.get("success"): + raise ValueError("Result should not be successful") + if not isinstance(result.get("errors"), list): + raise ValueError("Result must have errors array") + if len(result["errors"]) == 0: + raise ValueError("Errors array cannot be empty") + + if expected_error_type: + has_expected_error = any( + isinstance(e, dict) + and "error" in e + and ( + e["error"].get("name") == expected_error_type + or e["error"].get("code") == expected_error_type + ) + for e in result["errors"] + ) + if not has_expected_error: + raise ValueError(f"Expected error type {expected_error_type} not found") + + +sample_pdf = Path(__file__).parent.joinpath("data", "sample.pdf").read_bytes() + +sample_docx = Path(__file__).parent.joinpath("data", "sample.docx").read_bytes() + +sample_png = Path(__file__).parent.joinpath("data", "sample.png").read_bytes() diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py deleted file mode 100644 index a0ee81b..0000000 --- a/tests/integration/conftest.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Pytest configuration for integration tests.""" - -import pytest - - -def pytest_configure(config): - """Configure pytest for integration tests.""" - # Add custom markers - config.addinivalue_line( - "markers", "integration: marks tests as integration tests (may require API key)" - ) - - -def pytest_collection_modifyitems(config, items): - """Automatically mark integration tests.""" - for item in items: - if "integration" in str(item.fspath): - item.add_marker(pytest.mark.integration) diff --git a/tests/integration/integration_config.py.example b/tests/integration/integration_config.py.example deleted file mode 100644 index 75b359d..0000000 --- a/tests/integration/integration_config.py.example +++ /dev/null @@ -1,13 +0,0 @@ -"""Integration test configuration template. - -Copy this file to integration_config.py and fill in your API key. -""" - -# Your Nutrient DWS API key -API_KEY = "your-api-key-here" - -# Optional: Override base URL if testing against different environment -# BASE_URL = "https://api.nutrient.io/build" - -# Optional: Request timeout for integration tests -# TIMEOUT = 60 \ No newline at end of file diff --git a/tests/integration/test_direct_api_integration.py b/tests/integration/test_direct_api_integration.py deleted file mode 100644 index a36b1c9..0000000 --- a/tests/integration/test_direct_api_integration.py +++ /dev/null @@ -1,683 +0,0 @@ -"""Comprehensive integration tests for Direct API methods. - -These tests require a valid API key configured in integration_config.py and -test all Direct API methods against the live Nutrient DWS API. -""" - -import pytest - -from nutrient_dws import NutrientClient -from nutrient_dws.file_handler import get_pdf_page_count - -try: - from . import integration_config # type: ignore[attr-defined] - - API_KEY: str | None = integration_config.API_KEY - BASE_URL: str | None = getattr(integration_config, "BASE_URL", None) - TIMEOUT: int = getattr(integration_config, "TIMEOUT", 60) -except ImportError: - API_KEY = None - BASE_URL = None - TIMEOUT = 60 - - -def assert_is_pdf(file_path_or_bytes: str | bytes) -> None: - """Assert that a file or bytes is a valid PDF. - - Args: - file_path_or_bytes: Path to file or bytes content to check. - """ - if isinstance(file_path_or_bytes, (str, bytes)): - if isinstance(file_path_or_bytes, str): - with open(file_path_or_bytes, "rb") as f: - content = f.read(8) - else: - content = file_path_or_bytes[:8] - - # Check PDF magic number - assert content.startswith(b"%PDF-"), ( - f"File does not start with PDF magic number, got: {content!r}" - ) - else: - raise ValueError("Input must be file path string or bytes") - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestDirectAPIIntegration: - """Comprehensive integration tests for all Direct API methods.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - client = NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - yield client - client.close() - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file for testing.""" - import os - - return os.path.join(os.path.dirname(__file__), "..", "data", "sample.pdf") - - @pytest.fixture - def sample_docx_path(self): - """Get path to sample DOCX file for testing.""" - import os - - return os.path.join(os.path.dirname(__file__), "..", "data", "sample.docx") - - @pytest.fixture - def sample_multipage_pdf_path(self): - """Get path to multi-page sample PDF file for testing.""" - import os - - return os.path.join(os.path.dirname(__file__), "..", "data", "sample_multipage.pdf") - - # Tests for convert_to_pdf - def test_convert_to_pdf_from_docx(self, client, sample_docx_path): - """Test convert_to_pdf method with DOCX input.""" - result = client.convert_to_pdf(sample_docx_path) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_convert_to_pdf_with_output_file(self, client, sample_docx_path, tmp_path): - """Test convert_to_pdf method saving to output file.""" - output_path = str(tmp_path / "converted.pdf") - - result = client.convert_to_pdf(sample_docx_path, output_path=output_path) - - assert result is None - assert (tmp_path / "converted.pdf").exists() - assert (tmp_path / "converted.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - def test_convert_to_pdf_from_pdf_passthrough(self, client, sample_pdf_path): - """Test convert_to_pdf method with PDF input (should pass through).""" - result = client.convert_to_pdf(sample_pdf_path) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Tests for flatten_annotations - def test_flatten_annotations_integration(self, client, sample_pdf_path): - """Test flatten_annotations method with live API.""" - result = client.flatten_annotations(sample_pdf_path) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_flatten_annotations_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test flatten_annotations method saving to output file.""" - output_path = str(tmp_path / "flattened.pdf") - - result = client.flatten_annotations(sample_pdf_path, output_path=output_path) - - assert result is None - assert (tmp_path / "flattened.pdf").exists() - assert_is_pdf(output_path) - - # Tests for rotate_pages - def test_rotate_pages_integration(self, client, sample_pdf_path): - """Test rotate_pages method with live API.""" - result = client.rotate_pages(sample_pdf_path, degrees=90) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_rotate_pages_specific_pages(self, client, sample_pdf_path): - """Test rotate_pages method with specific page indexes.""" - result = client.rotate_pages(sample_pdf_path, degrees=180, page_indexes=[0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_rotate_pages_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test rotate_pages method saving to output file.""" - output_path = str(tmp_path / "rotated.pdf") - - result = client.rotate_pages(sample_pdf_path, degrees=270, output_path=output_path) - - assert result is None - assert (tmp_path / "rotated.pdf").exists() - assert_is_pdf(output_path) - - # Tests for ocr_pdf - def test_ocr_pdf_integration(self, client, sample_pdf_path): - """Test ocr_pdf method with live API.""" - result = client.ocr_pdf(sample_pdf_path, language="english") - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_ocr_pdf_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test ocr_pdf method saving to output file.""" - output_path = str(tmp_path / "ocr.pdf") - - result = client.ocr_pdf(sample_pdf_path, language="english", output_path=output_path) - - assert result is None - assert (tmp_path / "ocr.pdf").exists() - assert_is_pdf(output_path) - - # Tests for watermark_pdf - def test_watermark_pdf_text_integration(self, client, sample_pdf_path): - """Test watermark_pdf method with text watermark.""" - result = client.watermark_pdf( - sample_pdf_path, text="DRAFT", width=200, height=100, opacity=0.5 - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_watermark_pdf_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test watermark_pdf method saving to output file.""" - output_path = str(tmp_path / "watermarked.pdf") - - result = client.watermark_pdf( - sample_pdf_path, - text="CONFIDENTIAL", - width=150, - height=75, - position="top-right", - output_path=output_path, - ) - - assert result is None - assert (tmp_path / "watermarked.pdf").exists() - assert_is_pdf(output_path) - - # Tests for apply_redactions - def test_apply_redactions_integration(self, client, sample_pdf_path): - """Test apply_redactions method with live API.""" - result = client.apply_redactions(sample_pdf_path) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_apply_redactions_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test apply_redactions method saving to output file.""" - output_path = str(tmp_path / "redacted.pdf") - - result = client.apply_redactions(sample_pdf_path, output_path=output_path) - - assert result is None - assert (tmp_path / "redacted.pdf").exists() - assert_is_pdf(output_path) - - # Tests for merge_pdfs - def test_merge_pdfs_integration(self, client, sample_pdf_path, tmp_path): - """Test merge_pdfs method with live API.""" - # Create a second PDF by copying the sample - second_pdf_path = str(tmp_path / "second.pdf") - import shutil - - shutil.copy2(sample_pdf_path, second_pdf_path) - - result = client.merge_pdfs([sample_pdf_path, second_pdf_path]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_merge_pdfs_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test merge_pdfs method saving to output file.""" - # Create a second PDF by copying the sample - second_pdf_path = str(tmp_path / "second.pdf") - output_path = str(tmp_path / "merged.pdf") - import shutil - - shutil.copy2(sample_pdf_path, second_pdf_path) - - result = client.merge_pdfs([sample_pdf_path, second_pdf_path], output_path=output_path) - - assert result is None - assert (tmp_path / "merged.pdf").exists() - assert_is_pdf(output_path) - - def test_merge_pdfs_error_single_file(self, client, sample_pdf_path): - """Test merge_pdfs method with single file raises error.""" - with pytest.raises(ValueError, match="At least 2 files required"): - client.merge_pdfs([sample_pdf_path]) - - # Tests for split_pdf - def test_split_pdf_integration(self, client, sample_multipage_pdf_path, tmp_path): - """Test split_pdf method with live API.""" - # Test splitting PDF into two parts - multi-page PDF has 3 pages - page_ranges = [ - {"start": 0, "end": 0}, # First page - {"start": 1}, # Remaining pages - ] - - # Test getting bytes back - result = client.split_pdf(sample_multipage_pdf_path, page_ranges=page_ranges) - - assert isinstance(result, list) - assert len(result) == 2 # Should return exactly 2 parts - assert all(isinstance(pdf_bytes, bytes) for pdf_bytes in result) - assert all(len(pdf_bytes) > 0 for pdf_bytes in result) - - # Verify both results are valid PDFs - for pdf_bytes in result: - assert_is_pdf(pdf_bytes) - - # Verify the number of pages in each output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(result[0]) == 1 # First PDF should have 1 page - assert ( - get_pdf_page_count(result[1]) == total_page_count - 1 - ) # Second PDF should have the remaining pages - - def test_split_pdf_with_output_files(self, client, sample_multipage_pdf_path, tmp_path): - """Test split_pdf method saving to output files.""" - output_paths = [str(tmp_path / "page1.pdf"), str(tmp_path / "remaining.pdf")] - - page_ranges = [ - {"start": 0, "end": 0}, # First page - {"start": 1}, # Remaining pages - ] - - # Test saving to files - result = client.split_pdf( - sample_multipage_pdf_path, page_ranges=page_ranges, output_paths=output_paths - ) - - # Should return empty list when saving to files - assert result == [] - - # Check that output files were created - assert (tmp_path / "page1.pdf").exists() - assert (tmp_path / "page1.pdf").stat().st_size > 0 - assert_is_pdf(str(tmp_path / "page1.pdf")) - - # Verify the number of pages in the first output PDF - assert get_pdf_page_count(str(tmp_path / "page1.pdf")) == 1 # First PDF should have 1 page - - # Second file should exist since sample PDF has multiple pages - assert (tmp_path / "remaining.pdf").exists() - assert (tmp_path / "remaining.pdf").stat().st_size > 0 - assert_is_pdf(str(tmp_path / "remaining.pdf")) - - # Verify the number of pages in the second output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert ( - get_pdf_page_count(str(tmp_path / "remaining.pdf")) == total_page_count - 1 - ) # Second PDF should have remaining pages - - def test_split_pdf_no_ranges_error(self, client, sample_pdf_path): - """Test split_pdf with no ranges returns first page by default.""" - # When no page_ranges provided, should default to first page - result = client.split_pdf(sample_pdf_path) - - assert isinstance(result, list) - assert len(result) == 1 # Should return single PDF (first page) - assert isinstance(result[0], bytes) - assert len(result[0]) > 0 - assert_is_pdf(result[0]) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(result[0]) == 1 # Should contain only the first page - - def test_split_pdf_output_paths_length_mismatch_error(self, client, sample_pdf_path): - """Test split_pdf method with mismatched output_paths and page_ranges lengths.""" - page_ranges = [{"start": 0, "end": 1}, {"start": 1}] - output_paths = ["page1.pdf"] # Only one path for two ranges - - with pytest.raises(ValueError, match="output_paths length must match page_ranges length"): - client.split_pdf(sample_pdf_path, page_ranges=page_ranges, output_paths=output_paths) - - def test_split_pdf_too_many_ranges_error(self, client, sample_pdf_path): - """Test split_pdf method with too many ranges raises error.""" - # Create 51 ranges (exceeds the 50 limit) - page_ranges = [{"start": i, "end": i + 1} for i in range(51)] - - with pytest.raises(ValueError, match="Maximum 50 page ranges allowed"): - client.split_pdf(sample_pdf_path, page_ranges=page_ranges) - - # Tests for duplicate_pdf_pages - def test_duplicate_pdf_pages_basic(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with basic duplication.""" - # Test duplicating first page twice (works with single-page PDF) - result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[0, 0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(result) == 2 # Should have 2 pages (duplicated the first page) - - def test_duplicate_pdf_pages_reorder(self, client, sample_multipage_pdf_path): - """Test duplicate_pdf_pages method with page reordering.""" - # Test reordering pages (multi-page PDF has 3 pages) - result = client.duplicate_pdf_pages(sample_multipage_pdf_path, page_indexes=[1, 0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(result) == 2 # Should have 2 pages (page 2 and page 1) - - def test_duplicate_pdf_pages_with_output_file( - self, client, sample_multipage_pdf_path, tmp_path - ): - """Test duplicate_pdf_pages method saving to output file.""" - output_path = str(tmp_path / "duplicated.pdf") - - # Test duplicating and saving to file (multi-page PDF has 3 pages) - result = client.duplicate_pdf_pages( - sample_multipage_pdf_path, page_indexes=[0, 0, 1], output_path=output_path - ) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "duplicated.pdf").exists() - assert (tmp_path / "duplicated.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(output_path) == 3 # Should have 3 pages (page 1, page 1, page 2) - - def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with negative indexes.""" - # Test using negative indexes (last page - works with single-page PDF) - result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[-1, 0, -1]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - assert ( - get_pdf_page_count(result) == 3 - ) # Should have 3 pages (last page, first page, last page) - - def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with empty page_indexes raises error.""" - with pytest.raises(ValueError, match="page_indexes cannot be empty"): - client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[]) - - # Tests for delete_pdf_pages - def test_delete_pdf_pages_basic(self, client, sample_multipage_pdf_path): - """Test delete_pdf_pages method with basic page deletion.""" - # Test deleting first page (multi-page PDF has 3 pages) - result = client.delete_pdf_pages(sample_multipage_pdf_path, page_indexes=[0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert ( - get_pdf_page_count(result) == total_page_count - 1 - ) # Should have 2 pages (deleted first page from 3-page PDF) - - def test_delete_pdf_pages_multiple(self, client, sample_multipage_pdf_path): - """Test delete_pdf_pages method with multiple page deletion.""" - # Test deleting multiple pages (deleting pages 1 and 3 from 3-page PDF) - result = client.delete_pdf_pages(sample_multipage_pdf_path, page_indexes=[0, 2]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert ( - get_pdf_page_count(result) == total_page_count - 2 - ) # Should have 1 page (deleted pages 1 and 3 from 3-page PDF) - - def test_delete_pdf_pages_with_output_file(self, client, sample_multipage_pdf_path, tmp_path): - """Test delete_pdf_pages method saving to output file.""" - output_path = str(tmp_path / "pages_deleted.pdf") - - # Test deleting pages and saving to file - result = client.delete_pdf_pages( - sample_multipage_pdf_path, page_indexes=[1], output_path=output_path - ) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "pages_deleted.pdf").exists() - assert (tmp_path / "pages_deleted.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert ( - get_pdf_page_count(output_path) == total_page_count - 1 - ) # Should have 2 pages (deleted page 2 from 3-page PDF) - - def test_delete_pdf_pages_negative_indexes_error(self, client, sample_pdf_path): - """Test delete_pdf_pages method with negative indexes raises error.""" - # Currently negative indexes are not supported for deletion - with pytest.raises(ValueError, match="Negative page indexes not yet supported"): - client.delete_pdf_pages(sample_pdf_path, page_indexes=[-1]) - - def test_delete_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): - """Test delete_pdf_pages method with empty page_indexes raises error.""" - with pytest.raises(ValueError, match="page_indexes cannot be empty"): - client.delete_pdf_pages(sample_pdf_path, page_indexes=[]) - - def test_delete_pdf_pages_duplicate_indexes(self, client, sample_multipage_pdf_path): - """Test delete_pdf_pages method with duplicate page indexes.""" - # Test that duplicate indexes are handled correctly (should remove duplicates) - result = client.delete_pdf_pages(sample_multipage_pdf_path, page_indexes=[0, 0, 1]) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert ( - get_pdf_page_count(result) == total_page_count - 2 - ) # Should have 1 page (deleted pages 1 and 2 from 3-page PDF) - - # Tests for add_page - def test_add_page_at_beginning(self, client, sample_pdf_path): - """Test add_page method inserting at the beginning.""" - # Test inserting at beginning (insert_index=0) - result = client.add_page(sample_pdf_path, insert_index=0) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 1 - - def test_add_page_multiple_pages(self, client, sample_multipage_pdf_path): - """Test add_page method with multiple pages.""" - # Test adding multiple blank pages before second page - result = client.add_page(sample_multipage_pdf_path, insert_index=1, page_count=3) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 3 - - def test_add_page_at_end(self, client, sample_pdf_path): - """Test add_page method inserting at the end.""" - # Test inserting at end using -1 - result = client.add_page(sample_pdf_path, insert_index=-1, page_count=2) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 2 - - def test_add_page_before_specific_page(self, client, sample_multipage_pdf_path): - """Test add_page method inserting before a specific page.""" - # Test inserting before page 3 (insert_index=2) - result = client.add_page(sample_multipage_pdf_path, insert_index=2, page_count=1) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 1 - - def test_add_page_custom_size_orientation(self, client, sample_pdf_path): - """Test add_page method with custom page size and orientation.""" - # Test adding Letter-sized landscape pages at beginning - result = client.add_page( - sample_pdf_path, - insert_index=0, - page_size="Letter", - orientation="landscape", - page_count=2, - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 2 - - def test_add_page_with_output_file(self, client, sample_multipage_pdf_path, tmp_path): - """Test add_page method saving to output file.""" - output_path = str(tmp_path / "with_blank_pages.pdf") - - # Test adding pages and saving to file - result = client.add_page( - sample_multipage_pdf_path, insert_index=1, page_count=2, output_path=output_path - ) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "with_blank_pages.pdf").exists() - assert (tmp_path / "with_blank_pages.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(output_path) == total_page_count + 2 - - def test_add_page_different_page_sizes(self, client, sample_pdf_path): - """Test add_page method with different page sizes.""" - # Test various page sizes - page_sizes = ["A4", "Letter", "Legal", "A3", "A5"] - - for page_size in page_sizes: - result = client.add_page(sample_pdf_path, insert_index=0, page_size=page_size) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 1 - - def test_add_page_invalid_page_count_error(self, client, sample_pdf_path): - """Test add_page method with invalid page_count raises error.""" - # Test zero page count - with pytest.raises(ValueError, match="page_count must be at least 1"): - client.add_page(sample_pdf_path, insert_index=0, page_count=0) - - # Test negative page count - with pytest.raises(ValueError, match="page_count must be at least 1"): - client.add_page(sample_pdf_path, insert_index=0, page_count=-1) - - # Test excessive page count - with pytest.raises(ValueError, match="page_count cannot exceed 100"): - client.add_page(sample_pdf_path, insert_index=0, page_count=101) - - def test_add_page_invalid_position_error(self, client, sample_pdf_path): - """Test add_page method with invalid insert_index raises error.""" - # Test invalid negative position (anything below -1) - with pytest.raises(ValueError, match="insert_index must be -1"): - client.add_page(sample_pdf_path, insert_index=-2, page_count=1) - - with pytest.raises(ValueError, match="insert_index must be -1"): - client.add_page(sample_pdf_path, insert_index=-5, page_count=1) - - # Tests for set_page_label - def test_set_page_label_integration(self, client, sample_pdf_path, tmp_path): - """Test set_page_label method with live API.""" - labels = [{"pages": {"start": 0, "end": 0}, "label": "Cover"}] - - output_path = str(tmp_path / "labeled.pdf") - - # Try to set page labels - result = client.set_page_label(sample_pdf_path, labels, output_path=output_path) - - # If successful, verify output - assert result is None # Should return None when output_path provided - assert (tmp_path / "labeled.pdf").exists() - assert_is_pdf(output_path) - - def test_set_page_label_return_bytes(self, client, sample_pdf_path): - """Test set_page_label method returning bytes.""" - labels = [{"pages": {"start": 0, "end": 0}, "label": "i"}] - - # Test getting bytes back - result = client.set_page_label(sample_pdf_path, labels) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_set_page_label_multiple_ranges(self, client, sample_multipage_pdf_path): - """Test set_page_label method with multiple page ranges.""" - labels = [ - {"pages": {"start": 0, "end": 0}, "label": "i"}, - {"pages": {"start": 1, "end": 1}, "label": "intro"}, - ] - - result = client.set_page_label(sample_multipage_pdf_path, labels) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_set_page_label_single_page(self, client, sample_pdf_path): - """Test set_page_label method with single page label.""" - labels = [{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] - - result = client.set_page_label(sample_pdf_path, labels) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_set_page_label_empty_labels_error(self, client, sample_pdf_path): - """Test set_page_label method with empty labels raises error.""" - with pytest.raises(ValueError, match="labels list cannot be empty"): - client.set_page_label(sample_pdf_path, labels=[]) - - def test_set_page_label_invalid_label_config_error(self, client, sample_pdf_path): - """Test set_page_label method with invalid label configuration raises error.""" - # Missing 'pages' key - with pytest.raises(ValueError, match="missing required 'pages' key"): - client.set_page_label(sample_pdf_path, labels=[{"label": "test"}]) - - # Missing 'label' key - with pytest.raises(ValueError, match="missing required 'label' key"): - client.set_page_label(sample_pdf_path, labels=[{"pages": {"start": 0}}]) - - # Invalid pages format - with pytest.raises(ValueError, match="'pages' must be a dict with 'start' key"): - client.set_page_label(sample_pdf_path, labels=[{"pages": "invalid", "label": "test"}]) diff --git a/tests/integration/test_live_api.py b/tests/integration/test_live_api.py deleted file mode 100644 index 4591f42..0000000 --- a/tests/integration/test_live_api.py +++ /dev/null @@ -1,570 +0,0 @@ -"""Integration tests against the live Nutrient DWS API. - -These tests require a valid API key configured in integration_config.py. -""" - -from __future__ import annotations - -import pytest - -from nutrient_dws import NutrientClient -from nutrient_dws.file_handler import get_pdf_page_count - -try: - from . import integration_config # type: ignore[attr-defined] - - API_KEY: str | None = integration_config.API_KEY - BASE_URL: str | None = getattr(integration_config, "BASE_URL", None) - TIMEOUT: int = getattr(integration_config, "TIMEOUT", 60) -except ImportError: - API_KEY = None - BASE_URL = None - TIMEOUT = 60 - - -def assert_is_pdf(file_path_or_bytes: str | bytes) -> None: - """Assert that a file or bytes is a valid PDF. - - Args: - file_path_or_bytes: Path to file or bytes content to check. - """ - if isinstance(file_path_or_bytes, (str, bytes)): - if isinstance(file_path_or_bytes, str): - with open(file_path_or_bytes, "rb") as f: - content = f.read(8) - else: - content = file_path_or_bytes[:8] - - # Check PDF magic number - assert content.startswith(b"%PDF-"), ( - f"File does not start with PDF magic number, got: {content!r}" - ) - else: - raise ValueError("Input must be file path string or bytes") - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestLiveAPI: - """Integration tests against live API.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - client = NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - yield client - client.close() - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file for testing.""" - import os - - return os.path.join(os.path.dirname(__file__), "..", "data", "sample.pdf") - - def test_client_initialization(self): - """Test that client initializes correctly with API key.""" - client = NutrientClient(api_key=API_KEY) - assert client._api_key == API_KEY - client.close() - - def test_client_missing_api_key(self): - """Test that client works without API key but fails on API calls.""" - client = NutrientClient() - # Should not raise during initialization - assert client is not None - client.close() - - def test_basic_api_connectivity(self, client, sample_pdf_path): - """Test basic API connectivity with a simple operation.""" - # This test will depend on what operations are available - # For now, we'll test that we can create a builder without errors - builder = client.build(input_file=sample_pdf_path) - assert builder is not None - - @pytest.mark.skip(reason="Requires specific tool implementation") - def test_convert_operation(self, client, sample_pdf_path, tmp_path): - """Test a basic convert operation (example - adjust based on available tools).""" - # This is an example - adjust based on actual available tools - # output_path = tmp_path / "output.pdf" - # result = client.convert_to_pdf(input_file=sample_pdf_path, output_path=str(output_path)) - - # assert output_path.exists() - # assert output_path.stat().st_size > 0 - - def test_builder_api_basic(self, client, sample_pdf_path): - """Test basic builder API functionality.""" - builder = client.build(input_file=sample_pdf_path) - - # Test that we can add steps without errors - # This will need to be updated based on actual available tools - # builder.add_step("example-tool", {}) - - assert builder is not None - - def test_split_pdf_integration(self, client, sample_pdf_path, tmp_path): - """Test split_pdf method with live API.""" - # Test splitting PDF into two parts - sample PDF should have multiple pages - page_ranges = [ - {"start": 0, "end": 0}, # First page - {"start": 1}, # Remaining pages - ] - - # Test getting bytes back - result = client.split_pdf(sample_pdf_path, page_ranges=page_ranges) - - assert isinstance(result, list) - assert len(result) == 2 # Should return exactly 2 parts since sample has multiple pages - assert all(isinstance(pdf_bytes, bytes) for pdf_bytes in result) - assert all(len(pdf_bytes) > 0 for pdf_bytes in result) - - # Verify both results are valid PDFs - for pdf_bytes in result: - assert_is_pdf(pdf_bytes) - - # Verify the number of pages in each output PDF - assert get_pdf_page_count(result[0]) == 1 # First PDF should have 1 page - # The second PDF should have the remaining pages (total pages - 1) - total_pages = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result[1]) == total_pages - 1 - - def test_split_pdf_with_output_files(self, client, sample_pdf_path, tmp_path): - """Test split_pdf method saving to output files.""" - output_paths = [str(tmp_path / "page1.pdf"), str(tmp_path / "remaining.pdf")] - - page_ranges = [ - {"start": 0, "end": 0}, # First page - {"start": 1}, # Remaining pages - ] - - # Test saving to files - result = client.split_pdf( - sample_pdf_path, page_ranges=page_ranges, output_paths=output_paths - ) - - # Should return empty list when saving to files - assert result == [] - - # Check that output files were created - assert (tmp_path / "page1.pdf").exists() - assert (tmp_path / "page1.pdf").stat().st_size > 0 - assert_is_pdf(str(tmp_path / "page1.pdf")) - - # Verify the number of pages in the first output PDF - assert get_pdf_page_count(str(tmp_path / "page1.pdf")) == 1 # First PDF should have 1 page - - # Second file should exist since sample PDF has multiple pages - assert (tmp_path / "remaining.pdf").exists() - assert (tmp_path / "remaining.pdf").stat().st_size > 0 - assert_is_pdf(str(tmp_path / "remaining.pdf")) - - # Verify the number of pages in the second output PDF - # The second PDF should have the remaining pages (total pages - 1) - total_pages = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(str(tmp_path / "remaining.pdf")) == total_pages - 1 - - def test_split_pdf_single_page_default(self, client, sample_pdf_path): - """Test split_pdf with default behavior (single page).""" - # Test default splitting (should extract first page) - result = client.split_pdf(sample_pdf_path) - - assert isinstance(result, list) - assert len(result) == 1 - assert isinstance(result[0], bytes) - assert len(result[0]) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result[0]) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(result[0]) == 1 # Should contain only the first page - - def test_set_page_label_integration(self, client, sample_pdf_path, tmp_path): - """Test set_page_label method with live API.""" - labels = [{"pages": {"start": 0, "end": 0}, "label": "Cover"}] - - output_path = str(tmp_path / "labeled.pdf") - - # Try to set page labels - result = client.set_page_label(sample_pdf_path, labels, output_path=output_path) - - # If successful, verify output - assert result is None # Should return None when output_path provided - assert (tmp_path / "labeled.pdf").exists() - assert_is_pdf(output_path) - - def test_set_page_label_return_bytes(self, client, sample_pdf_path): - """Test set_page_label method returning bytes.""" - labels = [{"pages": {"start": 0, "end": 0}, "label": "i"}] - - # Test getting bytes back - result = client.set_page_label(sample_pdf_path, labels) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_set_page_label_multiple_ranges(self, client, sample_pdf_path): - """Test set_page_label method with multiple page ranges.""" - labels = [ - {"pages": {"start": 0, "end": 0}, "label": "i"}, - {"pages": {"start": 1, "end": 1}, "label": "intro"}, - {"pages": {"start": 2, "end": 2}, "label": "final"}, - ] - - result = client.set_page_label(sample_pdf_path, labels) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_set_page_label_single_page(self, client, sample_pdf_path): - """Test set_page_label method with single page label.""" - labels = [{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] - - result = client.set_page_label(sample_pdf_path, labels) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_set_page_label_empty_labels_error(self, client, sample_pdf_path): - """Test set_page_label method with empty labels raises error.""" - with pytest.raises(ValueError, match="labels list cannot be empty"): - client.set_page_label(sample_pdf_path, labels=[]) - - def test_set_page_label_invalid_label_config_error(self, client, sample_pdf_path): - """Test set_page_label method with invalid label configuration raises error.""" - # Missing 'pages' key - with pytest.raises(ValueError, match="missing required 'pages' key"): - client.set_page_label(sample_pdf_path, labels=[{"label": "test"}]) - - # Missing 'label' key - with pytest.raises(ValueError, match="missing required 'label' key"): - client.set_page_label(sample_pdf_path, labels=[{"pages": {"start": 0}}]) - - # Invalid pages format - with pytest.raises(ValueError, match="'pages' must be a dict with 'start' key"): - client.set_page_label(sample_pdf_path, labels=[{"pages": "invalid", "label": "test"}]) - - def test_duplicate_pdf_pages_basic(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with basic duplication.""" - # Test duplicating first page twice - result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[0, 0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(result) == 2 # Should have 2 pages (duplicated the first page) - - def test_duplicate_pdf_pages_reorder(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with page reordering.""" - # Test reordering pages (assumes sample PDF has at least 2 pages) - result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[1, 0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(result) == 2 # Should have 2 pages (page 2 and page 1) - - def test_duplicate_pdf_pages_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test duplicate_pdf_pages method saving to output file.""" - output_path = str(tmp_path / "duplicated.pdf") - - # Test duplicating and saving to file - result = client.duplicate_pdf_pages( - sample_pdf_path, page_indexes=[0, 0, 1], output_path=output_path - ) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "duplicated.pdf").exists() - assert (tmp_path / "duplicated.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - # Verify the number of pages in the output PDF - assert get_pdf_page_count(output_path) == 3 # Should have 3 pages (page 1, page 1, page 2) - - def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with negative indexes.""" - # Test using negative indexes (last page) - result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[-1, 0, -1]) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - assert ( - get_pdf_page_count(result) == 3 - ) # Should have 3 pages (last page, first page, last page) - - def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): - """Test duplicate_pdf_pages method with empty page_indexes raises error.""" - with pytest.raises(ValueError, match="page_indexes cannot be empty"): - client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[]) - - def test_delete_pdf_pages_basic(self, client, sample_pdf_path): - """Test delete_pdf_pages method with basic page deletion.""" - # Test deleting first page (assuming sample PDF has at least 2 pages) - result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[0]) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - total_pages = get_pdf_page_count(sample_pdf_path) - assert ( - get_pdf_page_count(result) == total_pages - 1 - ) # Should have one less page than original - - def test_delete_pdf_pages_multiple(self, client, sample_pdf_path): - """Test delete_pdf_pages method with multiple page deletion.""" - # Test deleting multiple pages - result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[0, 2]) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - total_pages = get_pdf_page_count(sample_pdf_path) - # Should have two less pages than original (deleted pages 1 and 3) - assert get_pdf_page_count(result) == total_pages - 2 - - def test_delete_pdf_pages_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test delete_pdf_pages method saving to output file.""" - output_path = str(tmp_path / "pages_deleted.pdf") - - # Test deleting pages and saving to file - result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[1], output_path=output_path) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "pages_deleted.pdf").exists() - assert (tmp_path / "pages_deleted.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - # Verify the number of pages in the output PDF - total_pages = get_pdf_page_count(sample_pdf_path) - # Should have one less page than original (deleted page 2) - assert get_pdf_page_count(output_path) == total_pages - 1 - - def test_delete_pdf_pages_negative_indexes_error(self, client, sample_pdf_path): - """Test delete_pdf_pages method with negative indexes raises error.""" - # Currently negative indexes are not supported for deletion - with pytest.raises(ValueError, match="Negative page indexes not yet supported"): - client.delete_pdf_pages(sample_pdf_path, page_indexes=[-1]) - - def test_delete_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): - """Test delete_pdf_pages method with empty page_indexes raises error.""" - with pytest.raises(ValueError, match="page_indexes cannot be empty"): - client.delete_pdf_pages(sample_pdf_path, page_indexes=[]) - - def test_delete_pdf_pages_duplicate_indexes(self, client, sample_pdf_path): - """Test delete_pdf_pages method with duplicate page indexes.""" - # Test that duplicate indexes are handled correctly (should remove duplicates) - result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[0, 0, 1]) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - # Verify the number of pages in the output PDF - total_pages = get_pdf_page_count(sample_pdf_path) - # Should have two less pages than original (deleted pages 1 and 2) - assert get_pdf_page_count(result) == total_pages - 2 - - @pytest.fixture - def sample_docx_path(self): - """Get path to sample DOCX file for testing.""" - import os - - return os.path.join(os.path.dirname(__file__), "..", "data", "sample.docx") - - def test_convert_to_pdf_from_docx(self, client, sample_docx_path): - """Test convert_to_pdf method with DOCX input.""" - # Test converting DOCX to PDF and getting bytes back - result = client.convert_to_pdf(sample_docx_path) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - def test_convert_to_pdf_with_output_file(self, client, sample_docx_path, tmp_path): - """Test convert_to_pdf method saving to output file.""" - output_path = str(tmp_path / "converted.pdf") - - # Test converting and saving to file - result = client.convert_to_pdf(sample_docx_path, output_path=output_path) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "converted.pdf").exists() - assert (tmp_path / "converted.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - def test_convert_to_pdf_from_pdf_passthrough(self, client, sample_pdf_path): - """Test convert_to_pdf method with PDF input (should pass through).""" - # Test that PDF input passes through unchanged - result = client.convert_to_pdf(sample_pdf_path) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - - def test_add_page_at_beginning(self, client, sample_pdf_path): - """Test add_page method inserting at the beginning.""" - # Test inserting at beginning (insert_index=0) - result = client.add_page(sample_pdf_path, insert_index=0) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 1 - - def test_add_page_multiple_pages(self, client, sample_pdf_path): - """Test add_page method with multiple pages.""" - # Test adding multiple blank pages before second page - result = client.add_page(sample_pdf_path, insert_index=1, page_count=3) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 3 - - def test_add_page_at_end(self, client, sample_pdf_path): - """Test add_page method inserting at the end.""" - # Test inserting at end using -1 - result = client.add_page(sample_pdf_path, insert_index=-1, page_count=2) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 2 - - def test_add_page_before_specific_page(self, client, sample_pdf_path): - """Test add_page method inserting before a specific page.""" - # Test inserting before page 3 (insert_index=2) - result = client.add_page(sample_pdf_path, insert_index=2, page_count=1) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 1 - - def test_add_page_custom_size_orientation(self, client, sample_pdf_path): - """Test add_page method with custom page size and orientation.""" - # Test adding Letter-sized landscape pages at beginning - result = client.add_page( - sample_pdf_path, - insert_index=0, - page_size="Letter", - orientation="landscape", - page_count=2, - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - - # Verify result is a valid PDF - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 2 - - def test_add_page_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test add_page method saving to output file.""" - output_path = str(tmp_path / "with_blank_pages.pdf") - - # Test adding pages and saving to file - result = client.add_page( - sample_pdf_path, insert_index=1, page_count=2, output_path=output_path - ) - - # Should return None when saving to file - assert result is None - - # Check that output file was created - assert (tmp_path / "with_blank_pages.pdf").exists() - assert (tmp_path / "with_blank_pages.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(output_path) == total_page_count + 2 - - def test_add_page_different_page_sizes(self, client, sample_pdf_path): - """Test add_page method with different page sizes.""" - # Test various page sizes - page_sizes = ["A4", "Letter", "Legal", "A3", "A5"] - - for page_size in page_sizes: - result = client.add_page(sample_pdf_path, insert_index=0, page_size=page_size) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - # Verify the number of pages in the output PDF - total_page_count = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_page_count + 1 - - def test_add_page_invalid_page_count_error(self, client, sample_pdf_path): - """Test add_page method with invalid page_count raises error.""" - # Test zero page count - with pytest.raises(ValueError, match="page_count must be at least 1"): - client.add_page(sample_pdf_path, insert_index=0, page_count=0) - - # Test negative page count - with pytest.raises(ValueError, match="page_count must be at least 1"): - client.add_page(sample_pdf_path, insert_index=0, page_count=-1) - - def test_add_page_invalid_position_error(self, client, sample_pdf_path): - """Test add_page method with invalid insert_index raises error.""" - # Test invalid negative position (anything below -1) - with pytest.raises(ValueError, match="insert_index must be -1"): - client.add_page(sample_pdf_path, insert_index=-2, page_count=1) - - with pytest.raises(ValueError, match="insert_index must be -1"): - client.add_page(sample_pdf_path, insert_index=-5, page_count=1) diff --git a/tests/integration/test_new_tools_integration.py b/tests/integration/test_new_tools_integration.py deleted file mode 100644 index 7dd70e2..0000000 --- a/tests/integration/test_new_tools_integration.py +++ /dev/null @@ -1,453 +0,0 @@ -"""Integration tests for newly added Direct API methods. - -These tests require a valid API key configured in integration_config.py and -test the new Direct API methods against the live Nutrient DWS API. -""" - -from pathlib import Path - -import pytest - -from nutrient_dws import NutrientClient - -try: - from . import integration_config # type: ignore[attr-defined] - - API_KEY: str | None = integration_config.API_KEY - BASE_URL: str | None = getattr(integration_config, "BASE_URL", None) - TIMEOUT: int = getattr(integration_config, "TIMEOUT", 60) -except ImportError: - API_KEY = None - BASE_URL = None - TIMEOUT = 60 - - -def assert_is_pdf(file_path_or_bytes: str | bytes) -> None: - """Assert that a file or bytes is a valid PDF. - - Args: - file_path_or_bytes: Path to file or bytes content to check. - """ - if isinstance(file_path_or_bytes, (str, bytes)): - if isinstance(file_path_or_bytes, str): - with open(file_path_or_bytes, "rb") as f: - content = f.read(8) - else: - content = file_path_or_bytes[:8] - - # Check PDF magic number - assert content.startswith(b"%PDF-"), ( - f"File does not start with PDF magic number, got: {content!r}" - ) - else: - raise ValueError("Input must be file path string or bytes") - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestCreateRedactionsIntegration: - """Integration tests for create_redactions methods.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - return NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - - @pytest.fixture - def sample_pdf_with_sensitive_data(self, tmp_path): - """Create a PDF with sensitive data for testing redactions.""" - # For now, we'll use a sample PDF. In a real scenario, we'd create one with sensitive data - sample_path = Path(__file__).parent.parent / "data" / "sample.pdf" - if not sample_path.exists(): - pytest.skip(f"Sample PDF not found at {sample_path}") - return str(sample_path) - - def test_create_redactions_preset_ssn(self, client, sample_pdf_with_sensitive_data): - """Test creating redactions with SSN preset.""" - result = client.create_redactions_preset( - sample_pdf_with_sensitive_data, preset="social-security-number" - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_create_redactions_preset_with_output_file( - self, client, sample_pdf_with_sensitive_data, tmp_path - ): - """Test creating redactions with preset and saving to file.""" - output_path = tmp_path / "redacted_preset.pdf" - result = client.create_redactions_preset( - sample_pdf_with_sensitive_data, - preset="international-phone-number", - output_path=str(output_path), - ) - assert result is None - assert output_path.exists() - assert_is_pdf(str(output_path)) - - def test_create_redactions_regex(self, client, sample_pdf_with_sensitive_data): - """Test creating redactions with regex pattern.""" - # Pattern for simple numbers (which should exist in any PDF) - result = client.create_redactions_regex( - sample_pdf_with_sensitive_data, pattern=r"\d+", case_sensitive=False - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_create_redactions_text(self, client, sample_pdf_with_sensitive_data): - """Test creating redactions for exact text matches.""" - # Use a very common letter that should exist - result = client.create_redactions_text( - sample_pdf_with_sensitive_data, - text="a", - case_sensitive=False, - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_create_redactions_with_appearance(self, client, sample_pdf_with_sensitive_data): - """Test creating redactions with custom appearance.""" - result = client.create_redactions_text( - sample_pdf_with_sensitive_data, - text="e", # Very common letter - case_sensitive=False, - appearance_fill_color="#FF0000", - appearance_stroke_color="#000000", - ) - assert_is_pdf(result) - assert len(result) > 0 - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestOptimizePDFIntegration: - """Integration tests for optimize_pdf method.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - return NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file.""" - sample_path = Path(__file__).parent.parent / "data" / "sample.pdf" - if not sample_path.exists(): - pytest.skip(f"Sample PDF not found at {sample_path}") - return str(sample_path) - - def test_optimize_pdf_basic(self, client, sample_pdf_path): - """Test basic PDF optimization.""" - result = client.optimize_pdf(sample_pdf_path) - assert_is_pdf(result) - assert len(result) > 0 - - def test_optimize_pdf_grayscale(self, client, sample_pdf_path): - """Test PDF optimization with grayscale options.""" - result = client.optimize_pdf( - sample_pdf_path, grayscale_text=True, grayscale_graphics=True, grayscale_images=True - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_optimize_pdf_image_optimization_quality(self, client, sample_pdf_path): - """Test PDF optimization with image optimization quality.""" - result = client.optimize_pdf(sample_pdf_path, image_optimization_quality=2) - assert_is_pdf(result) - assert len(result) > 0 - - def test_optimize_pdf_linearize(self, client, sample_pdf_path): - """Test PDF optimization with linearization.""" - result = client.optimize_pdf(sample_pdf_path, linearize=True) - assert_is_pdf(result) - assert len(result) > 0 - - def test_optimize_pdf_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test PDF optimization with output file.""" - output_path = tmp_path / "optimized.pdf" - result = client.optimize_pdf( - sample_pdf_path, - grayscale_images=True, - image_optimization_quality=3, - output_path=str(output_path), - ) - assert result is None - assert output_path.exists() - assert_is_pdf(str(output_path)) - - def test_optimize_pdf_invalid_quality_raises_error(self, client, sample_pdf_path): - """Test that invalid image quality raises ValueError.""" - with pytest.raises(ValueError, match="image_optimization_quality must be between 1 and 4"): - client.optimize_pdf(sample_pdf_path, image_optimization_quality=0) - - with pytest.raises(ValueError, match="image_optimization_quality must be between 1 and 4"): - client.optimize_pdf(sample_pdf_path, image_optimization_quality=5) - - with pytest.raises(ValueError, match="No optimization is enabled"): - client.optimize_pdf(sample_pdf_path, image_optimization_quality=None) - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestPasswordProtectPDFIntegration: - """Integration tests for password_protect_pdf method.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - return NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file.""" - sample_path = Path(__file__).parent.parent / "data" / "sample.pdf" - if not sample_path.exists(): - pytest.skip(f"Sample PDF not found at {sample_path}") - return str(sample_path) - - def test_password_protect_user_password(self, client, sample_pdf_path): - """Test password protection with user password only.""" - result = client.password_protect_pdf(sample_pdf_path, user_password="test123") - assert_is_pdf(result) - assert len(result) > 0 - - def test_password_protect_both_passwords(self, client, sample_pdf_path): - """Test password protection with both user and owner passwords.""" - result = client.password_protect_pdf( - sample_pdf_path, user_password="user123", owner_password="owner456" - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_password_protect_with_permissions(self, client, sample_pdf_path): - """Test password protection with custom permissions.""" - result = client.password_protect_pdf( - sample_pdf_path, - user_password="test123", - permissions=["extract", "annotations_and_forms"], - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_password_protect_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test password protection with output file.""" - output_path = tmp_path / "protected.pdf" - result = client.password_protect_pdf( - sample_pdf_path, - user_password="secret", - owner_password="admin", - permissions=["printing"], - output_path=str(output_path), - ) - assert result is None - assert output_path.exists() - assert_is_pdf(str(output_path)) - - def test_password_protect_no_password_raises_error(self, client, sample_pdf_path): - """Test that no password raises ValueError.""" - with pytest.raises( - ValueError, match="At least one of user_password or owner_password must be provided" - ): - client.password_protect_pdf(sample_pdf_path) - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestSetPDFMetadataIntegration: - """Integration tests for set_pdf_metadata method.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - return NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file.""" - sample_path = Path(__file__).parent.parent / "data" / "sample.pdf" - if not sample_path.exists(): - pytest.skip(f"Sample PDF not found at {sample_path}") - return str(sample_path) - - def test_set_pdf_metadata_title_author(self, client, sample_pdf_path): - """Test setting PDF title and author.""" - result = client.set_pdf_metadata( - sample_pdf_path, title="Test Document", author="Test Author" - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_set_pdf_metadata_all_supported_fields(self, client, sample_pdf_path): - """Test setting all supported PDF metadata fields (title and author).""" - result = client.set_pdf_metadata( - sample_pdf_path, - title="Complete Test Document", - author="John Doe", - ) - assert_is_pdf(result) - assert len(result) > 0 - - def test_set_pdf_metadata_with_output_file(self, client, sample_pdf_path, tmp_path): - """Test setting PDF metadata with output file.""" - output_path = tmp_path / "metadata.pdf" - result = client.set_pdf_metadata( - sample_pdf_path, - title="Output Test", - author="Test Author", - output_path=str(output_path), - ) - assert result is None - assert output_path.exists() - assert_is_pdf(str(output_path)) - - def test_set_pdf_metadata_no_fields_raises_error(self, client, sample_pdf_path): - """Test that no metadata fields raises ValueError.""" - with pytest.raises(ValueError, match="At least one metadata field must be provided"): - client.set_pdf_metadata(sample_pdf_path) - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestApplyInstantJSONIntegration: - """Integration tests for apply_instant_json method.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - return NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file.""" - sample_path = Path(__file__).parent.parent / "data" / "sample.pdf" - if not sample_path.exists(): - pytest.skip(f"Sample PDF not found at {sample_path}") - return str(sample_path) - - @pytest.fixture - def sample_instant_json(self, tmp_path): - """Create a sample Instant JSON file.""" - json_content = """{ - "format": "https://pspdfkit.com/instant-json/v1", - "annotations": [ - { - "v": 2, - "type": "pspdfkit/text", - "pageIndex": 0, - "bbox": [100, 100, 200, 150], - "content": "Test annotation", - "fontSize": 14, - "opacity": 1, - "horizontalAlign": "left", - "verticalAlign": "top" - } - ] - }""" - json_path = tmp_path / "annotations.json" - json_path.write_text(json_content) - return str(json_path) - - def test_apply_instant_json_from_file(self, client, sample_pdf_path, sample_instant_json): - """Test applying Instant JSON from file.""" - result = client.apply_instant_json(sample_pdf_path, sample_instant_json) - assert_is_pdf(result) - assert len(result) > 0 - - def test_apply_instant_json_from_bytes(self, client, sample_pdf_path): - """Test applying Instant JSON from bytes.""" - json_bytes = b"""{ - "format": "https://pspdfkit.com/instant-json/v1", - "annotations": [ - { - "v": 2, - "type": "pspdfkit/text", - "pageIndex": 0, - "bbox": [100, 100, 200, 150], - "content": "Test annotation", - "fontSize": 14, - "opacity": 1, - "horizontalAlign": "left", - "verticalAlign": "top" - } - ] - }""" - result = client.apply_instant_json(sample_pdf_path, json_bytes) - assert_is_pdf(result) - assert len(result) > 0 - - def test_apply_instant_json_with_output_file( - self, client, sample_pdf_path, sample_instant_json, tmp_path - ): - """Test applying Instant JSON with output file.""" - output_path = tmp_path / "annotated.pdf" - result = client.apply_instant_json( - sample_pdf_path, sample_instant_json, output_path=str(output_path) - ) - assert result is None - assert output_path.exists() - assert_is_pdf(str(output_path)) - - @pytest.mark.skip(reason="Requires valid Instant JSON URL") - def test_apply_instant_json_from_url(self, client, sample_pdf_path): - """Test applying Instant JSON from URL.""" - # This test would require a valid URL with Instant JSON content - pass - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestApplyXFDFIntegration: - """Integration tests for apply_xfdf method.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - return NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file.""" - sample_path = Path(__file__).parent.parent / "data" / "sample.pdf" - if not sample_path.exists(): - pytest.skip(f"Sample PDF not found at {sample_path}") - return str(sample_path) - - @pytest.fixture - def sample_xfdf(self, tmp_path): - """Create a sample XFDF file.""" - xfdf_content = """ - - - - Test XFDF annotation - - -""" - xfdf_path = tmp_path / "annotations.xfdf" - xfdf_path.write_text(xfdf_content) - return str(xfdf_path) - - def test_apply_xfdf_from_file(self, client, sample_pdf_path, sample_xfdf): - """Test applying XFDF from file.""" - result = client.apply_xfdf(sample_pdf_path, sample_xfdf) - assert_is_pdf(result) - assert len(result) > 0 - - def test_apply_xfdf_from_bytes(self, client, sample_pdf_path): - """Test applying XFDF from bytes.""" - xfdf_bytes = b""" - - - - -""" - result = client.apply_xfdf(sample_pdf_path, xfdf_bytes) - assert_is_pdf(result) - assert len(result) > 0 - - def test_apply_xfdf_with_output_file(self, client, sample_pdf_path, sample_xfdf, tmp_path): - """Test applying XFDF with output file.""" - output_path = tmp_path / "xfdf_annotated.pdf" - result = client.apply_xfdf(sample_pdf_path, sample_xfdf, output_path=str(output_path)) - assert result is None - assert output_path.exists() - assert_is_pdf(str(output_path)) - - @pytest.mark.skip(reason="Requires valid XFDF URL") - def test_apply_xfdf_from_url(self, client, sample_pdf_path): - """Test applying XFDF from URL.""" - # This test would require a valid URL with XFDF content - pass diff --git a/tests/integration/test_smoke.py b/tests/integration/test_smoke.py deleted file mode 100644 index e9b20bb..0000000 --- a/tests/integration/test_smoke.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Basic smoke test to validate integration test setup.""" - -import pytest - -from nutrient_dws import NutrientClient - -# Type annotation for mypy -API_KEY: str | None = None - -try: - from . import integration_config # type: ignore[attr-defined] - - API_KEY = integration_config.API_KEY -except (ImportError, AttributeError): - API_KEY = None - - -@pytest.mark.skipif(not API_KEY, reason="No API key available") -def test_api_connection(): - """Test that we can connect to the API.""" - client = NutrientClient(api_key=API_KEY) - # Just verify client initialization works - assert client._api_key == API_KEY - assert hasattr(client, "convert_to_pdf") - assert hasattr(client, "build") diff --git a/tests/integration/test_watermark_image_file_integration.py b/tests/integration/test_watermark_image_file_integration.py deleted file mode 100644 index 09a1b4d..0000000 --- a/tests/integration/test_watermark_image_file_integration.py +++ /dev/null @@ -1,236 +0,0 @@ -"""Integration tests for image file watermark functionality.""" - -import os -from pathlib import Path - -import pytest - -from nutrient_dws import NutrientClient - -try: - from . import integration_config # type: ignore[attr-defined] - - API_KEY: str | None = integration_config.API_KEY - BASE_URL: str | None = getattr(integration_config, "BASE_URL", None) - TIMEOUT: int = getattr(integration_config, "TIMEOUT", 60) -except ImportError: - API_KEY = None - BASE_URL = None - TIMEOUT = 60 - - -def assert_is_pdf(file_path_or_bytes: str | bytes) -> None: - """Assert that a file or bytes is a valid PDF.""" - if isinstance(file_path_or_bytes, str): - with open(file_path_or_bytes, "rb") as f: - content = f.read(8) - else: - content = file_path_or_bytes[:8] - - assert content.startswith(b"%PDF-"), ( - f"File does not start with PDF magic number, got: {content!r}" - ) - - -def create_test_image(tmp_path: Path, filename: str = "watermark.png") -> str: - """Create a simple test PNG image.""" - try: - # Try to use PIL to create a proper image - from PIL import Image - - img = Image.new("RGB", (100, 100), color="red") - image_path = tmp_path / filename - img.save(str(image_path)) - return str(image_path) - except ImportError: - # Fallback to a simple but valid PNG if PIL is not available - # This is a 50x50 red PNG image - png_data = ( - b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a\x00\x00\x00\x0d\x49\x48\x44\x52" - b"\x00\x00\x00\x32\x00\x00\x00\x32\x08\x02\x00\x00\x00\x91\x5d\x1f" - b"\xe6\x00\x00\x00\x4b\x49\x44\x41\x54\x78\x9c\xed\xce\xb1\x01\x00" - b"\x10\x00\xc0\x30\xfc\xff\x33\x0f\x58\x32\x31\x34\x17\x64\xee\xf1" - b"\xa3\xf5\x3a\x70\x57\x4b\xd4\x12\xb5\x44\x2d\x51\x4b\xd4\x12\xb5" - b"\x44\x2d\x51\x4b\xd4\x12\xb5\x44\x2d\x51\x4b\xd4\x12\xb5\x44\x2d" - b"\x51\x4b\xd4\x12\xb5\x44\x2d\x51\x4b\xd4\x12\xb5\x44\x2d\x71\x00" - b"\x41\xaa\x01\x63\x85\xb8\x32\xab\x00\x00\x00\x00\x49\x45\x4e\x44" - b"\xae\x42\x60\x82" - ) - image_path = tmp_path / filename - image_path.write_bytes(png_data) - return str(image_path) - - -@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") -class TestWatermarkImageFileIntegration: - """Integration tests for image file watermark functionality.""" - - @pytest.fixture - def client(self): - """Create a client with the configured API key.""" - client = NutrientClient(api_key=API_KEY, timeout=TIMEOUT) - yield client - client.close() - - @pytest.fixture - def sample_pdf_path(self): - """Get path to sample PDF file for testing.""" - return os.path.join(os.path.dirname(__file__), "..", "data", "sample.pdf") - - def test_watermark_pdf_with_image_file_path(self, client, sample_pdf_path, tmp_path): - """Test watermark_pdf with local image file path.""" - # Create a test image - image_path = create_test_image(tmp_path) - - result = client.watermark_pdf( - sample_pdf_path, - image_file=image_path, - width=100, - height=50, - opacity=0.5, - position="bottom-right", - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_watermark_pdf_with_image_bytes(self, client, sample_pdf_path): - """Test watermark_pdf with image as bytes.""" - # Create a proper PNG image as bytes - try: - import io - - from PIL import Image - - img = Image.new("RGB", (100, 100), color="blue") - img_buffer = io.BytesIO() - img.save(img_buffer, format="PNG") - png_bytes = img_buffer.getvalue() - except ImportError: - # Fallback to a 50x50 red PNG if PIL is not available - png_bytes = ( - b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a\x00\x00\x00\x0d\x49\x48\x44\x52" - b"\x00\x00\x00\x32\x00\x00\x00\x32\x08\x02\x00\x00\x00\x91\x5d\x1f" - b"\xe6\x00\x00\x00\x4b\x49\x44\x41\x54\x78\x9c\xed\xce\xb1\x01\x00" - b"\x10\x00\xc0\x30\xfc\xff\x33\x0f\x58\x32\x31\x34\x17\x64\xee\xf1" - b"\xa3\xf5\x3a\x70\x57\x4b\xd4\x12\xb5\x44\x2d\x51\x4b\xd4\x12\xb5" - b"\x44\x2d\x51\x4b\xd4\x12\xb5\x44\x2d\x51\x4b\xd4\x12\xb5\x44\x2d" - b"\x51\x4b\xd4\x12\xb5\x44\x2d\x51\x4b\xd4\x12\xb5\x44\x2d\x71\x00" - b"\x41\xaa\x01\x63\x85\xb8\x32\xab\x00\x00\x00\x00\x49\x45\x4e\x44" - b"\xae\x42\x60\x82" - ) - - result = client.watermark_pdf( - sample_pdf_path, - image_file=png_bytes, - width=150, - height=75, - opacity=0.8, - position="top-left", - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_watermark_pdf_with_image_file_output_path(self, client, sample_pdf_path, tmp_path): - """Test watermark_pdf with image file saving to output path.""" - # Create a test image - image_path = create_test_image(tmp_path) - output_path = str(tmp_path / "watermarked_with_image.pdf") - - result = client.watermark_pdf( - sample_pdf_path, - image_file=image_path, - width=200, - height=100, - opacity=0.7, - position="center", - output_path=output_path, - ) - - assert result is None - assert (tmp_path / "watermarked_with_image.pdf").exists() - assert (tmp_path / "watermarked_with_image.pdf").stat().st_size > 0 - assert_is_pdf(output_path) - - def test_watermark_pdf_with_file_like_object(self, client, sample_pdf_path, tmp_path): - """Test watermark_pdf with image as file-like object.""" - # Create a test image - image_path = create_test_image(tmp_path) - - # Read as file-like object - with open(image_path, "rb") as image_file: - result = client.watermark_pdf( - sample_pdf_path, - image_file=image_file, - width=120, - height=60, - opacity=0.6, - position="top-center", - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_builder_api_with_image_file_watermark(self, client, sample_pdf_path, tmp_path): - """Test Builder API with image file watermark.""" - # Create a test image - image_path = create_test_image(tmp_path) - - # Use builder API - result = ( - client.build(sample_pdf_path) - .add_step( - "watermark-pdf", - options={ - "image_file": image_path, - "width": 180, - "height": 90, - "opacity": 0.4, - "position": "bottom-left", - }, - ) - .execute() - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) - - def test_multiple_watermarks_with_image_files(self, client, sample_pdf_path, tmp_path): - """Test applying multiple watermarks including image files.""" - # Create test images - image1_path = create_test_image(tmp_path, "watermark1.png") - - # Chain multiple watermark operations - result = ( - client.build(sample_pdf_path) - .add_step( - "watermark-pdf", - options={ - "text": "DRAFT", - "width": 200, - "height": 100, - "opacity": 0.3, - "position": "center", - }, - ) - .add_step( - "watermark-pdf", - options={ - "image_file": image1_path, - "width": 100, - "height": 50, - "opacity": 0.5, - "position": "top-right", - }, - ) - .execute() - ) - - assert isinstance(result, bytes) - assert len(result) > 0 - assert_is_pdf(result) diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..3e57352 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,1133 @@ +"""Integration tests for Nutrient DWS Python Client. + +These tests require a valid API key and make real API calls. +Set NUTRIENT_API_KEY environment variable to run these tests. + +To run these tests with a live API: +1. Set NUTRIENT_API_KEY environment variable +2. Run: NUTRIENT_API_KEY=your_key pytest tests/test_integration.py +""" + +import os +import pytest +from dotenv import load_dotenv + +from nutrient_dws import NutrientClient +from nutrient_dws.builder.constant import BuildActions +from nutrient_dws.errors import NutrientError, ValidationError +from tests.helpers import ( + TestDocumentGenerator, + ResultValidator, + sample_pdf, + sample_docx, + sample_png, +) + +load_dotenv() # Load environment variables + +# Skip integration tests unless explicitly enabled with valid API key +should_run_integration_tests = bool(os.getenv("NUTRIENT_API_KEY")) + +# Use conditional pytest.mark based on environment +pytestmark = pytest.mark.skipif( + not should_run_integration_tests, + reason="Integration tests require NUTRIENT_API_KEY environment variable", +) + +@pytest.fixture(scope="class") +def integration_client(): + """Create client instance for testing.""" + return NutrientClient(api_key=os.getenv("NUTRIENT_API_KEY", ""), base_url=os.getenv("NUTRIENT_BASE_URL", "https://api.nutrient.io")) + + +class TestIntegrationDirectMethods: + """Integration tests with live API - direct client methods.""" + + def test_account_and_authentication_methods(self, integration_client): + """Test account information and authentication methods.""" + + @pytest.mark.asyncio + async def test_get_account_info(self, integration_client): + """Test retrieving account information.""" + account_info = await integration_client.get_account_info() + + assert account_info is not None + assert "subscriptionType" in account_info + assert isinstance(account_info["subscriptionType"], str) + assert "apiKeys" in account_info + + @pytest.mark.asyncio + async def test_create_and_delete_token(self, integration_client): + """Test creating and deleting authentication tokens.""" + token_params = { + "expirationTime": 0, + } + + token = await integration_client.create_token(token_params) + + assert token is not None + assert "id" in token + assert isinstance(token["id"], str) + assert "accessToken" in token + assert isinstance(token["accessToken"], str) + + # Clean up - delete the token we just created + await integration_client.delete_token(token["id"]) + + @pytest.mark.asyncio + async def test_sign_pdf_document(self, integration_client): + """Test signing PDF documents.""" + result = await integration_client.sign(sample_pdf) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_sign_pdf_with_custom_image(self, integration_client): + """Test signing PDF with custom signature image.""" + result = await integration_client.sign( + sample_pdf, + None, + { + "image": sample_png, + }, + ) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_create_redactions_ai(self, integration_client): + """Test AI-powered redaction functionality.""" + sensitive_document = TestDocumentGenerator.generate_pdf_with_sensitive_data() + result = await integration_client.create_redactions_ai(sensitive_document, "Redact Email") + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + assert result["filename"] == "output.pdf" + + @pytest.mark.asyncio + async def test_create_redactions_ai_with_page_range(self, integration_client): + """Test AI redaction with specific page range.""" + result = await integration_client.create_redactions_ai( + sample_pdf, + "Redact Email", + "apply", + { + "start": 1, + "end": 2, + }, + ) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "input_data,input_type,output_type,expected_mime", + [ + (sample_pdf, "pdf", "pdfa", "application/pdf"), + (sample_pdf, "pdf", "pdfua", "application/pdf"), + ( + sample_pdf, + "pdf", + "docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + ( + sample_pdf, + "pdf", + "xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ( + sample_pdf, + "pdf", + "pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (sample_docx, "docx", "pdf", "application/pdf"), + (sample_pdf, "pdf", "png", "image/png"), + (sample_pdf, "pdf", "jpeg", "image/jpeg"), + (sample_pdf, "pdf", "jpg", "image/jpeg"), + (sample_pdf, "pdf", "webp", "image/webp"), + # (sample_pdf, "pdf", "html", "text/html"), # FIXME: 500 error upstream + (sample_pdf, "pdf", "markdown", "text/markdown"), + ], + ) + async def test_convert_formats( + self, integration_client, input_data, input_type, output_type, expected_mime + ): + """Test document format conversion.""" + result = await integration_client.convert(input_data, output_type) + + assert result is not None + + if output_type not in ["markdown", "html"]: + assert isinstance(result.get("buffer"), (bytes, bytearray)) + else: + assert isinstance(result.get("content"), str) + assert result["mimeType"] == expected_mime + + @pytest.mark.asyncio + async def test_ocr_single_language(self, integration_client): + """Test OCR with single language.""" + result = await integration_client.ocr(sample_png, "english") + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_ocr_multiple_languages(self, integration_client): + """Test OCR with multiple languages.""" + result = await integration_client.ocr(sample_png, ["english", "spanish"]) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_watermark_text(self, integration_client): + """Test text watermarking.""" + result = await integration_client.watermark_text( + sample_pdf, + "CONFIDENTIAL", + { + "opacity": 0.5, + "fontSize": 48, + "rotation": 45, + }, + ) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_watermark_image(self, integration_client): + """Test image watermarking.""" + result = await integration_client.watermark_image( + sample_pdf, + sample_png, + { + "opacity": 0.5, + }, + ) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_merge_pdf_files(self, integration_client): + """Test merging multiple PDF files.""" + result = await integration_client.merge([sample_pdf, sample_pdf, sample_pdf]) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + # Merged PDF should be larger than single PDF + assert len(result["buffer"]) > len(sample_pdf) + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "optimization_options", + [ + {"imageOptimizationQuality": 1}, # low + {"imageOptimizationQuality": 2}, # medium + {"imageOptimizationQuality": 3}, # high + {"imageOptimizationQuality": 4, "mrcCompression": True}, # maximum + ], + ) + async def test_optimize_pdf(self, integration_client, optimization_options): + """Test PDF optimization with different options.""" + result = await integration_client.optimize(sample_pdf, optimization_options) + + assert result is not None + assert isinstance(result["buffer"], (bytes, bytearray)) + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_extract_text(self, integration_client): + """Test text extraction from PDF.""" + result = await integration_client.extract_text(sample_pdf) + + assert result is not None + assert "data" in result + assert isinstance(result["data"], dict) + + @pytest.mark.asyncio + async def test_flatten_pdf(self, integration_client): + """Test flattening PDF annotations.""" + result = await integration_client.flatten(sample_pdf) + + assert result is not None + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_rotate_pdf(self, integration_client): + """Test rotating PDF pages.""" + result = await integration_client.rotate(sample_pdf, 90) + + assert result is not None + assert result["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_rotate_pdf_page_range(self, integration_client): + """Test rotating specific page range.""" + result = await integration_client.rotate(sample_pdf, 180, {"start": 1, "end": 3}) + + assert result is not None + assert result["mimeType"] == "application/pdf" + + +class TestIntegrationErrorHandling: + """Test error handling scenarios with live API.""" + + @pytest.mark.asyncio + async def test_invalid_file_input(self, integration_client): + """Test handling of invalid file input.""" + with pytest.raises((ValidationError, NutrientError)): + await integration_client.convert(None, "pdf") + + @pytest.mark.asyncio + async def test_invalid_api_key(self): + """Test handling of invalid API key.""" + invalid_client = NutrientClient(api_key="invalid-api-key") + + with pytest.raises(NutrientError, match="HTTP 401"): + await invalid_client.convert(b"test", "pdf") + + @pytest.mark.asyncio + async def test_network_timeout(self): + """Test handling of network timeouts.""" + timeout_client = NutrientClient(api_key=os.getenv("NUTRIENT_API_KEY", ""), timeout=1) + + with pytest.raises(NutrientError): + await timeout_client.convert(sample_docx, "pdf") + + +class TestIntegrationWorkflowBuilder: + """Integration tests for workflow builder with live API.""" + + @pytest.mark.asyncio + async def test_complex_workflow_multiple_parts_actions(self, integration_client): + """Test complex workflow with multiple parts and actions.""" + pdf1 = TestDocumentGenerator.generate_pdf_with_table() + pdf2 = TestDocumentGenerator.generate_pdf_with_sensitive_data() + html = TestDocumentGenerator.generate_html_content() + + result = await ( + integration_client.workflow() + .add_file_part(pdf1, None, [BuildActions.rotate(90)]) + .add_html_part(html) + .add_file_part(pdf2) + .add_new_page({"layout": {"size": "A4"}}) + .apply_actions( + [ + BuildActions.watermark_text("DRAFT", {"opacity": 0.3}), + BuildActions.flatten(), + ] + ) + .output_pdfua() + .execute(on_progress=lambda step, total: None) + ) + + assert result["success"] is True + assert isinstance(result["output"]["buffer"], (bytes, bytearray)) + assert result["output"]["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_workflow_dry_run(self, integration_client): + """Test workflow dry run analysis.""" + result = await ( + integration_client.workflow() + .add_file_part(sample_pdf) + .apply_action(BuildActions.ocr(["english", "french"])) + .output_pdf() + .dry_run() + ) + + assert result["success"] is True + assert "analysis" in result + assert result["analysis"]["cost"] >= 0 + assert "required_features" in result["analysis"] + + @pytest.mark.asyncio + async def test_workflow_redaction_actions(self, integration_client): + """Test workflow with redaction actions.""" + result = await ( + integration_client.workflow() + .add_file_part(sample_pdf) + .apply_actions( + [ + BuildActions.create_redactions_text( + "confidential", {}, {"caseSensitive": False} + ), + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + assert result["success"] is True + assert result["output"]["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_workflow_regex_redactions(self, integration_client): + """Test workflow with regex redaction actions.""" + result = await ( + integration_client.workflow() + .add_file_part(sample_pdf) + .apply_actions( + [ + BuildActions.create_redactions_regex( + r"\d{3}-\d{2}-\d{4}", {}, {"caseSensitive": False} + ), + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + assert result["success"] is True + assert result["output"]["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_workflow_preset_redactions(self, integration_client): + """Test workflow with preset redaction actions.""" + result = await ( + integration_client.workflow() + .add_file_part(sample_pdf) + .apply_actions( + [ + BuildActions.create_redactions_preset("email-address"), + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + assert result["success"] is True + assert result["output"]["mimeType"] == "application/pdf" + + @pytest.mark.asyncio + async def test_workflow_instant_json_xfdf(self, integration_client): + """Test workflow with Instant JSON and XFDF actions.""" + pdf_file = sample_pdf + json_file = TestDocumentGenerator.generate_instant_json_content() + xfdf_file = TestDocumentGenerator.generate_xfdf_content() + + # Test apply_instant_json + instant_json_result = await ( + integration_client.workflow() + .add_file_part(pdf_file) + .apply_action(BuildActions.apply_instant_json(json_file)) + .output_pdf() + .execute() + ) + + assert instant_json_result["success"] is True + assert instant_json_result["output"]["mimeType"] == "application/pdf" + + # Test apply_xfdf + xfdf_result = await ( + integration_client.workflow() + .add_file_part(pdf_file) + .apply_action(BuildActions.apply_xfdf(xfdf_file)) + .output_pdf() + .execute() + ) + + assert xfdf_result["success"] is True + assert xfdf_result["output"]["mimeType"] == "application/pdf" + + +class TestIntegrationRedactionOperations: + """Test redaction operations with live API.""" + + @pytest.mark.asyncio + async def test_text_based_redactions(self, integration_client, test_sensitive_pdf): + """Test text-based redactions.""" + result = await ( + integration_client.workflow() + .add_file_part(test_sensitive_pdf) + .apply_actions( + [ + BuildActions.create_redactions_text("123-45-6789"), + BuildActions.create_redactions_text("john.doe@example.com"), + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_regex_redactions_ssn_pattern(self, integration_client, test_sensitive_pdf): + """Test regex redactions for SSN pattern.""" + result = await ( + integration_client.workflow() + .add_file_part(test_sensitive_pdf) + .apply_actions( + [ + BuildActions.create_redactions_regex( + r"\d{3}-\d{2}-\d{4}" + ), # SSN pattern + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_multiple_regex_patterns(self, integration_client, test_sensitive_pdf): + """Test multiple regex redaction patterns.""" + result = await ( + integration_client.workflow() + .add_file_part(test_sensitive_pdf) + .apply_actions( + [ + BuildActions.create_redactions_regex( + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + ), # Email + BuildActions.create_redactions_regex( + r"\(\d{3}\) \d{3}-\d{4}" + ), # Phone + BuildActions.create_redactions_regex( + r"\d{4}-\d{4}-\d{4}-\d{4}" + ), # Credit card + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_preset_redactions_common_patterns(self, integration_client, test_sensitive_pdf): + """Test preset redactions for common patterns.""" + result = await ( + integration_client.workflow() + .add_file_part(test_sensitive_pdf) + .apply_actions( + [ + BuildActions.create_redactions_preset("email-address"), + BuildActions.create_redactions_preset("international-phone-number"), + BuildActions.create_redactions_preset("social-security-number"), + BuildActions.apply_redactions(), + ] + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationImageWatermarking: + """Test image watermarking with live API.""" + + @pytest.mark.asyncio + async def test_image_watermark_basic(self, integration_client, test_table_pdf): + """Test basic image watermarking.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .apply_action( + BuildActions.watermark_image( + sample_png, + { + "opacity": 0.3, + "width": {"value": 200, "unit": "pt"}, + "height": {"value": 100, "unit": "pt"}, + }, + ) + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_image_watermark_custom_positioning(self, integration_client, test_table_pdf): + """Test image watermarking with custom positioning.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .apply_action( + BuildActions.watermark_image( + sample_png, + { + "opacity": 0.5, + "width": {"value": 150, "unit": "pt"}, + "height": {"value": 150, "unit": "pt"}, + "top": {"value": 100, "unit": "pt"}, + "left": {"value": 100, "unit": "pt"}, + }, + ) + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationHtmlToPdfConversion: + """Test HTML to PDF conversion with live API.""" + + + + @pytest.mark.asyncio + async def test_html_to_pdf_default_settings(self, integration_client, test_html_content): + """Test HTML to PDF conversion with default settings.""" + result = await ( + integration_client.workflow().add_html_part(test_html_content).output_pdf().execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_html_with_actions(self, integration_client, test_html_content): + """Test HTML conversion with applied actions.""" + result = await ( + integration_client.workflow() + .add_html_part(test_html_content) + .apply_actions( + [ + BuildActions.watermark_text("DRAFT", {"opacity": 0.3}), + BuildActions.flatten(), + ] + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_combine_html_with_pdf(self, integration_client, test_html_content): + """Test combining HTML with existing PDF.""" + test_table_pdf = TestDocumentGenerator.generate_pdf_with_table() + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .add_html_part(test_html_content) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationAnnotationOperations: + """Test annotation operations with live API.""" + + @pytest.mark.asyncio + async def test_apply_xfdf_annotations( + self, integration_client, test_table_pdf, test_xfdf_content + ): + """Test applying XFDF annotations to PDF.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .apply_action(BuildActions.apply_xfdf(test_xfdf_content)) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_apply_xfdf_and_flatten( + self, integration_client, test_table_pdf, test_xfdf_content + ): + """Test applying XFDF and flattening annotations.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .apply_actions( + [BuildActions.apply_xfdf(test_xfdf_content), BuildActions.flatten()] + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_apply_instant_json_annotations( + self, integration_client, test_table_pdf, test_instant_json_content + ): + """Test applying Instant JSON annotations.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .apply_action(BuildActions.apply_instant_json(test_instant_json_content)) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationAdvancedPdfOptions: + """Test advanced PDF options with live API.""" + + @pytest.mark.asyncio + async def test_password_protected_pdf(self, integration_client, test_sensitive_pdf): + """Test creating password-protected PDF.""" + result = await ( + integration_client.workflow() + .add_file_part(test_sensitive_pdf) + .output_pdf( + { + "user_password": "user123", + "owner_password": "owner456", + } + ) + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_pdf_permissions(self, integration_client, test_table_pdf): + """Test setting PDF permissions.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_pdf( + { + "owner_password": "owner123", + "user_permissions": ["printing", "extract", "fill_forms"], + } + ) + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_pdf_metadata(self, integration_client, test_table_pdf): + """Test setting PDF metadata.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_pdf( + { + "metadata": { + "title": "Test Document", + "author": "Test Author", + }, + } + ) + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_pdf_optimization_advanced(self, integration_client, test_table_pdf): + """Test PDF optimization with advanced settings.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_pdf( + { + "optimize": { + "mrcCompression": True, + "imageOptimizationQuality": 3, + "linearize": True, + }, + } + ) + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_pdfa_advanced_options(self, integration_client, test_table_pdf): + """Test PDF/A with specific conformance level.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_pdfa( + { + "conformance": "pdfa-2a", + "vectorization": True, + "rasterization": True, + } + ) + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationOfficeFormatOutputs: + """Test Office format outputs with live API.""" + + @pytest.mark.asyncio + async def test_pdf_to_excel(self, integration_client, test_table_pdf): + """Test converting PDF to Excel (XLSX).""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_office("xlsx") + .execute() + ) + + ResultValidator.validate_office_output(result, "xlsx") + + @pytest.mark.asyncio + async def test_pdf_to_powerpoint(self, integration_client, test_table_pdf): + """Test converting PDF to PowerPoint (PPTX).""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_office("pptx") + .execute() + ) + + ResultValidator.validate_office_output(result, "pptx") + + +class TestIntegrationImageOutputOptions: + """Test image output options with live API.""" + + @pytest.mark.asyncio + async def test_pdf_to_jpeg_custom_dpi(self, integration_client, test_table_pdf): + """Test converting PDF to JPEG with custom DPI.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_image("jpeg", {"dpi": 300}) + .execute() + ) + + ResultValidator.validate_image_output(result, "jpeg") + + @pytest.mark.asyncio + async def test_pdf_to_webp(self, integration_client, test_table_pdf): + """Test converting PDF to WebP format.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_image("webp", {"height": 300}) + .execute() + ) + + ResultValidator.validate_image_output(result, "webp") + + +class TestIntegrationJsonContentExtraction: + """Test JSON content extraction with live API.""" + + @pytest.mark.asyncio + async def test_extract_tables(self, integration_client, test_table_pdf): + """Test extracting tables from PDF.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_json({"tables": True}) + .execute() + ) + + ResultValidator.validate_json_output(result) + + @pytest.mark.asyncio + async def test_extract_key_value_pairs(self, integration_client, test_table_pdf): + """Test extracting key-value pairs.""" + result = await ( + integration_client.workflow() + .add_file_part(test_table_pdf) + .output_json({"keyValuePairs": True}) + .execute() + ) + + ResultValidator.validate_json_output(result) + + @pytest.mark.asyncio + async def test_extract_specific_page_range(self, integration_client, test_sensitive_pdf): + """Test extracting content from specific page range.""" + result = await ( + integration_client.workflow() + .add_file_part(test_sensitive_pdf, {"pages": {"start": 0, "end": 0}}) + .output_json() + .execute() + ) + + ResultValidator.validate_json_output(result) + + +class TestIntegrationComplexWorkflows: + """Test complex multi-format workflows with live API.""" + + @pytest.mark.asyncio + async def test_combine_html_pdf_images_with_actions(self, integration_client): + """Test combining HTML, PDF, and images with various actions.""" + test_sensitive_pdf = TestDocumentGenerator.generate_pdf_with_sensitive_data() + test_html_content = TestDocumentGenerator.generate_html_content() + + result = await ( + integration_client.workflow() + # Add existing PDF + .add_file_part(test_sensitive_pdf, None, [BuildActions.rotate(90)]) + # Add HTML content + .add_html_part(test_html_content) + # Add image as new page + .add_file_part(sample_png) + # Add blank page + .add_new_page({"layout": {"size": "A4"}}) + # Apply global actions + .apply_actions( + [ + BuildActions.watermark_text( + "CONFIDENTIAL", + { + "opacity": 0.2, + "fontSize": 60, + "rotation": 45, + }, + ), + BuildActions.flatten(), + ] + ) + .output_pdf({"optimize": {"imageOptimizationQuality": 2}}) + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_document_assembly_with_redactions(self, integration_client): + """Test document assembly with redactions.""" + pdf1 = TestDocumentGenerator.generate_simple_pdf_content("SSN: 123-45-6789") + pdf2 = TestDocumentGenerator.generate_simple_pdf_content( + "email: secret@example.com" + ) + + result = await ( + integration_client.workflow() + # First document with redactions + .add_file_part( + pdf1, + None, + [ + BuildActions.create_redactions_regex(r"\d{3}-\d{2}-\d{4}"), + BuildActions.apply_redactions(), + ], + ) + # Second document with different redactions + .add_file_part( + pdf2, + None, + [ + BuildActions.create_redactions_regex( + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + ), + BuildActions.apply_redactions(), + ], + ) + # Apply watermark to entire document + .apply_action( + BuildActions.watermark_text( + "REDACTED COPY", + { + "opacity": 0.3, + "fontSize": 48, + "fontColor": "#FF0000", + }, + ) + ) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationErrorScenarios: + """Test error scenarios with live API.""" + + @pytest.mark.asyncio + async def test_invalid_html_content(self, integration_client): + """Test handling of invalid HTML content.""" + invalid_html = b"Invalid HTML" + + result = await ( + integration_client.workflow().add_html_part(invalid_html).output_pdf().execute() + ) + + # Should still succeed with best-effort HTML processing + assert result["success"] is True + + @pytest.mark.asyncio + async def test_invalid_xfdf_content(self, integration_client): + """Test handling of invalid XFDF content.""" + invalid_xfdf = b'' + + result = await ( + integration_client.workflow() + .add_file_part(b"%PDF-1.4") + .apply_action(BuildActions.apply_xfdf(invalid_xfdf)) + .output_pdf() + .execute() + ) + + # Error handling may vary - check if it succeeds or fails gracefully + assert "success" in result + + @pytest.mark.asyncio + async def test_invalid_instant_json(self, integration_client): + """Test handling of invalid Instant JSON.""" + invalid_json = "{ invalid json }" + + result = await ( + integration_client.workflow() + .add_file_part(b"%PDF-1.4") + .apply_action(BuildActions.apply_instant_json(invalid_json)) + .output_pdf() + .execute() + ) + + # Error handling may vary - check if it succeeds or fails gracefully + assert "success" in result + + +class TestIntegrationPerformanceAndLimits: + """Test performance and limits with live API.""" + + @pytest.mark.asyncio + async def test_workflow_with_many_actions(self, integration_client): + """Test workflow with many actions.""" + actions = [] + # Add multiple watermarks + for i in range(5): + actions.append( + BuildActions.watermark_text( + f"Layer {i + 1}", + { + "opacity": 0.1, + "fontSize": 20 + i * 10, + "rotation": i * 15, + }, + ) + ) + # Add multiple redaction patterns + actions.extend( + [ + BuildActions.create_redactions_regex(r"\d{3}-\d{2}-\d{4}"), + BuildActions.create_redactions_regex( + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + ), + BuildActions.apply_redactions(), + BuildActions.flatten(), + ] + ) + + result = await ( + integration_client.workflow() + .add_file_part(sample_pdf) + .apply_actions(actions) + .output_pdf() + .execute() + ) + + ResultValidator.validate_pdf_output(result) + + @pytest.mark.asyncio + async def test_workflow_with_many_parts(self, integration_client): + """Test workflow with many parts.""" + parts = [] + for i in range(10): + parts.append( + TestDocumentGenerator.generate_simple_pdf_content(f"Page {i + 1}") + ) + + workflow = integration_client.workflow() + for part in parts: + workflow = workflow.add_file_part(part) + + result = await workflow.output_pdf().execute() + + ResultValidator.validate_pdf_output(result) + + +class TestIntegrationPatternsMock: + """Mock integration patterns for CI/development environments.""" + + def test_workflow_builder_pattern(self): + """Test workflow builder pattern structure.""" + client = NutrientClient(api_key="mock-key") + workflow = client.workflow() + + assert workflow is not None + assert hasattr(workflow, "add_file_part") + assert callable(workflow.add_file_part) + + def test_all_convenience_methods_available(self): + """Test that all convenience methods are available.""" + client = NutrientClient(api_key="mock-key") + + # Core methods + assert hasattr(client, "workflow") + assert callable(client.workflow) + + # Document conversion methods + assert hasattr(client, "convert") + assert callable(client.convert) + assert hasattr(client, "ocr") + assert callable(client.ocr) + assert hasattr(client, "extract_text") + assert callable(client.extract_text) + + # Document manipulation methods + assert hasattr(client, "watermark_text") + assert callable(client.watermark_text) + assert hasattr(client, "watermark_image") + assert callable(client.watermark_image) + assert hasattr(client, "merge") + assert callable(client.merge) + assert hasattr(client, "optimize") + assert callable(client.optimize) + assert hasattr(client, "flatten") + assert callable(client.flatten) + assert hasattr(client, "rotate") + assert callable(client.rotate) + + def test_type_safety_workflow_builder(self): + """Test type safety with workflow builder.""" + client = NutrientClient(api_key="mock-key") + + # Python should maintain method chaining + stage1 = client.workflow() + stage2 = stage1.add_file_part("test.pdf") + stage3 = stage2.output_pdf() + + # Each stage should have appropriate methods + assert hasattr(stage1, "add_file_part") + assert callable(stage1.add_file_part) + assert hasattr(stage2, "apply_action") + assert callable(stage2.apply_action) + assert hasattr(stage3, "execute") + assert callable(stage3.execute) diff --git a/tests/unit/test_builder.py b/tests/unit/test_builder.py index 23cd422..29ccd6f 100644 --- a/tests/unit/test_builder.py +++ b/tests/unit/test_builder.py @@ -1,500 +1,1234 @@ -"""Comprehensive unit tests for Builder API.""" +"""Tests for StagedWorkflowBuilder functionality.""" -import io -from unittest.mock import Mock, patch +from unittest.mock import AsyncMock, MagicMock, patch +from typing import Any import pytest -from nutrient_dws.builder import BuildAPIWrapper - - -class TestBuilderInitialization: - """Test suite for BuildAPIWrapper initialization.""" - - def test_builder_init_with_string_path(self): - """Test builder initialization with string file path.""" - builder = BuildAPIWrapper(None, "test.pdf") - assert builder._input_file == "test.pdf" - assert builder._actions == [] - assert builder._parts == [{"file": "file"}] - assert "file" in builder._files - - def test_builder_init_with_bytes(self): - """Test builder initialization with bytes input.""" - content = b"PDF content" - builder = BuildAPIWrapper(None, content) - assert builder._input_file == content - assert builder._actions == [] - assert builder._parts == [{"file": "file"}] - assert "file" in builder._files - - def test_builder_init_with_file_like_object(self): - """Test builder initialization with file-like object.""" - file_obj = io.BytesIO(b"File content") - file_obj.name = "test.pdf" - - builder = BuildAPIWrapper(None, file_obj) - assert builder._input_file == file_obj - assert builder._actions == [] - assert builder._parts == [{"file": "file"}] - assert "file" in builder._files - - def test_builder_init_with_mock_client(self): - """Test builder initialization with mock client.""" - mock_client = Mock() - builder = BuildAPIWrapper(mock_client, "test.pdf") - assert builder._client == mock_client - - -class TestBuilderAddStep: - """Test suite for BuildAPIWrapper add_step method.""" - - def setup_method(self): - """Set up test fixtures.""" - self.builder = BuildAPIWrapper(None, "test.pdf") - - def test_add_step_basic(self): - """Test adding basic step without options.""" - result = self.builder.add_step("convert-to-pdf") - - assert result is self.builder # Should return self for chaining - assert len(self.builder._actions) == 1 - assert self.builder._actions[0]["type"] == "convert-to-pdf" - - def test_add_step_with_options(self): - """Test adding step with options.""" - result = self.builder.add_step("rotate-pages", options={"degrees": 90}) - - assert result is self.builder - assert len(self.builder._actions) == 1 - assert self.builder._actions[0]["type"] == "rotate" - assert self.builder._actions[0]["rotateBy"] == 90 - - def test_add_step_with_complex_options(self): - """Test adding step with complex options.""" - options = { - "text": "CONFIDENTIAL", - "width": 200, - "height": 100, - "opacity": 0.5, - "position": "center", +from nutrient_dws.builder.builder import StagedWorkflowBuilder +from nutrient_dws.builder.constant import BuildActions, BuildOutputs +from nutrient_dws.errors import ValidationError, NutrientError + + + + + +@pytest.fixture +def mock_send_request(): + """Mock the send request method.""" + + async def mock_request(endpoint, data): + if endpoint == "/build": + return b"mock-response" + elif endpoint == "/analyze_build": + return {"cost": 1.0, "required_features": {}} + return b"mock-response" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ) as mock: + yield mock + + +@pytest.fixture +def mock_validate_file_input(): + """Mock file input validation.""" + with patch("nutrient_dws.inputs.validate_file_input") as mock: + mock.return_value = True + yield mock + + +@pytest.fixture +def mock_is_remote_file_input(): + """Mock remote file input check.""" + with patch("nutrient_dws.inputs.is_remote_file_input") as mock: + mock.return_value = False + yield mock + + +@pytest.fixture +def mock_process_file_input(): + """Mock file input processing.""" + + async def mock_process(file_input): + return ("test-content", "application/pdf") + + with patch( + "nutrient_dws.inputs.process_file_input", side_effect=mock_process + ) as mock: + yield mock + + +class TestStagedWorkflowBuilderConstructor: + """Tests for StagedWorkflowBuilder constructor.""" + + def test_create_workflow_builder_with_valid_options(self, valid_client_options): + """Test creating a workflow builder with valid options.""" + builder = StagedWorkflowBuilder(valid_client_options) + assert builder is not None + assert builder.build_instructions == {"parts": []} + assert builder.assets == {} + assert builder.asset_index == 0 + assert builder.current_step == 0 + assert builder.is_executed is False + + +class TestStagedWorkflowBuilderPrivateMethods: + """Tests for StagedWorkflowBuilder private methods.""" + + def test_register_asset_with_valid_file(self, valid_client_options): + """Test registering a valid file asset.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ) as mock_validate: + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ) as mock_is_remote: + builder = StagedWorkflowBuilder(valid_client_options) + test_file = "test.pdf" + + asset_key = builder._register_asset(test_file) + + assert asset_key == "asset_0" + assert builder.assets["asset_0"] == test_file + assert builder.asset_index == 1 + mock_validate.assert_called_once_with(test_file) + mock_is_remote.assert_called_once_with(test_file) + + def test_register_asset_with_invalid_file(self, valid_client_options): + """Test registering an invalid file asset throws ValidationError.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=False + ): + builder = StagedWorkflowBuilder(valid_client_options) + + with pytest.raises( + ValidationError, match="Invalid file input provided to workflow" + ): + builder._register_asset("invalid-file") + + def test_register_asset_with_remote_file(self, valid_client_options): + """Test registering a remote file throws ValidationError.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=True + ): + builder = StagedWorkflowBuilder(valid_client_options) + + with pytest.raises( + ValidationError, + match="Remote file input doesn't need to be registered", + ): + builder._register_asset("https://example.com/file.pdf") + + def test_register_multiple_assets_increments_counter(self, valid_client_options): + """Test that registering multiple assets increments the counter.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + builder = StagedWorkflowBuilder(valid_client_options) + + first_key = builder._register_asset("file1.pdf") + second_key = builder._register_asset("file2.pdf") + + assert first_key == "asset_0" + assert second_key == "asset_1" + assert builder.asset_index == 2 + + def test_ensure_not_executed_with_fresh_workflow(self, valid_client_options): + """Test ensure_not_executed passes with fresh workflow.""" + builder = StagedWorkflowBuilder(valid_client_options) + # Should not raise exception + builder._ensure_not_executed() + + def test_ensure_not_executed_with_executed_workflow(self, valid_client_options): + """Test ensure_not_executed throws error when workflow is executed.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.is_executed = True + + with pytest.raises( + ValidationError, match="This workflow has already been executed" + ): + builder._ensure_not_executed() + + def test_validate_with_no_parts(self, valid_client_options): + """Test validate throws error when workflow has no parts.""" + builder = StagedWorkflowBuilder(valid_client_options) + + with pytest.raises(ValidationError, match="Workflow has no parts to execute"): + builder._validate() + + def test_validate_with_parts_but_no_output(self, valid_client_options): + """Test validate adds default PDF output when no output is specified.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.build_instructions["parts"] = [{"file": "asset_0"}] + + builder._validate() + + assert builder.build_instructions["output"] == {"type": "pdf"} + + def test_validate_with_parts_and_output(self, valid_client_options): + """Test validate passes when workflow has parts and output.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.build_instructions["parts"] = [{"file": "asset_0"}] + builder.build_instructions["output"] = {"type": "png", "format": "png"} + + # Should not raise exception + builder._validate() + + def test_process_action_with_regular_action(self, valid_client_options): + """Test processing a regular action without file input.""" + builder = StagedWorkflowBuilder(valid_client_options) + action = {"type": "ocr", "language": "english"} + + result = builder._process_action(action) + + assert result == action + + def test_process_action_with_file_input_action( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test processing an action that requires file registration.""" + builder = StagedWorkflowBuilder(valid_client_options) + + # Create a mock action with file input + mock_action = MagicMock() + mock_action.__needsFileRegistration = True + mock_action.fileInput = "watermark.png" + mock_action.createAction.return_value = { + "type": "watermark", + "image": "asset_0", + } + + result = builder._process_action(mock_action) + + assert result == {"type": "watermark", "image": "asset_0"} + mock_action.createAction.assert_called_once_with("asset_0") + + def test_process_action_with_remote_file_input_action( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test processing an action with remote file input.""" + mock_is_remote_file_input.return_value = True + builder = StagedWorkflowBuilder(valid_client_options) + + # Create a mock action with remote file input + mock_action = MagicMock() + mock_action.__needsFileRegistration = True + mock_action.fileInput = "https://example.com/watermark.png" + mock_action.createAction.return_value = { + "type": "watermark", + "image": {"url": "https://example.com/watermark.png"}, } - result = self.builder.add_step("watermark-pdf", options=options) - - assert result is self.builder - assert len(self.builder._actions) == 1 - action = self.builder._actions[0] - assert action["type"] == "watermark" - assert action["text"] == "CONFIDENTIAL" - assert action["width"] == 200 - assert action["height"] == 100 - assert action["opacity"] == 0.5 - assert action["position"] == "center" - - def test_add_multiple_steps(self): - """Test adding multiple steps.""" - self.builder.add_step("convert-to-pdf") - self.builder.add_step("rotate-pages", options={"degrees": 90}) - self.builder.add_step("watermark-pdf", options={"text": "DRAFT"}) - - assert len(self.builder._actions) == 3 - assert self.builder._actions[0]["type"] == "convert-to-pdf" - assert self.builder._actions[1]["type"] == "rotate" - assert self.builder._actions[2]["type"] == "watermark" - - -class TestBuilderChaining: - """Test suite for BuildAPIWrapper method chaining.""" - - def test_basic_chaining(self): - """Test basic method chaining.""" - builder = BuildAPIWrapper(None, "test.pdf") - result = ( - builder.add_step("convert-to-pdf") - .add_step("rotate-pages", options={"degrees": 90}) - .add_step("watermark-pdf", options={"text": "DRAFT"}) - ) + + result = builder._process_action(mock_action) + + expected_file_handle = {"url": "https://example.com/watermark.png"} + assert result == {"type": "watermark", "image": expected_file_handle} + mock_action.createAction.assert_called_once_with(expected_file_handle) + + def test_is_action_with_file_input_returns_true_for_file_action( + self, valid_client_options + ): + """Test is_action_with_file_input returns True for actions with file input.""" + builder = StagedWorkflowBuilder(valid_client_options) + + # Create a mock action that needs file registration + mock_action = MagicMock() + mock_action.__needsFileRegistration = True + mock_action.fileInput = "test.png" + mock_action.createAction = MagicMock() + + result = builder._is_action_with_file_input(mock_action) + + assert result is True + + def test_is_action_with_file_input_returns_false_for_regular_action( + self, valid_client_options + ): + """Test is_action_with_file_input returns False for regular actions.""" + builder = StagedWorkflowBuilder(valid_client_options) + action = {"type": "ocr", "language": "english"} + + result = builder._is_action_with_file_input(action) + + assert result is False + + @pytest.mark.asyncio + async def test_prepare_files_processes_assets_concurrently( + self, valid_client_options + ): + """Test prepare_files processes all assets concurrently.""" + + async def mock_process(file_input): + if file_input == "file1.pdf": + return ("content1", "application/pdf") + elif file_input == "file2.pdf": + return ("content2", "application/pdf") + return ("test-content", "application/pdf") + + with patch( + "nutrient_dws.builder.builder.process_file_input", side_effect=mock_process + ) as mock_process_file: + builder = StagedWorkflowBuilder(valid_client_options) + builder.assets = {"asset_0": "file1.pdf", "asset_1": "file2.pdf"} + + result = await builder._prepare_files() + + assert result == { + "asset_0": ("content1", "application/pdf"), + "asset_1": ("content2", "application/pdf"), + } + assert mock_process_file.call_count == 2 + + def test_cleanup_resets_builder_state(self, valid_client_options): + """Test cleanup resets the builder to initial state.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.assets = {"asset_0": "test.pdf"} + builder.asset_index = 5 + builder.current_step = 2 + + builder._cleanup() + + assert builder.assets == {} + assert builder.asset_index == 0 + assert builder.current_step == 0 + assert builder.is_executed is True + + +class TestStagedWorkflowBuilderPartMethods: + """Tests for StagedWorkflowBuilder part methods.""" + + def test_add_file_part_with_local_file( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding a file part with local file.""" + builder = StagedWorkflowBuilder(valid_client_options) + test_file = "test.pdf" + + result = builder.add_file_part(test_file) assert result is builder - assert len(builder._actions) == 3 - assert all("type" in action for action in builder._actions) - - def test_chaining_with_output_options(self): - """Test chaining with output options.""" - builder = BuildAPIWrapper(None, "test.pdf") - result = ( - builder.add_step("convert-to-pdf") - .set_output_options(metadata={"title": "Test"}, optimize=True) - .add_step("watermark-pdf", options={"text": "FINAL"}) - ) + assert len(builder.build_instructions["parts"]) == 1 + + file_part = builder.build_instructions["parts"][0] + assert file_part["file"] == "asset_0" + assert builder.assets["asset_0"] == test_file + + def test_add_file_part_with_remote_file( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding a file part with remote file URL.""" + mock_is_remote_file_input.return_value = True + builder = StagedWorkflowBuilder(valid_client_options) + test_url = "https://example.com/document.pdf" + + result = builder.add_file_part(test_url) assert result is builder - assert len(builder._actions) == 2 - assert builder._output_options["metadata"]["title"] == "Test" - assert builder._output_options["optimize"] is True - - def test_complex_workflow_chaining(self): - """Test complex workflow with multiple operations.""" - builder = BuildAPIWrapper(None, "document.docx") - result = ( - builder.add_step("convert-to-pdf") - .add_step("ocr-pdf", options={"language": "english"}) - .add_step("rotate-pages", options={"degrees": 90, "page_indexes": [0, 2]}) - .add_step("watermark-pdf", options={"text": "PROCESSED"}) - .add_step("flatten-annotations") - .set_output_options(optimize=True, metadata={"title": "Processed Document"}) - ) + assert len(builder.build_instructions["parts"]) == 1 + + file_part = builder.build_instructions["parts"][0] + assert file_part["file"] == {"url": test_url} + assert len(builder.assets) == 0 # Remote files are not registered + + def test_add_file_part_with_options_and_actions( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding a file part with options and actions.""" + builder = StagedWorkflowBuilder(valid_client_options) + test_file = "test.pdf" + options = {"pages": {"start": 0, "end": 5}} + actions = [{"type": "ocr", "language": "english"}] + + result = builder.add_file_part(test_file, options, actions) assert result is builder - assert len(builder._actions) == 5 - assert builder._actions[0]["type"] == "convert-to-pdf" - assert builder._actions[1]["type"] == "ocr" - assert builder._actions[2]["type"] == "rotate" - assert builder._actions[3]["type"] == "watermark" - assert builder._actions[4]["type"] == "flatten" - - -class TestBuilderOutputOptions: - """Test suite for BuildAPIWrapper output options.""" - - def setup_method(self): - """Set up test fixtures.""" - self.builder = BuildAPIWrapper(None, "test.pdf") - - def test_set_output_options_basic(self): - """Test setting basic output options.""" - result = self.builder.set_output_options(optimize=True) - - assert result is self.builder - assert self.builder._output_options["optimize"] is True - - def test_set_output_options_metadata(self): - """Test setting output options with metadata.""" - metadata = {"title": "Test Doc", "author": "Test Author"} - result = self.builder.set_output_options(metadata=metadata) - - assert result is self.builder - assert self.builder._output_options["metadata"]["title"] == "Test Doc" - assert self.builder._output_options["metadata"]["author"] == "Test Author" - - def test_set_output_options_multiple_calls(self): - """Test multiple calls to set_output_options merge properly.""" - self.builder.set_output_options(optimize=True) - self.builder.set_output_options(metadata={"title": "Test"}) - self.builder.set_output_options(compress=True) - - assert self.builder._output_options["optimize"] is True - assert self.builder._output_options["metadata"]["title"] == "Test" - assert self.builder._output_options["compress"] is True - - def test_set_output_options_overwrites_same_key(self): - """Test that setting same option key overwrites previous value.""" - self.builder.set_output_options(optimize=True) - self.builder.set_output_options(optimize=False) - - assert self.builder._output_options["optimize"] is False - - def test_set_output_options_complex_metadata(self): - """Test setting complex metadata structure.""" - metadata = { - "title": "Complex Document", - "author": "Test Author", - "subject": "Test Subject", - "keywords": ["test", "document", "processing"], - "custom": {"version": "1.0", "department": "Engineering"}, - } - result = self.builder.set_output_options(metadata=metadata) - - assert result is self.builder - assert self.builder._output_options["metadata"] == metadata - - -class TestBuilderToolToActionMapping: - """Test suite for BuildAPIWrapper tool to action mapping.""" - - def setup_method(self): - """Set up test fixtures.""" - self.builder = BuildAPIWrapper(None, "test.pdf") - - def test_map_tool_to_action_convert_to_pdf(self): - """Test mapping convert-to-pdf tool.""" - self.builder.add_step("convert-to-pdf") - action = self.builder._actions[0] - assert action["type"] == "convert-to-pdf" - - def test_map_tool_to_action_flatten_annotations(self): - """Test mapping flatten-annotations tool.""" - self.builder.add_step("flatten-annotations") - action = self.builder._actions[0] - assert action["type"] == "flatten" - - def test_map_tool_to_action_rotate_pages(self): - """Test mapping rotate-pages tool with options.""" - self.builder.add_step("rotate-pages", options={"degrees": 180}) - action = self.builder._actions[0] - assert action["type"] == "rotate" - assert action["rotateBy"] == 180 - - def test_map_tool_to_action_rotate_pages_with_page_indexes(self): - """Test mapping rotate-pages tool with page indexes.""" - self.builder.add_step("rotate-pages", options={"degrees": 90, "page_indexes": [0, 2, 4]}) - action = self.builder._actions[0] - assert action["type"] == "rotate" - assert action["rotateBy"] == 90 - assert action["pageIndexes"] == [0, 2, 4] - - def test_map_tool_to_action_ocr_pdf(self): - """Test mapping ocr-pdf tool with language.""" - self.builder.add_step("ocr-pdf", options={"language": "german"}) - action = self.builder._actions[0] - assert action["type"] == "ocr" - assert action["language"] == "deu" # Maps to API format - - def test_map_tool_to_action_ocr_pdf_english(self): - """Test mapping ocr-pdf tool with english language.""" - self.builder.add_step("ocr-pdf", options={"language": "english"}) - action = self.builder._actions[0] - assert action["type"] == "ocr" - assert action["language"] == "english" - - def test_map_tool_to_action_watermark_pdf(self): - """Test mapping watermark-pdf tool with all options.""" + file_part = builder.build_instructions["parts"][0] + assert file_part["pages"] == {"start": 0, "end": 5} + assert file_part["actions"] == [{"type": "ocr", "language": "english"}] + + def test_add_file_part_throws_error_when_executed(self, valid_client_options): + """Test add_file_part throws error when workflow is already executed.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.is_executed = True + + with pytest.raises( + ValidationError, match="This workflow has already been executed" + ): + builder.add_file_part("test.pdf") + + def test_add_html_part_with_local_file( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding an HTML part with local file.""" + builder = StagedWorkflowBuilder(valid_client_options) + html_content = b"Test" + + result = builder.add_html_part(html_content) + + assert result is builder + assert len(builder.build_instructions["parts"]) == 1 + + html_part = builder.build_instructions["parts"][0] + assert html_part["html"] == "asset_0" + assert builder.assets["asset_0"] == html_content + + def test_add_html_part_with_remote_url( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding an HTML part with remote URL.""" + mock_is_remote_file_input.return_value = True + builder = StagedWorkflowBuilder(valid_client_options) + html_url = "https://example.com/page.html" + + result = builder.add_html_part(html_url) + + assert result is builder + html_part = builder.build_instructions["parts"][0] + assert html_part["html"] == {"url": html_url} + assert len(builder.assets) == 0 + + def test_add_html_part_with_assets_and_actions( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding HTML part with assets and actions.""" + builder = StagedWorkflowBuilder(valid_client_options) + html_content = b"Test" + assets = [b"p {color: red;}", b"img {width: 100px;}"] + options = {"layout": "page"} + actions = [{"type": "ocr", "language": "english"}] + + result = builder.add_html_part(html_content, assets, options, actions) + + assert result is builder + html_part = builder.build_instructions["parts"][0] + assert html_part["html"] == "asset_0" + assert html_part["layout"] == "page" + assert html_part["assets"] == ["asset_1", "asset_2"] + assert html_part["actions"] == [{"type": "ocr", "language": "english"}] + assert len(builder.assets) == 3 # HTML + 2 assets + + def test_add_html_part_with_remote_assets_throws_error( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test adding HTML part with remote assets throws ValidationError.""" + + def is_remote_side_effect(input_file): + return input_file.startswith("https://") + + mock_is_remote_file_input.side_effect = is_remote_side_effect + builder = StagedWorkflowBuilder(valid_client_options) + html_content = b"Test" + assets = ["https://example.com/style.css"] + + with pytest.raises(ValidationError, match="Assets file input cannot be a URL"): + builder.add_html_part(html_content, assets) + + def test_add_new_page_with_no_options(self, valid_client_options): + """Test adding a new page with no options.""" + builder = StagedWorkflowBuilder(valid_client_options) + + result = builder.add_new_page() + + assert result is builder + assert len(builder.build_instructions["parts"]) == 1 + + page_part = builder.build_instructions["parts"][0] + assert page_part["page"] == "new" + + def test_add_new_page_with_options_and_actions(self, valid_client_options): + """Test adding a new page with options and actions.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"pageCount": 3, "layout": "A4"} + actions = [{"type": "ocr", "language": "english"}] + + result = builder.add_new_page(options, actions) + + assert result is builder + page_part = builder.build_instructions["parts"][0] + assert page_part["page"] == "new" + assert page_part["pageCount"] == 3 + assert page_part["layout"] == "A4" + assert page_part["actions"] == [{"type": "ocr", "language": "english"}] + + def test_add_document_part_with_basic_options(self, valid_client_options): + """Test adding a document part with basic options.""" + builder = StagedWorkflowBuilder(valid_client_options) + document_id = "doc-123" + + result = builder.add_document_part(document_id) + + assert result is builder + assert len(builder.build_instructions["parts"]) == 1 + + doc_part = builder.build_instructions["parts"][0] + assert doc_part["document"] == {"id": "doc-123"} + + def test_add_document_part_with_options_and_actions(self, valid_client_options): + """Test adding a document part with options and actions.""" + builder = StagedWorkflowBuilder(valid_client_options) + document_id = "doc-123" options = { - "text": "CONFIDENTIAL", - "width": 300, - "height": 150, - "opacity": 0.7, - "position": "top-right", + "layer": "layer1", + "password": "secret", + "pages": {"start": 0, "end": 10}, } - self.builder.add_step("watermark-pdf", options=options) - action = self.builder._actions[0] - assert action["type"] == "watermark" - assert action["text"] == "CONFIDENTIAL" - assert action["width"] == 300 - assert action["height"] == 150 - assert action["opacity"] == 0.7 - assert action["position"] == "top-right" - - def test_map_tool_to_action_watermark_pdf_defaults(self): - """Test mapping watermark-pdf tool with minimal options.""" - self.builder.add_step("watermark-pdf", options={"text": "TEST"}) - action = self.builder._actions[0] - assert action["type"] == "watermark" - assert action["text"] == "TEST" - assert action["width"] == 200 # Default - assert action["height"] == 100 # Default - - def test_map_tool_to_action_apply_redactions(self): - """Test mapping apply-redactions tool.""" - self.builder.add_step("apply-redactions") - action = self.builder._actions[0] - assert action["type"] == "applyRedactions" - - -class TestBuilderFileHandling: - """Test suite for BuildAPIWrapper file handling.""" - - def test_builder_stores_file_for_upload(self): - """Test that builder stores files for later upload.""" - builder = BuildAPIWrapper(None, "test.pdf") - - # The file is stored for later preparation during execute - assert "file" in builder._files - assert builder._files["file"] == "test.pdf" - - def test_builder_handles_bytes_input(self): - """Test that builder handles bytes input.""" - content = b"PDF content bytes" - builder = BuildAPIWrapper(None, content) - - assert "file" in builder._files - assert builder._files["file"] == content - - -class TestBuilderExecute: - """Test suite for BuildAPIWrapper execute method.""" - - def setup_method(self): - """Set up test fixtures.""" - self.mock_client = Mock() - self.mock_client._http_client = Mock() - self.builder = BuildAPIWrapper(self.mock_client, "test.pdf") - - @patch("nutrient_dws.builder.prepare_file_for_upload") - @patch("nutrient_dws.builder.save_file_output") - def test_execute_without_output_path(self, mock_save, mock_prepare): - """Test execute without output path returns bytes.""" - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - self.mock_client._http_client.post.return_value = b"processed content" - - result = self.builder.execute() - - assert result == b"processed content" - mock_save.assert_not_called() - self.mock_client._http_client.post.assert_called_once() - - @patch("nutrient_dws.builder.prepare_file_for_upload") - @patch("nutrient_dws.builder.save_file_output") - def test_execute_with_output_path(self, mock_save, mock_prepare): - """Test execute with output path saves file.""" - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - self.mock_client._http_client.post.return_value = b"processed content" - - result = self.builder.execute("output.pdf") - - assert result is None - mock_save.assert_called_once_with(b"processed content", "output.pdf") - self.mock_client._http_client.post.assert_called_once() - - @patch("nutrient_dws.builder.prepare_file_for_upload") - def test_execute_builds_correct_instructions(self, mock_prepare): - """Test that execute builds correct instructions.""" - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - self.mock_client._http_client.post.return_value = b"result" - - self.builder.add_step("convert-to-pdf") - self.builder.add_step("watermark-pdf", options={"text": "TEST"}) - self.builder.set_output_options(optimize=True) - - self.builder.execute() - - # Verify the client.post was called with correct parameters - call_args = self.mock_client._http_client.post.call_args - assert call_args[0][0] == "/build" # endpoint - assert "files" in call_args[1] - assert "json_data" in call_args[1] - - # Check the instruction structure - instructions = call_args[1]["json_data"] - assert "parts" in instructions - assert "actions" in instructions - assert "output" in instructions # Should include output options - assert len(instructions["actions"]) == 2 - assert instructions["actions"][0]["type"] == "convert-to-pdf" - assert instructions["actions"][1]["type"] == "watermark" - - @patch("nutrient_dws.builder.prepare_file_for_upload") - def test_execute_with_no_client_raises_error(self, mock_prepare): - """Test that execute without client raises appropriate error.""" - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - builder = BuildAPIWrapper(None, "test.pdf") - - with pytest.raises(AttributeError): - builder.execute() - - @patch("nutrient_dws.builder.prepare_file_for_upload") - def test_execute_propagates_client_errors(self, mock_prepare): - """Test that execute propagates errors from HTTP client.""" - from nutrient_dws.exceptions import APIError - - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - self.mock_client._http_client.post.side_effect = APIError("API error", 400, "Bad request") - - with pytest.raises(APIError): - self.builder.execute() - - -class TestBuilderEdgeCases: - """Test edge cases and boundary conditions.""" - - @patch("nutrient_dws.builder.prepare_file_for_upload") - def test_builder_with_empty_actions(self, mock_prepare): - """Test builder with no actions added.""" - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - mock_client = Mock() - mock_client._http_client = Mock() - mock_client._http_client.post.return_value = b"empty workflow result" - - builder = BuildAPIWrapper(mock_client, "test.pdf") - result = builder.execute() - - assert result == b"empty workflow result" - - # Verify instructions have empty actions - call_args = mock_client._http_client.post.call_args - instructions = call_args[1]["json_data"] - assert instructions["actions"] == [] - - @patch("nutrient_dws.builder.prepare_file_for_upload") - def test_builder_action_order_preservation(self, mock_prepare): - """Test that actions are executed in the correct order.""" - mock_prepare.return_value = ("file", ("test.pdf", b"content", "application/pdf")) - mock_client = Mock() - mock_client._http_client = Mock() - mock_client._http_client.post.return_value = b"ordered result" - - builder = BuildAPIWrapper(mock_client, "test.pdf") - builder.add_step("convert-to-pdf") - builder.add_step("ocr-pdf", options={"language": "english"}) - builder.add_step("rotate-pages", options={"degrees": 90}) - builder.add_step("watermark-pdf", options={"text": "FINAL"}) - - builder.execute() - - # Verify action order - call_args = mock_client._http_client.post.call_args - instructions = call_args[1]["json_data"] - actions = instructions["actions"] - - assert len(actions) == 4 - assert actions[0]["type"] == "convert-to-pdf" - assert actions[1]["type"] == "ocr" - assert actions[2]["type"] == "rotate" - assert actions[3]["type"] == "watermark" - - def test_builder_with_large_file_input(self): - """Test builder with large file input.""" - large_content = b"x" * (10 * 1024 * 1024) # 10MB - - builder = BuildAPIWrapper(None, large_content) - assert builder._input_file == large_content - - def test_builder_set_page_labels(self): - """Test setting page labels.""" - builder = BuildAPIWrapper(None, "test.pdf") - - labels = [ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3, "end": 9}, "label": "Chapter 1"}, - {"pages": {"start": 10}, "label": "Appendix"}, + actions = [{"type": "ocr", "language": "english"}] + + result = builder.add_document_part(document_id, options, actions) + + assert result is builder + doc_part = builder.build_instructions["parts"][0] + assert doc_part["document"] == {"id": "doc-123", "layer": "layer1"} + assert doc_part["password"] == "secret" + assert doc_part["pages"] == {"start": 0, "end": 10} + assert doc_part["actions"] == [{"type": "ocr", "language": "english"}] + + +class TestStagedWorkflowBuilderActionMethods: + """Tests for StagedWorkflowBuilder action methods.""" + + def test_apply_actions_with_multiple_actions(self, valid_client_options): + """Test applying multiple actions to workflow.""" + builder = StagedWorkflowBuilder(valid_client_options) + actions = [{"type": "ocr", "language": "english"}, {"type": "flatten"}] + + result = builder.apply_actions(actions) + + assert result is builder + assert builder.build_instructions["actions"] == actions + + def test_apply_actions_extends_existing_actions(self, valid_client_options): + """Test that apply_actions extends existing actions.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.build_instructions["actions"] = [{"type": "rotate", "rotateBy": 90}] + + new_actions = [{"type": "ocr", "language": "english"}] + result = builder.apply_actions(new_actions) + + assert result is builder + expected_actions = [ + {"type": "rotate", "rotateBy": 90}, + {"type": "ocr", "language": "english"}, ] + assert builder.build_instructions["actions"] == expected_actions + + def test_apply_action_with_single_action(self, valid_client_options): + """Test applying a single action to workflow.""" + builder = StagedWorkflowBuilder(valid_client_options) + action = {"type": "ocr", "language": "english"} + + result = builder.apply_action(action) + + assert result is builder + assert builder.build_instructions["actions"] == [action] + + def test_apply_actions_with_file_input_action( + self, valid_client_options, mock_validate_file_input, mock_is_remote_file_input + ): + """Test applying actions that require file registration.""" + builder = StagedWorkflowBuilder(valid_client_options) + + # Create a mock action with file input + mock_action = MagicMock() + mock_action.__needsFileRegistration = True + mock_action.fileInput = "watermark.png" + mock_action.createAction.return_value = { + "type": "watermark", + "image": "asset_0", + } + + result = builder.apply_actions([mock_action]) + + assert result is builder + assert builder.build_instructions["actions"] == [ + {"type": "watermark", "image": "asset_0"} + ] + + def test_apply_actions_throws_error_when_executed(self, valid_client_options): + """Test apply_actions throws error when workflow is already executed.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.is_executed = True + + with pytest.raises( + ValidationError, match="This workflow has already been executed" + ): + builder.apply_actions([{"type": "ocr", "language": "english"}]) + + +class TestStagedWorkflowBuilderOutputMethods: + """Tests for StagedWorkflowBuilder output methods.""" - result = builder.set_page_labels(labels) + def test_output_pdf_with_no_options(self, valid_client_options): + """Test setting PDF output with no options.""" + builder = StagedWorkflowBuilder(valid_client_options) - assert result is builder # Should return self for chaining - assert builder._output_options["labels"] == labels + result = builder.output_pdf() - def test_builder_set_page_labels_chaining(self): - """Test page labels can be chained with other operations.""" - builder = BuildAPIWrapper(None, "test.pdf") + assert result is builder + assert builder.build_instructions["output"] == {"type": "pdf"} + + def test_output_pdf_with_options(self, valid_client_options): + """Test setting PDF output with options.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"user_password": "secret", "owner_password": "owner"} + + result = builder.output_pdf(options) + + assert result is builder + expected_output = { + "type": "pdf", + "user_password": "secret", + "owner_password": "owner", + } + assert builder.build_instructions["output"] == expected_output + + def test_output_pdfa_with_options(self, valid_client_options): + """Test setting PDF/A output with options.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"conformance": "pdfa-2b", "vectorization": True} + + result = builder.output_pdfa(options) - labels = [{"pages": {"start": 0, "end": 0}, "label": "Cover"}] + assert result is builder + expected_output = { + "type": "pdfa", + "conformance": "pdfa-2b", + "vectorization": True, + } + assert builder.build_instructions["output"] == expected_output - result = ( - builder.add_step("rotate-pages", options={"degrees": 90}) - .set_page_labels(labels) - .set_output_options(metadata={"title": "Test"}) - ) + def test_output_pdfua_with_options(self, valid_client_options): + """Test setting PDF/UA output with options.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"metadata": {"title": "Accessible Document"}} + + result = builder.output_pdfua(options) assert result is builder - assert len(builder._actions) == 1 - assert builder._output_options["labels"] == labels - assert builder._output_options["metadata"]["title"] == "Test" + expected_output = { + "type": "pdfua", + "metadata": {"title": "Accessible Document"}, + } + assert builder.build_instructions["output"] == expected_output - def test_builder_options_none_handling(self): - """Test builder handles None options gracefully.""" - builder = BuildAPIWrapper(None, "test.pdf") - result = builder.add_step("convert-to-pdf", options=None) + def test_output_image_with_dpi(self, valid_client_options): + """Test setting image output with DPI option.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"dpi": 300} + + result = builder.output_image("png", options) assert result is builder - assert len(builder._actions) == 1 - assert builder._actions[0]["type"] == "convert-to-pdf" + expected_output = {"type": "image", "format": "png", "dpi": 300} + assert builder.build_instructions["output"] == expected_output + + def test_output_image_with_dimensions(self, valid_client_options): + """Test setting image output with width and height.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"width": 800, "height": 600} + + result = builder.output_image("jpeg", options) + + assert result is builder + expected_output = { + "type": "image", + "format": "jpeg", + "width": 800, + "height": 600, + } + assert builder.build_instructions["output"] == expected_output + + def test_output_image_without_required_options_throws_error( + self, valid_client_options + ): + """Test that image output without required options throws ValidationError.""" + builder = StagedWorkflowBuilder(valid_client_options) + + with pytest.raises( + ValidationError, + match="Image output requires at least one of the following options: dpi, height, width", + ): + builder.output_image("png") + + with pytest.raises( + ValidationError, + match="Image output requires at least one of the following options: dpi, height, width", + ): + builder.output_image("png", {}) + + def test_output_office_formats(self, valid_client_options): + """Test setting office output formats.""" + builder = StagedWorkflowBuilder(valid_client_options) + + # Test DOCX + result_docx = builder.output_office("docx") + assert result_docx is builder + assert builder.build_instructions["output"] == {"type": "docx"} + + # Test XLSX + builder2 = StagedWorkflowBuilder(valid_client_options) + result_xlsx = builder2.output_office("xlsx") + assert result_xlsx is builder2 + assert builder2.build_instructions["output"] == {"type": "xlsx"} + + # Test PPTX + builder3 = StagedWorkflowBuilder(valid_client_options) + result_pptx = builder3.output_office("pptx") + assert result_pptx is builder3 + assert builder3.build_instructions["output"] == {"type": "pptx"} + + def test_output_html_with_default_layout(self, valid_client_options): + """Test setting HTML output with default layout.""" + builder = StagedWorkflowBuilder(valid_client_options) + + result = builder.output_html() + + assert result is builder + assert builder.build_instructions["output"] == { + "type": "html", + "layout": "page", + } + + def test_output_html_with_reflow_layout(self, valid_client_options): + """Test setting HTML output with reflow layout.""" + builder = StagedWorkflowBuilder(valid_client_options) + + result = builder.output_html("reflow") + + assert result is builder + assert builder.build_instructions["output"] == { + "type": "html", + "layout": "reflow", + } + + def test_output_markdown(self, valid_client_options): + """Test setting Markdown output.""" + builder = StagedWorkflowBuilder(valid_client_options) + + result = builder.output_markdown() + + assert result is builder + assert builder.build_instructions["output"] == {"type": "markdown"} + + def test_output_json_with_options(self, valid_client_options): + """Test setting JSON content output with options.""" + builder = StagedWorkflowBuilder(valid_client_options) + options = {"plainText": True, "tables": False} - def test_builder_empty_output_options(self): - """Test builder with empty output options.""" - builder = BuildAPIWrapper(None, "test.pdf") - result = builder.set_output_options() + result = builder.output_json(options) assert result is builder - # Should still create empty output options dict - assert isinstance(builder._output_options, dict) + expected_output = {"type": "json-content", "plainText": True, "tables": False} + assert builder.build_instructions["output"] == expected_output + + def test_output_methods_throw_error_when_executed(self, valid_client_options): + """Test output methods throw error when workflow is already executed.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.is_executed = True + + with pytest.raises( + ValidationError, match="This workflow has already been executed" + ): + builder.output_pdf() + + +class TestStagedWorkflowBuilderExecutionMethods: + """Tests for StagedWorkflowBuilder execution methods.""" + + @pytest.mark.asyncio + async def test_execute_with_pdf_output(self, valid_client_options): + """Test executing workflow with PDF output.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"pdf-content" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ) as mock_send: + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + builder.output_pdf() + + result = await builder.execute() + + assert result["success"] is True + assert result["errors"] == [] + assert result["output"]["buffer"] == b"pdf-content" + assert result["output"]["mimeType"] == "application/pdf" + assert result["output"]["filename"] == "output.pdf" + assert builder.is_executed is True + mock_send.assert_called_once() + + @pytest.mark.asyncio + async def test_execute_with_json_output(self, valid_client_options): + """Test executing workflow with JSON content output.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + mock_response_data = {"pages": [{"plainText": "Some text"}]} + + async def mock_request(endpoint, data): + return mock_response_data + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + builder.output_json({"plainText": True}) + + result = await builder.execute() + + assert result["success"] is True + assert result["output"]["data"] == mock_response_data + + @pytest.mark.asyncio + async def test_execute_with_html_output(self, valid_client_options): + """Test executing workflow with HTML output.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"Content" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + builder.output_html("page") + + result = await builder.execute() + + assert result["success"] is True + assert ( + result["output"]["content"] + == "Content" + ) + assert result["output"]["mimeType"] == "text/html" + assert result["output"]["filename"] == "output.html" + + @pytest.mark.asyncio + async def test_execute_with_markdown_output(self, valid_client_options): + """Test executing workflow with Markdown output.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"# Header\n\nContent" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + builder.output_markdown() + + result = await builder.execute() + + assert result["success"] is True + assert result["output"]["content"] == "# Header\n\nContent" + assert result["output"]["mimeType"] == "text/markdown" + assert result["output"]["filename"] == "output.md" + + @pytest.mark.asyncio + async def test_execute_with_progress_callback(self, valid_client_options): + """Test executing workflow with progress callback.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + progress_calls = [] + + def on_progress(current: int, total: int) -> None: + progress_calls.append((current, total)) + + async def mock_request(endpoint, data): + return b"pdf-content" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + builder.output_pdf() + + await builder.execute(on_progress=on_progress) + + assert progress_calls == [(1, 3), (2, 3), (3, 3)] + + @pytest.mark.asyncio + async def test_execute_handles_validation_error(self, valid_client_options): + """Test execute handles validation errors properly.""" + builder = StagedWorkflowBuilder(valid_client_options) + # Don't add any parts - should trigger validation error + + result = await builder.execute() + + assert result["success"] is False + assert len(result["errors"]) == 1 + assert result["errors"][0]["step"] == 1 + assert isinstance(result["errors"][0]["error"], ValidationError) + assert builder.is_executed is True + + @pytest.mark.asyncio + async def test_execute_handles_request_error(self, valid_client_options): + """Test execute handles request errors properly.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + raise Exception("Network error") + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + builder.output_pdf() + + result = await builder.execute() + + assert result["success"] is False + assert len(result["errors"]) == 1 + assert result["errors"][0]["step"] == 2 + assert str(result["errors"][0]["error"]) == "Network error" + + @pytest.mark.asyncio + async def test_execute_throws_error_when_already_executed( + self, valid_client_options + ): + """Test execute throws error when workflow is already executed.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.is_executed = True + + with pytest.raises( + ValidationError, match="This workflow has already been executed" + ): + await builder.execute() + + @pytest.mark.asyncio + async def test_dry_run_success(self, valid_client_options): + """Test successful dry run execution.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + mock_analysis_data = {"estimatedTime": 5.2, "cost": 0.10} + + async def mock_request(endpoint, data): + return mock_analysis_data + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ) as mock_send: + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + + result = await builder.dry_run() + + assert result["success"] is True + assert result["errors"] == [] + assert result["analysis"] == mock_analysis_data + mock_send.assert_called_once() + # Verify it called the analyze_build endpoint + call_args = mock_send.call_args + assert call_args[0][0] == "/analyze_build" + + @pytest.mark.asyncio + async def test_dry_run_handles_validation_error(self, valid_client_options): + """Test dry run handles validation errors properly.""" + builder = StagedWorkflowBuilder(valid_client_options) + # Don't add any parts - should trigger validation error + + result = await builder.dry_run() + + assert result["success"] is False + assert len(result["errors"]) == 1 + assert result["errors"][0]["step"] == 0 + assert isinstance(result["errors"][0]["error"], ValidationError) + + @pytest.mark.asyncio + async def test_dry_run_handles_request_error( + self, + valid_client_options, + mock_send_request, + mock_validate_file_input, + mock_is_remote_file_input, + ): + """Test dry run handles request errors properly.""" + mock_send_request.side_effect = Exception("Analysis failed") + builder = StagedWorkflowBuilder(valid_client_options) + builder.add_file_part("test.pdf") + + result = await builder.dry_run() + + assert result["success"] is False + assert len(result["errors"]) == 1 + assert result["errors"][0]["step"] == 0 + assert str(result["errors"][0]["error"]) == "Analysis failed" + + @pytest.mark.asyncio + async def test_dry_run_throws_error_when_already_executed( + self, valid_client_options + ): + """Test dry run throws error when workflow is already executed.""" + builder = StagedWorkflowBuilder(valid_client_options) + builder.is_executed = True + + with pytest.raises( + ValidationError, match="This workflow has already been executed" + ): + await builder.dry_run() + + +class TestStagedWorkflowBuilderChaining: + """Tests for StagedWorkflowBuilder method chaining and type safety.""" + + @pytest.mark.asyncio + async def test_complete_workflow_chaining(self, valid_client_options): + """Test complete workflow with method chaining.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"pdf-content" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + + result = await ( + builder.add_file_part("test.pdf") + .apply_action({"type": "ocr", "language": "english"}) + .output_pdf({"user_password": "secret"}) + .execute() + ) + + assert result["success"] is True + assert result["output"]["buffer"] == b"pdf-content" + + # Verify the build instructions were set correctly + assert len(builder.build_instructions["parts"]) == 1 + assert builder.build_instructions["actions"] == [ + {"type": "ocr", "language": "english"} + ] + assert ( + builder.build_instructions["output"]["user_password"] + == "secret" + ) + + @pytest.mark.asyncio + async def test_complex_workflow_with_multiple_parts_and_actions( + self, valid_client_options + ): + """Test complex workflow with multiple parts and actions.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"merged-pdf-content" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + + result = await ( + builder.add_file_part( + "doc1.pdf", {"pages": {"start": 0, "end": 5}} + ) + .add_file_part( + "doc2.pdf", {"pages": {"start": 2, "end": 8}} + ) + .add_new_page({"pageCount": 1}) + .apply_actions( + [ + {"type": "ocr", "language": "english"}, + {"type": "flatten"}, + ] + ) + .output_pdf({"metadata": {"title": "Merged Document"}}) + .execute() + ) + + assert result["success"] is True + assert len(builder.build_instructions["parts"]) == 3 + assert len(builder.build_instructions["actions"]) == 2 + + +class TestStagedWorkflowBuilderIntegration: + """Integration tests for StagedWorkflowBuilder with real BuildActions.""" + + @pytest.mark.asyncio + async def test_workflow_with_watermark_action(self, valid_client_options): + """Test workflow with watermark action that requires file registration.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"watermarked-pdf" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + + # Create a watermark action that needs file registration + watermark_action = BuildActions.watermark_image("logo.png") + + result = await ( + builder.add_file_part("document.pdf") + .apply_action(watermark_action) + .output_pdf() + .execute() + ) + + assert result["success"] is True + + # Verify that actions were applied (the specific structure depends on implementation) + # Note: assets are cleaned up after execution, but the build instructions remain + assert len(builder.build_instructions["actions"]) == 1 + + @pytest.mark.asyncio + async def test_workflow_with_mixed_actions(self, valid_client_options): + """Test workflow with mix of regular actions and file-input actions.""" + with patch( + "nutrient_dws.builder.builder.validate_file_input", return_value=True + ): + with patch( + "nutrient_dws.builder.builder.is_remote_file_input", return_value=False + ): + with patch( + "nutrient_dws.builder.builder.process_file_input" + ) as mock_process: + mock_process.return_value = ("test-content", "application/pdf") + + async def mock_request(endpoint, data): + return b"processed-pdf" + + with patch( + "nutrient_dws.builder.base_builder.BaseBuilder._send_request", + side_effect=mock_request, + ): + builder = StagedWorkflowBuilder(valid_client_options) + + # Mix of regular actions and actions requiring file registration + actions = [ + BuildActions.ocr("english"), # Regular action + BuildActions.watermark_image( + "watermark.png" + ), # File input action + BuildActions.flatten(), # Regular action + BuildActions.apply_instant_json( + "annotations.json" + ), # File input action + ] + + result = await ( + builder.add_file_part("document.pdf") + .apply_actions(actions) + .output_pdf() + .execute() + ) + + assert result["success"] is True + + # Verify actions were applied (the specific structure depends on implementation) + # Note: assets are cleaned up after execution, but the build instructions remain + processed_actions = builder.build_instructions["actions"] + assert len(processed_actions) == 4 diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 2b09768..d2d7cf7 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -1,223 +1,868 @@ -"""Unit tests for NutrientClient.""" - -import os - -from nutrient_dws.client import NutrientClient - - -def test_client_init_with_api_key(): - """Test client initialization with API key.""" - client = NutrientClient(api_key="test-key") - assert client is not None - assert client._http_client._api_key == "test-key" - - -def test_client_init_with_env_var(): - """Test client initialization with environment variable.""" - # Save original value - original = os.environ.get("NUTRIENT_API_KEY") - - try: - os.environ["NUTRIENT_API_KEY"] = "env-key" - client = NutrientClient() - assert client._http_client._api_key == "env-key" - finally: - # Restore original value - if original is not None: - os.environ["NUTRIENT_API_KEY"] = original - else: - os.environ.pop("NUTRIENT_API_KEY", None) - - -def test_client_init_precedence(): - """Test that explicit API key takes precedence over env var.""" - # Save original value - original = os.environ.get("NUTRIENT_API_KEY") - - try: - os.environ["NUTRIENT_API_KEY"] = "env-key" - client = NutrientClient(api_key="explicit-key") - assert client._http_client._api_key == "explicit-key" - finally: - # Restore original value - if original is not None: - os.environ["NUTRIENT_API_KEY"] = original - else: - os.environ.pop("NUTRIENT_API_KEY", None) - - -def test_client_build_method(): - """Test that build() returns a BuildAPIWrapper.""" - client = NutrientClient(api_key="test-key") - builder = client.build("test.pdf") - - # Check class name to avoid import issues - assert builder.__class__.__name__ == "BuildAPIWrapper" - - -def test_client_has_direct_api_methods(): - """Test that client has direct API methods.""" - client = NutrientClient(api_key="test-key") - - # Check that direct API methods exist (from DirectAPIMixin) - assert hasattr(client, "convert_to_pdf") - assert hasattr(client, "flatten_annotations") - assert hasattr(client, "rotate_pages") - assert hasattr(client, "watermark_pdf") - assert hasattr(client, "ocr_pdf") - assert hasattr(client, "apply_redactions") - assert hasattr(client, "merge_pdfs") - assert hasattr(client, "split_pdf") - assert hasattr(client, "duplicate_pdf_pages") - assert hasattr(client, "delete_pdf_pages") - assert hasattr(client, "add_page") - assert hasattr(client, "set_page_label") - - -def test_client_context_manager(): - """Test client can be used as context manager.""" - with NutrientClient(api_key="test-key") as client: +"""Tests for NutrientClient functionality.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from nutrient_dws import NutrientClient +from nutrient_dws.errors import ValidationError, NutrientError + + +class TestNutrientClientConstructor: + """Tests for NutrientClient constructor.""" + + def test_create_client_with_valid_options(self, valid_client_options, unit_client): + assert unit_client is not None + assert unit_client.options == valid_client_options + + def test_create_client_with_minimal_options(self): + client = NutrientClient(api_key="test-key") assert client is not None - # Check that HTTP client is not closed - assert hasattr(client._http_client, "_session") + assert client.options["apiKey"] == "test-key" + + def test_create_client_with_async_api_key_function(self): + async def get_api_key(): + return "async-key" + + client = NutrientClient(api_key=get_api_key) + assert client is not None + assert callable(client.options["apiKey"]) + + def test_throw_validation_error_for_missing_options(self): + with pytest.raises(ValidationError, match="API key is required"): + NutrientClient(None) + + def test_throw_validation_error_for_missing_api_key(self): + with pytest.raises(TypeError, match="missing 1 required positional argument"): + NutrientClient() + + def test_throw_validation_error_for_invalid_api_key_type(self): + with pytest.raises( + ValidationError, + match="API key must be a string or a function that returns a string", + ): + NutrientClient(api_key=123) + + def test_throw_validation_error_for_invalid_base_url_type(self): + with pytest.raises(ValidationError, match="Base URL must be a string"): + NutrientClient(api_key="test-key", base_url=123) + + +class TestNutrientClientWorkflow: + """Tests for NutrientClient workflow method.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + def test_create_workflow_instance( + self, mock_staged_workflow_builder, valid_client_options, unit_client + ): + mock_workflow_instance = MagicMock() + mock_staged_workflow_builder.return_value = mock_workflow_instance + + workflow = unit_client.workflow() + + mock_staged_workflow_builder.assert_called_once_with(valid_client_options) + assert workflow == mock_workflow_instance + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + def test_pass_client_options_to_workflow(self, mock_staged_workflow_builder): + custom_options = {"apiKey": "custom-key", "baseUrl": "https://custom.api.com", "timeout": None} + client = NutrientClient(api_key=custom_options["apiKey"], base_url=custom_options["baseUrl"]) + + client.workflow() + + mock_staged_workflow_builder.assert_called_once_with(custom_options) - # After exiting context, HTTP client session should be closed - # We can't directly check if closed, but the method should have been called + @patch("nutrient_dws.client.StagedWorkflowBuilder") + def test_workflow_with_timeout_override( + self, mock_staged_workflow_builder, valid_client_options, unit_client + ): + override_timeout = 5000 + + unit_client.workflow(override_timeout) + expected_options = valid_client_options.copy() + expected_options["timeout"] = override_timeout + mock_staged_workflow_builder.assert_called_once_with(expected_options) -def test_client_close(): - """Test client close method.""" - client = NutrientClient(api_key="test-key") - # Verify HTTP client exists - assert hasattr(client, "_http_client") +class TestNutrientClientOcr: + """Tests for NutrientClient OCR functionality.""" - # Close should not raise an error - client.close() + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_perform_ocr_with_single_language_and_default_pdf_output( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) -def test_set_page_label_validation(): - """Test set_page_label method validation logic.""" - from unittest.mock import Mock, patch + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance - import pytest + file = "test-file.pdf" + language = "english" + + result = await unit_client.ocr(file, language) + + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with( + file, None, [{"type": "ocr", "language": "english"}] + ) + mock_workflow_instance.output_pdf.assert_called_once() + mock_output_stage.execute.assert_called_once() - client = NutrientClient(api_key="test-key") - client._http_client = Mock() # Mock the HTTP client to avoid actual API calls + # Verify the result + assert result["buffer"] == b"test-buffer" + assert result["mimeType"] == "application/pdf" - with ( - patch("nutrient_dws.file_handler.get_pdf_page_count") as mock_pdf_page_count, + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_perform_ocr_with_multiple_languages( + self, mock_staged_workflow_builder, unit_client ): - mock_pdf_page_count.return_value = 10 - # Test empty labels list - with pytest.raises(ValueError, match="labels list cannot be empty"): - client.set_page_label("test.pdf", []) + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + languages = ["english", "spanish"] - # Test invalid label config (not a dict) - with pytest.raises(ValueError, match="Label configuration 0 must be a dictionary"): - client.set_page_label("test.pdf", ["invalid"]) # type: ignore[list-item] + await unit_client.ocr(file, languages) + + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with( + file, None, [{"type": "ocr", "language": ["english", "spanish"]}] + ) + mock_workflow_instance.output_pdf.assert_called_once() + mock_output_stage.execute.assert_called_once() - # Test missing 'pages' key - with pytest.raises(ValueError, match="Label configuration 0 missing required 'pages' key"): - client.set_page_label("test.pdf", [{"label": "Test"}]) - # Test missing 'label' key - with pytest.raises(ValueError, match="Label configuration 0 missing required 'label' key"): - client.set_page_label("test.pdf", [{"pages": {"start": 0}}]) +class TestNutrientClientWatermarkText: + """Tests for NutrientClient text watermark functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_add_text_watermark_with_default_options( + self, mock_staged_workflow_builder, unit_client + ): + + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + text = "CONFIDENTIAL" + + await unit_client.watermark_text(file, text) + + # Check that add_file_part was called with the watermark action + call_args = mock_workflow_instance.add_file_part.call_args + assert call_args[0][0] == file # First positional arg (file) + assert call_args[0][1] is None # Second positional arg (options) + + # Check the watermark action structure + actions = call_args[0][2] # Third positional arg (actions) + assert len(actions) == 1 + watermark_action = actions[0] + assert watermark_action["type"] == "watermark" + assert watermark_action["text"] == "CONFIDENTIAL" + assert watermark_action["width"] == {"value": 100, "unit": "%"} + assert watermark_action["height"] == {"value": 100, "unit": "%"} + + mock_workflow_instance.output_pdf.assert_called_once() + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_add_text_watermark_with_custom_options( + self, mock_staged_workflow_builder, unit_client + ): + + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + text = "DRAFT" + options = { + "opacity": 0.5, + "fontSize": 24, + "fontColor": "#ff0000", + "rotation": 45, + } + + await unit_client.watermark_text(file, text, options) + + # Check that add_file_part was called with the correct watermark action + call_args = mock_workflow_instance.add_file_part.call_args + actions = call_args[0][2] + watermark_action = actions[0] + + assert watermark_action["type"] == "watermark" + assert watermark_action["text"] == "DRAFT" + assert watermark_action["opacity"] == 0.5 + assert watermark_action["fontSize"] == 24 + assert watermark_action["fontColor"] == "#ff0000" + assert watermark_action["rotation"] == 45 + + +class TestNutrientClientWatermarkImage: + """Tests for NutrientClient image watermark functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_add_image_watermark_with_default_options( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + image = "watermark.png" + + await unit_client.watermark_image(file, image) + + # Check that add_file_part was called with the watermark action + call_args = mock_workflow_instance.add_file_part.call_args + assert call_args[0][0] == file + assert call_args[0][1] is None + + # Check the watermark action has the right properties (file input needs registration) + actions = call_args[0][2] + assert len(actions) == 1 + watermark_action = actions[0] + + # Check that it's an action that needs file registration + assert hasattr(watermark_action, "fileInput") + assert hasattr(watermark_action, "createAction") + assert watermark_action.fileInput == "watermark.png" + + mock_workflow_instance.output_pdf.assert_called_once() + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_add_image_watermark_with_custom_options( + self, mock_staged_workflow_builder, unit_client + ): + + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + image = "watermark.png" + options = {"opacity": 0.5, "rotation": 45} + + await unit_client.watermark_image(file, image, options) + + # Check that add_file_part was called with the watermark action + call_args = mock_workflow_instance.add_file_part.call_args + actions = call_args[0][2] + watermark_action = actions[0] + + # Check that it's an action that needs file registration with the right file input + assert hasattr(watermark_action, "fileInput") + assert hasattr(watermark_action, "createAction") + assert watermark_action.fileInput == "watermark.png" + + +class TestNutrientClientMerge: + """Tests for NutrientClient merge functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_merge_multiple_files( + self, mock_staged_workflow_builder, unit_client + ): + + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + files = ["file1.pdf", "file2.pdf", "file3.pdf"] + + result = await unit_client.merge(files) + + # Check that add_file_part was called for each file + assert mock_workflow_instance.add_file_part.call_count == 3 + mock_workflow_instance.add_file_part.assert_any_call("file1.pdf") + mock_workflow_instance.add_file_part.assert_any_call("file2.pdf") + mock_workflow_instance.add_file_part.assert_any_call("file3.pdf") + + mock_workflow_instance.output_pdf.assert_called_once() + mock_output_stage.execute.assert_called_once() + + # Verify the result + assert result["buffer"] == b"test-buffer" + + @pytest.mark.asyncio + async def test_throw_validation_error_when_less_than_2_files_provided( + self, valid_client_options, unit_client + ): + files = ["file1.pdf"] - # Test invalid pages config (not a dict) with pytest.raises( - ValueError, match="Label configuration 0 'pages' must be a dict with 'start' key" + ValidationError, match="At least 2 files are required for merge operation" ): - client.set_page_label("test.pdf", [{"pages": "invalid", "label": "Test"}]) + await unit_client.merge(files) + + @pytest.mark.asyncio + async def test_throw_validation_error_when_empty_array_provided( + self, unit_client + ): + files = [] - # Test missing 'start' key in pages with pytest.raises( - ValueError, match="Label configuration 0 'pages' must be a dict with 'start' key" + ValidationError, match="At least 2 files are required for merge operation" ): - client.set_page_label("test.pdf", [{"pages": {"end": 5}, "label": "Test"}]) - - -def test_set_page_label_valid_config(): - """Test set_page_label with valid configuration.""" - from unittest.mock import Mock, patch - - client = NutrientClient(api_key="test-key") - - # Mock HTTP client and file handler functions - mock_http_client = Mock() - mock_http_client.post.return_value = b"mock_pdf_bytes" - client._http_client = mock_http_client - - with ( - patch("nutrient_dws.file_handler.prepare_file_for_upload") as mock_prepare, - patch("nutrient_dws.file_handler.save_file_output") as mock_save, - patch("nutrient_dws.file_handler.get_pdf_page_count") as mock_pdf_page_count, - ): - mock_prepare.return_value = ("file", ("filename.pdf", b"mock_file_data", "application/pdf")) - mock_pdf_page_count.return_value = 10 - - # Test valid configuration - labels = [ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3}, "label": "Content"}, - ] - - result = client.set_page_label("test.pdf", labels) - - # Expected normalized labels (implementation only includes 'end' if explicitly provided) - expected_normalized_labels = [ - {"pages": {"start": 0, "end": 2}, "label": "Introduction"}, - {"pages": {"start": 3}, "label": "Content"}, # No 'end' means to end of document - ] - - # Verify the API call was made with correct parameters - mock_http_client.post.assert_called_once_with( - "/build", - files={"file": ("filename.pdf", b"mock_file_data", "application/pdf")}, - json_data={ - "parts": [{"file": "file"}], - "actions": [], - "output": {"labels": expected_normalized_labels}, - }, + await unit_client.merge(files) + + +class TestNutrientClientExtractText: + """Tests for NutrientClient extract text functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_extract_text_from_document( + self, mock_staged_workflow_builder, unit_client + ): + + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "data": {"pages": [{"plainText": "Extracted text content"}]}, + "mimeType": "application/json", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_json.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + + result = await unit_client.extract_text(file) + + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with(file, None) + mock_workflow_instance.output_json.assert_called_once_with( + {"plainText": True, "tables": False} + ) + mock_output_stage.execute.assert_called_once() + + # Verify the result + assert result["data"] == {"pages": [{"plainText": "Extracted text content"}]} + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_extract_text_with_page_range( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "data": {"pages": [{"plainText": "Extracted text content"}]}, + "mimeType": "application/json", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_json.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + pages = {"start": 0, "end": 2} + + await unit_client.extract_text(file, pages) + + # Verify the workflow was called with page options + call_args = mock_workflow_instance.add_file_part.call_args + assert call_args[0][0] == file # First positional arg (file) + assert call_args[0][1] == { + "pages": {"start": 0, "end": 2} + } # Second positional arg (part options) + + +class TestNutrientClientExtractTable: + """Tests for NutrientClient extract table functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_extract_table_from_document( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "data": {"pages": [{"tables": [{"rows": [["cell1", "cell2"]]}]}]}, + "mimeType": "application/json", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_json.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + + result = await unit_client.extract_table(file) + + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with(file, None) + mock_workflow_instance.output_json.assert_called_once_with( + {"plainText": False, "tables": True} + ) + mock_output_stage.execute.assert_called_once() + + # Verify the result + assert "tables" in result["data"]["pages"][0] + + +class TestNutrientClientExtractKeyValuePairs: + """Tests for NutrientClient extract key-value pairs functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_extract_key_value_pairs_from_document( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "data": { + "pages": [ + {"keyValuePairs": [{"key": "Name", "value": "John Doe"}]} + ] + }, + "mimeType": "application/json", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_json.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "test-file.pdf" + + result = await unit_client.extract_key_value_pairs(file) + + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with(file, None) + mock_workflow_instance.output_json.assert_called_once_with( + {"plainText": False, "tables": False, "keyValuePairs": True} ) + mock_output_stage.execute.assert_called_once() + + # Verify the result + assert "keyValuePairs" in result["data"]["pages"][0] + - # Verify result - assert result == b"mock_pdf_bytes" +class TestNutrientClientConvert: + """Tests for NutrientClient convert functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_convert_docx_to_pdf( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"pdf-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) - # Verify save_file_output was not called (no output_path) - mock_save.assert_not_called() + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + file = "document.docx" + target_format = "pdf" -def test_set_page_label_with_output_path(): - """Test set_page_label with output path.""" - from unittest.mock import Mock, patch + result = await unit_client.convert(file, target_format) - client = NutrientClient(api_key="test-key") + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with(file) + mock_workflow_instance.output_pdf.assert_called_once() + mock_output_stage.execute.assert_called_once() - # Mock HTTP client and file handler functions - mock_http_client = Mock() - mock_http_client.post.return_value = b"mock_pdf_bytes" - client._http_client = mock_http_client + # Verify the result + assert result["buffer"] == b"pdf-buffer" + assert result["mimeType"] == "application/pdf" - with ( - patch("nutrient_dws.file_handler.prepare_file_for_upload") as mock_prepare, - patch("nutrient_dws.file_handler.save_file_output") as mock_save, - patch("nutrient_dws.file_handler.get_pdf_page_count") as mock_pdf_page_count, + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_convert_pdf_to_image( + self, mock_staged_workflow_builder, unit_client ): - mock_prepare.return_value = ("file", ("filename.pdf", b"mock_file_data", "application/pdf")) - mock_pdf_page_count.return_value = 10 + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"png-buffer", + "mimeType": "image/png", + "filename": "output.png", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_image.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "document.pdf" + target_format = "png" + + result = await unit_client.convert(file, target_format) + + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with(file) + mock_workflow_instance.output_image.assert_called_once_with("png", {"dpi": 300}) + mock_output_stage.execute.assert_called_once() + + # Verify the result + assert result["buffer"] == b"png-buffer" + assert result["mimeType"] == "image/png" + + @pytest.mark.asyncio + async def test_convert_unsupported_format_throws_error(self, unit_client): + file = "document.pdf" + target_format = "unsupported" + + with pytest.raises( + ValidationError, match="Unsupported target format: unsupported" + ): + await unit_client.convert(file, target_format) + + +class TestNutrientClientPasswordProtect: + """Tests for NutrientClient password protection functionality.""" + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_password_protect_pdf( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"protected-pdf-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "document.pdf" + user_password = "user123" + owner_password = "owner456" + + result = await unit_client.password_protect(file, user_password, owner_password) - labels = [{"pages": {"start": 0, "end": 0}, "label": "Cover"}] + # Verify the workflow was called correctly + mock_workflow_instance.add_file_part.assert_called_once_with(file) - result = client.set_page_label("test.pdf", labels, output_path="/path/to/output.pdf") + # Check the PDF output options + call_args = mock_workflow_instance.output_pdf.call_args + pdf_options = call_args[0][0] # First positional argument + assert pdf_options["user_password"] == user_password + assert pdf_options["owner_password"] == owner_password - # Verify the API call was made - mock_http_client.post.assert_called_once() + mock_output_stage.execute.assert_called_once() - # Verify save_file_output was called with correct parameters - mock_save.assert_called_once_with(b"mock_pdf_bytes", "/path/to/output.pdf") + # Verify the result + assert result["buffer"] == b"protected-pdf-buffer" - # Verify result is None when output_path is provided - assert result is None + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_password_protect_pdf_with_permissions( + self, mock_staged_workflow_builder, unit_client + ): + # Setup mock workflow + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"protected-pdf-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + + file = "document.pdf" + user_password = "user123" + owner_password = "owner456" + permissions = ["printing", "extract_accessibility"] + + result = await unit_client.password_protect( + file, user_password, owner_password, permissions + ) + + # Check the PDF output options include permissions + call_args = mock_workflow_instance.output_pdf.call_args + pdf_options = call_args[0][0] + assert pdf_options["user_permissions"] == permissions + + +class TestNutrientClientProcessTypedWorkflowResult: + """Tests for NutrientClient _process_typed_workflow_result method.""" + + def test_process_successful_workflow_result(self, unit_client): + + result = { + "success": True, + "output": {"buffer": b"test-buffer", "mimeType": "application/pdf"}, + } + + processed_result = unit_client._process_typed_workflow_result(result) + assert processed_result == result["output"] + + def test_process_failed_workflow_result_with_errors(self, unit_client): + test_error = NutrientError("Test error", "TEST_ERROR") + result = {"success": False, "errors": [{"error": test_error}], "output": None} + + with pytest.raises(NutrientError, match="Test error"): + unit_client._process_typed_workflow_result(result) + + def test_process_failed_workflow_result_without_errors(self, unit_client): + result = {"success": False, "errors": [], "output": None} + + with pytest.raises( + NutrientError, + match="Workflow operation failed without specific error details", + ): + unit_client._process_typed_workflow_result(result) + + def test_process_successful_workflow_result_without_output(self, unit_client): + result = {"success": True, "output": None} + + with pytest.raises( + NutrientError, + match="Workflow completed successfully but no output was returned", + ): + unit_client._process_typed_workflow_result(result) + + +class TestNutrientClientAccountInfo: + """Tests for NutrientClient account info functionality.""" + + @patch("nutrient_dws.client.send_request") + @pytest.mark.asyncio + async def test_get_account_info(self, mock_send_request, valid_client_options, unit_client): + expected_account_info = { + "subscriptionType": "premium", + "remainingCredits": 1000, + } + + mock_send_request.return_value = {"data": expected_account_info, "status": 200} + + result = await unit_client.get_account_info() + + # Verify the request was made correctly + mock_send_request.assert_called_once_with( + { + "method": "GET", + "endpoint": "/account/info", + "data": None, + "headers": None, + }, + valid_client_options, + ) + + # Verify the result + assert result == expected_account_info + + +class TestNutrientClientCreateToken: + """Tests for NutrientClient create token functionality.""" + + @patch("nutrient_dws.client.send_request") + @pytest.mark.asyncio + async def test_create_token(self, mock_send_request, valid_client_options, unit_client): + params = {"allowedOperations": ["annotations_api"], "expirationTime": 3600} + + expected_token_response = {"id": "token-123", "token": "jwt-token-string"} + + mock_send_request.return_value = { + "data": expected_token_response, + "status": 200, + } + + result = await unit_client.create_token(params) + + # Verify the request was made correctly + mock_send_request.assert_called_once_with( + {"method": "POST", "endpoint": "/tokens", "data": params, "headers": None}, + valid_client_options, + ) + + # Verify the result + assert result == expected_token_response + + +class TestNutrientClientDeleteToken: + """Tests for NutrientClient delete token functionality.""" + + @patch("nutrient_dws.client.send_request") + @pytest.mark.asyncio + async def test_delete_token(self, mock_send_request, valid_client_options, unit_client): + + token_id = "token-123" + + mock_send_request.return_value = {"data": None, "status": 204} + + await unit_client.delete_token(token_id) + + # Verify the request was made correctly + mock_send_request.assert_called_once_with( + { + "method": "DELETE", + "endpoint": "/tokens", + "data": {"id": token_id}, + "headers": None, + }, + valid_client_options, + ) diff --git a/tests/unit/test_constant.py b/tests/unit/test_constant.py new file mode 100644 index 0000000..6a8afa0 --- /dev/null +++ b/tests/unit/test_constant.py @@ -0,0 +1,487 @@ +"""Tests for BuildActions and BuildOutputs factory functions.""" + +import pytest + +from nutrient_dws.builder.constant import BuildActions, BuildOutputs +from nutrient_dws.inputs import FileInput + + +class TestBuildActions: + """Tests for BuildActions factory functions.""" + + def test_ocr_with_single_language(self): + action = BuildActions.ocr("english") + + assert action == {"type": "ocr", "language": "english"} + + def test_ocr_with_multiple_languages(self): + languages = ["english", "spanish"] + action = BuildActions.ocr(languages) + + assert action == {"type": "ocr", "language": ["english", "spanish"]} + + def test_rotate_90_degrees(self): + action = BuildActions.rotate(90) + + assert action == {"type": "rotate", "rotateBy": 90} + + def test_rotate_180_degrees(self): + action = BuildActions.rotate(180) + + assert action == {"type": "rotate", "rotateBy": 180} + + def test_rotate_270_degrees(self): + action = BuildActions.rotate(270) + + assert action == {"type": "rotate", "rotateBy": 270} + + def test_watermark_text_with_minimal_options(self): + default_dimensions = { + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + } + + action = BuildActions.watermark_text("CONFIDENTIAL", default_dimensions) + + assert action == { + "type": "watermark", + "text": "CONFIDENTIAL", + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + "rotation": 0, + } + + def test_watermark_text_with_all_options(self): + options = { + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + "opacity": 0.5, + "rotation": 45, + "fontSize": 24, + "fontColor": "#ff0000", + "fontFamily": "Arial", + "fontStyle": ["bold", "italic"], + "top": {"value": 10, "unit": "pt"}, + "left": {"value": 20, "unit": "pt"}, + "right": {"value": 30, "unit": "pt"}, + "bottom": {"value": 40, "unit": "pt"}, + } + + action = BuildActions.watermark_text("DRAFT", options) + + assert action == { + "type": "watermark", + "text": "DRAFT", + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + "opacity": 0.5, + "rotation": 45, + "fontSize": 24, + "fontColor": "#ff0000", + "fontFamily": "Arial", + "fontStyle": ["bold", "italic"], + "top": {"value": 10, "unit": "pt"}, + "left": {"value": 20, "unit": "pt"}, + "right": {"value": 30, "unit": "pt"}, + "bottom": {"value": 40, "unit": "pt"}, + } + + def test_watermark_image_with_minimal_options(self): + image = "logo.png" + default_dimensions = { + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + } + + action = BuildActions.watermark_image(image, default_dimensions) + + # Check that action requires file registration by having fileInput and createAction method + assert hasattr(action, "fileInput") + assert hasattr(action, "createAction") + assert action.fileInput == "logo.png" + + result = action.createAction("asset_0") + assert result == { + "type": "watermark", + "image": "asset_0", + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + "rotation": 0, + } + + def test_watermark_image_with_all_options(self): + image = "watermark.png" + options = { + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + "opacity": 0.3, + "rotation": 30, + "top": {"value": 10, "unit": "pt"}, + "left": {"value": 20, "unit": "pt"}, + "right": {"value": 30, "unit": "pt"}, + "bottom": {"value": 40, "unit": "pt"}, + } + + action = BuildActions.watermark_image(image, options) + + # Check that action requires file registration by having fileInput and createAction method + assert hasattr(action, "fileInput") + assert hasattr(action, "createAction") + assert action.fileInput == "watermark.png" + + result = action.createAction("asset_0") + assert result == { + "type": "watermark", + "image": "asset_0", + "width": {"value": 100, "unit": "%"}, + "height": {"value": 100, "unit": "%"}, + "opacity": 0.3, + "rotation": 30, + "top": {"value": 10, "unit": "pt"}, + "left": {"value": 20, "unit": "pt"}, + "right": {"value": 30, "unit": "pt"}, + "bottom": {"value": 40, "unit": "pt"}, + } + + def test_flatten_without_annotation_ids(self): + action = BuildActions.flatten() + + assert action == {"type": "flatten"} + + def test_flatten_with_annotation_ids(self): + annotation_ids = ["ann1", "ann2", 123] + action = BuildActions.flatten(annotation_ids) + + assert action == {"type": "flatten", "annotationIds": ["ann1", "ann2", 123]} + + def test_apply_instant_json(self): + file: FileInput = "annotations.json" + action = BuildActions.apply_instant_json(file) + + # Check that action requires file registration by having fileInput and createAction method + assert hasattr(action, "fileInput") + assert hasattr(action, "createAction") + assert action.fileInput == "annotations.json" + + result = action.createAction("asset_0") + assert result == {"type": "applyInstantJson", "file": "asset_0"} + + def test_apply_xfdf(self): + file: FileInput = "annotations.xfdf" + action = BuildActions.apply_xfdf(file) + + # Check that action requires file registration by having fileInput and createAction method + assert hasattr(action, "fileInput") + assert hasattr(action, "createAction") + assert action.fileInput == "annotations.xfdf" + + result = action.createAction("asset_1") + assert result == {"type": "applyXfdf", "file": "asset_1"} + + def test_apply_redactions(self): + action = BuildActions.apply_redactions() + + assert action == {"type": "applyRedactions"} + + def test_create_redactions_text_with_minimal_options(self): + text = "confidential" + action = BuildActions.create_redactions_text(text) + + assert action == { + "type": "createRedactions", + "strategy": "text", + "strategyOptions": {"text": "confidential"}, + } + + def test_create_redactions_text_with_all_options(self): + text = "secret" + options = {} + strategy_options = {"caseSensitive": True, "wholeWord": True} + + action = BuildActions.create_redactions_text(text, options, strategy_options) + + assert action == { + "type": "createRedactions", + "strategy": "text", + "strategyOptions": { + "text": "secret", + "caseSensitive": True, + "wholeWord": True, + }, + } + + def test_create_redactions_regex_with_minimal_options(self): + regex = r"\d{3}-\d{2}-\d{4}" + action = BuildActions.create_redactions_regex(regex) + + assert action == { + "type": "createRedactions", + "strategy": "regex", + "strategyOptions": {"regex": r"\d{3}-\d{2}-\d{4}"}, + } + + def test_create_redactions_regex_with_all_options(self): + regex = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}" + options = {} + strategy_options = {"caseSensitive": False} + + action = BuildActions.create_redactions_regex(regex, options, strategy_options) + + assert action == { + "type": "createRedactions", + "strategy": "regex", + "strategyOptions": { + "regex": r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", + "caseSensitive": False, + }, + } + + def test_create_redactions_preset_with_minimal_options(self): + preset = "date" + action = BuildActions.create_redactions_preset(preset) + + assert action == { + "type": "createRedactions", + "strategy": "preset", + "strategyOptions": {"preset": "date"}, + } + + def test_create_redactions_preset_with_all_options(self): + preset = "email-address" + options = {} + strategy_options = {"start": 1} + + action = BuildActions.create_redactions_preset(preset, options, strategy_options) + + assert action == { + "type": "createRedactions", + "strategy": "preset", + "strategyOptions": {"preset": "email-address", "start": 1}, + } + + +class TestBuildOutputs: + """Tests for BuildOutputs factory functions.""" + + def test_pdf_with_no_options(self): + output = BuildOutputs.pdf() + + assert output == {"type": "pdf"} + + def test_pdf_with_all_options(self): + options = { + "metadata": {"title": "Test Document"}, + "labels": [{"pages": [0], "label": "Page I-III"}], + "user_password": "user123", + "owner_password": "owner123", + "user_permissions": ["print"], + "optimize": {"print": True}, + } + + output = BuildOutputs.pdf(options) + + assert output == { + "type": "pdf", + "metadata": {"title": "Test Document"}, + "labels": [{"pages": [0], "label": "Page I-III"}], + "user_password": "user123", + "owner_password": "owner123", + "user_permissions": ["print"], + "optimize": {"print": True}, + } + + def test_pdfa_with_no_options(self): + output = BuildOutputs.pdfa() + + assert output == {"type": "pdfa"} + + def test_pdfa_with_all_options(self): + options = { + "conformance": "pdfa-1b", + "vectorization": True, + "rasterization": False, + "metadata": {"title": "Test Document"}, + "user_password": "user123", + "owner_password": "owner123", + } + + output = BuildOutputs.pdfa(options) + + assert output == { + "type": "pdfa", + "conformance": "pdfa-1b", + "vectorization": True, + "rasterization": False, + "metadata": {"title": "Test Document"}, + "user_password": "user123", + "owner_password": "owner123", + } + + def test_image_with_default_options(self): + output = BuildOutputs.image("png") + + assert output == {"type": "image", "format": "png"} + + def test_image_with_custom_options(self): + options = {"dpi": 300, "pages": {"start": 1, "end": 5}} + + output = BuildOutputs.image("png", options) + + assert output == { + "type": "image", + "format": "png", + "dpi": 300, + "pages": {"start": 1, "end": 5}, + } + + def test_pdfua_with_no_options(self): + output = BuildOutputs.pdfua() + + assert output == {"type": "pdfua"} + + def test_pdfua_with_all_options(self): + options = { + "metadata": {"title": "Accessible Document"}, + "labels": [{"pages": [0], "label": "Cover Page"}], + "user_password": "user123", + "owner_password": "owner123", + "user_permissions": ["print"], + "optimize": {"print": True}, + } + + output = BuildOutputs.pdfua(options) + + assert output == { + "type": "pdfua", + "metadata": {"title": "Accessible Document"}, + "labels": [{"pages": [0], "label": "Cover Page"}], + "user_password": "user123", + "owner_password": "owner123", + "user_permissions": ["print"], + "optimize": {"print": True}, + } + + def test_json_content_with_default_options(self): + output = BuildOutputs.jsonContent() + + assert output == {"type": "json-content"} + + def test_json_content_with_custom_options(self): + options = { + "plainText": False, + "structuredText": True, + "keyValuePairs": True, + "tables": False, + "language": "english", + } + + output = BuildOutputs.jsonContent(options) + + assert output == { + "type": "json-content", + "plainText": False, + "structuredText": True, + "keyValuePairs": True, + "tables": False, + "language": "english", + } + + def test_office_docx(self): + output = BuildOutputs.office("docx") + + assert output == {"type": "docx"} + + def test_office_xlsx(self): + output = BuildOutputs.office("xlsx") + + assert output == {"type": "xlsx"} + + def test_office_pptx(self): + output = BuildOutputs.office("pptx") + + assert output == {"type": "pptx"} + + def test_html_with_page_layout(self): + output = BuildOutputs.html("page") + + assert output == {"type": "html", "layout": "page"} + + def test_html_with_reflow_layout(self): + output = BuildOutputs.html("reflow") + + assert output == {"type": "html", "layout": "reflow"} + + def test_markdown(self): + output = BuildOutputs.markdown() + + assert output == {"type": "markdown"} + + def test_get_mime_type_for_pdf_output(self): + output = BuildOutputs.pdf() + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == {"mimeType": "application/pdf", "filename": "output.pdf"} + + def test_get_mime_type_for_pdfa_output(self): + output = BuildOutputs.pdfa() + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == {"mimeType": "application/pdf", "filename": "output.pdf"} + + def test_get_mime_type_for_pdfua_output(self): + output = BuildOutputs.pdfua() + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == {"mimeType": "application/pdf", "filename": "output.pdf"} + + def test_get_mime_type_for_image_output_with_custom_format(self): + output = BuildOutputs.image("jpeg") + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == {"mimeType": "image/jpeg", "filename": "output.jpeg"} + + def test_get_mime_type_for_docx_output(self): + output = BuildOutputs.office("docx") + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == { + "mimeType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "filename": "output.docx", + } + + def test_get_mime_type_for_xlsx_output(self): + output = BuildOutputs.office("xlsx") + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == { + "mimeType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "filename": "output.xlsx", + } + + def test_get_mime_type_for_pptx_output(self): + output = BuildOutputs.office("pptx") + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == { + "mimeType": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "filename": "output.pptx", + } + + def test_get_mime_type_for_html_output(self): + output = BuildOutputs.html("page") + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == {"mimeType": "text/html", "filename": "output.html"} + + def test_get_mime_type_for_markdown_output(self): + output = BuildOutputs.markdown() + result = BuildOutputs.getMimeTypeForOutput(output) + + assert result == {"mimeType": "text/markdown", "filename": "output.md"} + + def test_get_mime_type_for_unknown_output(self): + # Create an output with unknown type + unknown_output = {"type": "unknown"} + result = BuildOutputs.getMimeTypeForOutput(unknown_output) + + assert result == {"mimeType": "application/octet-stream", "filename": "output"} diff --git a/tests/unit/test_direct_api.py b/tests/unit/test_direct_api.py deleted file mode 100644 index 9284df9..0000000 --- a/tests/unit/test_direct_api.py +++ /dev/null @@ -1,479 +0,0 @@ -"""Comprehensive unit tests for Direct API methods.""" - -import tempfile -from typing import BinaryIO, cast -from unittest.mock import Mock, patch - -import pytest - -from nutrient_dws.client import NutrientClient - - -class TestDirectAPIMethods: - """Test suite for Direct API methods.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = NutrientClient(api_key="test-key") - self.mock_response = b"mocked-pdf-content" - - @patch("nutrient_dws.client.NutrientClient.build") - def test_convert_to_pdf_with_bytes_return(self, mock_build): - """Test convert_to_pdf returns bytes when no output_path.""" - mock_builder = Mock() - mock_builder.execute.return_value = self.mock_response - mock_build.return_value = mock_builder - - result = self.client.convert_to_pdf(b"test content") - - assert result == self.mock_response - mock_build.assert_called_once_with(b"test content") - mock_builder.execute.assert_called_once_with(None) - - @patch("nutrient_dws.client.NutrientClient.build") - def test_convert_to_pdf_with_output_path(self, mock_build): - """Test convert_to_pdf saves to file when output_path provided.""" - mock_builder = Mock() - mock_builder.execute.return_value = None - mock_build.return_value = mock_builder - - result = self.client.convert_to_pdf("input.docx", "output.pdf") - - assert result is None - mock_build.assert_called_once_with("input.docx") - mock_builder.execute.assert_called_once_with("output.pdf") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_flatten_annotations(self, mock_process): - """Test flatten_annotations method.""" - mock_process.return_value = self.mock_response - - result = self.client.flatten_annotations("test.pdf") - - assert result == self.mock_response - mock_process.assert_called_once_with("flatten-annotations", "test.pdf", None) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_flatten_annotations_with_output_path(self, mock_process): - """Test flatten_annotations with output path.""" - mock_process.return_value = None - - result = self.client.flatten_annotations("test.pdf", "output.pdf") - - assert result is None - mock_process.assert_called_once_with("flatten-annotations", "test.pdf", "output.pdf") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_rotate_pages_default_params(self, mock_process): - """Test rotate_pages with default parameters.""" - mock_process.return_value = self.mock_response - - result = self.client.rotate_pages("test.pdf") - - assert result == self.mock_response - mock_process.assert_called_once_with("rotate-pages", "test.pdf", None, degrees=0) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_rotate_pages_with_degrees(self, mock_process): - """Test rotate_pages with specific degrees.""" - mock_process.return_value = self.mock_response - - result = self.client.rotate_pages("test.pdf", degrees=90) - - assert result == self.mock_response - mock_process.assert_called_once_with("rotate-pages", "test.pdf", None, degrees=90) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_rotate_pages_with_page_indexes(self, mock_process): - """Test rotate_pages with specific page indexes.""" - mock_process.return_value = self.mock_response - - result = self.client.rotate_pages("test.pdf", degrees=180, page_indexes=[0, 2, 4]) - - assert result == self.mock_response - mock_process.assert_called_once_with( - "rotate-pages", "test.pdf", None, degrees=180, page_indexes=[0, 2, 4] - ) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_ocr_pdf_default_language(self, mock_process): - """Test ocr_pdf with default language.""" - mock_process.return_value = self.mock_response - - result = self.client.ocr_pdf("test.pdf") - - assert result == self.mock_response - mock_process.assert_called_once_with("ocr-pdf", "test.pdf", None, language="english") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_ocr_pdf_custom_language(self, mock_process): - """Test ocr_pdf with custom language.""" - mock_process.return_value = self.mock_response - - result = self.client.ocr_pdf("test.pdf", language="german") - - assert result == self.mock_response - mock_process.assert_called_once_with("ocr-pdf", "test.pdf", None, language="german") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_watermark_pdf_with_text(self, mock_process): - """Test watermark_pdf with text watermark.""" - mock_process.return_value = self.mock_response - - result = self.client.watermark_pdf("test.pdf", text="CONFIDENTIAL") - - assert result == self.mock_response - mock_process.assert_called_once_with( - "watermark-pdf", - "test.pdf", - None, - text="CONFIDENTIAL", - width=200, - height=100, - opacity=1.0, - position="center", - ) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_watermark_pdf_with_image_url(self, mock_process): - """Test watermark_pdf with image URL.""" - mock_process.return_value = self.mock_response - - result = self.client.watermark_pdf( - "test.pdf", - image_url="https://example.com/logo.png", - width=150, - height=75, - opacity=0.5, - position="top-right", - ) - - assert result == self.mock_response - mock_process.assert_called_once_with( - "watermark-pdf", - "test.pdf", - None, - image_url="https://example.com/logo.png", - width=150, - height=75, - opacity=0.5, - position="top-right", - ) - - def test_watermark_pdf_no_text_or_image_raises_error(self): - """Test watermark_pdf raises ValueError when neither text nor image_url provided.""" - err_msg = "Either text, image_url, or image_file must be provided" - with pytest.raises(ValueError, match=err_msg): - self.client.watermark_pdf("test.pdf") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_apply_redactions(self, mock_process): - """Test apply_redactions method.""" - mock_process.return_value = self.mock_response - - result = self.client.apply_redactions("test.pdf") - - assert result == self.mock_response - mock_process.assert_called_once_with("apply-redactions", "test.pdf", None) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_apply_redactions_with_output_path(self, mock_process): - """Test apply_redactions with output path.""" - mock_process.return_value = None - - result = self.client.apply_redactions("test.pdf", "redacted.pdf") - - assert result is None - mock_process.assert_called_once_with("apply-redactions", "test.pdf", "redacted.pdf") - - @patch("nutrient_dws.file_handler.prepare_file_for_upload") - @patch("nutrient_dws.file_handler.save_file_output") - def test_merge_pdfs_returns_bytes(self, mock_save, mock_prepare): - """Test merge_pdfs returns bytes when no output_path.""" - # Mock file preparation - mock_prepare.side_effect = [ - ("file0", ("file0", b"content1", "application/pdf")), - ("file1", ("file1", b"content2", "application/pdf")), - ] - - # Mock HTTP client - self.client._http_client.post = Mock(return_value=self.mock_response) # type: ignore - - result = self.client.merge_pdfs(["file1.pdf", "file2.pdf"]) # type: ignore[arg-type] - - assert result == self.mock_response - assert mock_prepare.call_count == 2 - mock_save.assert_not_called() - - # Verify HTTP client was called correctly - self.client._http_client.post.assert_called_once() - call_args = self.client._http_client.post.call_args - assert call_args[0][0] == "/build" - assert "files" in call_args[1] - assert "json_data" in call_args[1] - - @patch("nutrient_dws.file_handler.prepare_file_for_upload") - @patch("nutrient_dws.file_handler.save_file_output") - def test_merge_pdfs_saves_to_file(self, mock_save, mock_prepare): - """Test merge_pdfs saves to file when output_path provided.""" - # Mock file preparation - mock_prepare.side_effect = [ - ("file0", ("file0", b"content1", "application/pdf")), - ("file1", ("file1", b"content2", "application/pdf")), - ] - - # Mock HTTP client - self.client._http_client.post = Mock(return_value=self.mock_response) # type: ignore - - result = self.client.merge_pdfs(["file1.pdf", "file2.pdf"], "merged.pdf") - - assert result is None - mock_save.assert_called_once_with(self.mock_response, "merged.pdf") - - def test_merge_pdfs_insufficient_files_raises_error(self): - """Test merge_pdfs raises ValueError when less than 2 files provided.""" - with pytest.raises(ValueError, match="At least 2 files required for merge"): - self.client.merge_pdfs(["single_file.pdf"]) - - with pytest.raises(ValueError, match="At least 2 files required for merge"): - self.client.merge_pdfs([]) - - @patch("nutrient_dws.file_handler.prepare_file_for_upload") - def test_merge_pdfs_multiple_files(self, mock_prepare): - """Test merge_pdfs with multiple files.""" - # Mock file preparation for 3 files - mock_prepare.side_effect = [ - ("file0", ("file0", b"content1", "application/pdf")), - ("file1", ("file1", b"content2", "application/pdf")), - ("file2", ("file2", b"content3", "application/pdf")), - ] - - # Mock HTTP client - self.client._http_client.post = Mock(return_value=self.mock_response) # type: ignore - - files = ["file1.pdf", "file2.pdf", "file3.pdf"] - result = self.client.merge_pdfs(files) # type: ignore[arg-type] - - assert result == self.mock_response - assert mock_prepare.call_count == 3 - - # Verify the instruction structure - call_args = self.client._http_client.post.call_args - json_data = call_args[1]["json_data"] - assert len(json_data["parts"]) == 3 - assert json_data["parts"][0] == {"file": "file0"} - assert json_data["parts"][1] == {"file": "file1"} - assert json_data["parts"][2] == {"file": "file2"} - assert json_data["actions"] == [] - - -class TestDirectAPIFileTypes: - """Test Direct API methods with different file input types.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = NutrientClient(api_key="test-key") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_direct_api_with_file_path(self, mock_process): - """Test Direct API methods with file path input.""" - mock_process.return_value = b"result" - - self.client.flatten_annotations("/path/to/file.pdf") - mock_process.assert_called_once_with("flatten-annotations", "/path/to/file.pdf", None) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_direct_api_with_bytes_input(self, mock_process): - """Test Direct API methods with bytes input.""" - mock_process.return_value = b"result" - file_content = b"PDF content here" - - self.client.ocr_pdf(file_content) - mock_process.assert_called_once_with("ocr-pdf", file_content, None, language="english") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_direct_api_with_file_like_object(self, mock_process): - """Test Direct API methods with file-like object.""" - mock_process.return_value = b"result" - - with tempfile.NamedTemporaryFile() as temp_file: - temp_file.write(b"test content") - temp_file.seek(0) - - self.client.rotate_pages(cast("BinaryIO", temp_file), degrees=90) - mock_process.assert_called_once_with( - "rotate-pages", cast("BinaryIO", temp_file), None, degrees=90 - ) - - -class TestDirectAPIErrorHandling: - """Test error handling in Direct API methods.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = NutrientClient(api_key="test-key") - - def test_watermark_pdf_validation_error(self): - """Test watermark_pdf parameter validation.""" - err_msg = "Either text, image_url, or image_file must be provided" - - # Test missing text, image_url, and image_file - with pytest.raises(ValueError, match=err_msg): - self.client.watermark_pdf("test.pdf") - - # Test empty text and no image_url or image_file - with pytest.raises(ValueError, match=err_msg): - self.client.watermark_pdf("test.pdf", text="") - - # Test None text and no image_url or image_file - with pytest.raises(ValueError, match=err_msg): - self.client.watermark_pdf("test.pdf", text=None) - - def test_merge_pdfs_validation_error(self): - """Test merge_pdfs parameter validation.""" - # Test empty list - with pytest.raises(ValueError, match="At least 2 files required for merge"): - self.client.merge_pdfs([]) - - # Test single file - with pytest.raises(ValueError, match="At least 2 files required for merge"): - self.client.merge_pdfs(["single.pdf"]) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_direct_api_propagates_exceptions(self, mock_process): - """Test that Direct API methods propagate exceptions from _process_file.""" - from nutrient_dws.exceptions import APIError - - mock_process.side_effect = APIError("API error", 400, "Bad request") - - with pytest.raises(APIError): - self.client.flatten_annotations("test.pdf") - - @patch("nutrient_dws.client.NutrientClient.build") - def test_convert_to_pdf_propagates_exceptions(self, mock_build): - """Test that convert_to_pdf propagates exceptions from build().execute().""" - from nutrient_dws.exceptions import AuthenticationError - - mock_builder = Mock() - mock_builder.execute.side_effect = AuthenticationError("Invalid API key") - mock_build.return_value = mock_builder - - with pytest.raises(AuthenticationError): - self.client.convert_to_pdf("test.docx") - - -class TestDirectAPIBoundaryConditions: - """Test boundary conditions and edge cases for Direct API methods.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = NutrientClient(api_key="test-key") - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_rotate_pages_boundary_degrees(self, mock_process): - """Test rotate_pages with boundary degree values.""" - mock_process.return_value = b"result" - - # Test valid degree values - for degrees in [90, 180, 270, -90]: - self.client.rotate_pages("test.pdf", degrees=degrees) - mock_process.assert_called_with("rotate-pages", "test.pdf", None, degrees=degrees) - - # Test zero degrees (no rotation) - self.client.rotate_pages("test.pdf", degrees=0) - mock_process.assert_called_with("rotate-pages", "test.pdf", None, degrees=0) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_watermark_pdf_boundary_opacity(self, mock_process): - """Test watermark_pdf with boundary opacity values.""" - mock_process.return_value = b"result" - - # Test minimum opacity - self.client.watermark_pdf("test.pdf", text="TEST", opacity=0.0) - mock_process.assert_called_with( - "watermark-pdf", - "test.pdf", - None, - text="TEST", - width=200, - height=100, - opacity=0.0, - position="center", - ) - - # Test maximum opacity - self.client.watermark_pdf("test.pdf", text="TEST", opacity=1.0) - mock_process.assert_called_with( - "watermark-pdf", - "test.pdf", - None, - text="TEST", - width=200, - height=100, - opacity=1.0, - position="center", - ) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_watermark_pdf_all_positions(self, mock_process): - """Test watermark_pdf with all valid position values.""" - mock_process.return_value = b"result" - - positions = [ - "top-left", - "top-center", - "top-right", - "center", - "bottom-left", - "bottom-center", - "bottom-right", - ] - - for position in positions: - self.client.watermark_pdf("test.pdf", text="TEST", position=position) - mock_process.assert_called_with( - "watermark-pdf", - "test.pdf", - None, - text="TEST", - width=200, - height=100, - opacity=1.0, - position=position, - ) - - @patch("nutrient_dws.client.NutrientClient._process_file") - def test_ocr_pdf_all_languages(self, mock_process): - """Test ocr_pdf with all supported languages.""" - mock_process.return_value = b"result" - - languages = ["english", "eng", "deu", "german"] - - for language in languages: - self.client.ocr_pdf("test.pdf", language=language) - mock_process.assert_called_with("ocr-pdf", "test.pdf", None, language=language) - - @patch("nutrient_dws.file_handler.prepare_file_for_upload") - def test_merge_pdfs_maximum_files(self, mock_prepare): - """Test merge_pdfs with many files.""" - # Create 10 files to test performance with larger lists - files = [f"file{i}.pdf" for i in range(10)] - - # Mock file preparation - mock_prepare.side_effect = [ - (f"file{i}", (f"file{i}", f"content{i}".encode(), "application/pdf")) for i in range(10) - ] - - # Mock HTTP client - self.client._http_client.post = Mock(return_value=b"merged_result") # type: ignore - - result = self.client.merge_pdfs(files) # type: ignore[arg-type] - - assert result == b"merged_result" - assert mock_prepare.call_count == 10 - - # Verify instruction structure - call_args = self.client._http_client.post.call_args - json_data = call_args[1]["json_data"] - assert len(json_data["parts"]) == 10 - assert json_data["actions"] == [] diff --git a/tests/unit/test_errors.py b/tests/unit/test_errors.py new file mode 100644 index 0000000..05e7c6c --- /dev/null +++ b/tests/unit/test_errors.py @@ -0,0 +1,233 @@ +"""Simple error tests for actual functionality.""" + +import json +import pytest + +from nutrient_dws.errors import ( + APIError, + AuthenticationError, + NetworkError, + NutrientError, + ValidationError, +) + + +class TestNutrientError: + def test_create_base_error_with_message_and_code(self): + error = NutrientError("Test error", "TEST_ERROR") + + assert error.message == "Test error" + assert error.code == "TEST_ERROR" + assert error.__class__.__name__ == "NutrientError" + assert hasattr(error, "__traceback__") + assert error.details is None + assert error.status_code is None + + def test_include_details_when_provided(self): + details = {"foo": "bar", "baz": 123} + error = NutrientError("Test error", "TEST_ERROR", details) + + assert error.details == details + + def test_include_status_code_when_provided(self): + error = NutrientError("Test error", "TEST_ERROR", {"foo": "bar"}, 400) + + assert error.status_code == 400 + + def test_is_instance_of_exception(self): + error = NutrientError("Test error", "TEST_ERROR") + + assert isinstance(error, Exception) + assert isinstance(error, NutrientError) + + +class TestValidationError: + def test_create_validation_error_with_default_code(self): + error = ValidationError("Invalid input") + + assert error.message == "Invalid input" + assert error.code == "VALIDATION_ERROR" + assert error.__class__.__name__ == "ValidationError" + + def test_inherit_from_nutrient_error(self): + error = ValidationError("Invalid input") + + assert isinstance(error, Exception) + assert isinstance(error, NutrientError) + assert isinstance(error, ValidationError) + + def test_accept_details_and_status_code(self): + details = {"field": "email", "reason": "invalid format"} + error = ValidationError("Invalid input", details, 422) + + assert error.details == details + assert error.status_code == 422 + + +class TestAPIError: + def test_create_api_error_with_status_code(self): + error = APIError("Server error", 500) + + assert error.message == "Server error" + assert error.code == "API_ERROR" + assert error.__class__.__name__ == "APIError" + assert error.status_code == 500 + + def test_inherit_from_nutrient_error(self): + error = APIError("Server error", 500) + + assert isinstance(error, Exception) + assert isinstance(error, NutrientError) + assert isinstance(error, APIError) + + def test_accept_details(self): + details = {"endpoint": "/convert", "method": "POST"} + error = APIError("Server error", 500, details) + + assert error.details == details + + +class TestAuthenticationError: + def test_create_authentication_error_with_default_code(self): + error = AuthenticationError("Invalid API key") + + assert error.message == "Invalid API key" + assert error.code == "AUTHENTICATION_ERROR" + assert error.__class__.__name__ == "AuthenticationError" + + def test_inherit_from_nutrient_error(self): + error = AuthenticationError("Invalid API key") + + assert isinstance(error, Exception) + assert isinstance(error, NutrientError) + assert isinstance(error, AuthenticationError) + + def test_accept_details_and_status_code(self): + details = {"reason": "expired token"} + error = AuthenticationError("Invalid API key", details, 401) + + assert error.details == details + assert error.status_code == 401 + + +class TestNetworkError: + def test_create_network_error_with_default_code(self): + error = NetworkError("Connection failed") + + assert error.message == "Connection failed" + assert error.code == "NETWORK_ERROR" + assert error.__class__.__name__ == "NetworkError" + + def test_inherit_from_nutrient_error(self): + error = NetworkError("Connection failed") + + assert isinstance(error, Exception) + assert isinstance(error, NutrientError) + assert isinstance(error, NetworkError) + + def test_accept_details(self): + details = {"timeout": 30000, "endpoint": "https://api.nutrient.io"} + error = NetworkError("Connection failed", details) + + assert error.details == details + + +class TestErrorSerialization: + def test_serialize_to_json_correctly(self): + error = ValidationError("Invalid input", {"field": "email"}, 422) + error_dict = { + "message": error.message, + "code": error.code, + "name": error.__class__.__name__, + "details": error.details, + "status_code": error.status_code, + } + json_str = json.dumps(error_dict) + parsed = json.loads(json_str) + + assert parsed["message"] == "Invalid input" + assert parsed["code"] == "VALIDATION_ERROR" + assert parsed["name"] == "ValidationError" + assert parsed["details"] == {"field": "email"} + assert parsed["status_code"] == 422 + + def test_maintain_error_properties_when_caught(self): + def throw_and_catch(): + try: + raise APIError("Test error", 500, {"foo": "bar"}) + except Exception as e: + return e + + error = throw_and_catch() + assert error is not None + assert error.message == "Test error" + assert error.code == "API_ERROR" + assert error.status_code == 500 + assert error.details == {"foo": "bar"} + + +class TestToStringMethod: + def test_format_error_with_default_code(self): + error = NutrientError("Test error") + assert str(error) == "NutrientError: Test error" + + def test_include_custom_code_when_provided(self): + error = NutrientError("Test error", "CUSTOM_CODE") + assert str(error) == "NutrientError: Test error (CUSTOM_CODE)" + + def test_include_status_code_when_provided(self): + error = NutrientError("Test error", "CUSTOM_CODE", {}, 404) + assert str(error) == "NutrientError: Test error (CUSTOM_CODE) [HTTP 404]" + + def test_include_status_code_without_custom_code(self): + error = NutrientError("Test error", "NUTRIENT_ERROR", {}, 500) + assert str(error) == "NutrientError: Test error [HTTP 500]" + + +class TestWrapMethod: + def test_return_original_error_if_nutrient_error(self): + original_error = ValidationError("Original error") + wrapped_error = NutrientError.wrap(original_error) + + assert wrapped_error is original_error + + def test_wrap_standard_exception_instances(self): + original_error = Exception("Standard error") + wrapped_error = NutrientError.wrap(original_error) + + assert isinstance(wrapped_error, NutrientError) + assert wrapped_error.message == "Standard error" + assert wrapped_error.code == "WRAPPED_ERROR" + assert wrapped_error.details == { + "originalError": "Exception", + "originalMessage": "Standard error", + "stack": None, + } + + def test_wrap_standard_exception_instances_with_custom_message(self): + original_error = Exception("Standard error") + wrapped_error = NutrientError.wrap(original_error, "Custom prefix") + + assert isinstance(wrapped_error, NutrientError) + assert wrapped_error.message == "Custom prefix: Standard error" + assert wrapped_error.code == "WRAPPED_ERROR" + + def test_handle_non_exception_objects(self): + wrapped_error = NutrientError.wrap("String error") + + assert isinstance(wrapped_error, NutrientError) + assert wrapped_error.message == "An unknown error occurred" + assert wrapped_error.code == "UNKNOWN_ERROR" + assert wrapped_error.details == { + "originalError": "String error", + } + + def test_handle_non_exception_objects_with_custom_message(self): + wrapped_error = NutrientError.wrap(None, "Custom message") + + assert isinstance(wrapped_error, NutrientError) + assert wrapped_error.message == "Custom message" + assert wrapped_error.code == "UNKNOWN_ERROR" + assert wrapped_error.details == { + "originalError": "None", + } diff --git a/tests/unit/test_exceptions.py b/tests/unit/test_exceptions.py deleted file mode 100644 index 862bf7c..0000000 --- a/tests/unit/test_exceptions.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Unit tests for exceptions module.""" - -from nutrient_dws.exceptions import ( - APIError, - AuthenticationError, - FileProcessingError, - NutrientError, - NutrientTimeoutError, - ValidationError, -) - - -def test_nutrient_error(): - """Test base exception.""" - exc = NutrientError("Test error") - assert str(exc) == "Test error" - assert isinstance(exc, Exception) - - -def test_authentication_error(): - """Test authentication error.""" - exc = AuthenticationError("Invalid API key") - assert str(exc) == "Invalid API key" - assert isinstance(exc, NutrientError) - - -def test_api_error_basic(): - """Test API error without additional context.""" - exc = APIError("API request failed") - assert str(exc) == "API request failed" - assert isinstance(exc, NutrientError) - assert exc.status_code is None - assert exc.response_body is None - assert exc.request_id is None - - -def test_api_error_with_status(): - """Test API error with status code.""" - exc = APIError("Not found", status_code=404) - assert exc.status_code == 404 - assert "Status: 404" in str(exc) - - -def test_api_error_full_context(): - """Test API error with all context.""" - exc = APIError( - "Server error", - status_code=500, - response_body='{"error": "Internal server error"}', - request_id="req-123", - ) - assert exc.status_code == 500 - assert exc.response_body == '{"error": "Internal server error"}' - assert exc.request_id == "req-123" - assert "Status: 500" in str(exc) - assert "Request ID: req-123" in str(exc) - assert "Response:" in str(exc) - - -def test_validation_error(): - """Test validation error.""" - exc = ValidationError("Invalid input") - assert str(exc) == "Invalid input" - assert isinstance(exc, NutrientError) - assert exc.errors == {} - - -def test_validation_error_with_details(): - """Test validation error with error details.""" - errors = {"field": "Invalid value"} - exc = ValidationError("Validation failed", errors=errors) - assert exc.errors == errors - - -def test_timeout_error(): - """Test timeout error.""" - exc = NutrientTimeoutError("Request timed out") - assert str(exc) == "Request timed out" - assert isinstance(exc, NutrientError) - - -def test_file_processing_error(): - """Test file processing error.""" - exc = FileProcessingError("Failed to process file") - assert str(exc) == "Failed to process file" - assert isinstance(exc, NutrientError) diff --git a/tests/unit/test_file_handler.py b/tests/unit/test_file_handler.py deleted file mode 100644 index d834bd3..0000000 --- a/tests/unit/test_file_handler.py +++ /dev/null @@ -1,516 +0,0 @@ -"""Comprehensive unit tests for file handling utilities.""" - -import io -import os -import tempfile -from pathlib import Path -from typing import BinaryIO, cast -from unittest.mock import Mock, patch - -import pytest - -from nutrient_dws.file_handler import ( - DEFAULT_CHUNK_SIZE, - get_file_size, - prepare_file_for_upload, - prepare_file_input, - save_file_output, - stream_file_content, -) - - -class TestPrepareFileInput: - """Test suite for prepare_file_input function.""" - - def test_prepare_file_input_from_bytes(self): - """Test preparing file input from bytes.""" - content = b"Hello, World!" - result, filename = prepare_file_input(content) - assert result == content - assert filename == "document" - - def test_prepare_file_input_from_string_io(self): - """Test preparing file input from StringIO-like object.""" - # Using BytesIO instead of StringIO for binary compatibility - content = b"Test content" - file_obj = io.BytesIO(content) - result, filename = prepare_file_input(file_obj) - assert result == content - assert filename == "document" - - def test_prepare_file_input_from_file_path_string(self): - """Test preparing file input from file path string.""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - content = b"Test file content" - temp_file.write(content) - temp_file.flush() - - try: - result, filename = prepare_file_input(temp_file.name) - assert result == content - assert filename == os.path.basename(temp_file.name) - finally: - os.unlink(temp_file.name) - - def test_prepare_file_input_from_pathlib_path(self): - """Test preparing file input from pathlib.Path object.""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - content = b"Test pathlib content" - temp_file.write(content) - temp_file.flush() - - try: - path = Path(temp_file.name) - result, filename = prepare_file_input(path) - assert result == content - assert filename == path.name - finally: - os.unlink(temp_file.name) - - def test_prepare_file_input_from_file_handle(self): - """Test preparing file input from file handle.""" - with tempfile.NamedTemporaryFile() as temp_file: - content = b"File handle content" - temp_file.write(content) - temp_file.seek(0) - - result, filename = prepare_file_input(cast("BinaryIO", temp_file)) - assert result == content - assert filename == os.path.basename(temp_file.name) - - def test_prepare_file_input_from_string_file_handle(self): - """Test preparing file input from file handle with string content.""" - string_content = "String content" - string_file = io.StringIO(string_content) - string_file.name = "test.txt" - - result, filename = prepare_file_input(cast("BinaryIO", string_file)) - assert result == string_content.encode() - assert filename == "test.txt" - - def test_prepare_file_input_file_not_found_string(self): - """Test FileNotFoundError for non-existent file path string.""" - with pytest.raises(FileNotFoundError, match="File not found: /non/existent/file.txt"): - prepare_file_input("/non/existent/file.txt") - - def test_prepare_file_input_file_not_found_path(self): - """Test FileNotFoundError for non-existent pathlib.Path.""" - path = Path("/non/existent/file.txt") - with pytest.raises(FileNotFoundError, match="File not found:"): - prepare_file_input(path) - - def test_prepare_file_input_unsupported_type(self): - """Test ValueError for unsupported input type.""" - with pytest.raises(ValueError, match="Unsupported file input type"): - prepare_file_input(123) # type: ignore - - def test_prepare_file_input_file_handle_with_path_name(self): - """Test file handle with path-like name attribute.""" - with tempfile.NamedTemporaryFile() as temp_file: - content = b"Content with path name" - temp_file.write(content) - temp_file.seek(0) - - # Mock the name to be a path-like object - temp_file.name = Path(temp_file.name) # type: ignore - - result, filename = prepare_file_input(cast("BinaryIO", temp_file)) - assert result == content - assert filename == os.path.basename(str(temp_file.name)) - - -class TestPrepareFileForUpload: - """Test suite for prepare_file_for_upload function.""" - - def test_prepare_file_for_upload_small_file(self): - """Test preparing small file for upload (loads into memory).""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - content = b"Small file content" - temp_file.write(content) - temp_file.flush() - - try: - field_name, (filename, file_content, content_type) = prepare_file_for_upload( - temp_file.name, "test_field" - ) - - assert field_name == "test_field" - assert filename == os.path.basename(temp_file.name) - assert file_content == content - assert content_type == "application/octet-stream" - finally: - os.unlink(temp_file.name) - - def test_prepare_file_for_upload_large_file(self): - """Test preparing large file for upload (uses file handle).""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - # Create a file larger than 10MB threshold - large_content = b"x" * (11 * 1024 * 1024) # 11MB - temp_file.write(large_content) - temp_file.flush() - - try: - field_name, (filename, file_handle, content_type) = prepare_file_for_upload( - temp_file.name, "large_field" - ) - - assert field_name == "large_field" - assert filename == os.path.basename(temp_file.name) - assert hasattr(file_handle, "read") # Should be file handle - assert content_type == "application/octet-stream" - - # Clean up the file handle - if hasattr(file_handle, "close"): - file_handle.close() - finally: - os.unlink(temp_file.name) - - def test_prepare_file_for_upload_pathlib_path(self): - """Test preparing pathlib.Path for upload.""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - content = b"Pathlib content" - temp_file.write(content) - temp_file.flush() - - try: - path = Path(temp_file.name) - field_name, (filename, file_content, content_type) = prepare_file_for_upload(path) - - assert field_name == "file" # default field name - assert filename == path.name - assert file_content == content - assert content_type == "application/octet-stream" - finally: - os.unlink(temp_file.name) - - def test_prepare_file_for_upload_bytes(self): - """Test preparing bytes for upload.""" - content = b"Bytes content" - - field_name, (filename, file_content, content_type) = prepare_file_for_upload( - content, "bytes_field" - ) - - assert field_name == "bytes_field" - assert filename == "document" - assert file_content == content - assert content_type == "application/octet-stream" - - def test_prepare_file_for_upload_file_handle(self): - """Test preparing file handle for upload.""" - content = b"File handle content" - file_obj = io.BytesIO(content) - file_obj.name = "test_file.pdf" - - field_name, (filename, file_handle, content_type) = prepare_file_for_upload( - file_obj, "handle_field" - ) - - assert field_name == "handle_field" - assert filename == "test_file.pdf" - assert file_handle is file_obj - assert content_type == "application/octet-stream" - - def test_prepare_file_for_upload_file_handle_with_path_name(self): - """Test file handle with path-like name attribute.""" - content = b"Content with path name" - file_obj = io.BytesIO(content) - file_obj.name = Path("/path/to/test_file.pdf") - - field_name, (filename, file_handle, content_type) = prepare_file_for_upload(file_obj) - - assert field_name == "file" - assert filename == "test_file.pdf" # basename extracted - assert file_handle is file_obj - assert content_type == "application/octet-stream" - - def test_prepare_file_for_upload_file_not_found(self): - """Test FileNotFoundError for non-existent file.""" - with pytest.raises(FileNotFoundError, match="File not found: /non/existent/file.txt"): - prepare_file_for_upload("/non/existent/file.txt") - - def test_prepare_file_for_upload_unsupported_type(self): - """Test ValueError for unsupported input type.""" - with pytest.raises(ValueError, match="Unsupported file input type"): - prepare_file_for_upload(123) # type: ignore - - -class TestSaveFileOutput: - """Test suite for save_file_output function.""" - - def test_save_file_output_basic(self): - """Test basic file saving.""" - content = b"Test content to save" - - with tempfile.TemporaryDirectory() as temp_dir: - output_path = os.path.join(temp_dir, "output.pdf") - save_file_output(content, output_path) - - # Verify file was saved correctly - saved_content = Path(output_path).read_bytes() - assert saved_content == content - - def test_save_file_output_creates_directories(self): - """Test that save_file_output creates parent directories.""" - content = b"Content with nested path" - - with tempfile.TemporaryDirectory() as temp_dir: - output_path = os.path.join(temp_dir, "nested", "deep", "output.pdf") - save_file_output(content, output_path) - - # Verify directories were created - assert os.path.exists(os.path.dirname(output_path)) - - # Verify file was saved correctly - saved_content = Path(output_path).read_bytes() - assert saved_content == content - - def test_save_file_output_overwrites_existing(self): - """Test that save_file_output overwrites existing files.""" - original_content = b"Original content" - new_content = b"New content" - - with tempfile.TemporaryDirectory() as temp_dir: - output_path = os.path.join(temp_dir, "overwrite.pdf") - - # Create initial file - Path(output_path).write_bytes(original_content) - - # Overwrite with new content - save_file_output(new_content, output_path) - - # Verify new content - saved_content = Path(output_path).read_bytes() - assert saved_content == new_content - - @patch("pathlib.Path.mkdir") - @patch("pathlib.Path.write_bytes") - def test_save_file_output_propagates_os_error(self, mock_write, mock_mkdir): - """Test that save_file_output propagates OSError.""" - mock_write.side_effect = OSError("Permission denied") - mock_mkdir.return_value = None # mkdir succeeds - - with pytest.raises(OSError, match="Permission denied"): - save_file_output(b"content", "/some/path") - - -class TestStreamFileContent: - """Test suite for stream_file_content function.""" - - def test_stream_file_content_basic(self): - """Test basic file streaming.""" - content = b"Content to stream in chunks" - - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(content) - temp_file.flush() - - try: - chunks = list(stream_file_content(temp_file.name, chunk_size=8)) - streamed_content = b"".join(chunks) - - assert streamed_content == content - assert len(chunks) == 4 # 26 bytes in chunks of 8 - finally: - os.unlink(temp_file.name) - - def test_stream_file_content_large_file(self): - """Test streaming large file with default chunk size.""" - # Create content larger than default chunk size - content = b"x" * (DEFAULT_CHUNK_SIZE + 1000) - - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(content) - temp_file.flush() - - try: - chunks = list(stream_file_content(temp_file.name)) - streamed_content = b"".join(chunks) - - assert streamed_content == content - assert len(chunks) == 2 # Should be split into 2 chunks - finally: - os.unlink(temp_file.name) - - def test_stream_file_content_empty_file(self): - """Test streaming empty file.""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.flush() # Empty file - - try: - chunks = list(stream_file_content(temp_file.name)) - assert chunks == [] - finally: - os.unlink(temp_file.name) - - def test_stream_file_content_file_not_found(self): - """Test FileNotFoundError for non-existent file.""" - with pytest.raises(FileNotFoundError, match="File not found: /non/existent/file.txt"): - list(stream_file_content("/non/existent/file.txt")) - - def test_stream_file_content_custom_chunk_size(self): - """Test streaming with custom chunk size.""" - content = b"Custom chunk size test content" - - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(content) - temp_file.flush() - - try: - chunks = list(stream_file_content(temp_file.name, chunk_size=5)) - streamed_content = b"".join(chunks) - - assert streamed_content == content - assert len(chunks) == 6 # 30 bytes in chunks of 5 - assert all(len(chunk) <= 5 for chunk in chunks) - finally: - os.unlink(temp_file.name) - - -class TestGetFileSize: - """Test suite for get_file_size function.""" - - def test_get_file_size_from_bytes(self): - """Test getting file size from bytes.""" - content = b"Hello, World!" - size = get_file_size(content) - assert size == 13 - - def test_get_file_size_from_bytesio(self): - """Test getting file size from BytesIO.""" - content = b"Test content" - file_obj = io.BytesIO(content) - size = get_file_size(file_obj) - assert size == 12 - - def test_get_file_size_from_file_path(self): - """Test getting file size from file path string.""" - content = b"File path content test" - - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(content) - temp_file.flush() - - try: - size = get_file_size(temp_file.name) - assert size == len(content) - finally: - os.unlink(temp_file.name) - - def test_get_file_size_from_pathlib_path(self): - """Test getting file size from pathlib.Path.""" - content = b"Pathlib size test" - - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(content) - temp_file.flush() - - try: - path = Path(temp_file.name) - size = get_file_size(path) - assert size == len(content) - finally: - os.unlink(temp_file.name) - - def test_get_file_size_file_not_found(self): - """Test get_file_size returns None for non-existent file.""" - size = get_file_size("/non/existent/file.txt") - assert size is None - - def test_get_file_size_seekable_file_object(self): - """Test get_file_size with seekable file object.""" - content = b"Seekable file content" - - with tempfile.NamedTemporaryFile() as temp_file: - temp_file.write(content) - temp_file.seek(5) # Move to middle of file - - size = get_file_size(cast("BinaryIO", temp_file)) - assert size == len(content) - - # Verify position was restored - assert temp_file.tell() == 5 - - def test_get_file_size_non_seekable_file_object(self): - """Test get_file_size with non-seekable file object.""" - # Create a mock file object that raises on seek operations - mock_file = Mock() - mock_file.seek.side_effect = io.UnsupportedOperation("not seekable") - mock_file.tell.side_effect = io.UnsupportedOperation("not seekable") - - size = get_file_size(mock_file) - assert size is None - - def test_get_file_size_file_object_with_os_error(self): - """Test get_file_size handles OSError during seeking.""" - mock_file = Mock() - mock_file.tell.side_effect = OSError("OS error during tell") - - size = get_file_size(mock_file) - assert size is None - - def test_get_file_size_unsupported_type(self): - """Test get_file_size returns None for unsupported types.""" - size = get_file_size(123) # type: ignore - assert size is None - - def test_get_file_size_empty_bytes(self): - """Test get_file_size with empty bytes.""" - size = get_file_size(b"") - assert size == 0 - - def test_get_file_size_empty_file(self): - """Test get_file_size with empty file.""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.flush() # Empty file - - try: - size = get_file_size(temp_file.name) - assert size == 0 - finally: - os.unlink(temp_file.name) - - -class TestFileHandlerEdgeCases: - """Test edge cases and boundary conditions.""" - - def test_prepare_file_input_bytesio_at_end(self): - """Test prepare_file_input with BytesIO positioned at end.""" - content = b"BytesIO at end test" - file_obj = io.BytesIO(content) - file_obj.seek(0, 2) # Seek to end - - result, filename = prepare_file_input(file_obj) - assert result == content - assert filename == "document" - - def test_prepare_file_for_upload_exactly_10mb(self): - """Test prepare_file_for_upload with file exactly at 10MB threshold.""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - # Create exactly 10MB file - content = b"x" * (10 * 1024 * 1024) - temp_file.write(content) - temp_file.flush() - - try: - field_name, (filename, file_content, content_type) = prepare_file_for_upload( - temp_file.name - ) - - # Should load into memory (not streaming) at exactly 10MB - assert isinstance(file_content, bytes) - assert file_content == content - finally: - os.unlink(temp_file.name) - - def test_file_handle_name_attribute_edge_cases(self): - """Test file handle with various name attribute types.""" - content = b"Name attribute test" - - # Test with bytes name - file_obj = io.BytesIO(content) - file_obj.name = b"/path/to/file.pdf" - - result, filename = prepare_file_input(file_obj) - assert result == content - assert filename == "file.pdf" diff --git a/tests/unit/test_http.py b/tests/unit/test_http.py new file mode 100644 index 0000000..0c3a556 --- /dev/null +++ b/tests/unit/test_http.py @@ -0,0 +1,657 @@ +"""HTTP layer tests for Nutrient DWS Python Client.""" + +import json +import re +from typing import Literal +from unittest.mock import MagicMock, patch + +import pytest +import httpx + +from nutrient_dws.errors import ( + APIError, + ValidationError, + AuthenticationError, + NetworkError, +) +from nutrient_dws.http import ( + send_request, + RequestConfig, + NutrientClientOptions, + BuildRequestData, + resolve_api_key, + extract_error_message, + create_http_error, +) +from nutrient_dws.inputs import NormalizedFileData + +class TestResolveApiKey: + async def test_resolve_string_api_key(self): + result = await resolve_api_key("test-api-key") + assert result == "test-api-key" + + async def test_resolve_function_api_key(self): + def api_key_func(): + return "function-api-key" + + result = await resolve_api_key(api_key_func) + assert result == "function-api-key" + + async def test_resolve_async_function_api_key(self): + async def get_token(): + # Your token retrieval logic here + return 'async-function-api-key' + result = await resolve_api_key(get_token) + assert result == "async-function-api-key" + + async def test_function_returns_empty_string(self): + def empty_key_func(): + return "" + + with pytest.raises( + AuthenticationError, match="API key function must return a non-empty string" + ): + await resolve_api_key(empty_key_func) + + async def test_function_returns_none(self): + def none_key_func(): + return None + + with pytest.raises( + AuthenticationError, match="API key function must return a non-empty string" + ): + await resolve_api_key(none_key_func) + + async def test_function_throws_error(self): + def error_key_func(): + raise Exception("Token fetch failed") + + with pytest.raises( + AuthenticationError, match="Failed to resolve API key from function" + ): + await resolve_api_key(error_key_func) + + +class TestExtractErrorMessage: + def test_extract_error_description(self): + data = {"error_description": "API key is invalid"} + result = extract_error_message(data) + assert result == "API key is invalid" + + def test_extract_error_message(self): + data = {"error_message": "Request failed"} + result = extract_error_message(data) + assert result == "Request failed" + + def test_extract_message(self): + data = {"message": "Validation failed"} + result = extract_error_message(data) + assert result == "Validation failed" + + def test_extract_nested_error_message(self): + data = {"error": {"message": "Nested error"}} + result = extract_error_message(data) + assert result == "Nested error" + + def test_extract_errors_array(self): + data = {"errors": ["First error", "Second error"]} + result = extract_error_message(data) + assert result == "First error" + + def test_extract_errors_object_array(self): + data = {"errors": [{"message": "Object error"}]} + result = extract_error_message(data) + assert result == "Object error" + + def test_no_error_message(self): + data = {"other": "data"} + result = extract_error_message(data) + assert result is None + + +class TestCreateHttpError: + def test_create_authentication_error_401(self): + error = create_http_error(401, "Unauthorized", {"message": "Invalid API key"}) + assert isinstance(error, AuthenticationError) + assert error.message == "Invalid API key" + assert error.status_code == 401 + + def test_create_authentication_error_403(self): + error = create_http_error(403, "Forbidden", {"message": "Access denied"}) + assert isinstance(error, AuthenticationError) + assert error.message == "Access denied" + assert error.status_code == 403 + + def test_create_validation_error_400(self): + error = create_http_error(400, "Bad Request", {"message": "Invalid parameters"}) + assert isinstance(error, ValidationError) + assert error.message == "Invalid parameters" + assert error.status_code == 400 + + def test_create_api_error_500(self): + error = create_http_error( + 500, "Internal Server Error", {"message": "Server error"} + ) + assert isinstance(error, APIError) + assert error.message == "Server error" + assert error.status_code == 500 + + def test_fallback_message(self): + error = create_http_error(404, "Not Found", {}) + assert error.message == "HTTP 404: Not Found" + + +class TestSendRequest: + def setup_method(self): + self.mock_client_options: NutrientClientOptions = { + "apiKey": "test-api-key", + "baseUrl": "https://api.test.com/v1", + "timeout": None, + } + + @pytest.mark.asyncio + async def test_successful_get_request(self): + mock_response_data = {"result": "success"} + + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {"content-type": "application/json"} + mock_response.json.return_value = mock_response_data + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + result = await send_request(config, self.mock_client_options) + + # Verify the request was made correctly + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert call_kwargs["method"] == "GET" + assert call_kwargs["url"] == "https://api.test.com/v1/account/info" + assert call_kwargs["headers"]["Authorization"] == "Bearer test-api-key" + assert re.match(r'^nutrient-dws/\d+\.\d+\.\d+(?:-[a-zA-Z0-9.-]+)?', call_kwargs["headers"]["User-Agent"]) + assert call_kwargs["timeout"] is None + + assert result["data"] == {"result": "success"} + assert result["status"] == 200 + assert result["statusText"] == "OK" + assert result["headers"]["content-type"] == "application/json" + + @pytest.mark.asyncio + async def test_handle_function_api_key(self): + def api_key_func(): + return "function-api-key" + + async_options: NutrientClientOptions = { + "apiKey": api_key_func, + "baseUrl": None, + "timeout": None, + } + + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {"result": "success"} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + await send_request(config, async_options) + + # Verify function API key was used + mock_client.return_value.__aenter__.return_value.request.assert_called_once() + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert call_kwargs["headers"]["Authorization"] == "Bearer function-api-key" + + @pytest.mark.asyncio + async def test_throw_authentication_error_for_invalid_function_api_key(self): + def empty_key_func(): + return "" + + async_options: NutrientClientOptions = { + "apiKey": empty_key_func, + "baseUrl": None, + "timeout": None, + } + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + with pytest.raises( + AuthenticationError, match="API key function must return a non-empty string" + ): + await send_request(config, async_options) + + @pytest.mark.asyncio + async def test_throw_authentication_error_when_function_fails(self): + def error_key_func(): + raise Exception("Token fetch failed") + + async_options: NutrientClientOptions = { + "apiKey": error_key_func, + "baseUrl": None, + "timeout": None, + } + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + with pytest.raises( + AuthenticationError, match="Failed to resolve API key from function" + ): + await send_request(config, async_options) + + @pytest.mark.asyncio + async def test_send_json_data_with_proper_headers(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 201 + mock_response.reason_phrase = "Created" + mock_response.headers = {} + mock_response.json.return_value = {"id": 123} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + # Use analyze_build endpoint for JSON-only requests + config: RequestConfig = { + "endpoint": "/analyze_build", + "method": "POST", + "data": {"instructions": {"parts": [{"file": "test.pdf"}]}}, + "headers": None, + } + + await send_request(config, self.mock_client_options) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert "json" in call_kwargs + + @pytest.mark.asyncio + async def test_send_files_with_form_data(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {"uploaded": True} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + file_data: NormalizedFileData = (b"\x01\x02\x03\x04", "file.bin") + files_map: dict[str, NormalizedFileData] = {"document": file_data} + + build_data: BuildRequestData = { + "files": files_map, + "instructions": { + "parts": [{"file": "document"}], + "output": {"type": "pdf"}, + }, + } + + config: RequestConfig[Literal["POST"], Literal["/build"], BuildRequestData] = { + "endpoint": "/build", + "method": "POST", + "data": build_data, + "headers": None, + } + + await send_request(config, self.mock_client_options) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert "files" in call_kwargs + assert "data" in call_kwargs + + # Check that files are properly formatted + files = call_kwargs["files"] + assert "document" in files + assert files["document"][0] == "file.bin" # filename + assert files["document"][1] == b"\x01\x02\x03\x04" # content + + # Check that instructions are JSON-encoded + data = call_kwargs["data"] + assert "instructions" in data + instructions = json.loads(data["instructions"]) + assert instructions["parts"] == [{"file": "document"}] + + @pytest.mark.asyncio + async def test_handle_401_authentication_error(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 401 + mock_response.reason_phrase = "Unauthorized" + mock_response.headers = {} + mock_response.json.return_value = {"error": "Invalid API key"} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + with pytest.raises(AuthenticationError) as exc_info: + await send_request(config, self.mock_client_options) + + error = exc_info.value + assert error.message == "Invalid API key" + assert error.code == "AUTHENTICATION_ERROR" + assert error.status_code == 401 + + @pytest.mark.asyncio + async def test_handle_400_validation_error(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.reason_phrase = "Bad Request" + mock_response.headers = {} + mock_response.json.return_value = {"message": "Invalid parameters"} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + # Use analyze_build endpoint for JSON-only requests + config: RequestConfig = { + "endpoint": "/analyze_build", + "method": "POST", + "data": {"instructions": {}}, + "headers": None, + } + + with pytest.raises(ValidationError) as exc_info: + await send_request(config, self.mock_client_options) + + error = exc_info.value + assert error.message == "Invalid parameters" + assert error.code == "VALIDATION_ERROR" + assert error.status_code == 400 + + @pytest.mark.asyncio + async def test_handle_network_errors(self): + with patch("httpx.AsyncClient") as mock_client: + network_error = httpx.RequestError("Network Error") + network_error.request = httpx.Request( + "GET", "https://api.test.com/v1/account/info" + ) + + mock_client.return_value.__aenter__.return_value.request.side_effect = ( + network_error + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + with pytest.raises(NetworkError) as exc_info: + await send_request(config, self.mock_client_options) + + error = exc_info.value + assert error.message == "Network request failed" + assert error.code == "NETWORK_ERROR" + + @pytest.mark.asyncio + async def test_not_leak_api_key_in_network_error_details(self): + with patch("httpx.AsyncClient") as mock_client: + network_error = httpx.RequestError("Network Error") + network_error.request = httpx.Request( + "GET", "https://api.test.com/v1/account/info" + ) + + mock_client.return_value.__aenter__.return_value.request.side_effect = ( + network_error + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": { + "Authorization": "Bearer secret-api-key-that-should-not-leak", + "Content-Type": "application/json", + "X-Custom-Header": "custom-value", + }, + } + + with pytest.raises(NetworkError) as exc_info: + await send_request(config, self.mock_client_options) + + error = exc_info.value + assert error.message == "Network request failed" + assert error.code == "NETWORK_ERROR" + + # Verify headers are present but Authorization is sanitized + assert "headers" in error.details + headers = error.details["headers"] + assert headers["X-Custom-Header"] == "custom-value" + assert "Authorization" not in headers + + # Verify the API key is not present in the stringified error + error_string = json.dumps(error.details) + assert "secret-api-key-that-should-not-leak" not in error_string + + @pytest.mark.asyncio + async def test_use_custom_timeout(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + custom_options = {**self.mock_client_options, "timeout": 60} + await send_request(config, custom_options) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert call_kwargs["timeout"] == 60 + + @pytest.mark.asyncio + async def test_use_default_timeout_when_not_specified(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + await send_request(config, self.mock_client_options) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert call_kwargs["timeout"] is None + + @pytest.mark.asyncio + async def test_handle_multiple_files_in_request(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {"success": True} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + files_map: dict[str, NormalizedFileData] = { + "file1": (b"\x01\x02\x03", "file1.bin"), + "file2": (b"\x04\x05\x06", "file2.bin"), + "file3": (b"\x07\x08\x09", "file3.bin"), + } + + build_data: BuildRequestData = { + "files": files_map, + "instructions": { + "parts": [{"file": "file1"}, {"file": "file2"}, {"file": "file3"}], + "output": {"type": "pdf"}, + }, + } + + config: RequestConfig[Literal["POST"], Literal["/build"], BuildRequestData] = { + "endpoint": "/build", + "method": "POST", + "data": build_data, + "headers": None, + } + + await send_request(config, self.mock_client_options) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + files = call_kwargs["files"] + + # Check all files are present + assert "file1" in files + assert "file2" in files + assert "file3" in files + + # Check file content + assert files["file1"] == ("file1.bin", b"\x01\x02\x03") + assert files["file2"] == ("file2.bin", b"\x04\x05\x06") + assert files["file3"] == ("file3.bin", b"\x07\x08\x09") + + @pytest.mark.asyncio + async def test_handle_binary_response_data(self): + with patch("httpx.AsyncClient") as mock_client: + binary_data = b"PDF content here" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {"content-type": "application/pdf"} + mock_response.json.side_effect = json.JSONDecodeError("Not JSON", "", 0) + mock_response.content = binary_data + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + # Use analyze_build endpoint for JSON-only requests + config: RequestConfig = { + "endpoint": "/analyze_build", + "method": "POST", + "data": {"instructions": {"parts": [{"file": "test.pdf"}]}}, + "headers": None, + } + + result = await send_request(config, self.mock_client_options) + + assert result["data"] == binary_data + assert result["headers"]["content-type"] == "application/pdf" + + @pytest.mark.asyncio + async def test_strip_trailing_slashes_from_base_url(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + options_with_trailing_slash: NutrientClientOptions = { + "apiKey": "test-key", + "baseUrl": "https://api.nutrient.io/", + "timeout": None, + } + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + await send_request(config, options_with_trailing_slash) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert call_kwargs["url"] == "https://api.nutrient.io/account/info" + + @pytest.mark.asyncio + async def test_use_default_base_url_when_none(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.reason_phrase = "OK" + mock_response.headers = {} + mock_response.json.return_value = {} + + mock_client.return_value.__aenter__.return_value.request.return_value = ( + mock_response + ) + + options_without_base_url: NutrientClientOptions = { + "apiKey": "test-key", + "baseUrl": None, + "timeout": None, + } + + config: RequestConfig[Literal["GET"], Literal["/account/info"], None] = { + "endpoint": "/account/info", + "method": "GET", + "data": None, + "headers": None, + } + + await send_request(config, options_without_base_url) + + call_kwargs = mock_client.return_value.__aenter__.return_value.request.call_args.kwargs + assert call_kwargs["url"] == "https://api.nutrient.io/account/info" diff --git a/tests/unit/test_http_client.py b/tests/unit/test_http_client.py deleted file mode 100644 index a05d157..0000000 --- a/tests/unit/test_http_client.py +++ /dev/null @@ -1,375 +0,0 @@ -"""Comprehensive unit tests for HTTPClient.""" - -import json -from unittest.mock import Mock, patch - -import pytest -import requests - -from nutrient_dws.exceptions import ( - APIError, - AuthenticationError, - NutrientTimeoutError, -) -from nutrient_dws.http_client import HTTPClient - - -class TestHTTPClientInitialization: - """Test suite for HTTPClient initialization.""" - - def test_http_client_init_default(self): - """Test HTTP client initialization with defaults.""" - client = HTTPClient(api_key="test-key") - assert client._api_key == "test-key" - assert client._base_url == "https://api.pspdfkit.com" - assert client._timeout == 300 - - def test_http_client_init_custom_timeout(self): - """Test HTTP client with custom timeout.""" - client = HTTPClient(api_key="test-key", timeout=60) - assert client._timeout == 60 - - def test_http_client_init_no_api_key(self): - """Test HTTP client initialization without API key.""" - client = HTTPClient(api_key=None) - assert client._api_key is None - - def test_http_client_init_empty_api_key(self): - """Test HTTP client initialization with empty API key.""" - client = HTTPClient(api_key="") - assert client._api_key == "" - - def test_http_client_creates_session(self): - """Test that HTTP client creates a requests session.""" - client = HTTPClient(api_key="test-key") - assert hasattr(client, "_session") - assert isinstance(client._session, requests.Session) - - def test_http_client_session_headers(self): - """Test that session has proper headers set.""" - client = HTTPClient(api_key="test-key") - assert "Authorization" in client._session.headers - assert client._session.headers["Authorization"] == "Bearer test-key" - assert "User-Agent" in client._session.headers - assert "nutrient-dws" in client._session.headers["User-Agent"] - - def test_http_client_context_manager(self): - """Test HTTP client can be used as context manager.""" - with HTTPClient(api_key="test-key") as client: - assert client is not None - assert hasattr(client, "_session") - - -class TestHTTPClientMethods: - """Test suite for HTTPClient HTTP methods.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = HTTPClient(api_key="test-key") - - @patch("requests.Session.request") - def test_post_method_with_json(self, mock_request): - """Test POST request with JSON data.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = b"POST response" - mock_request.return_value = mock_response - - json_data = {"key": "value"} - result = self.client.post("/test", json_data=json_data) - - assert result == b"POST response" - # Check that the request was made correctly - assert mock_request.called - call_args = mock_request.call_args - assert call_args[0][0] == "POST" - assert call_args[0][1] == "https://api.pspdfkit.com/test" - - @patch("requests.Session.request") - def test_post_method_with_files(self, mock_request): - """Test POST request with files.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = b"POST response" - mock_request.return_value = mock_response - - files = {"file": ("test.pdf", b"file content", "application/pdf")} - result = self.client.post("/test", files=files) - - assert result == b"POST response" - # Check that the request was made correctly - assert mock_request.called - call_args = mock_request.call_args - assert call_args[0][0] == "POST" - assert call_args[0][1] == "https://api.pspdfkit.com/test" - - @patch("requests.Session.request") - def test_post_with_both_files_and_json(self, mock_request): - """Test POST request with both files and JSON data.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = b"POST response" - mock_request.return_value = mock_response - - files = {"file": ("test.pdf", b"file content", "application/pdf")} - json_data = {"actions": [{"type": "rotate"}]} - result = self.client.post("/test", files=files, json_data=json_data) - - assert result == b"POST response" - assert mock_request.called - - def test_post_without_api_key_raises_error(self): - """Test POST without API key raises AuthenticationError.""" - client = HTTPClient(api_key=None) - - with pytest.raises(AuthenticationError, match="API key is required"): - client.post("/test") - - -class TestHTTPClientErrorHandling: - """Test suite for HTTPClient error handling.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = HTTPClient(api_key="test-key") - - @patch("requests.Session.request") - def test_authentication_error_401(self, mock_request): - """Test 401 authentication error handling.""" - mock_response = Mock() - mock_response.status_code = 401 - mock_response.text = "Unauthorized" - mock_response.json.side_effect = json.JSONDecodeError("Expecting value", "doc", 0) - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError() - mock_request.return_value = mock_response - - with pytest.raises(AuthenticationError, match="HTTP 401: Unauthorized"): - self.client.post("/test") - - @patch("requests.Session.request") - def test_authentication_error_403(self, mock_request): - """Test 403 forbidden error handling.""" - mock_response = Mock() - mock_response.status_code = 403 - mock_response.text = "Forbidden" - mock_response.json.side_effect = json.JSONDecodeError("Expecting value", "doc", 0) - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError() - mock_request.return_value = mock_response - - with pytest.raises(AuthenticationError, match="HTTP 403: Forbidden"): - self.client.post("/test") - - @patch("requests.Session.request") - def test_api_error_400(self, mock_request): - """Test 400 bad request error handling.""" - mock_response = Mock() - mock_response.status_code = 400 - mock_response.text = "Bad request" - mock_response.json.side_effect = json.JSONDecodeError("Expecting value", "doc", 0) - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError() - mock_request.return_value = mock_response - - with pytest.raises(APIError) as exc_info: - self.client.post("/test") - - assert exc_info.value.status_code == 400 - assert exc_info.value.response_body == "Bad request" - - @patch("requests.Session.request") - def test_api_error_500(self, mock_request): - """Test 500 internal server error handling.""" - mock_response = Mock() - mock_response.status_code = 500 - mock_response.text = "Internal server error" - mock_response.json.side_effect = json.JSONDecodeError("Expecting value", "doc", 0) - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError() - mock_request.return_value = mock_response - - with pytest.raises(APIError) as exc_info: - self.client.post("/test") - - assert exc_info.value.status_code == 500 - assert exc_info.value.response_body == "Internal server error" - - @patch("requests.Session.request") - def test_timeout_error(self, mock_request): - """Test timeout error handling.""" - mock_request.side_effect = requests.Timeout("Request timed out") - - with pytest.raises(NutrientTimeoutError, match="Request timed out"): - self.client.post("/test") - - @patch("requests.Session.request") - def test_connection_error(self, mock_request): - """Test connection error handling.""" - mock_request.side_effect = requests.ConnectionError("Connection failed") - - with pytest.raises(APIError, match="Connection failed"): - self.client.post("/test") - - @patch("requests.Session.request") - def test_requests_exception(self, mock_request): - """Test generic requests exception handling.""" - mock_request.side_effect = requests.RequestException("Request failed") - - with pytest.raises(APIError, match="Request failed"): - self.client.post("/test") - - @patch("requests.Session.request") - def test_api_error_with_json_response(self, mock_request): - """Test API error with JSON error response.""" - mock_response = Mock() - mock_response.status_code = 422 - mock_response.text = '{"message": "Validation failed", "details": "Invalid file format"}' - mock_response.json.return_value = { - "message": "Validation failed", - "details": "Invalid file format", - } - mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError() - mock_request.return_value = mock_response - - from nutrient_dws.exceptions import ValidationError - - with pytest.raises(ValidationError) as exc_info: - self.client.post("/test") - - assert "Validation failed" in str(exc_info.value) - - -class TestHTTPClientResponseHandling: - """Test suite for HTTPClient response handling.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = HTTPClient(api_key="test-key") - - @patch("requests.Session.request") - def test_successful_response_with_content(self, mock_request): - """Test successful response with content.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = b"PDF content here" - mock_response.raise_for_status.return_value = None - mock_request.return_value = mock_response - - result = self.client.post("/test") - assert result == b"PDF content here" - - @patch("requests.Session.request") - def test_successful_response_empty_content(self, mock_request): - """Test successful response with empty content.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = b"" - mock_response.raise_for_status.return_value = None - mock_request.return_value = mock_response - - result = self.client.post("/test") - assert result == b"" - - @patch("requests.Session.request") - def test_successful_response_201(self, mock_request): - """Test successful 201 Created response.""" - mock_response = Mock() - mock_response.status_code = 201 - mock_response.content = b"Created content" - mock_response.raise_for_status.return_value = None - mock_request.return_value = mock_response - - result = self.client.post("/test") - assert result == b"Created content" - - @patch("requests.Session.request") - def test_successful_response_204(self, mock_request): - """Test successful 204 No Content response.""" - mock_response = Mock() - mock_response.status_code = 204 - mock_response.content = b"" - mock_response.raise_for_status.return_value = None - mock_request.return_value = mock_response - - result = self.client.post("/test") - assert result == b"" - - -class TestHTTPClientContextManager: - """Test suite for HTTPClient context manager functionality.""" - - def test_context_manager_enters_and_exits(self): - """Test context manager enter and exit.""" - with HTTPClient(api_key="test-key") as client: - assert client is not None - assert hasattr(client, "_session") - - # Session should be closed after exiting context - # Note: We can't directly test if session is closed in requests, - # but we can verify the close method was accessible - - def test_context_manager_exception_handling(self): - """Test context manager handles exceptions properly.""" - try: - with HTTPClient(api_key="test-key") as client: - assert client is not None - raise ValueError("Test exception") - except ValueError: - pass # Exception should be propagated - - def test_manual_close(self): - """Test manual close method.""" - client = HTTPClient(api_key="test-key") - - # Close should not raise an error - client.close() - - # Verify session is accessible (requests doesn't provide a closed property) - assert hasattr(client, "_session") - - -class TestHTTPClientEdgeCases: - """Test edge cases and boundary conditions.""" - - def setup_method(self): - """Set up test fixtures.""" - self.client = HTTPClient(api_key="test-key") - - @patch("requests.Session.post") - def test_request_with_all_parameters(self, mock_post): - """Test request with all possible parameters.""" - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = b"Full request" - mock_response.raise_for_status.return_value = None - mock_post.return_value = mock_response - - files = {"file": ("test.pdf", b"content", "application/pdf")} - json_data = {"action": "process"} - data = {"key": "value"} - - result = self.client.post("/test", json_data=json_data, files=files, data=data) - - assert result == b"Full request" - mock_post.assert_called_once() - - @patch("requests.Session.post") - def test_very_large_response(self, mock_post): - """Test handling of very large response.""" - large_content = b"x" * (10 * 1024 * 1024) # 10MB - mock_response = Mock() - mock_response.status_code = 200 - mock_response.content = large_content - mock_response.raise_for_status.return_value = None - mock_post.return_value = mock_response - - result = self.client.post("/test") - assert result == large_content - assert len(result) == 10 * 1024 * 1024 - - def test_client_with_none_api_key_no_auth_header(self): - """Test that None API key doesn't set Authorization header.""" - client = HTTPClient(api_key=None) - assert "Authorization" not in client._session.headers - - def test_client_with_empty_api_key_no_auth_header(self): - """Test that empty API key doesn't set Authorization header.""" - client = HTTPClient(api_key="") - assert "Authorization" not in client._session.headers diff --git a/tests/unit/test_inputs.py b/tests/unit/test_inputs.py new file mode 100644 index 0000000..e8f126f --- /dev/null +++ b/tests/unit/test_inputs.py @@ -0,0 +1,295 @@ +import io +from pathlib import Path +from unittest.mock import AsyncMock, Mock, patch + +import pytest + +from nutrient_dws.inputs import ( + get_pdf_page_count, + is_remote_file_input, + is_valid_pdf, + process_file_input, + process_remote_file_input, + validate_file_input, + FileInput, +) +from tests.helpers import sample_pdf, TestDocumentGenerator + + +def create_test_bytes(content: str = "test content") -> bytes: + """Create test bytes data.""" + return content.encode("utf-8") + + +class TestValidateFileInput: + def test_validate_string_inputs(self): + assert validate_file_input("test.pdf") is True + assert validate_file_input("https://example.com/file.pdf") is True + + def test_validate_bytes_objects(self): + test_bytes = create_test_bytes() + assert validate_file_input(test_bytes) is True + + def test_validate_path_objects(self): + with ( + patch("pathlib.Path.exists", return_value=True), + patch("pathlib.Path.is_file", return_value=True), + ): + assert validate_file_input(Path("test.pdf")) is True + + def test_validate_file_like_objects(self): + mock_file = io.BytesIO(b"test content") + assert validate_file_input(mock_file) is True + + def test_reject_invalid_inputs(self): + assert validate_file_input(None) is False + assert validate_file_input(123) is False + assert validate_file_input({}) is False + + +class TestProcessFileInputBytes: + @pytest.mark.asyncio + async def test_process_bytes_object(self): + test_bytes = create_test_bytes("test content") + result = await process_file_input(test_bytes) + + assert result[0] == test_bytes + assert result[1] == "document" + + +class TestProcessFileInputFilePath: + @pytest.mark.asyncio + async def test_process_file_path_string(self): + mock_file_data = b"test file content" + + with ( + patch("pathlib.Path.exists", return_value=True), + patch("aiofiles.open") as mock_aiofiles_open, + ): + mock_file = AsyncMock() + mock_file.read = AsyncMock(return_value=mock_file_data) + mock_context_manager = AsyncMock() + mock_context_manager.__aenter__ = AsyncMock(return_value=mock_file) + mock_context_manager.__aexit__ = AsyncMock(return_value=None) + mock_aiofiles_open.return_value = mock_context_manager + + result = await process_file_input("/path/to/test.pdf") + + assert result[1] == "test.pdf" + assert result[0] == mock_file_data + + @pytest.mark.asyncio + async def test_process_path_object(self): + mock_file_data = b"test file content" + test_path = Path("/path/to/test.pdf") + + with ( + patch("pathlib.Path.exists", return_value=True), + patch("aiofiles.open") as mock_aiofiles_open, + ): + mock_file = AsyncMock() + mock_file.read = AsyncMock(return_value=mock_file_data) + mock_context_manager = AsyncMock() + mock_context_manager.__aenter__ = AsyncMock(return_value=mock_file) + mock_context_manager.__aexit__ = AsyncMock(return_value=None) + mock_aiofiles_open.return_value = mock_context_manager + + result = await process_file_input(test_path) + + assert result[1] == "test.pdf" + assert result[0] == mock_file_data + + @pytest.mark.asyncio + async def test_throw_error_for_non_existent_file(self): + with patch("pathlib.Path.exists", return_value=False): + with pytest.raises(FileNotFoundError): + await process_file_input("/path/to/nonexistent.pdf") + + @pytest.mark.asyncio + async def test_throw_error_for_other_errors(self): + with ( + patch("pathlib.Path.exists", return_value=True), + patch("aiofiles.open", side_effect=OSError("Some other error")), + ): + with pytest.raises(OSError): + await process_file_input("/path/to/test.pdf") + + +class TestProcessFileInputFileObjects: + @pytest.mark.asyncio + async def test_process_sync_file_object(self): + test_content = b"test file content" + mock_file = io.BytesIO(test_content) + mock_file.name = "test.pdf" + + result = await process_file_input(mock_file) + + assert result[0] == test_content + assert result[1] == "test.pdf" + + @pytest.mark.asyncio + async def test_process_file_object_without_name(self): + test_content = b"test file content" + mock_file = io.BytesIO(test_content) + + result = await process_file_input(mock_file) + + assert result[0] == test_content + assert result[1] == "document" + + +class TestIsRemoteFileInput: + @pytest.mark.parametrize( + "input_data,expected", + [ + ("https://example.com/test.pdf", True), + ("http://example.com/test.pdf", True), + ("ftp://example.com/test.pdf", True), + ("test.pdf", False), + ("/path/to/test.pdf", False), + (b"test", False), + (Path("test.pdf"), False), + ], + ) + def test_remote_file_detection(self, input_data, expected): + assert is_remote_file_input(input_data) is expected + + +class TestProcessFileInputInvalidInputs: + @pytest.mark.asyncio + async def test_throw_for_unsupported_types(self): + with pytest.raises(ValueError): + await process_file_input(123) + + with pytest.raises(ValueError): + await process_file_input({}) + + @pytest.mark.asyncio + async def test_throw_for_none(self): + with pytest.raises(ValueError): + await process_file_input(None) + + +class TestProcessRemoteFileInput: + @pytest.mark.asyncio + async def test_process_url_string_input(self): + mock_response_data = b"test pdf content" + + with patch("httpx.AsyncClient") as mock_client: + mock_response = AsyncMock() + mock_response.content = mock_response_data + mock_response.headers = {} + mock_response.raise_for_status = Mock(return_value=None) + mock_client.return_value.__aenter__.return_value.get.return_value = ( + mock_response + ) + + result = await process_remote_file_input("https://example.com/test.pdf") + + assert result[0] == mock_response_data + assert result[1] == "downloaded_file" + + @pytest.mark.asyncio + async def test_process_url_with_content_disposition_header(self): + mock_response_data = b"test pdf content" + + with patch("httpx.AsyncClient") as mock_client: + mock_response = AsyncMock() + mock_response.content = mock_response_data + mock_response.headers = { + "content-disposition": 'attachment; filename="document.pdf"' + } + mock_response.raise_for_status = Mock(return_value=None) + mock_client.return_value.__aenter__.return_value.get.return_value = ( + mock_response + ) + + result = await process_remote_file_input("https://example.com/test.pdf") + + assert result[0] == mock_response_data + assert result[1] == "document.pdf" + + @pytest.mark.asyncio + async def test_throw_error_for_http_error(self): + with patch("httpx.AsyncClient") as mock_client: + mock_response = AsyncMock() + mock_response.raise_for_status = Mock(side_effect=Exception("HTTP 404")) + mock_client.return_value.__aenter__.return_value.get.return_value = ( + mock_response + ) + + with pytest.raises(Exception): + await process_remote_file_input("https://example.com/test.pdf") + + +class TestGetPdfPageCount: + def test_pdf_with_1_page(self): + pdf_bytes = TestDocumentGenerator.generate_simple_pdf_content("Text") + result = get_pdf_page_count(pdf_bytes) + assert result == 1 + + def test_pdf_with_6_pages(self): + result = get_pdf_page_count(sample_pdf) + assert result == 6 + + def test_throw_for_invalid_pdf_no_objects(self): + invalid_pdf = b"%PDF-1.4\n%%EOF" + + with pytest.raises(ValueError, match="Could not find /Catalog object"): + get_pdf_page_count(invalid_pdf) + + def test_throw_for_invalid_pdf_no_catalog(self): + invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /NotCatalog >>\nendobj\n%%EOF" + + with pytest.raises(ValueError, match="Could not find /Catalog object"): + get_pdf_page_count(invalid_pdf) + + def test_throw_for_catalog_without_pages_reference(self): + invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n%%EOF" + + with pytest.raises(ValueError, match="Could not find /Pages reference"): + get_pdf_page_count(invalid_pdf) + + def test_throw_for_missing_pages_object(self): + invalid_pdf = ( + b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n%%EOF" + ) + + with pytest.raises(ValueError, match="Could not find root /Pages object"): + get_pdf_page_count(invalid_pdf) + + def test_throw_for_pages_object_without_count(self): + invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages >>\nendobj\n%%EOF" + + with pytest.raises(ValueError, match="Could not find /Count"): + get_pdf_page_count(invalid_pdf) + + +class TestIsValidPdf: + def test_return_true_for_valid_pdf_files(self): + # Test with generated PDF + valid_pdf_bytes = TestDocumentGenerator.generate_simple_pdf_content( + "Test content" + ) + result = is_valid_pdf(valid_pdf_bytes) + assert result is True + + # Test with sample PDF + result = is_valid_pdf(sample_pdf) + assert result is True + + def test_return_false_for_non_pdf_files(self): + # Test with non-PDF bytes + non_pdf_bytes = b"This is not a PDF file" + result = is_valid_pdf(non_pdf_bytes) + assert result is False + + def test_return_false_for_partial_pdf_header(self): + # Test with partial PDF header + partial_pdf = b"%PD" + result = is_valid_pdf(partial_pdf) + assert result is False + + def test_return_false_for_empty_bytes(self): + result = is_valid_pdf(b"") + assert result is False diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 0000000..5a0c7cd --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,92 @@ +import os +from unittest import mock + +import pytest +import re +from nutrient_dws.utils import ( + get_user_agent, + get_library_version, +) + + +class TestUtilityFunctions: + """Unit tests for utility functions""" + + def test_get_library_version_returns_valid_semver(self): + """Should return a valid semver version string""" + version = get_library_version() + + assert version is not None + assert isinstance(version, str) + assert len(version) > 0 + + # Check if it matches semver pattern (major.minor.patch) + semver_pattern = r'^\d+\.\d+\.\d+(?:-[a-zA-Z0-9.-]+)?$' + assert re.match(semver_pattern, version) + + def test_get_library_version_consistency(self): + """Should return the version consistently""" + version = get_library_version() + + # The version should match whatever is in the package metadata + # We don't hardcode the expected version to avoid breaking on version updates + assert re.match(r'^\d+\.\d+\.\d+(?:-[a-zA-Z0-9.-]+)?$', version) + assert len(version) > 0 + + def test_get_user_agent_returns_formatted_string(self): + """Should return a properly formatted User-Agent string""" + user_agent = get_user_agent() + + assert user_agent is not None + assert isinstance(user_agent, str) + assert len(user_agent) > 0 + + def test_get_user_agent_follows_expected_format(self): + """Should follow the expected User-Agent format""" + user_agent = get_user_agent() + + # Should match: nutrient-dws/VERSION + expected_pattern = r'^nutrient-dws/\d+\.\d+\.\d+(?:-[a-zA-Z0-9.-]+)?$' + assert re.match(expected_pattern, user_agent) + + @mock.patch.dict( + os.environ, + { + "PYTHON_ENV": "development", + }, + clear=True, + ) + def test_get_user_agent_follows_expected_format_development(self): + """Should follow the expected User-Agent format in development""" + user_agent = get_user_agent() + + # Should match: nutrient-dws/VERSION + assert user_agent == "nutrient-dws/0.0.0-dev" + + + def test_get_user_agent_includes_correct_library_name(self): + """Should include the correct library name""" + user_agent = get_user_agent() + + assert 'nutrient-dws' in user_agent + + def test_get_user_agent_includes_current_library_version(self): + """Should include the current library version""" + user_agent = get_user_agent() + version = get_library_version() + + assert version in user_agent + + def test_get_user_agent_consistency(self): + """Should have consistent format across multiple calls""" + user_agent1 = get_user_agent() + user_agent2 = get_user_agent() + + assert user_agent1 == user_agent2 + + def test_get_user_agent_expected_format_with_current_version(self): + """Should return the expected User-Agent format with current version""" + user_agent = get_user_agent() + version = get_library_version() + + assert user_agent == f"nutrient-dws/{version}" diff --git a/tests/unit/test_watermark_image_file.py b/tests/unit/test_watermark_image_file.py deleted file mode 100644 index 79e64f9..0000000 --- a/tests/unit/test_watermark_image_file.py +++ /dev/null @@ -1,196 +0,0 @@ -"""Unit tests for image file watermark functionality.""" - -from io import BytesIO -from unittest.mock import MagicMock, patch - -import pytest - -from nutrient_dws import NutrientClient - - -class TestWatermarkImageFile: - """Test watermark with image file upload.""" - - @pytest.fixture - def client(self): - """Create a test client.""" - return NutrientClient(api_key="test_key") - - @pytest.fixture - def mock_http_client(self, client): - """Mock the HTTP client.""" - mock = MagicMock() - mock.post.return_value = b"PDF content" - client._http_client = mock - return mock - - def test_watermark_pdf_with_image_file_bytes(self, client, mock_http_client): - """Test watermark_pdf with image file as bytes.""" - pdf_bytes = b"PDF file content" - image_bytes = b"PNG image data" - - result = client.watermark_pdf( - pdf_bytes, - image_file=image_bytes, - width=150, - height=75, - opacity=0.8, - position="top-right", - ) - - assert result == b"PDF content" - - # Verify API call - mock_http_client.post.assert_called_once() - call_args = mock_http_client.post.call_args - - # Check endpoint - assert call_args[0][0] == "/build" - - # Check files - files = call_args[1]["files"] - assert "file" in files - assert "watermark" in files - - # Check instructions - instructions = call_args[1]["json_data"] - assert instructions["parts"] == [{"file": "file"}] - assert len(instructions["actions"]) == 1 - - action = instructions["actions"][0] - assert action["type"] == "watermark" - assert action["width"] == 150 - assert action["height"] == 75 - assert action["opacity"] == 0.8 - assert action["position"] == "top-right" - assert action["image"] == "watermark" - - def test_watermark_pdf_with_image_file_object(self, client, mock_http_client): - """Test watermark_pdf with image as file-like object.""" - pdf_file = BytesIO(b"PDF file content") - image_file = BytesIO(b"PNG image data") - - result = client.watermark_pdf(pdf_file, image_file=image_file, width=200, height=100) - - assert result == b"PDF content" - - # Verify files were uploaded - call_args = mock_http_client.post.call_args - files = call_args[1]["files"] - assert "watermark" in files - - def test_watermark_pdf_with_output_path(self, client, mock_http_client): - """Test watermark_pdf with image file and output path.""" - pdf_bytes = b"PDF file content" - image_bytes = b"PNG image data" - - with patch("nutrient_dws.file_handler.save_file_output") as mock_save: - result = client.watermark_pdf( - pdf_bytes, image_file=image_bytes, output_path="output.pdf" - ) - - assert result is None - mock_save.assert_called_once_with(b"PDF content", "output.pdf") - - def test_watermark_pdf_error_no_watermark_type(self, client): - """Test watermark_pdf raises error when no watermark type provided.""" - err_msg = "Either text, image_url, or image_file must be provided" - with pytest.raises(ValueError, match=err_msg): - client.watermark_pdf(b"PDF content") - - def test_watermark_pdf_text_still_works(self, client, mock_http_client): - """Test that text watermarks still work with new implementation.""" - # Mock _process_file method - with patch.object(client, "_process_file", return_value=b"PDF content") as mock_process: - result = client.watermark_pdf( - b"PDF content", text="CONFIDENTIAL", width=200, height=100 - ) - - assert result == b"PDF content" - mock_process.assert_called_once_with( - "watermark-pdf", - b"PDF content", - None, - width=200, - height=100, - opacity=1.0, - position="center", - text="CONFIDENTIAL", - ) - - def test_watermark_pdf_url_still_works(self, client, mock_http_client): - """Test that URL watermarks still work with new implementation.""" - # Mock _process_file method - with patch.object(client, "_process_file", return_value=b"PDF content") as mock_process: - result = client.watermark_pdf( - b"PDF content", image_url="https://example.com/logo.png", width=200, height=100 - ) - - assert result == b"PDF content" - mock_process.assert_called_once_with( - "watermark-pdf", - b"PDF content", - None, - width=200, - height=100, - opacity=1.0, - position="center", - image_url="https://example.com/logo.png", - ) - - def test_builder_api_with_image_file(self, client, mock_http_client): - """Test builder API with image file watermark.""" - pdf_bytes = b"PDF content" - image_bytes = b"PNG image data" - - builder = client.build(pdf_bytes) - builder.add_step( - "watermark-pdf", - options={ - "image_file": image_bytes, - "width": 150, - "height": 75, - "opacity": 0.5, - "position": "bottom-right", - }, - ) - - result = builder.execute() - - assert result == b"PDF content" - - # Verify API call - mock_http_client.post.assert_called_once() - call_args = mock_http_client.post.call_args - - # Check files - files = call_args[1]["files"] - assert "file" in files - assert any("watermark" in key for key in files) - - # Check instructions - instructions = call_args[1]["json_data"] - assert len(instructions["actions"]) == 1 - - action = instructions["actions"][0] - assert action["type"] == "watermark" - assert action["width"] == 150 - assert action["height"] == 75 - assert action["opacity"] == 0.5 - assert action["position"] == "bottom-right" - assert action["image"].startswith("watermark_") - - def test_watermark_pdf_precedence(self, client, mock_http_client): - """Test that only one watermark type is used when multiple provided.""" - # When multiple types provided, should error since it's ambiguous - # The current implementation will use the first valid one (text > url > file) - # But for clarity, let's test that providing text uses text watermark - with patch.object(client, "_process_file", return_value=b"PDF content") as mock_process: - # Test with text - should use _process_file - client.watermark_pdf(b"PDF content", text="TEXT", width=100, height=50) - - # Should use text path - mock_process.assert_called_once() - call_args = mock_process.call_args[1] - assert "text" in call_args - assert call_args["text"] == "TEXT"