diff --git a/.github/workflows/algolia-indexing.yml b/.github/workflows/algolia-indexing.yml deleted file mode 100644 index 173d58c78..000000000 --- a/.github/workflows/algolia-indexing.yml +++ /dev/null @@ -1,51 +0,0 @@ ---- -name: Search Indexing -on: - workflow_dispatch: - inputs: - mode: - description: 'Type of indexing. "index" to push to Algolia, "console" for dry run.' - required: true - default: "index" - type: choice - options: - - console - - index - -jobs: - build-and-index: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Node v18 for Yarn v4 - uses: actions/setup-node@v3 - with: - node-version: "18.19.0" # Current LTS version - - - name: Enable Corepack for Yarn - run: corepack enable - - - name: Install Dependencies - run: yarn install - env: - YARN_ENABLE_IMMUTABLE_INSTALLS: false - - - name: Build site - run: yarn build - - env: - NODE_OPTIONS: "--max_old_space_size=8192" - PREFIX_PATHS: true # equivalent to --prefix-paths flag for 'gatsby build' - REPO_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_OWNER: ${{ github.repository_owner }} - REPO_NAME: ${{ github.event.repository.name }} - REPO_BRANCH: ${{ github.ref_name }} - GATSBY_ALGOLIA_APPLICATION_ID: ${{ secrets.AIO_ALGOLIA_APPLICATION_ID }} - GATSBY_ALGOLIA_SEARCH_API_KEY: ${{ secrets.AIO_ALGOLIA_SEARCH_API_KEY }} - ALGOLIA_WRITE_API_KEY: ${{ secrets.AIO_ALGOLIA_WRITE_API_KEY }} - ALGOLIA_INDEXATION_MODE: ${{ github.event.inputs.mode || 'index' }} - GATSBY_ALGOLIA_INDEX_NAME: ${{ secrets.ALGOLIA_INDEX_NAME || github.event.repository.name }} - GATSBY_FEDS_PRIVACY_ID: ${{ secrets.AIO_FEDS_PRIVACY_ID }} - GATSBY_SITE_DOMAIN_URL: https://developer.adobe.com diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d35d20379..78b5a439e 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -8,9 +8,10 @@ on: required: true default: "dev" clean: - description: "Clean cache (yes|no)" + description: "Clean cache" required: true - default: "no" + type: boolean + default: false excludeSubfolder: description: "Exclude a subfolder from deletion" required: false @@ -24,252 +25,13 @@ on: - console - index jobs: - set-state: - runs-on: ubuntu-latest - outputs: - deploy_prod: ${{ contains(github.event.inputs.env, 'prod') }} - deploy_dev: ${{ contains(github.event.inputs.env, 'dev') }} - clean_cache: ${{ contains(github.event.inputs.clean, 'yes') }} - path_prefix: ${{ steps.get_path_prefix.outputs.path_prefix }} - branch_short_ref: ${{ steps.get_branch.outputs.branch }} - exclude_subfolder: ${{ github.event.inputs.excludeSubfolder }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Get pathPrefix - uses: actions/github-script@v6 - id: get_path_prefix - with: - script: | - const script = require('./.github/scripts/get-path-prefix.js'); - script({ core }); - result-encoding: string - - name: Get branch name - shell: bash - run: echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: get_branch - - echo-state: - needs: [set-state] - runs-on: ubuntu-latest - steps: - - run: echo "Deploy to dev - ${{ needs.set-state.outputs.deploy_dev }}" - - run: echo "Deploy to prod - ${{ needs.set-state.outputs.deploy_prod }}" - - run: echo "Clean cache - ${{ needs.set-state.outputs.clean_cache }}" - - run: echo "Repository org - ${{ github.event.repository.owner.login }}" - - run: echo "Repository name - ${{ github.event.repository.name }}" - - run: echo "Repository branch - ${{ needs.set-state.outputs.branch_short_ref }}" - - run: echo "Path prefix - ${{ needs.set-state.outputs.path_prefix }}" - - run: echo "Exclude subfolder - ${{ needs.set-state.outputs.exclude_subfolder }}" - - pre-build-dev: - needs: [set-state] - runs-on: ubuntu-latest - if: needs.set-state.outputs.deploy_dev == 'true' - steps: - - name: check dev azure connection string - if: env.AIO_AZURE_DEV_CONNECTION_STRING == null - run: | - echo "::error::Please set the Azure Blob Storage connection string as AIO_AZURE_DEV_CONNECTION_STRING in Github Secrets" - exit 1 - env: - AIO_AZURE_DEV_CONNECTION_STRING: ${{ secrets.AIO_AZURE_DEV_CONNECTION_STRING }} - - build-dev: - defaults: - run: - shell: bash - needs: [set-state, pre-build-dev] - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Setup Node v16 for Yarn v3 - uses: actions/setup-node@v3 - with: - node-version: "16.15.0" # Current LTS version - - - name: Enable Corepack for Yarn v3 - run: corepack enable - - - name: Install Yarn v3 - uses: borales/actions-yarn@v3 - with: - cmd: set version stable - - - name: Install Dependencies - uses: borales/actions-yarn@v3 - env: - YARN_ENABLE_IMMUTABLE_INSTALLS: false - with: - cmd: install - - - name: Gatsby Cache - uses: actions/cache@v3 - with: - path: | - public - .cache - key: ${{ needs.set-state.outputs.branch_short_ref }}-gatsby-cache-${{ github.run_id }} - restore-keys: | - ${{ needs.set-state.outputs.branch_short_ref }}-gatsby-cache- - - - name: Clean Cache - if: needs.set-state.outputs.clean_cache == 'true' - uses: borales/actions-yarn@v3 - with: - cmd: clean - - - name: Build site - uses: borales/actions-yarn@v3 - with: - cmd: build - env: - PREFIX_PATHS: true # equivalent to --prefix-paths flag for 'gatsby build' - PATH_PREFIX: ${{ needs.set-state.outputs.path_prefix }} - GATSBY_ADOBE_LAUNCH_SRC: ${{ secrets.AIO_ADOBE_LAUNCH_DEV_SRC }} - GATSBY_ADDITIONAL_ADOBE_ANALYTICS_ACCOUNTS: ${{ secrets.AIO_REPORT_SUITE_DEV}} - GATSBY_ADOBE_ANALYTICS_ENV: "dev" - REPO_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_OWNER: ${{ github.event.repository.owner.login }} - REPO_NAME: ${{ github.event.repository.name }} - REPO_BRANCH: ${{ needs.set-state.outputs.branch_short_ref }} - GOOGLE_OAUTH_CLIENT_ID: ${{ secrets.GOOGLE_OAUTH_CLIENT_ID }} - GOOGLE_OAUTH_CLIENT_SECRET: ${{ secrets.GOOGLE_OAUTH_CLIENT_SECRET }} - GOOGLE_DOCS_TOKEN: ${{ secrets.GOOGLE_DOCS_TOKEN }} - GOOGLE_DOCS_FOLDER_ID: ${{ secrets.GOOGLE_DOCS_FOLDER_ID }} - GATSBY_IMS_SRC: ${{ secrets.AIO_IMS_DEV_SRC }} - GATSBY_IMS_CONFIG: ${{ secrets.AIO_IMS_DEV_CONFIG }} - GATSBY_ALGOLIA_APPLICATION_ID: ${{ secrets.AIO_ALGOLIA_APPLICATION_ID }} - GATSBY_ALGOLIA_SEARCH_API_KEY: ${{ secrets.AIO_ALGOLIA_SEARCH_API_KEY }} - GATSBY_ALGOLIA_APP_ID: ${{ secrets.AIO_ALGOLIA_APP_ID }} - GATSBY_ALGOLIA_API_KEY: ${{ secrets.AIO_ALGOLIA_API_KEY }} - GATSBY_ALGOLIA_INDEX_ALL_SRC: ${{ secrets.AIO_ALGOLIA_INDEX_ALL_SRC }} - GATSBY_ALGOLIA_SEARCH_INDEX: ${{ secrets.AIO_ALGOLIA_SEARCH_INDEX }} - GATSBY_ALGOLIA_INDEX_ENV_PREFIX: ${{ secrets.AIO_ALGOLIA_INDEX_ENV_PREFIX }} - GATSBY_FEDS_PRIVACY_ID: ${{ secrets.AIO_FEDS_PRIVACY_ID }} - GATSBY_SITE_DOMAIN_URL: https://developer-stage.adobe.com - NODE_OPTIONS: "--max_old_space_size=8192" - - name: Deploy - uses: AdobeDocs/static-website-deploy@master - with: - enabled-static-website: "true" - source: "public" - target: ${{ needs.set-state.outputs.path_prefix }} - connection-string: ${{ secrets.AIO_AZURE_DEV_CONNECTION_STRING }} - remove-existing-files: "true" - exclude-subfolder: ${{ needs.set-state.outputs.exclude_subfolder }} - - name: Purge Fastly Cache - uses: AdobeDocs/gatsby-fastly-purge-action@master - with: - fastly-token: ${{ secrets.AIO_FASTLY_TOKEN }} - fastly-url: "${{ secrets.AIO_FASTLY_DEV_URL}}${{ needs.set-state.outputs.path_prefix }}" - - pre-build-production: - needs: [set-state] - runs-on: ubuntu-latest - if: needs.set-state.outputs.deploy_prod == 'true' - steps: - - name: check prod azure connection string - if: env.AIO_AZURE_PROD_CONNECTION_STRING == null - run: | - echo "::error::Please set the Azure Blob Storage connection string as AIO_AZURE_PROD_CONNECTION_STRING in Github Secrets" - exit 1 - env: - AIO_AZURE_PROD_CONNECTION_STRING: ${{ secrets.AIO_AZURE_PROD_CONNECTION_STRING }} - - build-production: - defaults: - run: - shell: bash - needs: [set-state, pre-build-production] - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Setup Node v16 for Yarn v3 - uses: actions/setup-node@v3 - with: - node-version: "16.15.0" # Current LTS version - - - name: Enable Corepack for Yarn v3 - run: corepack enable - - - name: Install Yarn v3 - uses: borales/actions-yarn@v3 - with: - cmd: set version stable - - - name: Install Dependencies - uses: borales/actions-yarn@v3 - env: - YARN_ENABLE_IMMUTABLE_INSTALLS: false - with: - cmd: install - - - name: Gatsby Cache - uses: actions/cache@v3 - with: - path: | - public - .cache - key: ${{ needs.set-state.outputs.branch_short_ref }}-gatsby-cache-${{ github.run_id }} - restore-keys: | - ${{ needs.set-state.outputs.branch_short_ref }}-gatsby-cache- - - - name: Clean Cache - if: needs.set-state.outputs.clean_cache == 'true' - uses: borales/actions-yarn@v3 - with: - cmd: clean - - - name: Build site - uses: borales/actions-yarn@v3 - with: - cmd: build - env: - PREFIX_PATHS: true # equivalent to --prefix-paths flag for 'gatsby build' - PATH_PREFIX: ${{ needs.set-state.outputs.path_prefix }} - GATSBY_ADOBE_LAUNCH_SRC: ${{ secrets.AIO_ADOBE_LAUNCH_PROD_SRC }} - GATSBY_ADDITIONAL_ADOBE_ANALYTICS_ACCOUNTS: ${{ secrets.AIO_REPORT_SUITE_PROD }} - GATSBY_ADOBE_ANALYTICS_ENV: "production" - REPO_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_OWNER: ${{ github.event.repository.owner.login }} - REPO_NAME: ${{ github.event.repository.name }} - REPO_BRANCH: ${{ needs.set-state.outputs.branch_short_ref }} - GOOGLE_OAUTH_CLIENT_ID: ${{ secrets.GOOGLE_OAUTH_CLIENT_ID }} - GOOGLE_OAUTH_CLIENT_SECRET: ${{ secrets.GOOGLE_OAUTH_CLIENT_SECRET }} - GOOGLE_DOCS_TOKEN: ${{ secrets.GOOGLE_DOCS_TOKEN }} - GOOGLE_DOCS_FOLDER_ID: ${{ secrets.GOOGLE_DOCS_FOLDER_ID }} - GATSBY_IMS_SRC: ${{ secrets.AIO_IMS_PROD_SRC }} - GATSBY_IMS_CONFIG: ${{ secrets.AIO_IMS_PROD_CONFIG }} - GATSBY_ALGOLIA_APPLICATION_ID: ${{ secrets.AIO_ALGOLIA_APPLICATION_ID }} - GATSBY_ALGOLIA_SEARCH_API_KEY: ${{ secrets.AIO_ALGOLIA_SEARCH_API_KEY }} - GATSBY_ALGOLIA_APP_ID: ${{ secrets.AIO_ALGOLIA_APP_ID }} - GATSBY_ALGOLIA_API_KEY: ${{ secrets.AIO_ALGOLIA_API_KEY }} - ALGOLIA_WRITE_API_KEY: ${{ secrets.AIO_ALGOLIA_WRITE_API_KEY }} - ALGOLIA_INDEXATION_MODE: ${{ github.event.inputs.index-mode || 'index' }} - GATSBY_ALGOLIA_INDEX_NAME: ${{ secrets.ALGOLIA_INDEX_NAME || github.event.repository.name }} - GATSBY_ALGOLIA_INDEX_ALL_SRC: ${{ secrets.AIO_ALGOLIA_INDEX_ALL_SRC }} - GATSBY_ALGOLIA_SEARCH_INDEX: ${{ secrets.AIO_ALGOLIA_SEARCH_INDEX }} - GATSBY_ALGOLIA_INDEX_ENV_PREFIX: ${{ secrets.AIO_ALGOLIA_INDEX_ENV_PREFIX }} - GATSBY_FEDS_PRIVACY_ID: ${{ secrets.AIO_FEDS_PRIVACY_ID }} - GATSBY_SITE_DOMAIN_URL: https://developer.adobe.com - NODE_OPTIONS: "--max_old_space_size=8192" - - name: Deploy - uses: AdobeDocs/static-website-deploy@master - with: - enabled-static-website: "true" - source: "public" - target: ${{ needs.set-state.outputs.path_prefix }} - connection-string: ${{ secrets.AIO_AZURE_PROD_CONNECTION_STRING }} - remove-existing-files: "true" - exclude-subfolder: ${{ needs.set-state.outputs.exclude_subfolder }} - - name: Purge Fastly Cache - uses: AdobeDocs/gatsby-fastly-purge-action@master - with: - fastly-token: ${{ secrets.AIO_FASTLY_TOKEN }} - fastly-url: "${{ secrets.AIO_FASTLY_PROD_URL }}${{ needs.set-state.outputs.path_prefix }}" + deployment: + name: Deployment + uses: AdobeDocs/adp-devsite-workflow/.github/workflows/gatsby-deploy.yml@main + secrets: inherit + with: + env: ${{ inputs.env }} + clean: ${{ inputs.clean }} + excludeSubfolder: ${{ inputs.excludeSubfolder }} + index-mode: ${{ inputs.index-mode }} + NODE_OPTIONS: "--max-old-space-size=8192" diff --git a/.github/workflows/test-pull-request.yml b/.github/workflows/test-pull-request.yml index 115c3d00c..cf9aa31d5 100644 --- a/.github/workflows/test-pull-request.yml +++ b/.github/workflows/test-pull-request.yml @@ -57,33 +57,20 @@ jobs: VALIDATE_ALL_CODEBASE: false VALIDATE_GITHUB_ACTIONS: true - - name: Setup Node v16 for Yarn v3 + - name: Setup Node v20 for Yarn v3 uses: actions/setup-node@v3 with: - node-version: '16.15.0' # Current LTS version + node-version: '20.19.5' # Current LTS version - name: Enable Corepack for Yarn v3 run: corepack enable - - name: Install Yarn v3 - uses: borales/actions-yarn@v3 - with: - cmd: set version stable - - name: Install dependencies - uses: borales/actions-yarn@v3 - env: - YARN_ENABLE_IMMUTABLE_INSTALLS: false - with: - cmd: install + run: yarn install - name: Check internal links - uses: borales/actions-yarn@v3 - with: - cmd: test:links + run: yarn test:links - name: Build site if: ${{ success() }} - uses: borales/actions-yarn@v3 - with: - cmd: build + run: yarn build diff --git a/README.md b/README.md index 6b9c9048e..1928c4338 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## How to develop -For local development, simply use : +For local development, simply use: ```bash $ yarn install diff --git a/gatsby-browser.js b/gatsby-browser.js index 3d146e8c7..19967e7cf 100644 --- a/gatsby-browser.js +++ b/gatsby-browser.js @@ -262,6 +262,13 @@ export const onRouteUpdate = ({ location, prevLocation }) => { ) { pageHeadTittle = "PDF Services API Extract PDF"; } else if ( + window.location.pathname.indexOf( + "pdf-services-api/howtos/pdf-to-markdown-api/" + ) >= 0 + ) { + pageHeadTittle = "PDF Services API PDF to Markdown API"; + } + else if ( window.location.pathname.indexOf( "pdf-services-api/howtos/pdf-properties/" ) >= 0 diff --git a/gatsby-config.js b/gatsby-config.js index 1a6250c89..e0659db92 100644 --- a/gatsby-config.js +++ b/gatsby-config.js @@ -33,6 +33,11 @@ module.exports = { description: 'Create, combine and export PDFs', path: '../document-services/apis/pdf-services/' }, + { + title: 'PDF to Markdown', + description: 'Convert PDF documents to Markdown format', + path: '../document-services/apis/pdf-to-markdown/' + }, { title: 'PDF Accessibility Auto-Tag', description: 'Auto-tag PDF content to improve accessibility', @@ -229,6 +234,10 @@ module.exports = { title: 'Extract PDF', path: 'overview/pdf-services-api/howtos/extract-pdf.md' }, + { + title: 'PDF to Markdown API', + path: 'overview/pdf-services-api/howtos/pdf-to-markdown-api.md' + }, { title: 'Get PDF Properties', path: 'overview/pdf-services-api/howtos/pdf-properties.md' @@ -261,6 +270,10 @@ module.exports = { title: 'PDF Electronic Seal', path: 'overview/pdf-services-api/howtos/electronic-seal-api.md' }, + { + title: 'PDF To Markdown', + path: 'overview/pdf-services-api/howtos/pdf-to-markdown-api.md' + }, { title: 'PDF Watermark', path: 'overview/pdf-services-api/howtos/pdf-watermark-api.md' @@ -366,20 +379,26 @@ module.exports = { path: 'overview/pdf-extract-api/quickstarts', pages: [ { - title:'Node.js', - path: 'overview/pdf-extract-api/quickstarts/nodejs/index.md' - }, - { - title:'Java', - path: 'overview/pdf-extract-api/quickstarts/java/index.md' - }, - { - title:'.NET', - path: 'overview/pdf-extract-api/quickstarts/dotnet/index.md' - }, - { - title:'Python', - path: 'overview/pdf-extract-api/quickstarts/python/index.md' + title: 'Extract PDF', + path: 'overview/pdf-extract-api/quickstarts/extract-pdf/index.md', + pages: [ + { + title:'Node.js', + path: 'overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/index.md' + }, + { + title:'Java', + path: 'overview/pdf-extract-api/quickstarts/extract-pdf/java/index.md' + }, + { + title:'.NET', + path: 'overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/index.md' + }, + { + title:'Python', + path: 'overview/pdf-extract-api/quickstarts/extract-pdf/python/index.md' + } + ] } ] }, @@ -391,6 +410,10 @@ module.exports = { { title: 'PDF Extract API', path: 'overview/pdf-extract-api/howtos/extract-api.md' + }, + { + title: 'PDF to Markdown API', + path: 'overview/pdf-extract-api/howtos/pdf-to-markdown-api.md' } ] } @@ -716,6 +739,10 @@ module.exports = { title: 'Extract PDF', path: 'overview/legacy-documentation/pdf-services-api/howtos/extract-pdf.md' }, + { + title: 'PDF to Markdown', + path: 'overview/legacy-documentation/pdf-services-api/howtos/pdf-to-markdown-api.md' + }, { title: 'Get PDF Properties', path: 'overview/legacy-documentation/pdf-services-api/howtos/pdf-properties.md' diff --git a/src/pages/overview/document-generation-api/stylingformattingtags.md b/src/pages/overview/document-generation-api/stylingformattingtags.md index 95d2ef04d..fa7ca62e9 100644 --- a/src/pages/overview/document-generation-api/stylingformattingtags.md +++ b/src/pages/overview/document-generation-api/stylingformattingtags.md @@ -121,6 +121,16 @@ JSON representation of the input data: - [ & (ordered lists and unordered lists)](/overview/document-generation-api/templatetags/#insert-list-using-ul-and-ol-html-elements) +- (superscript) + +- (subscript) + +- , , , , , (headings) + +- (left, center, right, justify) + +- (units: pt, px, em, cm, mm, in) + JSON representation of the input data: ```json diff --git a/src/pages/overview/limits.md b/src/pages/overview/limits.md index 93d3d4535..a075b1fb5 100644 --- a/src/pages/overview/limits.md +++ b/src/pages/overview/limits.md @@ -19,7 +19,7 @@ for purchase options. Document Transactions for the Operations below resulting in a Document will be counted with the following page limits: -
  • Extract Operation: 1 Document Transaction for up to 5 pages
  • Accessibility Auto-Tag Operation: 10 Document Transactions per page
  • Electronic Seal Operation: 10 Document Transactions per PDF
  • All other Operations: 1 Document Transaction for up to 50 pages
+
  • Extract and PDF To Markdown Operations: 1 Document Transaction for up to 5 pages
  • Accessibility Auto-Tag Operation: 10 Document Transactions per page
  • Electronic Seal Operation: 10 Document Transactions per PDF
  • All other Operations: 1 Document Transaction for up to 50 pages
@@ -30,8 +30,8 @@ Document Transactions for the Operations below resulting in a Document will be c | Accessibility Auto-Tag | Tag the content in a PDF to improve accessibility | 10 Document Transactions (per page) | | Electronic Seal | Applies an organizational seal to a PDF using a digital certificate | 10 Document Transactions (per PDF) | | Document Generation | Create PDF and Word documents from your own authored Microsoft Word templates and JSON data | Document Transaction | -| Extract | Extracts PDF Elements such as text, images, tables in a structured format from a PDF. | Document Transaction (per 5 pages) | -| Create | Create PDF from static/dynamic HTML, HTML(with inline CSS), HTML(specified via URL), MS Office and other supported file types. File types: BMP, DOC, DOCX, GIF, JPEG, JPG, PNG, PPT, PPTX, RTF, TIF, TIFF, TXT, XLS, XLSX, ZIP | Document Transaction | +| Extract and PDF To Markdown | Extracts PDF Elements such as text, images, tables in a structured format from a PDF. | Document Transaction (per 5 pages) | +| Create | Create PDF from static/dynamic HTML, HTML(with inline CSS), HTML(specified via URL), MS Office and other supported file types. File types: BMP, DOC, DOCX, GIF, JPEG, JPG, MD, PNG, PPT, PPTX, RTF, TIF, TIFF, TXT, XLS, XLSX, ZIP | Document Transaction | | Export | Convert PDF to MS Office and other supported file types. File types: DOC, DOCX, JPEG, PNG, PPTX, RTF, XLSX | Document Transaction | | Combine | Combine multiple PDFs or pages in PDFs to a single PDF | Document Transaction | | OCR | Convert scanned PDF to editable and searchable PDF | Document Transaction | @@ -58,7 +58,7 @@ Document Transactions for the Operations below resulting in a Document will be c | Document Generation | Generating a fifteen-page PDF Document from a two-page Microsoft Word template and 1MB JSON file consumes one Document Transaction. | 1 (Operation: Document Generation) x 1 (Document) = 1 Document Transaction | | Accessibility Auto-Tag | Auto-tagging a five-page PDF consumes 50 Document Transactions. | 1 (Operation: Auto-Tag) X 1 (5-page document) = 50 Document Transactions | | Electronic Seal | Sealing one PDF Document consumes 10 Document Transactions. | 1 (Operation: Electronic Sealing) X 1 (document) = 10 Document Transactions | -| Extract PDF | Extracting PDF Elements from an 8-page PDF Document consumes two Document Transactions. | 1 (Operation: Extract PDF) x 2 (5-page document) = 2 Document Transactions | +| Extract PDF and PDF To Markdown | Extracting PDF Elements from an 8-page PDF Document consumes two Document Transactions. | 1 (Operation: Extract PDF) x 2 (5-page document) = 2 Document Transactions | | Create, Export | Converting ten single page Word Documents into ten PDF Documents consumes ten Document Transactions. | 1 (Operation: create) x 10 (documents) = 10 Document Transactions | | Combine | Combining six different PDF Documents into a single 250-page PDF Document consumes five Document Transactions. | 1 (Operation: combine) x 5 (50-page documents) = 5 Document Transactions | | OCR | Running OCR on a single 50-page document consumes one Document Transaction. | 1 (Operation: OCR) x 1 (50-page document) = 1 Document Transaction | @@ -74,15 +74,16 @@ Document Transactions for the Operations below resulting in a Document will be c There are several usage limits that apply to PDF Services API and its underlying Operations based on one initial endpoint request. Files submitted for processing that exceed usage limits below will fail and result in an error message. -| USAGE LIMIT | VALUE | -|---------------------------------------------------------------------| ----------- | -| Document limit (Combine, Insert, Replace, Split) | 20 | -| File size (for all documents)** | 100MB | -| Output images per Document Transaction (Export) | 50 | -| Page limit (Extract and Accessibility Auto-Tag)* | 400 | -| Page limit (Scanned - Extract and Accessibility Auto-Tag)* | 150 | -| JSON file size (Document Generation,HTML to PDF and Import PDFForm) | 10MB | -| Maximum Requests Per Minute | 100 RPM** (Enterprise),
25 RPM (Free Tier)| +| USAGE LIMIT | VALUE | +|-----------------------------------------------------------------------------| ----------- | +| Document limit (Combine, Insert, Replace, Split) | 20 | +| File size (for all documents)** | 100MB | +| Output images per Document Transaction (Export) | 50 | +| Page limit (Extract and PDF To Markdown)* | 400 | +| Page limit (Accessibility Auto-Tag)* | 200 | +| Page limit (Scanned - Extract, PDF To Markdown and Accessibility Auto-Tag)* | 150 | +| JSON file size (Document Generation,HTML to PDF and Import PDFForm) | 10MB | +| Maximum Requests Per Minute | 100 RPM** (Enterprise),
25 RPM (Free Tier)| **Page limits may be lower for documents with a large number of tables.* diff --git a/src/pages/overview/pdf-extract-api/gettingstarted.md b/src/pages/overview/pdf-extract-api/gettingstarted.md index 5403e30d7..57a7c8ab6 100644 --- a/src/pages/overview/pdf-extract-api/gettingstarted.md +++ b/src/pages/overview/pdf-extract-api/gettingstarted.md @@ -70,9 +70,21 @@ To create a job for the operation, please use the `assetID` obtained in Step 2 For creating the job, please refer to the corresponding API spec for the particular [PDF Operation](../../../apis). +**Choose your operation endpoint:** + +- **For JSON output (Extract PDF):** Use the `/operation/extractpdf` endpoint +- **For Markdown output (PDF to Markdown):** Use the `/operation/pdftomarkdown` endpoint + +[Extract PDF API Reference](../../../apis/#operation/pdfoperations.extractpdf) | [PDF to Markdown API Reference](../../../apis/#operation/pdfoperations.pdftomarkdown) + ## Step 4 : Fetching the status -Once the job is successfully created, you need to poll the at the `location` returned in response header of Step 3 by using the following API +Once the job is successfully created, you need to poll the at the `location` returned in response header of Step 3 by using the following API. + +**Status polling endpoints:** + +- **For Extract PDF (JSON):** `/operation/extractpdf/{jobId}/status` +- **For PDF to Markdown:** `/operation/pdftomarkdown/{jobId}/status` You can read more about the API in detail [here](../../../apis/#operation/pdfoperations.compresspdf.jobstatus). diff --git a/src/pages/overview/pdf-extract-api/howtos/pdf-to-markdown-api.md b/src/pages/overview/pdf-extract-api/howtos/pdf-to-markdown-api.md new file mode 100644 index 000000000..ec81a8ddc --- /dev/null +++ b/src/pages/overview/pdf-extract-api/howtos/pdf-to-markdown-api.md @@ -0,0 +1,118 @@ +--- +title: PDF to Markdown | Adobe PDF Services +description: Learn about the PDF to Markdown service that converts PDF documents into well-formatted LLM-friendly Markdown text. +--- + +# PDF to Markdown + +The PDF to Markdown API (included with the PDF Services API) is a cloud-based web service that automatically converts PDF documents – native or scanned – into well-formatted LLM-friendly Markdown text. This service preserves the document's structure and formatting while converting it into a format that's widely used for LLM flows, content authoring and documentation. + +## Structured Information Output Format + +The output of a PDF to Markdown operation includes: + +- A primary `.md` file containing the converted Markdown content + +### Output Structure + +The following is a summary of key elements in the converted Markdown: + +#### Elements + +Ordered list of semantic elements converted from the PDF document, preserving the natural reading order and document structure. The conversion handles: + +- Text content with proper Markdown syntax +- Document hierarchy and structure +- Inline formatting and emphasis +- Links and references +- Images and figures +- Tables and complex layouts + +#### Content Types + +The API processes various content types as follows: + +##### Text Elements + +- **Headings**: Converted to appropriate Markdown heading levels (H1-H6) +- **Paragraphs**: Preserved with proper spacing and formatting +- **Lists**: Both ordered and unordered lists with proper nesting +- **Text Emphasis**: Bold, italic, and other text formatting +- **Links**: Preserved with proper Markdown link syntax + +##### Images and Figures + +- Provided as base64-embedded images in the Markdown output +- Referenced correctly in the Markdown output +- Original quality preserved +- Proper alt-text and captions maintained + +##### Tables + +- Converted to Markdown table syntax +- Column alignment preserved +- Cell content formatting maintained +- Complex table structures supported + +#### Element Types and Paths + +The API recognizes and converts the following structural elements: + +| Category | Element Type | Description | +| --------- | ----------------- | --------------------------------------------------------- | +| Aside | Aside | Content that is not part of the regular content flow | +| Figure | Figure | Non-reflowable constructs such as graphs, images, and flowcharts | +| Footnote | Footnote | Footnote | +| Headings | H, H1, H2, etc | Heading levels | +| List | L, Li, Lbl, Lbody | List and list item elements | +| Paragraph | P, ParagraphSpan | Paragraphs and paragraph segments | +| Reference | Reference | Links | +| Section | Sect | Logical section of the document | +| StyleSpan | StyleSpan | Styling variations within text | +| Table | Table, TD, TH, TR | Table elements | +| Title | Title | Document title | + +### Reading Order + +The reading order in the output Markdown maintains: + +- Natural document flow +- Proper content hierarchy +- Column-based layouts +- Page transitions +- Inline elements and references + +## Use Cases + +The PDF to Markdown API is particularly valuable for: + +- **LLM and RAG ingestion**: Convert PDFs to Markdown for chunking, embeddings, and retrieval-augmented generation (RAG). +- **Prompt and context packaging**: Produce Markdown that is easy to paste, structure, and cite in prompts and agent workflows. +- **Training data preparation**: Create LLM fine-tuning datasets from PDF content after review, cleanup, and labeling. +- **Doc-as-code workflows**: Bring PDF content into Git-based review, versioning, diffing, and static-site generators. +- **Knowledge base publishing**: Migrate PDFs into documentation platforms and internal wikis as clean, editable Markdown. +- **Legacy and archive modernization**: Convert historical PDFs so they become searchable, editable, and maintainable. +- **Automated document processing**: Standardize PDF-to-text conversion inside ETL and document-processing pipelines. +- **Enterprise search and indexing**: Feed converted Markdown into internal search systems and knowledge repositories. +- **Compliance and audit readiness**: Make PDF policies, SOPs, and manuals searchable and easier to review for changes. +- **Content QA and change tracking**: Compare converted Markdown across document versions to detect updates and regressions. +- **Analytics and classification**: Use Markdown output for topic modeling, tagging, deduplication, and routing workflows. +- **Localization workflows**: Convert to Markdown as a starting point for translation and multi-language documentation. + +## API Limitations + +For File Constraints and Processing Limits, see [Licensing and Usage Limits](../dcserviceslicensing/). + +### Document Requirements + +- Files must be unprotected or allow content copying +- No support for: + - Hidden objects (JavaScript, OCG) + - XFA and fillable forms + - Complex annotations + - CAD drawings or vector art + - Password-protected content + +## REST API + +See our public API Reference for [PDF to Markdown API](../../../apis/#tag/PDF-To-Markdown). diff --git a/src/pages/overview/pdf-extract-api/index.md b/src/pages/overview/pdf-extract-api/index.md index 395519948..13378d86f 100644 --- a/src/pages/overview/pdf-extract-api/index.md +++ b/src/pages/overview/pdf-extract-api/index.md @@ -6,15 +6,50 @@ title: PDF Extract API | Adobe PDF Services **What is Extract?** -[The PDF Extract API (included with the PDF Services API)](./howtos/extract-api.md) is a cloud-based web service that uses Adobe’s Sensei AI technology to automatically extract content and structural information from PDF documents – native or scanned – and to output it in a structured JSON format. The service extracts text, complex tables, and figures as follows: +The PDF Extract API suite (included with the PDF Services API) is a cloud-based web service that uses Adobe's Sensei AI technology to automatically extract content and structural information from PDF documents – native or scanned. Two output formats are available: + +- **Structured JSON output** ([Extract PDF](./howtos/extract-api.md)): Extracts content and structural information in JSON format +- **Markdown output** ([PDF to Markdown](./howtos/pdf-to-markdown-api.md)): Converts PDF content to well-formatted, LLM-friendly Markdown text + +Both formats extract text, complex tables, and figures from PDF documents: - Text is extracted in contextual blocks – paragraphs, headings, lists, footnotes, etc. – and includes font, styling, and other text formatting information. -- Tables are extracted and parsed with the contents and table formatting information delivered for each cell. The service automatically identifies table cells that span multiple rows or columns. Table data is delivered within the resulting JSON and can also optionally be output in CSV and XLSX files. Tables are also output as PNG images allowing the table data to be visually validated. -- Objects that are identified as figures or images are extracted as PNG files. +- Tables are extracted and parsed with the contents and table formatting information delivered for each cell. The service automatically identifies table cells that span multiple rows or columns. +- For Extract JSON, Table data is delivered within the resulting JSON and can also optionally be output in CSV and XLSX files. Tables are also output as PNG images allowing the table data to be visually validated. + +- Objects that are identified as figures or images are extracted as PNG files (Extract PDF), base64 files (PDF to Markdown). + +## Choose Your Output Format + +The PDF Extract API provides two distinct endpoints under the same product umbrella: + +### JSON (Extract PDF) + +Best for: +- Structured downstream processing +- Layout and structure analysis +- Detailed element type information +- Custom data extraction workflows +- Integration with data analysis systems + +The JSON output captures document structure information, such as the natural reading order of the various extracted elements and the layout of the elements on each given page. Table data can optionally be delivered in CSV and XLSX files, and images are extracted as PNG files. + +[Learn more about Extract PDF →](./howtos/extract-api.md) + +### Markdown (PDF to Markdown) + +Best for: +- LLM ingestion and prompt creation +- Training/Fine-tuning LLMs with PDF content +- Documentation workflows +- Content republishing and migration +- Modern documentation systems +- Searchable knowledge repositories + +The Markdown output preserves document structure and reading order while converting content to a widely-used text format. Tables are converted to Markdown table syntax, and figures can be embedded as base64 images. -The JSON output also captures document structure information, such as the natural reading order of the various extracted elements and the layout of the elements on each given page. +[Learn more about PDF to Markdown →](./howtos/pdf-to-markdown-api.md) -The PDF Extract API provides a method for developers to extract and structure content for use in a number of downstream applications including content republishing, content processing, data analysis, and content aggregation, management, and search. The PDF Extract API can be embedded into any application using the [PDFServices SDK](../pdf-extract-api/gettingstarted#sdk) for Node.js, Python, .NET and Java. [Start with a Free Tier which includes 500 free Document Transactions per month.](https://acrobatservices.adobe.com/dc-integration-creation-app-cdn/main.html?api=pdf-extract-api) diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/Adobe Extract API Sample.pdf b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/Adobe Extract API Sample.pdf similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/Adobe Extract API Sample.pdf rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/Adobe Extract API Sample.pdf diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/index.md b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/index.md similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/index.md rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/index.md diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot1.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot1.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot1.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot1.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot2_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot2_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot2_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot2_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot3_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot3_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot3_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot3_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot5_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot5_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot5_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot5_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot6_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot6_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot6_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot6_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot9.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot9.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/dotnet/shot9.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/dotnet/shot9.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/index.md b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/index.md new file mode 100644 index 000000000..e4226348c --- /dev/null +++ b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/index.md @@ -0,0 +1,12 @@ +--- +title: Quickstarts | Extract PDF API | Adobe PDF Services +--- + +# Extract PDF - Quickstarts + +The following quickstarts will help you run your first successful operation and are tailored to our supported SDKs: + +* [Node.js](nodejs/) +* [Java](java/) +* [.NET](dotnet) +* [Python](python) \ No newline at end of file diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/Adobe Extract API Sample.pdf b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/Adobe Extract API Sample.pdf similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/Adobe Extract API Sample.pdf rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/Adobe Extract API Sample.pdf diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/index.md b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/index.md similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/index.md rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/index.md diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot1.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot1.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot1.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot1.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot2_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot2_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot2_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot2_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot3_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot3_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot3_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot3_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot5_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot5_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot5_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot5_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot6_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot6_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot6_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot6_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot7.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot7.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot7.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot7.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/java/shot9.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot9.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/java/shot9.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/java/shot9.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/Adobe Extract API Sample.pdf b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/Adobe Extract API Sample.pdf similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/Adobe Extract API Sample.pdf rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/Adobe Extract API Sample.pdf diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/index.md b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/index.md similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/index.md rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/index.md diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot1.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot1.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot1.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot1.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot2_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot2_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot2_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot2_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot3_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot3_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot3_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot3_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot5_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot5_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot5_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot5_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot6_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot6_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot6_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot6_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot7.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot7.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot7.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot7.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot8.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot8.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot8.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot8.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot9.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot9.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/nodejs/shot9.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/nodejs/shot9.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/Adobe Extract API Sample.pdf b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/Adobe Extract API Sample.pdf similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/Adobe Extract API Sample.pdf rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/Adobe Extract API Sample.pdf diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/index.md b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/index.md similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/index.md rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/index.md diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot1.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot1.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot1.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot1.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot2_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot2_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot2_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot2_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot3_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot3_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot3_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot3_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot5_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot5_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot5_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot5_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot6_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot6_spc.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot6_spc.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot6_spc.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot7.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot7.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot7.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot7.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot8.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot8.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot8.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot8.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/python/shot9.png b/src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot9.png similarity index 100% rename from src/pages/overview/pdf-extract-api/quickstarts/python/shot9.png rename to src/pages/overview/pdf-extract-api/quickstarts/extract-pdf/python/shot9.png diff --git a/src/pages/overview/pdf-extract-api/quickstarts/index.md b/src/pages/overview/pdf-extract-api/quickstarts/index.md index 5198b8e5d..daa8c90b1 100644 --- a/src/pages/overview/pdf-extract-api/quickstarts/index.md +++ b/src/pages/overview/pdf-extract-api/quickstarts/index.md @@ -4,9 +4,6 @@ title: Quickstarts | PDF Extract API | Adobe PDF Services # Quickstarts -Want to quickly test out PDF Extract API? The following quickstarts will help you run your first successful operation and are tailored to our supported SDKs: +Want to quickly test out Extract PDF API and PDF To Markdown API? Choose your operation to get started: -* [Node.js](nodejs/) -* [Java](java/) -* [.NET](dotnet) -* [Python](python) \ No newline at end of file +* [Extract PDF](extract-pdf/) \ No newline at end of file diff --git a/src/pages/overview/pdf-services-api/dcserviceslicensing.md b/src/pages/overview/pdf-services-api/dcserviceslicensing.md index 8b0b4de39..2e31125f9 100644 --- a/src/pages/overview/pdf-services-api/dcserviceslicensing.md +++ b/src/pages/overview/pdf-services-api/dcserviceslicensing.md @@ -22,21 +22,23 @@ will be counted with the following page limits: Document Transactions for the Operations below resulting in a Document -will be counted with the following page limits:
  • Extract PDF Operation: up to 5 pages per Document Transaction
  • All other Operations: up to 50 pages per Document Transaction
+will be counted with the following page limits:
  • Extract and PDF To Markdown Operations: 1 Document Transaction for up to 5 pages
  • Accessibility Auto-Tag Operation: 10 Document Transactions per page
  • Electronic Seal Operation: 10 Document Transactions per PDF
  • All other Operations: 1 Document Transaction for up to 50 pages
### Metrics -| Operation | Capability | Metric | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------ | -| Document Generation | Create PDF and Word documents from your own authored Microsoft Word templates and JSON data | Document Transaction | -| Extract PDF | Extracts PDF Elements such as text, images, tables in a structured format from a PDF. | Document Transaction (up to 5 pages) | +| Operation | Capability | Metric | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------ | +| Accessibility Auto-Tag | Tag the content in a PDF to improve accessibility | 10 Document Transactions (per page) | +| Electronic Seal | Applies an organizational seal to a PDF using a digital certificate | 10 Document Transactions (per PDF) | +| Document Generation | Create PDF and Word documents from your own authored Microsoft Word templates and JSON data | Document Transaction | +| Extract and PDF To Markdown | Extracts PDF Elements such as text, images, tables in a structured format from a PDF. PDF To Markdown converts PDF to LLM-friendly Markdown. | Document Transaction (per 5 pages) | | Create | Create PDF from static/dynamic HTML, HTML(with inline CSS), HTML(specified via URL), MS Office and other supported file types. File types: BMP, DOC, DOCX, GIF, JPEG, JPG, PNG, PPT, PPTX, RTF, TIF, TIFF, TXT, XLS, XLSX, ZIP | Document Transaction | | Export | Convert PDF to MS Office and other supported file types. File types: DOC, DOCX, JPEG, PNG, PPTX, RTF, XLSX | Document Transaction | | Combine | Combine multiple PDFs or pages in PDFs to a single PDF | Document Transaction | | OCR | Convert scanned PDF to editable and searchable PDF | Document Transaction | | Compress | Reduce the size of a PDF | Document Transaction | -| Protect | Set user password in a PDF to prevent others from opening and viewing the Document | Document Transaction | -| Remove Password | Remove password security from a PDF document. This can only be accomplished using the owner password of the document, which must be passed in the operation. | Document Transaction | +| Protect | Set user password in a PDF to prevent others from opening and viewing the Document | Document Transaction: the 50-page count does not apply | +| Remove Password | Remove password security from a PDF document. This can only be accomplished using the owner password of the document, which must be passed in the operation. | Document Transaction: the 50-page count does not apply | | Linearize | Optimize a PDF for [Fast Web View](https://helpx.adobe.com/acrobat/using/optimizing-pdfs-acrobat-pro.html#EnableFastWebViewinaPDF) | Document Transaction | | Split | Split a PDF document into multiple smaller PDFs | Document Transaction | | Insert Pages | Insert one or more pages from different PDFs into a PDF | Document Transaction | @@ -51,13 +53,15 @@ will be counted with the following page limits:
  • Extract PDF Operation: up The number of documents used to calculate Document Transactions is -rounded up on a 5-page basis for Extract PDF and on a 50-page basis +rounded up on a 5-page basis for Extract PDF and PDF To Markdown and on a 50-page basis for other PDF Tools. | Operation | Example | Calculation | | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | Document Generation | Generating a fifteen-page PDF Document from a two-page Microsoft Word template and 1MB JSON file consumes one Document Transaction | 1 (Operation: Document Generation) x 1 (Document) = 1 Document Transaction | -| Extract PDF | Extracting PDF Elements from an 8 page PDF Document consumes two Document Transactions | 1 (Operation: Extract PDF) x 2 (5 page document) = 2 Document Transactions | +| Accessibility Auto-Tag | Auto-tagging a five-page PDF consumes 50 Document Transactions. | 1 (Operation: Auto-Tag) x 5 (pages) = 50 Document Transactions | +| Electronic Seal | Sealing one PDF Document consumes 10 Document Transactions. | 1 (Operation: Electronic Sealing) x 1 (document) = 10 Document Transactions | +| Extract PDF and PDF To Markdown | Extracting PDF Elements or converting to Markdown from an 8-page PDF Document consumes two Document Transactions | 1 (Operation: Extract PDF or PDF To Markdown) x 2 (5-page document) = 2 Document Transactions | | Create, Export | Converting ten single page Word Documents into ten PDF Documents consumes ten Document Transactions. | 1 (Operation: create) x 10 (documents) = 10 Document Transactions | | Combine | Combining six different PDF Documents into a single 250 page PDF Document consumes five Document Transactions. | 1 (Operation: combine) x 5 (50 page documents) = 5 Document Transactions | | OCR | Running OCR on a single 50 page document consumes one Document Transaction | 1 (Operation: OCR) x 1 (50 page document) = 1 Document Transaction | @@ -76,15 +80,17 @@ There are several usage limits that apply to operation metrics. Files submitted for processing that exceed content limits below will fail and result in an error message. -| Usage limit | Value | -| --------------------------------------------------------------- | ----------- | -| Document limit (combine, insert, replace, split) | 20 | -| Document file size | 100MB | -| Output images per Document Transaction (export) | 50 | -| Page limit (OCR)\* | 100 | -| Page limit(Non-Scanned - Extract API)\* | 400 | -| JSON file size (Document Generation and HTML to PDF) | 10MB | -| Maximum Requests Per Minute | 25 RPM | +| Usage limit | Value | +| --------------------------------------------------------------------------- | ----------- | +| Document limit (combine, insert, replace, split) | 20 | +| Document file size (for all documents) | 100MB | +| Output images per Document Transaction (export) | 50 | +| Page limit (Extract and PDF To Markdown)\* | 400 | +| Page limit (Accessibility Auto-Tag)\* | 200 | +| Page limit (Scanned - Extract, PDF To Markdown and Accessibility Auto-Tag)\* | 150 | +| Page limit (OCR)\* | 100 | +| JSON file size (Document Generation and HTML to PDF) | 10MB | +| Maximum Requests Per Minute | 25 RPM | **Page limits may be lower for documents with a large number of tables.* diff --git a/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md b/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md new file mode 100644 index 000000000..ec81a8ddc --- /dev/null +++ b/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md @@ -0,0 +1,118 @@ +--- +title: PDF to Markdown | Adobe PDF Services +description: Learn about the PDF to Markdown service that converts PDF documents into well-formatted LLM-friendly Markdown text. +--- + +# PDF to Markdown + +The PDF to Markdown API (included with the PDF Services API) is a cloud-based web service that automatically converts PDF documents – native or scanned – into well-formatted LLM-friendly Markdown text. This service preserves the document's structure and formatting while converting it into a format that's widely used for LLM flows, content authoring and documentation. + +## Structured Information Output Format + +The output of a PDF to Markdown operation includes: + +- A primary `.md` file containing the converted Markdown content + +### Output Structure + +The following is a summary of key elements in the converted Markdown: + +#### Elements + +Ordered list of semantic elements converted from the PDF document, preserving the natural reading order and document structure. The conversion handles: + +- Text content with proper Markdown syntax +- Document hierarchy and structure +- Inline formatting and emphasis +- Links and references +- Images and figures +- Tables and complex layouts + +#### Content Types + +The API processes various content types as follows: + +##### Text Elements + +- **Headings**: Converted to appropriate Markdown heading levels (H1-H6) +- **Paragraphs**: Preserved with proper spacing and formatting +- **Lists**: Both ordered and unordered lists with proper nesting +- **Text Emphasis**: Bold, italic, and other text formatting +- **Links**: Preserved with proper Markdown link syntax + +##### Images and Figures + +- Provided as base64-embedded images in the Markdown output +- Referenced correctly in the Markdown output +- Original quality preserved +- Proper alt-text and captions maintained + +##### Tables + +- Converted to Markdown table syntax +- Column alignment preserved +- Cell content formatting maintained +- Complex table structures supported + +#### Element Types and Paths + +The API recognizes and converts the following structural elements: + +| Category | Element Type | Description | +| --------- | ----------------- | --------------------------------------------------------- | +| Aside | Aside | Content that is not part of the regular content flow | +| Figure | Figure | Non-reflowable constructs such as graphs, images, and flowcharts | +| Footnote | Footnote | Footnote | +| Headings | H, H1, H2, etc | Heading levels | +| List | L, Li, Lbl, Lbody | List and list item elements | +| Paragraph | P, ParagraphSpan | Paragraphs and paragraph segments | +| Reference | Reference | Links | +| Section | Sect | Logical section of the document | +| StyleSpan | StyleSpan | Styling variations within text | +| Table | Table, TD, TH, TR | Table elements | +| Title | Title | Document title | + +### Reading Order + +The reading order in the output Markdown maintains: + +- Natural document flow +- Proper content hierarchy +- Column-based layouts +- Page transitions +- Inline elements and references + +## Use Cases + +The PDF to Markdown API is particularly valuable for: + +- **LLM and RAG ingestion**: Convert PDFs to Markdown for chunking, embeddings, and retrieval-augmented generation (RAG). +- **Prompt and context packaging**: Produce Markdown that is easy to paste, structure, and cite in prompts and agent workflows. +- **Training data preparation**: Create LLM fine-tuning datasets from PDF content after review, cleanup, and labeling. +- **Doc-as-code workflows**: Bring PDF content into Git-based review, versioning, diffing, and static-site generators. +- **Knowledge base publishing**: Migrate PDFs into documentation platforms and internal wikis as clean, editable Markdown. +- **Legacy and archive modernization**: Convert historical PDFs so they become searchable, editable, and maintainable. +- **Automated document processing**: Standardize PDF-to-text conversion inside ETL and document-processing pipelines. +- **Enterprise search and indexing**: Feed converted Markdown into internal search systems and knowledge repositories. +- **Compliance and audit readiness**: Make PDF policies, SOPs, and manuals searchable and easier to review for changes. +- **Content QA and change tracking**: Compare converted Markdown across document versions to detect updates and regressions. +- **Analytics and classification**: Use Markdown output for topic modeling, tagging, deduplication, and routing workflows. +- **Localization workflows**: Convert to Markdown as a starting point for translation and multi-language documentation. + +## API Limitations + +For File Constraints and Processing Limits, see [Licensing and Usage Limits](../dcserviceslicensing/). + +### Document Requirements + +- Files must be unprotected or allow content copying +- No support for: + - Hidden objects (JavaScript, OCG) + - XFA and fillable forms + - Complex annotations + - CAD drawings or vector art + - Password-protected content + +## REST API + +See our public API Reference for [PDF to Markdown API](../../../apis/#tag/PDF-To-Markdown). diff --git a/src/pages/resources/openapi.json b/src/pages/resources/openapi.json index 0ec0dca16..992158275 100644 --- a/src/pages/resources/openapi.json +++ b/src/pages/resources/openapi.json @@ -38,6 +38,10 @@ "name": "Extract PDF", "description": "Extract content from PDF documents and output it in a structured JSON format, along with tables and figures" }, + { + "name": "PDF To Markdown", + "description": "Extract content from PDF documents and output it in a well-formatted LLM-friendly Markdown text, along with tables and figures" + }, { "name": "Html to PDF", "description": "Convert HTML Resources to a PDF File" @@ -130,6 +134,7 @@ "Export PDF", "Export PDF Form Data", "Extract PDF", + "PDF To Markdown", "Html to PDF", "Import PDF Form Data", "Linearize PDF", @@ -140,6 +145,7 @@ "PDF Electronic Seal", "PDF Properties", "PDF To Images", + "PDF To Markdown", "PDF Watermark", "Protect PDF", "Remove Protection", @@ -816,16 +822,364 @@ ], "responses": { "200": { - "description": "The request was successful.", + "description": "The request was successful.", + "headers": { + "Content-Type": { + "description": "The media type of the response body, which is application/json.", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique ID to identify each individual request.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AssetMetadata" + } + } + } + }, + "400": { + "description": "Bad Request. The request was invalid or cannot be otherwise served.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/400" + } + } + } + }, + "401": { + "description": "Unauthorized.", + "headers": { + "content-type": { + "description": "The content type of the error JSON response.", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/401" + } + } + } + }, + "429": { + "description": "Caller doesn't have sufficient quota for this operation.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Asset429" + } + } + } + }, + "500": { + "description": "Internal Server Error. The server has encountered an error and is unable to process your request at this time.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/500" + } + } + } + } + } + } + }, + "/operation/documentgeneration": { + "post": { + "tags": [ + "Document Generation" + ], + "summary": "Merge Word based templates with input JSON data to create Word and PDF documents", + "description": "Merges the input JSON data with Word based templates to create dynamic documents. To learn more about document generation and document templates, please see the documentation.", + "operationId": "pdfoperations.documentgeneration", + "parameters": [ + { + "name": "Authorization", + "in": "header", + "description": "Bearer + Token (Learn more about getting the access token)", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "x-api-key", + "in": "header", + "description": "The clientId from the generated credentials", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "description": "Params for Document Generation Operation. Refer to the External Section below for using external storage with Document Generation API", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/DocumentGenerationInternal" + }, + { + "$ref": "#/components/schemas/DocumentGenerationExternal" + } + ] + } + } + }, + "required": true + }, + "responses": { + "201": { + "description": "Request creation for the operation and status uri generated, which can be found in the 'location' header.", + "headers": { + "location": { + "description": "Job status URI for polling the results", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + } + }, + "400": { + "description": "Bad Request. The request was invalid or cannot be otherwise served.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/400" + } + } + } + }, + "404": { + "description": "Resource Not Found.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/404" + } + } + } + }, + "401": { + "description": "Unauthorized.", + "headers": { + "content-type": { + "description": "The content type of the error JSON response.", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/401" + } + } + } + }, + "429": { + "description": "Caller doesn't have sufficient quota for this operation.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/429" + } + } + } + }, + "500": { + "description": "Internal Server Error. The server has encountered an error and is unable to process your request at this time.", + "headers": { + "content-type": { + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/500" + } + } + } + } + }, + "x-codegen-request-body-name": "body" + } + }, + "/operation/documentgeneration/{jobID}/status": { + "get": { + "tags": [ + "Document Generation" + ], + "summary": "Poll the document generation job for completion", + "operationId": "pdfoperations.documentgeneration.jobstatus", + "parameters": [ + { + "name": "jobID", + "in": "path", + "description": "Job ID of the request", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "Authorization", + "in": "header", + "description": "Bearer + Token (Learn more about getting the access token)", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "x-api-key", + "in": "header", + "description": "The clientId from the generated credentials", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "A response with job information.", "headers": { - "Content-Type": { - "description": "The media type of the response body, which is application/json.", + "content-type": { + "description": "The content type of the status call response.", "schema": { "type": "string" } }, "x-request-id": { - "description": "A unique ID to identify each individual request.", + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", "schema": { "type": "string" } @@ -834,13 +1188,37 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/AssetMetadata" + "$ref": "#/components/schemas/DocumentGenerationJobStatus" } } } }, "400": { "description": "Bad Request. The request was invalid or cannot be otherwise served.", + "headers": { + "content-type": { + "description": "The content type of the status call response", + "schema": { + "type": "string" + } + }, + "x-request-id": { + "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", + "schema": { + "type": "string" + } + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Status400" + } + } + } + }, + "404": { + "description": "Resource Not Found.", "headers": { "content-type": { "description": "The content type of the POST API call response", @@ -858,7 +1236,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/400" + "$ref": "#/components/schemas/Satus404" } } } @@ -867,7 +1245,13 @@ "description": "Unauthorized.", "headers": { "content-type": { - "description": "The content type of the error JSON response.", + "description": "The content type of the POST API call response", + "schema": { + "type": "string" + } + }, + "x-server": { + "description": "The name and version of the server.", "schema": { "type": "string" } @@ -882,7 +1266,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/401" + "$ref": "#/components/schemas/Satus401" } } } @@ -906,7 +1290,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Asset429" + "$ref": "#/components/schemas/Satus429" } } } @@ -930,7 +1314,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/500" + "$ref": "#/components/schemas/Satus500" } } } @@ -938,14 +1322,14 @@ } } }, - "/operation/documentgeneration": { + "/operation/pdftomarkdown": { "post": { "tags": [ - "Document Generation" + "PDF To Markdown" ], - "summary": "Merge Word based templates with input JSON data to create Word and PDF documents", - "description": "Merges the input JSON data with Word based templates to create dynamic documents. To learn more about document generation and document templates, please see the documentation.", - "operationId": "pdfoperations.documentgeneration", + "summary": "Extract content from PDF documents and output it in a markdown format, along with tables and figures", + "description": "Extract PDF Content, Tables content and Tables/Figures renditions from a PDF document. Various available options are: \n\n1. Extract figures or images in base64 format", + "operationId": "pdfoperations.pdftomarkdown", "parameters": [ { "name": "Authorization", @@ -967,16 +1351,16 @@ } ], "requestBody": { - "description": "Params for Document Generation Operation. Refer to the External Section below for using external storage with Document Generation API", + "description": "Params for PDF to Markdown Operation. Refer to the External Section below for using external storage with PDF to Markdown Operation.", "content": { "application/json": { "schema": { "oneOf": [ { - "$ref": "#/components/schemas/DocumentGenerationInternal" + "$ref": "#/components/schemas/PDFToMarkdownInternal" }, { - "$ref": "#/components/schemas/DocumentGenerationExternal" + "$ref": "#/components/schemas/PDFToMarkdownExternal" } ] } @@ -1059,6 +1443,12 @@ "type": "string" } }, + "x-server": { + "description": "The name and version of the server.", + "schema": { + "type": "string" + } + }, "x-request-id": { "description": "A unique value to identify this request. If x-request-id is specified in the POST request, this value will be the same as the input.", "schema": { @@ -1126,13 +1516,13 @@ "x-codegen-request-body-name": "body" } }, - "/operation/documentgeneration/{jobID}/status": { + "/operation/pdftomarkdown/{jobID}/status": { "get": { "tags": [ - "Document Generation" + "PDF To Markdown" ], - "summary": "Poll the document generation job for completion", - "operationId": "pdfoperations.documentgeneration.jobstatus", + "summary": "Poll the PDF to markdown job for completion", + "operationId": "pdfoperations.pdftomarkdown.jobstatus", "parameters": [ { "name": "jobID", @@ -1182,7 +1572,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/DocumentGenerationJobStatus" + "$ref": "#/components/schemas/PDFToMarkdownJobStatus" } } } @@ -8897,6 +9287,101 @@ }, "components": { "schemas": { + "PDFToMarkdownInternal": { + "title": "Internal", + "description": "Params for PDF to Markdown Operation", + "type": "object", + "required": [ + "assetID" + ], + "properties": { + "assetID": { + "description": "A file assetID. For more details click here .", + "type": "string" + }, + "getFigures": { + "description": "Extract figures or images in base64 format", + "type": "boolean", + "default": "false" + }, + "notifiers": { + "$ref": "#/components/schemas/notifiers" + } + }, + "example": { + "assetID": "urn:aaid:AS:UE1:23c30ee0-2c4d-46d6-87f2-087832fca718", + "getFigures": false, + "notifiers": [ + { + "type": "CALLBACK", + "data": { + "url": "https://dummy.callback.org/", + "headers": { + "x-api-key": "dummykey", + "access-token": "dummytoken" + } + } + } + ] + }, + "additionalProperties": false + }, + "PDFToMarkdownExternal": { + "title": "External", + "description": "Params for PDF to Markdown Operation using external storage.", + "type": "object", + "required": [ + "input" + ], + "properties": { + "input": { + "$ref": "#/components/schemas/ExternalStorageInput" + }, + "output": { + "$ref": "#/components/schemas/ExternalStorageOutput" + }, + "params": { + "description": "Params for PDF to Markdown processing", + "type": "object", + "properties": { + "getFigures": { + "description": "Extract figures or images in base64 format", + "type": "boolean", + "default": "false" + } + } + }, + "notifiers": { + "$ref": "#/components/schemas/notifiers" + } + }, + "example": { + "input": { + "uri": "https://dcplatformstorageservice-dev-us-east-1.s3-accelerate.amazonaws.com/dc-platformService-automation_dc-platformService-automation%40AdobeID/1c4f4674-ce8d-4b21-a69d-60aeae35bf43?X-Amz-Security-Token=FwoGZXIvYXdzEBkaDK%2By2wxl94khIbkxzCLTAQn6n6Wo0vFSul%2FpXW66aFX4T%2BPxtuOy%2Bz8eTxrnexeJRvMreBHNQm1myLwp20MkE%2Bb0H%2BwYgOhFaepi9AMml1aLNxXn1UPnEWJ7y8llhvsrXHimEfWvb3TMAkZddgUIDBue8oGUYqm4f2s0sMvPWBCxI45zM0%2F37EK%2B4JnIo1SlrKNm0GSZ44AEiOAhXupQ8ih6KoUbUciD3Biile6CwTMVIhME3mJiRSgVK6W91EaDn8%2Ba3mU%2BVvU1K9sgDSPZ%2F81DOpj25pvMW%2B1cMuCtUNsu9KUo7dHvpAYyLYiy%2FPGEmO9EquKjfMPRr17PAjeunD1QdgbRss4ysG%2B6XF2Has8zsGqX1sQalA%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20230628T081557Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3599&X-Amz-Credential=ASIAU5PA7W47IMX73XEA%2F20230628%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=ac6998566dbbde22509b128fe94d1cb5d3146cd3fb8ba78d7068e10d61302ec2", + "storage": "S3" + }, + "output": { + "uri": "https://dcplatformstorageservice-dev-us-east-1.s3-accelerate.amazonaws.com/dc-platformService-automation_dc-platformService-automation%40AdobeID/f02f9927-4971-4589-8fdf-41ff56c2d520?X-Amz-Security-Token=FwoGZXIvYXdzEBkaDK%2By2wxl94khIbkxzCLTAQn6n6Wo0vFSul%2FpXW66aFX4T%2BPxtuOy%2Bz8eTxrnexeJRvMreBHNQm1myLwp20MkE%2Bb0H%2BwYgOhFaepi9AMml1aLNxXn1UPnEWJ7y8llhvsrXHimEfWvb3TMAkZddgUIDBue8oGUYqm4f2s0sMvPWBCxI45zM0%2F37EK%2B4JnIo1SlrKNm0GSZ44AEiOAhXupQ8ih6KoUbUciD3Biile6CwTMVIhME3mJiRSgVK6W91EaDn8%2Ba3mU%2BVvU1K9sgDSPZ%2F81DOpj25pvMW%2B1cMuCtUNsu9KUo7dHvpAYyLYiy%2FPGEmO9EquKjfMPRr17PAjeunD1QdgbRss4ysG%2B6XF2Has8zsGqX1sQalA%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20230628T081559Z&X-Amz-SignedHeaders=content-type%3Bhost&X-Amz-Expires=3600&X-Amz-Credential=ASIAU5PA7W47IMX73XEA%2F20230628%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=4f765277eb6e36bd5f7bf9d42be24244440092b5a705eadf178c78a9a9fb5d71", + "storage": "S3" + }, + "params": { + "getFigures": true + }, + "notifiers": [ + { + "type": "CALLBACK", + "data": { + "url": "https://dummy.callback.org/", + "headers": { + "x-api-key": "dummykey", + "access-token": "dummytoken" + } + } + } + ] + }, + "additionalProperties": false + }, "Token": { "description": "Params for generating the Access Token.", "type": "object", @@ -13005,6 +13490,19 @@ } ] }, + "PDFToMarkdownJobStatus": { + "oneOf": [ + { + "$ref": "#/components/schemas/inprogress" + }, + { + "$ref": "#/components/schemas/done" + }, + { + "$ref": "#/components/schemas/failed" + } + ] + }, "HtmlToPdfJobStatus": { "oneOf": [ { @@ -14098,6 +14596,17 @@ } ] }, + "PDFToMarkdownDone": { + "title": "done", + "oneOf": [ + { + "$ref": "#/components/schemas/PDFToMarkdownDoneInternal" + }, + { + "$ref": "#/components/schemas/PDFToMarkdownDoneExternal" + } + ] + }, "ExtractPDFDoneInternal": { "title": "Internal", "description": "Response in case of 'done' status", @@ -14193,17 +14702,101 @@ } } }, - "ExtractPDFDoneExternal": { + "PDFToMarkdownDoneInternal": { + "title": "Internal", + "description": "Response in case of 'done' status", + "type": "object", + "properties": { + "status": { + "description": "Job Status", + "type": "string", + "enum": [ + "done" + ] + }, + "asset": { + "type": "object", + "description": "Asset of markdown file containing converted content of pdf file. For more details click here .", + "properties": { + "metadata": { + "type": "object", + "description": "metadata details of output asset.", + "properties": { + "size": { + "description": "The size of the Resource in bytes. This value helps in making range requests.", + "type": "integer" + }, + "type": { + "description": "The media type of the Resource.", + "type": "string", + "enum": [ + "text/markdown" + ] + } + } + }, + "assetID": { + "description": "An asset ID identifying an asset that is globally unique and never reused.", + "type": "string" + }, + "downloadUri": { + "description": "The URL used to download the Resource directly to the cloud provider.", + "type": "string" + } + } + } + }, + "example": { + "status": "done", + "asset": { + "metadata": { + "type": "text/markdown", + "size": 200791 + }, + "downloadUri": "https://dcplatformstorageservice-dev-us-east-1.s3-accelerate.amazonaws.com/6bb12fd8-3233-4340-916b-4835917857be?X-Amz-Security-Token=FwoGZXIvYXdzEE0aDOLg7PFwVB1bjEMxQCLTAe3pjf%2Fgl2Pj%2FcjOaY%2BHfduju9SXTp1Pn8C4GapIXm%2F8tuR4cGPYGC0goU21qZxCq9R%2F8z2bOmB2EL%2BZrhbPLbaNcpzf5Vud%2B3Bmn61MEJiBdU%2BhZqasX5YhVxdnzhfpl5KfKeq2kwROVMqJcyHGdxw5h0%2Bi0sD2I8sqkbPmnBi0WOtYNwz7TQq42oe8W5KYHpq6WMya9OQgx0u7qg0inYwBnQu5UQ9NJJQY2MSU11IuZ0uE%2B%2FNAPuq3VfEIn3txK%2FFfIxGz9%2BInehhHroKjBFULzy0olY2rlwYyLQDwHHr2eyuy%2BaoGVaq%2BSUNH8T0OKIicbbZfQ5wjF0hK2FzVXwfebtDG4qq%2BiA%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220803T190422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=ASIAU5PA7W47AH3PA2JV%2F20220803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=5d8c3f421c68c009b4471919a319ba460495c72afe51674d23266bc124fe9a56", + "assetID": "urn:aaid:AS:UE1:23c30ee0-2c4d-46d6-87f2-087832fca718" + } + } + }, + "PDFToMarkdownDoneExternal": { "title": "External", "oneOf": [ { - "$ref": "#/components/schemas/ExtractPDFDoneExternalWithoutOutput" + "$ref": "#/components/schemas/PDFToMarkdownDoneExternalWithoutOutput" }, { "$ref": "#/components/schemas/external" } ] }, + "PDFToMarkdownDoneExternalWithoutOutput": { + "title": "External without an output URI in the request", + "description": "Response in case of 'done' status when ouput uri is not passed in the rquest", + "type": "object", + "properties": { + "status": { + "description": "Job Status", + "type": "string", + "enum": [ + "done" + ] + }, + "asset": { + "$ref": "#/components/schemas/responseAsset" + } + }, + "example": { + "status": "done", + "asset": { + "metadata": { + "type": "text/markdown", + "size": 200791 + }, + "downloadUri": "https://dcplatformstorageservice-dev-us-east-1.s3-accelerate.amazonaws.com/6bb12fd8-3233-4340-916b-4835917857be?X-Amz-Security-Token=FwoGZXIvYXdzEE0aDOLg7PFwVB1bjEMxQCLTAe3pjf%2Fgl2Pj%2FcjOaY%2BHfduju9SXTp1Pn8C4GapIXm%2F8tuR4cGPYGC0goU21qZxCq9R%2F8z2bOmB2EL%2BZrhbPLbaNcpzf5Vud%2B3Bmn61MEJiBdU%2BhZqasX5YhVxdnzhfpl5KfKeq2kwROVMqJcyHGdxw5h0%2Bi0sD2I8sqkbPmnBi0WOtYNwz7TQq42oe8W5KYHpq6WMya9OQgx0u7qg0inYwBnQu5UQ9NJJQY2MSU11IuZ0uE%2B%2FNAPuq3VfEIn3txK%2FFfIxGz9%2BInehhHroKjBFULzy0olY2rlwYyLQDwHHr2eyuy%2BaoGVaq%2BSUNH8T0OKIicbbZfQ5wjF0hK2FzVXwfebtDG4qq%2BiA%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220803T190422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=ASIAU5PA7W47AH3PA2JV%2F20220803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=5d8c3f421c68c009b4471919a319ba460495c72afe51674d23266bc124fe9a56", + "assetID": "urn:aaid:AS:UE1:23c30ee0-2c4d-46d6-87f2-087832fca718" + } + } + }, "ExtractPDFDoneExternalWithoutOutput" : { "title": "External without an output URI in the request", "description": "Response in case of 'done' status when ouput uri is not passed in the rquest",