diff --git a/gatsby-config.js b/gatsby-config.js index dfed96822..9d6160b69 100644 --- a/gatsby-config.js +++ b/gatsby-config.js @@ -394,6 +394,20 @@ module.exports = { path: 'overview/pdf-extract-api/quickstarts/extract-pdf/python/index.md' } ] + }, + { + title: 'PDF to Markdown', + path: 'overview/pdf-extract-api/quickstarts/pdf-to-markdown/index.md', + pages: [ + { + title:'.NET', + path: 'overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/index.md' + }, + { + title:'Python', + path: 'overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/index.md' + } + ] } ] }, diff --git a/src/pages/apis/index.md b/src/pages/apis/index.md index 76ea35820..99cf0a16b 100644 --- a/src/pages/apis/index.md +++ b/src/pages/apis/index.md @@ -1,6 +1,6 @@ --- title: Adobe PDF Services Open API spec description: The OpenAPI spec for Adobe PDF Services API endpoints, parameters, and responses. -openAPISpec: https://raw.githubusercontent.com/AdobeDocs/pdfservices-api-documentation/main/src/pages/resources/openapi.json +openAPISpec: https://raw.githubusercontent.com/AdobeDocs/pdfservices-api-documentation/supratims/python-dotnet-sdk-release/src/pages/resources/openapi.json --- -[] diff --git a/src/pages/overview/pdf-extract-api/howtos/extract-api.md b/src/pages/overview/pdf-extract-api/howtos/extract-api.md index 90fb582fe..59e904ebe 100644 --- a/src/pages/overview/pdf-extract-api/howtos/extract-api.md +++ b/src/pages/overview/pdf-extract-api/howtos/extract-api.md @@ -2330,3 +2330,392 @@ curl --location --request POST 'https://pdf-services.adobe.io/operation/extractp "includeStyling": true }' ``` + +## Extract Text and Tables and Header-Footer Information + +The sample below adds an option to get header-footer information from given PDFs. + +Please refer the [API usage guide](./api-usage.md) to understand how to use our APIs. + + + + +#### .NET + +```javascript +// Get the samples from https://github.com/adobe/PDFServices.NET.SDK.Samples +// Run the sample: +// cd ExtractPDFWithIncludeHeaderFooter/ +// dotnet run ExtractPDFWithIncludeHeaderFooter.csproj +namespace ExtractPDFWithIncludeHeaderFooter +{ + class Program + { + private static readonly ILog log = LogManager.GetLogger(typeof(Program)); + + static void Main() + { + // Configure the logging. + ConfigureLogging(); + try + { + // Initial setup, create credentials instance + ICredentials credentials = new ServicePrincipalCredentials( + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"), + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET")); + + // Creates a PDF Services instance + PDFServices pdfServices = new PDFServices(credentials); + + // Creates an asset from source file and upload + using Stream inputStream = File.OpenRead(@"extractPDFInput.pdf"); + IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue()); + + // Create parameters for the job + ExtractPDFParams extractPDFParams = ExtractPDFParams.ExtractPDFParamsBuilder() + .AddElementsToExtract(new List(new[] + { ExtractElementType.TEXT, ExtractElementType.TABLES })) + .AddIncludeHeaderFooter(true) // Enable header and footer extraction + .Build(); + + // Creates a new job instance + ExtractPDFJob extractPDFJob = new ExtractPDFJob(asset).SetParams(extractPDFParams); + + // Submits the job and gets the job result + String location = pdfServices.Submit(extractPDFJob); + PDFServicesResponse pdfServicesResponse = + pdfServices.GetJobResult(location, typeof(ExtractPDFResult)); + + // Get content from the resulting asset(s) + IAsset resultAsset = pdfServicesResponse.Result.Resource; + StreamAsset streamAsset = pdfServices.GetContent(resultAsset); + + // Creating output streams and copying stream asset's content to it + String outputFilePath = CreateOutputFilePath(); + new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create(); + Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath); + streamAsset.Stream.CopyTo(outputStream); + outputStream.Close(); + + Console.WriteLine("Successfully extracted PDF content with header and footer information!"); + Console.WriteLine("Output saved to: " + Directory.GetCurrentDirectory() + outputFilePath); + } + catch (ServiceUsageException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (ServiceApiException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (SDKException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (IOException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (Exception ex) + { + log.Error("Exception encountered while executing operation", ex); + } + } + + static void ConfigureLogging() + { + ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly()); + XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config")); + } + + // Generates a string containing a directory structure and file name for the output file. + private static String CreateOutputFilePath() + { + String timeStamp = DateTime.Now.ToString("yyyy'-'MM'-'dd'T'HH'-'mm'-'ss"); + return ("/output/extract" + timeStamp + ".zip"); + } + } +} +``` + +#### Python + +```python +# Get the samples from https://github.com/adobe/pdfservices-python-sdk-samples +# Run the sample: +# python src/extractpdf/extract_text_table_with_header_footer_from_pdf.py + +# Initialize the logger +logging.basicConfig(level=logging.INFO) + +class ExtractTextTableWithHeaderFooterFromPDF: + def __init__(self): + try: + file = open('src/resources/extractPdfInput.pdf', 'rb') + input_stream = file.read() + file.close() + + # Initial setup, create credentials instance + credentials = ServicePrincipalCredentials( + client_id=os.getenv('PDF_SERVICES_CLIENT_ID'), + client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET') + ) + + # Creates a PDF Services instance + pdf_services = PDFServices(credentials=credentials) + + # Creates an asset(s) from source file(s) and upload + input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF) + + # Create parameters for the job with header/footer extraction enabled + extract_pdf_params = ExtractPDFParams( + elements_to_extract=[ExtractElementType.TEXT, ExtractElementType.TABLES], + include_header_footer=True # Extract header and footer information from PDF + ) + + # Creates a new job instance + extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params) + + # Submit the job and gets the job result + location = pdf_services.submit(extract_pdf_job) + pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult) + + # Get content from the resulting asset(s) + result_asset: CloudAsset = pdf_services_response.get_result().get_resource() + stream_asset: StreamAsset = pdf_services.get_content(result_asset) + + # Creates an output stream and copy stream asset's content to it + output_file_path = self.create_output_file_path() + with open(output_file_path, "wb") as file: + file.write(stream_asset.get_input_stream()) + + except (ServiceApiException, ServiceUsageException, SdkException) as e: + logging.exception(f'Exception encountered while executing operation: {e}') + + # Generates a string containing a directory structure and file name for the output file + @staticmethod + def create_output_file_path() -> str: + now = datetime.now() + time_stamp = now.strftime("%Y-%m-%dT%H-%M-%S") + os.makedirs("output/ExtractTextTableWithHeaderFooterFromPDF", exist_ok=True) + return f"output/ExtractTextTableWithHeaderFooterFromPDF/extract{time_stamp}.zip" + + +if __name__ == "__main__": + ExtractTextTableWithHeaderFooterFromPDF() +``` + +#### REST API + +```javascript +// Please refer our REST API docs for more information +// https://developer.adobe.com/document-services/docs/apis/#tag/Extract-PDF + +curl --location --request POST 'https://pdf-services.adobe.io/operation/extractpdf' \ +--header 'x-api-key: {{Placeholder for client_id}}' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer {{Placeholder for token}}' \ +--data-raw '{ + "assetID": "urn:aaid:AS:UE1:23c30ee0-2e4d-46d6-87f2-087832fca718" + "elementsToExtract": [ + "text", + "tables" + ], + "includeHeaderFooter": true +}' +``` + +## Extract Text and Tables and Encapsulated Text from list of elements + +The sample below adds an option to extract encapsulated text content from list of elements. + +Please refer the [API usage guide](./api-usage.md) to understand how to use our APIs. + + + + +#### .NET + +```javascript +// Get the samples from https://github.com/adobe/PDFServices.NET.SDK.Samples +// Run the sample: +// cd ExtractPDFWithTagEncapsulatedText/ +// dotnet run ExtractPDFWithTagEncapsulatedText.csproj + +namespace ExtractPDFWithTagEncapsulatedText +{ + class Program + { + private static readonly ILog log = LogManager.GetLogger(typeof(Program)); + + static void Main() + { + // Configure the logging. + ConfigureLogging(); + try + { + // Initial setup, create credentials instance + ICredentials credentials = new ServicePrincipalCredentials( + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"), + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET")); + + // Creates a PDF Services instance + PDFServices pdfServices = new PDFServices(credentials); + + // Creates an asset from source file and upload + using Stream inputStream = File.OpenRead(@"extractPDFInput.pdf"); + IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue()); + + // Create parameters for the job + ExtractPDFParams extractPDFParams = ExtractPDFParams.ExtractPDFParamsBuilder() + .AddElementsToExtract(new List(new[] + { ExtractElementType.TEXT, ExtractElementType.TABLES })) + .AddTagEncapsulatedTextType(TagEncapsulatedTextType.FIGURE) // Enable figure tag encapsulation + .Build(); + + // Creates a new job instance + ExtractPDFJob extractPDFJob = new ExtractPDFJob(asset).SetParams(extractPDFParams); + + // Submits the job and gets the job result + String location = pdfServices.Submit(extractPDFJob); + PDFServicesResponse pdfServicesResponse = + pdfServices.GetJobResult(location, typeof(ExtractPDFResult)); + + // Get content from the resulting asset(s) + IAsset resultAsset = pdfServicesResponse.Result.Resource; + StreamAsset streamAsset = pdfServices.GetContent(resultAsset); + + // Creating output streams and copying stream asset's content to it + String outputFilePath = CreateOutputFilePath(); + new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create(); + Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath); + streamAsset.Stream.CopyTo(outputStream); + outputStream.Close(); + + Console.WriteLine("Successfully extracted PDF content with tag encapsulated text for figures!"); + Console.WriteLine("Output saved to: " + Directory.GetCurrentDirectory() + outputFilePath); + } + catch (ServiceUsageException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (ServiceApiException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (SDKException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (IOException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (Exception ex) + { + log.Error("Exception encountered while executing operation", ex); + } + } + + static void ConfigureLogging() + { + ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly()); + XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config")); + } + + // Generates a string containing a directory structure and file name for the output file. + private static String CreateOutputFilePath() + { + String timeStamp = DateTime.Now.ToString("yyyy'-'MM'-'dd'T'HH'-'mm'-'ss"); + return ("/output/extract" + timeStamp + ".zip"); + } + } +} +``` + +#### Python + +```python +# Get the samples from https://github.com/adobe/pdfservices-python-sdk-samples +# Run the sample: +# python src/extractpdf/extract_text_with_encapsulated_text_from_pdf.py + +# Initialize the logger +logging.basicConfig(level=logging.INFO) + +class ExtractTextWithEncapsulatedTextFromPDF: + def __init__(self): + try: + file = open('src/resources/extractPdfInput.pdf', 'rb') + input_stream = file.read() + file.close() + + # Initial setup, create credentials instance + credentials = ServicePrincipalCredentials( + client_id=os.getenv('PDF_SERVICES_CLIENT_ID'), + client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET') + ) + + # Creates a PDF Services instance + pdf_services = PDFServices(credentials=credentials) + + # Creates an asset(s) from source file(s) and upload + input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF) + + # Create parameters for the job with encapsulated text extraction from figures + extract_pdf_params = ExtractPDFParams( + elements_to_extract=[ExtractElementType.TEXT, ExtractElementType.TABLES], + tag_encapsulated_text=["Figure"] # Extract encapsulated text content from figures + ) + + # Creates a new job instance + extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params) + + # Submit the job and gets the job result + location = pdf_services.submit(extract_pdf_job) + pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult) + + # Get content from the resulting asset(s) + result_asset: CloudAsset = pdf_services_response.get_result().get_resource() + stream_asset: StreamAsset = pdf_services.get_content(result_asset) + + # Creates an output stream and copy stream asset's content to it + output_file_path = self.create_output_file_path() + with open(output_file_path, "wb") as file: + file.write(stream_asset.get_input_stream()) + + except (ServiceApiException, ServiceUsageException, SdkException) as e: + logging.exception(f'Exception encountered while executing operation: {e}') + + # Generates a string containing a directory structure and file name for the output file + @staticmethod + def create_output_file_path() -> str: + now = datetime.now() + time_stamp = now.strftime("%Y-%m-%dT%H-%M-%S") + os.makedirs("output/ExtractTextWithEncapsulatedTextFromPDF", exist_ok=True) + return f"output/ExtractTextWithEncapsulatedTextFromPDF/extract{time_stamp}.zip" + + +if __name__ == "__main__": + ExtractTextWithEncapsulatedTextFromPDF() +``` + +#### REST API + +```javascript +// Please refer our REST API docs for more information +// https://developer.adobe.com/document-services/docs/apis/#tag/Extract-PDF + +curl --location --request POST 'https://pdf-services.adobe.io/operation/extractpdf' \ +--header 'x-api-key: {{Placeholder for client_id}}' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer {{Placeholder for token}}' \ +--data-raw '{ + "assetID": "urn:aaid:AS:UE1:23c30ee0-2e4d-46d6-87f2-087832fca718" + "elementsToExtract": [ + "text", + "tables" + ], + "tagEncapsulatedText": ["Figure"] +}' +``` \ No newline at end of file diff --git a/src/pages/overview/pdf-extract-api/quickstarts/index.md b/src/pages/overview/pdf-extract-api/quickstarts/index.md index daa8c90b1..433fd4a9d 100644 --- a/src/pages/overview/pdf-extract-api/quickstarts/index.md +++ b/src/pages/overview/pdf-extract-api/quickstarts/index.md @@ -6,4 +6,5 @@ title: Quickstarts | PDF Extract API | Adobe PDF Services Want to quickly test out Extract PDF API and PDF To Markdown API? Choose your operation to get started: -* [Extract PDF](extract-pdf/) \ No newline at end of file +* [Extract PDF](extract-pdf/) +* [PDF to Markdown](pdf-to-markdown/) \ No newline at end of file diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/Adobe Extract API Sample.pdf b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/Adobe Extract API Sample.pdf new file mode 100644 index 000000000..7d9bc59e4 Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/Adobe Extract API Sample.pdf differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/index.md b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/index.md new file mode 100644 index 000000000..6054ede3a --- /dev/null +++ b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/index.md @@ -0,0 +1,299 @@ +--- +title: .NET | Quickstarts | PDF to Markdown API | Adobe PDF Services +--- + +# Quickstart for PDF to Markdown API (.NET) + +To get started using Adobe PDF to Markdown API, let's walk through a simple scenario - taking an input PDF document and extracting its elements into Markdown format. Once the PDF has been converted, we'll save the Markdown output. In this guide, we will walk you through the complete process for creating a program that will accomplish this task. + +## Prerequisites + +To complete this guide, you will need: + +* [.NET: version 8.0 or above](https://dotnet.microsoft.com/en-us/download) +* [.Net SDK](https://dotnet.microsoft.com/en-us/download/dotnet/8.0) +* A build tool: Either Visual Studio or .NET Core CLI. +* An Adobe ID. If you do not have one, the credential setup will walk you through creating one. +* A way to edit code. No specific editor is required for this guide. + +## Step One: Getting credentials + +1) To begin, open your browser to . If you are not already logged in to Adobe.com, you will need to sign in or create a new user. Using a personal email account is recommend and not a federated ID. + +![Sign in](./shot1.png) + +2) After registering or logging in, you will then be asked to name your new credentials. Use the name, "New Project". + +3) Change the "Choose language" setting to ".Net". + +4) Also note the checkbox by, "Create personalized code sample." This will include a large set of samples along with your credentials. These can be helpful for learning more later. + +5) Click the checkbox saying you agree to the developer terms and then click "Create credentials." + +![Project setup](./shot2_spc.png) + +6) After your credentials are created, they are automatically downloaded: + +![alt](./shot3_spc.png) + +## Step Two: Setting up the project + +1) In your Downloads folder, find the ZIP file with your credentials: PDFServicesSDK-.NetSamples.zip. If you unzip that archive, you will find a folder of samples and the `pdfservices-api-credentials.json` file. + +![alt](./shot5_spc.png) + +2) Take the `pdfservices-api-credentials.json` file and place it in a new directory. + +3) In your new directory, create a new file, `PDFToMarkdown.csproj`. This file will declare our requirements as well as help define the application we're creating. + +```xml + + + + Exe + net8.0 + + + + + + + + + + Always + + + Always + + + + +``` + +This file will define what dependencies we need and how the application will be built. + +Our application will take a PDF, `Adobe Extract API Sample.pdf` (downloadable from [here](/ +Adobe%20Extract%20API%20Sample.pdf)) and extract it's contents. The results will be saved as a `.md` file with a timestamp in the filename. + +4) In your editor, open the directory where you previously copied the credentials and created the `csproj` file. Create a new file, `Program.cs`. + +Now you're ready to begin coding. + +## Step Three: Creating the application + +1) We'll begin by including our required dependencies: + +```javascript +using System; +using System.IO; +using System.Reflection; +using Adobe.PDFServicesSDK; +using Adobe.PDFServicesSDK.auth; +using Adobe.PDFServicesSDK.exception; +using Adobe.PDFServicesSDK.io; +using Adobe.PDFServicesSDK.pdfjobs.jobs; +using Adobe.PDFServicesSDK.pdfjobs.parameters.pdftomarkdown; +using Adobe.PDFServicesSDK.pdfjobs.results; +using log4net; +using log4net.Config; +using log4net.Repository; +``` + +2) Now let's define our main class and `Main` method: + +```javascript +namespace PDFToMarkdown +{ + class Program + { + private static readonly ILog log = LogManager.GetLogger(typeof(Program)); + static void Main() + { + + } + } +} +``` + +3) Set the environment variables `PDF_SERVICES_CLIENT_ID` and `PDF_SERVICES_CLIENT_SECRET` by running the following commands and replacing placeholders `YOUR CLIENT ID` and `YOUR CLIENT SECRET` with the credentials present in `pdfservices-api-credentials.json` file: +- **Windows:** + - `set PDF_SERVICES_CLIENT_ID=` + - `set PDF_SERVICES_CLIENT_SECRET=` + +- **MacOS/Linux:** + - `export PDF_SERVICES_CLIENT_ID=` + - `export PDF_SERVICES_CLIENT_SECRET=` + +4) Next, we can create our credentials and use them to create a PDF Services instance + +```javascript +// Initial setup, create credentials instance +ICredentials credentials = new ServicePrincipalCredentials( + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"), + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET")); + +// Creates a PDF Services instance +PDFServices pdfServices = new PDFServices(credentials); +``` + +5) Now, let's upload the asset: + +```javascript +// Creates an asset from source file and upload +using Stream inputStream = File.OpenRead(@"Adobe Extract API Sample.pdf"); +IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue()); +``` + +We define what PDF will be converted. In a real application, these values would typically be dynamic. + +6) Now, let's create the job: + +```javascript +// Create parameters for the job +PDFToMarkdownParams pdfToMarkdownParams = new PDFToMarkdownParams.Builder() + .WithGetFigures(true) + .Build(); + +// Creates a new job instance +PDFToMarkdownJob pdfToMarkdownJob = new PDFToMarkdownJob(asset) + .SetParams(pdfToMarkdownParams); +``` + +This set of code defines what we're doing (a PDF to Markdown conversion operation). The `WithGetFigures(true)` option will extract figures and images as base64-embedded images in the Markdown output. + +7) The next code block submits the job and gets the job result: + +```javascript +// Submits the job and gets the job result +String location = pdfServices.Submit(pdfToMarkdownJob); +PDFServicesResponse pdfServicesResponse = + pdfServices.GetJobResult(location, typeof(PDFToMarkdownResult)); + +// Get content from the resulting asset(s) +IAsset resultAsset = pdfServicesResponse.Result.Asset; +StreamAsset streamAsset = pdfServices.GetContent(resultAsset); +``` + +This code runs the PDF to Markdown conversion process and gets the content of the result asset. + +8) The next code block saves the result at the specified location: + +```javascript +// Creating output file path with timestamp +String outputFilePath = CreateOutputFilePath(); +new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create(); +Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath); +streamAsset.Stream.CopyTo(outputStream); +outputStream.Close(); +``` + +9) Add a helper method to create the output file path with a timestamp: + +```javascript +private static String CreateOutputFilePath() +{ + String timeStamp = DateTime.Now.ToString("yyyy'-'MM'-'dd'T'HH'-'mm'-'ss"); + return ("/output/pdfToMarkdown" + timeStamp + ".md"); +} +``` + +![Example running in the command line](./shot9.png) + +Here's the complete application (`Program.cs`): + +```javascript +using System; +using System.IO; +using System.Reflection; +using Adobe.PDFServicesSDK; +using Adobe.PDFServicesSDK.auth; +using Adobe.PDFServicesSDK.exception; +using Adobe.PDFServicesSDK.io; +using Adobe.PDFServicesSDK.pdfjobs.jobs; +using Adobe.PDFServicesSDK.pdfjobs.parameters.pdftomarkdown; +using Adobe.PDFServicesSDK.pdfjobs.results; +using log4net; +using log4net.Config; +using log4net.Repository; + +namespace PDFToMarkdown +{ + class Program + { + private static readonly ILog log = LogManager.GetLogger(typeof(Program)); + + static void Main() + { + ConfigureLogging(); + try + { + ICredentials credentials = new ServicePrincipalCredentials( + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"), + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET")); + + PDFServices pdfServices = new PDFServices(credentials); + + using Stream inputStream = File.OpenRead(@"pdfToMarkdownInput.pdf"); + IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue()); + + PDFToMarkdownParams pdfToMarkdownParams = new PDFToMarkdownParams.Builder() + .WithGetFigures(true) + .Build(); + + PDFToMarkdownJob pdfToMarkdownJob = new PDFToMarkdownJob(asset) + .SetParams(pdfToMarkdownParams); + + String location = pdfServices.Submit(pdfToMarkdownJob); + PDFServicesResponse pdfServicesResponse = + pdfServices.GetJobResult(location, typeof(PDFToMarkdownResult)); + + IAsset resultAsset = pdfServicesResponse.Result.Asset; + StreamAsset streamAsset = pdfServices.GetContent(resultAsset); + + String outputFilePath = CreateOutputFilePath(); + new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create(); + Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath); + streamAsset.Stream.CopyTo(outputStream); + outputStream.Close(); + } + catch (ServiceUsageException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (ServiceApiException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (SDKException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (IOException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (Exception ex) + { + log.Error("Exception encountered while executing operation", ex); + } + } + + static void ConfigureLogging() + { + ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly()); + XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config")); + } + + private static String CreateOutputFilePath() + { + String timeStamp = DateTime.Now.ToString("yyyy'-'MM'-'dd'T'HH'-'mm'-'ss"); + return ("/output/pdfToMarkdown" + timeStamp + ".md"); + } + } +} +``` + +## Next Steps + +Now that you've successfully performed your first operation, [review the documentation](https://developer.adobe.com/document-services/docs/overview/pdf-services-api/) for many other examples and reach out on our [forums](https://community.adobe.com/t5/document-services-apis/ct-p/ct-Document-Cloud-SDK) with any questions. Also remember the samples you downloaded while creating your credentials also have many demos. diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot1.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot1.png new file mode 100644 index 000000000..cca4151ad Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot1.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot2_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot2_spc.png new file mode 100644 index 000000000..d91f4d403 Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot2_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot3_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot3_spc.png new file mode 100644 index 000000000..fa1c1ea8e Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot3_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot5_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot5_spc.png new file mode 100644 index 000000000..4dd116750 Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot5_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot6_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot6_spc.png new file mode 100644 index 000000000..063f61938 Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot6_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot9.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot9.png new file mode 100644 index 000000000..b9ae6e41d Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/dotnet/shot9.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/index.md b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/index.md new file mode 100644 index 000000000..7730c05fb --- /dev/null +++ b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/index.md @@ -0,0 +1,10 @@ +--- +title: Quickstarts | PDF to Markdown API | Adobe PDF Services +--- + +# PDF To Markdown - Quickstarts + +The following quickstarts will help you run your first successful operation and are tailored to our supported SDKs: + +* [.NET](dotnet) +* [Python](python) \ No newline at end of file diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/Adobe Extract API Sample.pdf b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/Adobe Extract API Sample.pdf new file mode 100644 index 000000000..7d9bc59e4 Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/Adobe Extract API Sample.pdf differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/index.md b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/index.md new file mode 100644 index 000000000..656bb6e65 --- /dev/null +++ b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/index.md @@ -0,0 +1,205 @@ +--- +title: Python | Quickstarts | PDF to Markdown API | Adobe PDF Services +--- + +# Getting Started with PDF to Markdown API (Python) + +To get started using Adobe PDF to Markdown API, let's walk through a simple scenario - taking an input PDF document and extracting its elements into Markdown format. Once the PDF has been converted, we'll save the Markdown output. In this guide, we will walk you through the complete process for creating a program that will accomplish this task. + +## Prerequisites + +To complete this guide, you will need: + +* [Python](https://www.python.org/downloads/) - Python 3.10 or higher is required. +* An Adobe ID. If you do not have one, the credential setup will walk you through creating one. +* A way to edit code. No specific editor is required for this guide. + + +## Step One: Getting credentials + +1) To begin, open your browser to . If you are not already logged in to Adobe.com, you will need to sign in or create a new user. Using a personal email account is recommend and not a federated ID. + +![Sign in](./shot1.png) + +2) After registering or logging in, you will then be asked to name your new credentials. Use the name, "New Project". + +3) Change the "Choose language" setting to "Python". + +4) Also note the checkbox by, "Create personalized code sample." This will include a large set of samples along with your credentials. These can be helpful for learning more later. + +5) Click the checkbox saying you agree to the developer terms and then click "Create credentials." + +![Project setup](./shot2_spc.png) + +6) After your credentials are created, they are automatically downloaded: + +![alt](./shot3_spc.png) + +## Step Two: Setting up the project + +1) In your Downloads folder, find the ZIP file with your credentials: PDFServicesSDK-Python Samples.zip. If you unzip that archive, you will find a README file, a folder of samples and the `pdfservices-api-credentials.json` file. + +![alt](./shot5_spc.png) + +2) Take the `pdfservices-api-credentials.json` file and place it in a new directory. Remember that these credential files are important and should be stored safely. + +3) At the command line, change to the directory you created, and run the following command to install the Python SDK: `pip install pdfservices-sdk`. + +![alt](./shot7.png) + +At this point, we've installed the Python SDK for Adobe PDF Services API as a dependency for our project and have copied over our credentials files. + +Our application will take a PDF, `Adobe Extract API Sample.pdf` (downloadable from [here](/ +Adobe%20Extract%20API%20Sample.pdf) and extract it's contents. The results will be saved as a `.md` file with a timestamp in the filename. + +4) In your editor, open the directory where you previously copied the credentials. Create a new file, `extract.py`. + +Now you're ready to begin coding. + +## Step Three: Creating the application + +1) We'll begin by including our required dependencies: + +```python +import logging +import os +from datetime import datetime + +from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials +from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException +from adobe.pdfservices.operation.io.cloud_asset import CloudAsset +from adobe.pdfservices.operation.io.stream_asset import StreamAsset +from adobe.pdfservices.operation.pdf_services import PDFServices +from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType +from adobe.pdfservices.operation.pdfjobs.jobs.pdf_to_markdown_job import PDFToMarkdownJob +from adobe.pdfservices.operation.pdfjobs.result.pdf_to_markdown_result import PDFToMarkdownResult +``` + +These imports bring in the Adobe PDF Services SDK components needed for PDF to Markdown conversion. + +2) Next, we setup the SDK to use our credentials. + +```python +# Initial setup, create credentials instance +credentials = ServicePrincipalCredentials( + client_id=os.getenv('PDF_SERVICES_CLIENT_ID'), + client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET') +) +``` + +This code both points to the credentials downloaded previously as well as sets up an execution context object that will be used later. + +3) Now, let's create the operation: + +```python +# Creates a PDF Services instance +pdf_services = PDFServices(credentials=credentials) + +# Creates an asset(s) from source file(s) and upload +input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF) + +# Creates a new job instance +pdf_to_markdown_job = PDFToMarkdownJob(input_asset=input_asset) +``` + +This code creates a PDF to Markdown conversion job. The job will convert the PDF content to Markdown format, preserving document structure and formatting. + +4) The next code block executes the operation: + +```python +# Submit the job and gets the job result +location = pdf_services.submit(pdf_to_markdown_job) +pdf_services_response = pdf_services.get_job_result(location, PDFToMarkdownResult) + +# Get content from the resulting asset(s) +result_asset: CloudAsset = pdf_services_response.get_result().get_asset() +stream_asset: StreamAsset = pdf_services.get_content(result_asset) + +# Creates an output stream and copy stream asset's content to it +output_file_path = self.create_output_file_path() +with open(output_file_path, "wb") as file: + file.write(stream_asset.get_input_stream()) +``` + +This code runs the PDF to Markdown conversion process and then stores the result Markdown file to the file system. + +![alt](./shot8.png) + +Here's the complete application (`extract.py`): + +```python +import logging +import os +from datetime import datetime + +from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials +from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException +from adobe.pdfservices.operation.io.cloud_asset import CloudAsset +from adobe.pdfservices.operation.io.stream_asset import StreamAsset +from adobe.pdfservices.operation.pdf_services import PDFServices +from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType +from adobe.pdfservices.operation.pdfjobs.jobs.pdf_to_markdown_job import PDFToMarkdownJob +from adobe.pdfservices.operation.pdfjobs.result.pdf_to_markdown_result import PDFToMarkdownResult + +# Initialize the logger +logging.basicConfig(level=logging.INFO) + +# This sample illustrates how to convert a PDF file to Markdown format. +# +# Refer to README.md for instructions on how to run the samples. + +class PDFToMarkdown: + def __init__(self): + try: + file = open('./pdfToMarkdownInput.pdf', 'rb') + input_stream = file.read() + file.close() + + # Initial setup, create credentials instance + credentials = ServicePrincipalCredentials( + client_id=os.getenv('PDF_SERVICES_CLIENT_ID'), + client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET') + ) + + # Creates a PDF Services instance + pdf_services = PDFServices(credentials=credentials) + + # Creates an asset(s) from source file(s) and upload + input_asset = pdf_services.upload(input_stream=input_stream, + mime_type=PDFServicesMediaType.PDF) + + # Creates a new job instance + pdf_to_markdown_job = PDFToMarkdownJob(input_asset=input_asset) + + # Submit the job and gets the job result + location = pdf_services.submit(pdf_to_markdown_job) + pdf_services_response = pdf_services.get_job_result(location, PDFToMarkdownResult) + + # Get content from the resulting asset(s) + result_asset: CloudAsset = pdf_services_response.get_result().get_asset() + stream_asset: StreamAsset = pdf_services.get_content(result_asset) + + # Creates an output stream and copy stream asset's content to it + output_file_path = self.create_output_file_path() + with open(output_file_path, "wb") as file: + file.write(stream_asset.get_input_stream()) + + except (ServiceApiException, ServiceUsageException, SdkException) as e: + logging.exception(f'Exception encountered while executing operation: {e}') + + # Generates a string containing a directory structure and file name for the output file + @staticmethod + def create_output_file_path() -> str: + now = datetime.now() + time_stamp = now.strftime("%Y-%m-%dT%H-%M-%S") + os.makedirs("output/PDFToMarkdown", exist_ok=True) + return f"output/PDFToMarkdown/markdown{time_stamp}.md" + + +if __name__ == "__main__": + PDFToMarkdown() +``` + +## Next Steps + +Now that you've successfully performed your first operation, [review the documentation](https://developer.adobe.com/document-services/docs/overview/pdf-services-api/) for many other examples and reach out on our [forums](https://community.adobe.com/t5/document-services-apis/ct-p/ct-Document-Cloud-SDK?page=1&sort=latest_replies&filter=all&lang=all&tabid=discussions&topics=label-documentgenerationapi) with any questions. Also remember the samples you downloaded while creating your credentials also have many demos. diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot1.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot1.png new file mode 100644 index 000000000..cca4151ad Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot1.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot2_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot2_spc.png new file mode 100644 index 000000000..b486a8e2e Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot2_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot3_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot3_spc.png new file mode 100644 index 000000000..fa1c1ea8e Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot3_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot5_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot5_spc.png new file mode 100644 index 000000000..8b09c67be Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot5_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot6_spc.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot6_spc.png new file mode 100644 index 000000000..5949871fd Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot6_spc.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot7.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot7.png new file mode 100644 index 000000000..f85593aff Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot7.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot8.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot8.png new file mode 100644 index 000000000..947b5acde Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot8.png differ diff --git a/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot9.png b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot9.png new file mode 100644 index 000000000..06ce78ca9 Binary files /dev/null and b/src/pages/overview/pdf-extract-api/quickstarts/pdf-to-markdown/python/shot9.png differ diff --git a/src/pages/overview/pdf-services-api/howtos/create-pdf.md b/src/pages/overview/pdf-services-api/howtos/create-pdf.md index 4fb5db5b0..22ac28ee5 100644 --- a/src/pages/overview/pdf-services-api/howtos/create-pdf.md +++ b/src/pages/overview/pdf-services-api/howtos/create-pdf.md @@ -26,6 +26,7 @@ following formats: - Microsoft Excel (XLS, XLSX) - Text (TXT, RTF) - Image (BMP, JPEG, GIF, TIFF, PNG) +- Markdown (MARKDOWN) diff --git a/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md b/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md index ec81a8ddc..ad81ac970 100644 --- a/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md +++ b/src/pages/overview/pdf-services-api/howtos/pdf-to-markdown-api.md @@ -116,3 +116,343 @@ For File Constraints and Processing Limits, see [Licensing and Usage Limits](../ ## REST API See our public API Reference for [PDF to Markdown API](../../../apis/#tag/PDF-To-Markdown). + +## Get Markdown from a PDF + +Use the sample below to create Markdowns from PDFs + +Please refer the [API usage guide](../api-usage.md) to understand how to use our APIs. + + + +#### .NET + +```javascript +// Get the samples from https://github.com/adobe/PDFServices.NET.SDK.Samples +// Run the sample: +// cd PDFToMarkdown/ +// dotnet run PDFToMarkdown.csproj +namespace PDFToMarkdown +{ + class Program + { + private static readonly ILog log = LogManager.GetLogger(typeof(Program)); + + static void Main() + { + ConfigureLogging(); + try + { + ICredentials credentials = new ServicePrincipalCredentials( + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"), + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET")); + + PDFServices pdfServices = new PDFServices(credentials); + + using Stream inputStream = File.OpenRead(@"pdfToMarkdownInput.pdf"); + IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue()); + + + PDFToMarkdownJob pdfToMarkdownJob = new PDFToMarkdownJob(asset); + + String location = pdfServices.Submit(pdfToMarkdownJob); + PDFServicesResponse pdfServicesResponse = + pdfServices.GetJobResult(location, typeof(PDFToMarkdownResult)); + + IAsset resultAsset = pdfServicesResponse.Result.Asset; + StreamAsset streamAsset = pdfServices.GetContent(resultAsset); + + String outputFilePath = CreateOutputFilePath(); + new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create(); + Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath); + streamAsset.Stream.CopyTo(outputStream); + outputStream.Close(); + } + catch (ServiceUsageException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (ServiceApiException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (SDKException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (IOException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (Exception ex) + { + log.Error("Exception encountered while executing operation", ex); + } + } + + static void ConfigureLogging() + { + ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly()); + XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config")); + } + + private static String CreateOutputFilePath() + { + String timeStamp = DateTime.Now.ToString("yyyy'-'MM'-'dd'T'HH'-'mm'-'ss"); + return ("/output/pdfToMarkdown" + timeStamp + ".md"); + } + } +} +``` + +#### Python + +```python +# Get the samples https://github.com/adobe/pdfservices-python-sdk-samples +# Run the sample: +# python src/pdftomarkdown/pdf_to_markdown.py + +# Initialize the logger +logging.basicConfig(level=logging.INFO) + +class PDFToMarkdown: + def __init__(self): + try: + file = open('src/resources/pdfToMarkdownInput.pdf', 'rb') + input_stream = file.read() + file.close() + + # Initial setup, create credentials instance + credentials = ServicePrincipalCredentials( + client_id=os.getenv('PDF_SERVICES_CLIENT_ID'), + client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET') + ) + + # Creates a PDF Services instance + pdf_services = PDFServices(credentials=credentials) + + # Creates an asset(s) from source file(s) and upload + input_asset = pdf_services.upload(input_stream=input_stream, + mime_type=PDFServicesMediaType.PDF) + + # Creates a new job instance + pdf_to_markdown_job = PDFToMarkdownJob(input_asset=input_asset) + + # Submit the job and gets the job result + location = pdf_services.submit(pdf_to_markdown_job) + pdf_services_response = pdf_services.get_job_result(location, PDFToMarkdownResult) + + # Get content from the resulting asset(s) + result_asset: CloudAsset = pdf_services_response.get_result().get_asset() + stream_asset: StreamAsset = pdf_services.get_content(result_asset) + + # Creates an output stream and copy stream asset's content to it + output_file_path = self.create_output_file_path() + with open(output_file_path, "wb") as file: + file.write(stream_asset.get_input_stream()) + + except (ServiceApiException, ServiceUsageException, SdkException) as e: + logging.exception(f'Exception encountered while executing operation: {e}') + + # Generates a string containing a directory structure and file name for the output file + @staticmethod + def create_output_file_path() -> str: + now = datetime.now() + time_stamp = now.strftime("%Y-%m-%dT%H-%M-%S") + os.makedirs("output/PDFToMarkdown", exist_ok=True) + return f"output/PDFToMarkdown/markdown{time_stamp}.md" + + +if __name__ == "__main__": + PDFToMarkdown() +``` + +#### REST API + +```javascript +// Please refer our REST API docs for more information +// https://developer.adobe.com/document-services/docs/apis/#tag/PDF-To-Markdown + +curl --location --request POST 'https://pdf-services.adobe.io/operation/pdftomarkdown' \ +--header 'x-api-key: {{Placeholder for client_id}}' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer {{Placeholder for token}}' \ +--data-raw '{ + "assetID": "urn:aaid:AS:UE1:23c30ee0-2e4d-46d6-87f2-087832fca718" +}' +``` + +## Get Markdown from a PDF with Figures + +Use the sample below to create Markdowns from PDFs with figures embedded in the PDFs + +Please refer the [API usage guide](../api-usage.md) to understand how to use our APIs. + + + +#### .NET + +```javascript +// Get the samples from https://github.com/adobe/PDFServices.NET.SDK.Samples +// Run the sample: +// cd PDFToMarkdownWithFigures/ +// dotnet run PDFToMarkdownWithFigures.csproj +namespace PDFToMarkdownWithFigures +{ + class Program + { + private static readonly ILog log = LogManager.GetLogger(typeof(Program)); + + static void Main() + { + ConfigureLogging(); + try + { + ICredentials credentials = new ServicePrincipalCredentials( + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"), + Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET")); + + PDFServices pdfServices = new PDFServices(credentials); + + using Stream inputStream = File.OpenRead(@"pdfToMarkdownInput.pdf"); + IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue()); + + // Create parameters for the job (include figure renditions in the output) + PDFToMarkdownParams pdfToMarkdownParams = PDFToMarkdownParams.PDFToMarkdownParamsBuilder() + .WithGetFigures(true) + .Build(); + + PDFToMarkdownJob pdfToMarkdownJob = new PDFToMarkdownJob(asset) + .SetParams(pdfToMarkdownParams); + + String location = pdfServices.Submit(pdfToMarkdownJob); + PDFServicesResponse pdfServicesResponse = + pdfServices.GetJobResult(location, typeof(PDFToMarkdownResult)); + + IAsset resultAsset = pdfServicesResponse.Result.Asset; + StreamAsset streamAsset = pdfServices.GetContent(resultAsset); + + String outputFilePath = CreateOutputFilePath(); + new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create(); + Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath); + streamAsset.Stream.CopyTo(outputStream); + outputStream.Close(); + } + catch (ServiceUsageException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (ServiceApiException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (SDKException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (IOException ex) + { + log.Error("Exception encountered while executing operation", ex); + } + catch (Exception ex) + { + log.Error("Exception encountered while executing operation", ex); + } + } + + static void ConfigureLogging() + { + ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly()); + XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config")); + } + + private static String CreateOutputFilePath() + { + String timeStamp = DateTime.Now.ToString("yyyy'-'MM'-'dd'T'HH'-'mm'-'ss"); + return ("/output/pdfToMarkdownWithFigures" + timeStamp + ".md"); + } + } +} +``` + +#### Python + +```python +# Get the samples https://github.com/adobe/pdfservices-python-sdk-samples +# Run the sample: +# python src/pdftomarkdown/pdf_to_markdown.py + +# Initialize the logger +logging.basicConfig(level=logging.INFO) + +class PDFToMarkdownWithOptions: + def __init__(self): + try: + file = open('src/resources/pdfToMarkdownInput.pdf', 'rb') + input_stream = file.read() + file.close() + + # Initial setup, create credentials instance + credentials = ServicePrincipalCredentials( + client_id=os.getenv('PDF_SERVICES_CLIENT_ID'), + client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET') + ) + + # Creates a PDF Services instance + pdf_services = PDFServices(credentials=credentials) + + # Creates an asset(s) from source file(s) and upload + input_asset = pdf_services.upload(input_stream=input_stream, + mime_type=PDFServicesMediaType.PDF) + + # Create parameters for the job with figures extraction enabled + pdf_to_markdown_params = PDFToMarkdownParams(get_figures=True) + + # Creates a new job instance + pdf_to_markdown_job = PDFToMarkdownJob(input_asset=input_asset, + pdf_to_markdown_params=pdf_to_markdown_params) + + # Submit the job and gets the job result + location = pdf_services.submit(pdf_to_markdown_job) + pdf_services_response = pdf_services.get_job_result(location, PDFToMarkdownResult) + + # Get content from the resulting asset(s) + result_asset: CloudAsset = pdf_services_response.get_result().get_asset() + stream_asset: StreamAsset = pdf_services.get_content(result_asset) + + # Creates an output stream and copy stream asset's content to it + output_file_path = self.create_output_file_path() + with open(output_file_path, "wb") as file: + file.write(stream_asset.get_input_stream()) + + except (ServiceApiException, ServiceUsageException, SdkException) as e: + logging.exception(f'Exception encountered while executing operation: {e}') + + # Generates a string containing a directory structure and file name for the output file + @staticmethod + def create_output_file_path() -> str: + now = datetime.now() + time_stamp = now.strftime("%Y-%m-%dT%H-%M-%S") + os.makedirs("output/PDFToMarkdownWithOptions", exist_ok=True) + return f"output/PDFToMarkdownWithOptions/markdown{time_stamp}.md" + + +if __name__ == "__main__": + PDFToMarkdownWithOptions() +``` + +#### REST API + +```javascript +// Please refer our REST API docs for more information +// https://developer.adobe.com/document-services/docs/apis/#tag/PDF-To-Markdown + +curl --location --request POST 'https://pdf-services.adobe.io/operation/pdftomarkdown' \ +--header 'x-api-key: {{Placeholder for client_id}}' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer {{Placeholder for token}}' \ +--data-raw '{ +"assetID": "urn:aaid:AS:UE1:23c30ee0-2e4d-46d6-87f2-087832fca718", + "getFigures": true +}' +``` \ No newline at end of file diff --git a/src/pages/overview/releasenotes.md b/src/pages/overview/releasenotes.md index 62888662e..7fa1e0077 100644 --- a/src/pages/overview/releasenotes.md +++ b/src/pages/overview/releasenotes.md @@ -139,7 +139,7 @@ const pageLayout = new PDFServicesSdk.CreatePDF.options.html.PageLayout(); ```javascript - + ``` @@ -154,7 +154,7 @@ using Adobe.PDFServicesSDK; - Add the following dependency in your project's requirements.txt file: ```javascript -pdfservices-sdk~=4.1.0 +pdfservices-sdk~=4.3.0 ``` ## Archived Documentation @@ -180,6 +180,20 @@ Upgrading to the latest SDK should not break existing applications. ## Change history +### April 6, 2026; .NET SDK 4.4.0 Release + +- Added support for [PDF to Markdown](../pdf-extract-api/howtos/pdf-to-markdown/) operation in PDF Services .NET SDK. +- Introduced support for generating PDFs from Markdown files in Create PDF operation +- Added includeRenderedHtml Parameter in the HTML to PDF operation +- Introduced includeHeaderFooter and tagEncapsulatedText Params in Extract operation + +### April 6, 2026; Python SDK 4.3.0 Release + +- Added support for [PDF to Markdown](../pdf-extract-api/howtos/pdf-to-markdown/) operation in PDF Services Python SDK. +- Introduced support for generating PDFs from Markdown files in Create PDF operation +- Added includeRenderedHtml Parameter in the HTML to PDF operation +- Introduced includeHeaderFooter and tagEncapsulatedText Params in Extract operation + ### September 29, 2025; .NET SDK 4.3.1 patch release - Bug fixes and stability improvements. diff --git a/src/pages/resources/Platform PDF Services Postman Collection.zip b/src/pages/resources/Platform PDF Services Postman Collection.zip index 709cac2b2..8cb344870 100644 Binary files a/src/pages/resources/Platform PDF Services Postman Collection.zip and b/src/pages/resources/Platform PDF Services Postman Collection.zip differ diff --git a/src/pages/resources/Platform PDF Services with External Storage Postman Collection.zip b/src/pages/resources/Platform PDF Services with External Storage Postman Collection.zip index 7ebc8f088..7f3e38bbe 100644 Binary files a/src/pages/resources/Platform PDF Services with External Storage Postman Collection.zip and b/src/pages/resources/Platform PDF Services with External Storage Postman Collection.zip differ diff --git a/src/pages/resources/openapi.json b/src/pages/resources/openapi.json index bc0fed5bd..1e82011d7 100644 --- a/src/pages/resources/openapi.json +++ b/src/pages/resources/openapi.json @@ -24,7 +24,7 @@ }, { "name": "Create PDF", - "description": "Create PDF document from Microsoft Office documents (Word, Excel and PowerPoint) and Image file formats." + "description": "Create PDF document from Microsoft Office documents (Word, Excel and PowerPoint), Markdown and Image file formats." }, { "name": "Export PDF", @@ -2095,7 +2095,7 @@ "Create PDF" ], "summary": "Create PDF document from non PDF document.", - "description": "Create PDF document from Microsoft Office documents (Word, Excel and PowerPoint) and Image file formats.", + "description": "Create PDF document from Microsoft Office documents (Word, Excel and PowerPoint), Markdown and Image file formats.", "operationId": "pdfoperations.createpdf", "parameters": [ { @@ -10008,7 +10008,7 @@ "type": "object", "properties": { "assetID": { - "description": "A file assetID. Supported file formats are bmp, doc, docx, gif, jpeg, jpg, png, ppt, pptx, rtf, tif, tiff, txt, xls and xlsx. For more details click here .", + "description": "A file assetID. Supported file formats are bmp, doc, docx, gif, jpeg, jpg, png, ppt, pptx, rtf, tif, tiff, txt, xls, xlsx and md. For more details click here .", "type": "string" }, "documentLanguage": { @@ -10626,6 +10626,11 @@ "type": "number", "default": 100 }, + "includeRenderedHtml": { + "description": "When set to `true`, the operation returns a ZIP file containing both the generated PDF and the rendered HTML content. Valid and accepted input type of files are - HTML files and ZIP files (containing HTML + assets)", + "type": "boolean", + "default": false + }, "notifiers": { "$ref": "#/components/schemas/notifiers" } @@ -10639,6 +10644,7 @@ "pageHeight": 8.5 }, "waitTimeToLoad": 100, + "includeRenderedHtml": true, "notifiers": [ { "type": "CALLBACK", @@ -10703,6 +10709,11 @@ "description": "Specifies the maximum time (in milliseconds) to wait before finalizing the PDF. After this duration, the HTML is captured and converted into a PDF, regardless of whether all elements have finished loading or rendering. If a customer provides a custom value, it overrides the default.", "type": "number", "default": 100 + }, + "includeRenderedHtml": { + "description": "When set to `true`, the operation returns a ZIP file containing both the generated PDF and the rendered HTML content.Valid and accepted input type of files are - HTML files and ZIP files (containing HTML + assets)", + "type": "boolean", + "default": false } } }, @@ -10726,7 +10737,8 @@ "pageWidth": 8.5, "pageHeight": 11 }, - "waitTimeToLoad": 100 + "waitTimeToLoad": 100, + "includeRenderedHtml": true }, "notifiers": [ {