From cfb342cd7493f6767371e00b8440ce7423df467e Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Fri, 27 Jun 2025 14:22:00 +0200 Subject: [PATCH 1/9] Implement PoC transform handling Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- app/main.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 4 deletions(-) diff --git a/app/main.py b/app/main.py index 0aa38c5..0d2c589 100644 --- a/app/main.py +++ b/app/main.py @@ -1,8 +1,103 @@ -from fastapi import FastAPI +from fastapi import FastAPI, UploadFile +from enum import Enum +from pandas import DataFrame +from typing import Callable +from docx import Document +from odf.opendocument import OpenDocumentText +from odf.table import Table, TableRow, TableCell +import tabula + + +def from_odt(file): + # Load the ODT document + doc = OpenDocumentText(file) + + # Iterate through the tables in the document + for table in doc.getElementsByType(Table): + for row in table.getElementsByType(TableRow): + # Extract text from each cell in the row + row_data = [] + for cell in row.getElementsByType(TableCell): + # Get the text content of the cell + cell_text = "".join( + node.data + for node in cell.childNodes + if node.nodeType == node.TEXT_NODE + ) + row_data.append(cell_text) + print(row_data) # Print or process the row data as needed + + +def from_pdf(file_path): + # Read tables from the PDF file + tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True) + + # Iterate through the extracted tables + for i, table in enumerate(tables): + print(f"Table {i + 1}:") + print(table) # Print or process the table as needed + app = FastAPI() -@app.get("/") -def read_root(): - return {"Hello": "World"} +class Formats(Enum): + CSV = "csv" + XML = "xml" + JSON = "json" + PDF = "pdf" + + +class ValidExtensions(Enum): + CSV = "csv" + XML = "xml" + JSON = "json" + PDF = "pdf" + DOC = "doc" + DOCX = "docx" + ODT = "odt" + + +FUNCTIONS: dict[Formats, Callable[[DataFrame], str]] = { + Formats.CSV: lambda x: x.to_csv(), + Formats.JSON: lambda x: x.to_json(), + Formats.XML: lambda x: x.to_xml(), +} + + +def from_docx(file): + # Load the document + doc = Document(file) + + # Iterate through the tables in the document + for table in doc.tables: + for row in table.rows: + # Extract text from each cell in the row + row_data = [cell.text for cell in row.cells] + + +@app.post("/") +def transform(format: Formats, file: UploadFile): + """ + Transforms given file to another format + Parameters + ---------- + format: + Destination format to use + file: + File to transform. Requires valid extension + """ + extension = file.filename.split(".")[-1] + if extension in {"csv", "xml", "json"}: + data = file.file.readlines() + df = DataFrame(data[1:], columns=data[0]) + elif extension in {"docx"}: + df = from_docx(file.file) + elif extension in {"odt"}: + df = from_odt(file.file) + elif extension in {"pdf"}: + df = from_pdf(file.file) + elif extension in {"doc"}: + return "ERROR: Not yet supported" + + return FUNCTIONS.get(format, lambda x: 404)(df) From e9e99f6c5cf166fa8e051d5948ebbf2d161ebdfc Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Fri, 27 Jun 2025 14:23:54 +0200 Subject: [PATCH 2/9] Implement functions to convert tables from docs Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- app/main.py | 150 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 95 insertions(+), 55 deletions(-) diff --git a/app/main.py b/app/main.py index 0d2c589..6ac7455 100644 --- a/app/main.py +++ b/app/main.py @@ -1,43 +1,12 @@ -from fastapi import FastAPI, UploadFile +from fastapi import FastAPI, Response, UploadFile from enum import Enum -from pandas import DataFrame +from pandas import DataFrame, concat, read_csv, read_json, read_xml from typing import Callable from docx import Document from odf.opendocument import OpenDocumentText from odf.table import Table, TableRow, TableCell import tabula - -def from_odt(file): - # Load the ODT document - doc = OpenDocumentText(file) - - # Iterate through the tables in the document - for table in doc.getElementsByType(Table): - for row in table.getElementsByType(TableRow): - # Extract text from each cell in the row - row_data = [] - for cell in row.getElementsByType(TableCell): - # Get the text content of the cell - cell_text = "".join( - node.data - for node in cell.childNodes - if node.nodeType == node.TEXT_NODE - ) - row_data.append(cell_text) - print(row_data) # Print or process the row data as needed - - -def from_pdf(file_path): - # Read tables from the PDF file - tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True) - - # Iterate through the extracted tables - for i, table in enumerate(tables): - print(f"Table {i + 1}:") - print(table) # Print or process the table as needed - - app = FastAPI() @@ -45,7 +14,6 @@ class Formats(Enum): CSV = "csv" XML = "xml" JSON = "json" - PDF = "pdf" class ValidExtensions(Enum): @@ -58,22 +26,76 @@ class ValidExtensions(Enum): ODT = "odt" -FUNCTIONS: dict[Formats, Callable[[DataFrame], str]] = { - Formats.CSV: lambda x: x.to_csv(), - Formats.JSON: lambda x: x.to_json(), - Formats.XML: lambda x: x.to_xml(), +FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = { + Formats.CSV: lambda df: Response( + content=df.to_csv(index=False), media_type="text/csv" + ), + Formats.JSON: lambda df: Response( + content=df.to_json(orient="records"), media_type="application/json" + ), + Formats.XML: lambda df: Response( + content=df.to_xml(index=False), media_type="application/xml" + ), } -def from_docx(file): - # Load the document - doc = Document(file) +def from_odt(file) -> DataFrame: + """ + Extracts tables from an ODT file and returns a concatenated DataFrame. + """ + doc = OpenDocumentText(file) + dfs = [] + for table in doc.getElementsByType(Table): + data = [] + for row in table.getElementsByType(TableRow): + row_data = [] + for cell in row.getElementsByType(TableCell): + cell_text = "".join( + node.data + for node in cell.childNodes + if node.nodeType == node.TEXT_NODE + ) + row_data.append(cell_text) + data.append(row_data) + if data: + header = data[0] + df = DataFrame(data[1:], columns=header) + dfs.append(df) + if not dfs: + return DataFrame() + return concat(dfs, ignore_index=True) + + +def from_pdf(file_path: str) -> DataFrame: + """ + Extracts tables from a PDF file and returns a concatenated DataFrame. + """ + try: + tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True) + except Exception: + return DataFrame() + if not tables: + return DataFrame() + return concat(tables, ignore_index=True) - # Iterate through the tables in the document + +def from_docx(file) -> DataFrame: + """ + Extracts tables from a DOCX file and returns a concatenated DataFrame. + """ + doc = Document(file) + dfs = [] for table in doc.tables: + data = [] for row in table.rows: - # Extract text from each cell in the row - row_data = [cell.text for cell in row.cells] + data.append([cell.text for cell in row.cells]) + if data: + header = data[0] + df = DataFrame(data[1:], columns=header) + dfs.append(df) + if not dfs: + return DataFrame() + return concat(dfs, ignore_index=True) @app.post("/") @@ -87,17 +109,35 @@ def transform(format: Formats, file: UploadFile): file: File to transform. Requires valid extension """ - extension = file.filename.split(".")[-1] - if extension in {"csv", "xml", "json"}: - data = file.file.readlines() - df = DataFrame(data[1:], columns=data[0]) - elif extension in {"docx"}: + extension = file.filename.split(".")[-1] if file.filename else "" + + if extension == "csv": + df = read_csv(file.file) + elif extension == "xml": + df = read_xml(file.file) + elif extension == "json": + df = read_json(file.file) + elif extension == "docx": df = from_docx(file.file) - elif extension in {"odt"}: + elif extension == "odt": df = from_odt(file.file) - elif extension in {"pdf"}: + elif extension == "pdf": df = from_pdf(file.file) - elif extension in {"doc"}: - return "ERROR: Not yet supported" - - return FUNCTIONS.get(format, lambda x: 404)(df) + elif extension == "doc": + return Response( + content="ERROR: .doc format is not supported for in-memory structured data extraction without external tools.", + status_code=400, + ) + else: + return Response( + content=f"Unsupported file extension: {extension}", status_code=400 + ) + + if df.empty: + return Response(content="Could not extract data from file.", status_code=400) + + if func := FUNCTIONS.get(format): + return func(df) + return Response( + content=f"Unsupported output format: {format.value}", status_code=400 + ) From 57f8a7b3e12cc87e3da0c9b6f53bbf8cfb2e4ab1 Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Fri, 27 Jun 2025 14:24:00 +0200 Subject: [PATCH 3/9] Update requirements Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- requirements.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d3feed5..194c53a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,6 @@ -fastapi[standard]>=0.115.13 \ No newline at end of file +fastapi[standard]>=0.115.13 +python-multipart +pandas +python-docx +odfpy +tabula-py From 4e5ab475f214dc90e1865d4b419f8516d5204dde Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:13:39 +0200 Subject: [PATCH 4/9] Organize imports & Add typehints Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- app/main.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/app/main.py b/app/main.py index 6ac7455..4ac5933 100644 --- a/app/main.py +++ b/app/main.py @@ -1,11 +1,12 @@ -from fastapi import FastAPI, Response, UploadFile from enum import Enum -from pandas import DataFrame, concat, read_csv, read_json, read_xml -from typing import Callable -from docx import Document -from odf.opendocument import OpenDocumentText -from odf.table import Table, TableRow, TableCell +from typing import BinaryIO, Callable + import tabula +from docx import Document +from fastapi import FastAPI, Response, UploadFile +from odf.opendocument import load +from odf.table import Table, TableCell, TableRow +from pandas import DataFrame, concat, read_csv, read_json, read_xml app = FastAPI() @@ -39,11 +40,11 @@ class ValidExtensions(Enum): } -def from_odt(file) -> DataFrame: +def from_odt(file: BinaryIO) -> DataFrame: """ Extracts tables from an ODT file and returns a concatenated DataFrame. """ - doc = OpenDocumentText(file) + doc = load(file) dfs = [] for table in doc.getElementsByType(Table): data = [] @@ -66,7 +67,7 @@ def from_odt(file) -> DataFrame: return concat(dfs, ignore_index=True) -def from_pdf(file_path: str) -> DataFrame: +def from_pdf(file_path: BinaryIO) -> DataFrame: """ Extracts tables from a PDF file and returns a concatenated DataFrame. """ @@ -79,7 +80,7 @@ def from_pdf(file_path: str) -> DataFrame: return concat(tables, ignore_index=True) -def from_docx(file) -> DataFrame: +def from_docx(file: BinaryIO) -> DataFrame: """ Extracts tables from a DOCX file and returns a concatenated DataFrame. """ From 77009cb24c068338f2c208dd27afc29266415177 Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:14:07 +0200 Subject: [PATCH 5/9] Add local build instructions Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1585063..ef442fd 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,16 @@ docker run --rm -it -p YOUR_PORT:80 ghcr.io/Atomic-IT/DataManager-Python-Tools ``` API Documentation URL: -- http://YOUR_IP:YOUR_PORT/docs \ No newline at end of file +- http://YOUR_IP:YOUR_PORT/docs + +# Local build + +## Build Docker +```sh +docker build -t dm-tools -f Dockerfile +``` + +## Run locally built image +```sh +docker run -p 1080:80 -it --rm dm-tools +``` From a9164f29d19123cd3b3fa6423e5588c4fabc36bc Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:21:15 +0200 Subject: [PATCH 6/9] Fix extracting tables from .odt Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- app/main.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/app/main.py b/app/main.py index 4ac5933..0d24f77 100644 --- a/app/main.py +++ b/app/main.py @@ -6,6 +6,7 @@ from fastapi import FastAPI, Response, UploadFile from odf.opendocument import load from odf.table import Table, TableCell, TableRow +from odf.teletype import extractText from pandas import DataFrame, concat, read_csv, read_json, read_xml app = FastAPI() @@ -51,11 +52,7 @@ def from_odt(file: BinaryIO) -> DataFrame: for row in table.getElementsByType(TableRow): row_data = [] for cell in row.getElementsByType(TableCell): - cell_text = "".join( - node.data - for node in cell.childNodes - if node.nodeType == node.TEXT_NODE - ) + cell_text = extractText(cell) row_data.append(cell_text) data.append(row_data) if data: From 7aab2be385d1c50d2200c97a696938300fdc0ad4 Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:26:52 +0200 Subject: [PATCH 7/9] Rename file_path to file to better reflect type Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- app/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/main.py b/app/main.py index 0d24f77..c5126b1 100644 --- a/app/main.py +++ b/app/main.py @@ -64,12 +64,12 @@ def from_odt(file: BinaryIO) -> DataFrame: return concat(dfs, ignore_index=True) -def from_pdf(file_path: BinaryIO) -> DataFrame: +def from_pdf(file: BinaryIO) -> DataFrame: """ Extracts tables from a PDF file and returns a concatenated DataFrame. """ try: - tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True) + tables = tabula.read_pdf(file, pages="all", multiple_tables=True) except Exception: return DataFrame() if not tables: From cb717cab0584bdd5a11c0b56f23b14ca9178d437 Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Tue, 24 Jun 2025 01:15:55 +0200 Subject: [PATCH 8/9] Create codeql.yml --- .github/workflows/codeql.yml | 100 +++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..57612d5 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,100 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL Advanced" + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + schedule: + - cron: '28 12 * * 3' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: actions + build-mode: none + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Add any setup steps before running the `github/codeql-action/init` action. + # This includes steps like installing compilers or runtimes (`actions/setup-node` + # or others). This is typically only required for manual builds. + # - name: Setup runtime (example) + # uses: actions/setup-example@v1 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" From 48cb6346ef3da553d89f68c790433f7315a54b13 Mon Sep 17 00:00:00 2001 From: Mmesek <13630781+Mmesek@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:34:57 +0200 Subject: [PATCH 9/9] Fix formating for linting action Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com> --- app/main.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/app/main.py b/app/main.py index c5126b1..3f6897f 100644 --- a/app/main.py +++ b/app/main.py @@ -29,15 +29,9 @@ class ValidExtensions(Enum): FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = { - Formats.CSV: lambda df: Response( - content=df.to_csv(index=False), media_type="text/csv" - ), - Formats.JSON: lambda df: Response( - content=df.to_json(orient="records"), media_type="application/json" - ), - Formats.XML: lambda df: Response( - content=df.to_xml(index=False), media_type="application/xml" - ), + Formats.CSV: lambda df: Response(content=df.to_csv(index=False), media_type="text/csv"), + Formats.JSON: lambda df: Response(content=df.to_json(orient="records"), media_type="application/json"), + Formats.XML: lambda df: Response(content=df.to_xml(index=False), media_type="application/xml") } @@ -127,15 +121,11 @@ def transform(format: Formats, file: UploadFile): status_code=400, ) else: - return Response( - content=f"Unsupported file extension: {extension}", status_code=400 - ) + return Response(content=f"Unsupported file extension: {extension}", status_code=400) if df.empty: return Response(content="Could not extract data from file.", status_code=400) if func := FUNCTIONS.get(format): return func(df) - return Response( - content=f"Unsupported output format: {format.value}", status_code=400 - ) + return Response(content=f"Unsupported output format: {format.value}", status_code=400)