From cfb342cd7493f6767371e00b8440ce7423df467e Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Fri, 27 Jun 2025 14:22:00 +0200
Subject: [PATCH 1/9] Implement PoC transform handling

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 app/main.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 4 deletions(-)

diff --git a/app/main.py b/app/main.py
index 0aa38c5..0d2c589 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,8 +1,103 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, UploadFile
+from enum import Enum
+from pandas import DataFrame
+from typing import Callable
+from docx import Document
+from odf.opendocument import OpenDocumentText
+from odf.table import Table, TableRow, TableCell
+import tabula
+
+
+def from_odt(file):
+    # Load the ODT document
+    doc = OpenDocumentText(file)
+
+    # Iterate through the tables in the document
+    for table in doc.getElementsByType(Table):
+        for row in table.getElementsByType(TableRow):
+            # Extract text from each cell in the row
+            row_data = []
+            for cell in row.getElementsByType(TableCell):
+                # Get the text content of the cell
+                cell_text = "".join(
+                    node.data
+                    for node in cell.childNodes
+                    if node.nodeType == node.TEXT_NODE
+                )
+                row_data.append(cell_text)
+            print(row_data)  # Print or process the row data as needed
+
+
+def from_pdf(file_path):
+    # Read tables from the PDF file
+    tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True)
+
+    # Iterate through the extracted tables
+    for i, table in enumerate(tables):
+        print(f"Table {i + 1}:")
+        print(table)  # Print or process the table as needed
+
 
 app = FastAPI()
 
 
-@app.get("/")
-def read_root():
-    return {"Hello": "World"}
+class Formats(Enum):
+    CSV = "csv"
+    XML = "xml"
+    JSON = "json"
+    PDF = "pdf"
+
+
+class ValidExtensions(Enum):
+    CSV = "csv"
+    XML = "xml"
+    JSON = "json"
+    PDF = "pdf"
+    DOC = "doc"
+    DOCX = "docx"
+    ODT = "odt"
+
+
+FUNCTIONS: dict[Formats, Callable[[DataFrame], str]] = {
+    Formats.CSV: lambda x: x.to_csv(),
+    Formats.JSON: lambda x: x.to_json(),
+    Formats.XML: lambda x: x.to_xml(),
+}
+
+
+def from_docx(file):
+    # Load the document
+    doc = Document(file)
+
+    # Iterate through the tables in the document
+    for table in doc.tables:
+        for row in table.rows:
+            # Extract text from each cell in the row
+            row_data = [cell.text for cell in row.cells]
+
+
+@app.post("/")
+def transform(format: Formats, file: UploadFile):
+    """
+    Transforms given file to another format
+    Parameters
+    ----------
+    format:
+        Destination format to use
+    file:
+        File to transform. Requires valid extension
+    """
+    extension = file.filename.split(".")[-1]
+    if extension in {"csv", "xml", "json"}:
+        data = file.file.readlines()
+        df = DataFrame(data[1:], columns=data[0])
+    elif extension in {"docx"}:
+        df = from_docx(file.file)
+    elif extension in {"odt"}:
+        df = from_odt(file.file)
+    elif extension in {"pdf"}:
+        df = from_pdf(file.file)
+    elif extension in {"doc"}:
+        return "ERROR: Not yet supported"
+
+    return FUNCTIONS.get(format, lambda x: 404)(df)

From e9e99f6c5cf166fa8e051d5948ebbf2d161ebdfc Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Fri, 27 Jun 2025 14:23:54 +0200
Subject: [PATCH 2/9] Implement functions to convert tables from docs

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 app/main.py | 150 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 95 insertions(+), 55 deletions(-)

diff --git a/app/main.py b/app/main.py
index 0d2c589..6ac7455 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,43 +1,12 @@
-from fastapi import FastAPI, UploadFile
+from fastapi import FastAPI, Response, UploadFile
 from enum import Enum
-from pandas import DataFrame
+from pandas import DataFrame, concat, read_csv, read_json, read_xml
 from typing import Callable
 from docx import Document
 from odf.opendocument import OpenDocumentText
 from odf.table import Table, TableRow, TableCell
 import tabula
 
-
-def from_odt(file):
-    # Load the ODT document
-    doc = OpenDocumentText(file)
-
-    # Iterate through the tables in the document
-    for table in doc.getElementsByType(Table):
-        for row in table.getElementsByType(TableRow):
-            # Extract text from each cell in the row
-            row_data = []
-            for cell in row.getElementsByType(TableCell):
-                # Get the text content of the cell
-                cell_text = "".join(
-                    node.data
-                    for node in cell.childNodes
-                    if node.nodeType == node.TEXT_NODE
-                )
-                row_data.append(cell_text)
-            print(row_data)  # Print or process the row data as needed
-
-
-def from_pdf(file_path):
-    # Read tables from the PDF file
-    tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True)
-
-    # Iterate through the extracted tables
-    for i, table in enumerate(tables):
-        print(f"Table {i + 1}:")
-        print(table)  # Print or process the table as needed
-
-
 app = FastAPI()
 
 
@@ -45,7 +14,6 @@ class Formats(Enum):
     CSV = "csv"
     XML = "xml"
     JSON = "json"
-    PDF = "pdf"
 
 
 class ValidExtensions(Enum):
@@ -58,22 +26,76 @@ class ValidExtensions(Enum):
     ODT = "odt"
 
 
-FUNCTIONS: dict[Formats, Callable[[DataFrame], str]] = {
-    Formats.CSV: lambda x: x.to_csv(),
-    Formats.JSON: lambda x: x.to_json(),
-    Formats.XML: lambda x: x.to_xml(),
+FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = {
+    Formats.CSV: lambda df: Response(
+        content=df.to_csv(index=False), media_type="text/csv"
+    ),
+    Formats.JSON: lambda df: Response(
+        content=df.to_json(orient="records"), media_type="application/json"
+    ),
+    Formats.XML: lambda df: Response(
+        content=df.to_xml(index=False), media_type="application/xml"
+    ),
 }
 
 
-def from_docx(file):
-    # Load the document
-    doc = Document(file)
+def from_odt(file) -> DataFrame:
+    """
+    Extracts tables from an ODT file and returns a concatenated DataFrame.
+    """
+    doc = OpenDocumentText(file)
+    dfs = []
+    for table in doc.getElementsByType(Table):
+        data = []
+        for row in table.getElementsByType(TableRow):
+            row_data = []
+            for cell in row.getElementsByType(TableCell):
+                cell_text = "".join(
+                    node.data
+                    for node in cell.childNodes
+                    if node.nodeType == node.TEXT_NODE
+                )
+                row_data.append(cell_text)
+            data.append(row_data)
+        if data:
+            header = data[0]
+            df = DataFrame(data[1:], columns=header)
+            dfs.append(df)
+    if not dfs:
+        return DataFrame()
+    return concat(dfs, ignore_index=True)
+
+
+def from_pdf(file_path: str) -> DataFrame:
+    """
+    Extracts tables from a PDF file and returns a concatenated DataFrame.
+    """
+    try:
+        tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True)
+    except Exception:
+        return DataFrame()
+    if not tables:
+        return DataFrame()
+    return concat(tables, ignore_index=True)
 
-    # Iterate through the tables in the document
+
+def from_docx(file) -> DataFrame:
+    """
+    Extracts tables from a DOCX file and returns a concatenated DataFrame.
+    """
+    doc = Document(file)
+    dfs = []
     for table in doc.tables:
+        data = []
         for row in table.rows:
-            # Extract text from each cell in the row
-            row_data = [cell.text for cell in row.cells]
+            data.append([cell.text for cell in row.cells])
+        if data:
+            header = data[0]
+            df = DataFrame(data[1:], columns=header)
+            dfs.append(df)
+    if not dfs:
+        return DataFrame()
+    return concat(dfs, ignore_index=True)
 
 
 @app.post("/")
@@ -87,17 +109,35 @@ def transform(format: Formats, file: UploadFile):
     file:
         File to transform. Requires valid extension
     """
-    extension = file.filename.split(".")[-1]
-    if extension in {"csv", "xml", "json"}:
-        data = file.file.readlines()
-        df = DataFrame(data[1:], columns=data[0])
-    elif extension in {"docx"}:
+    extension = file.filename.split(".")[-1] if file.filename else ""
+
+    if extension == "csv":
+        df = read_csv(file.file)
+    elif extension == "xml":
+        df = read_xml(file.file)
+    elif extension == "json":
+        df = read_json(file.file)
+    elif extension == "docx":
         df = from_docx(file.file)
-    elif extension in {"odt"}:
+    elif extension == "odt":
         df = from_odt(file.file)
-    elif extension in {"pdf"}:
+    elif extension == "pdf":
         df = from_pdf(file.file)
-    elif extension in {"doc"}:
-        return "ERROR: Not yet supported"
-
-    return FUNCTIONS.get(format, lambda x: 404)(df)
+    elif extension == "doc":
+        return Response(
+            content="ERROR: .doc format is not supported for in-memory structured data extraction without external tools.",
+            status_code=400,
+        )
+    else:
+        return Response(
+            content=f"Unsupported file extension: {extension}", status_code=400
+        )
+
+    if df.empty:
+        return Response(content="Could not extract data from file.", status_code=400)
+
+    if func := FUNCTIONS.get(format):
+        return func(df)
+    return Response(
+        content=f"Unsupported output format: {format.value}", status_code=400
+    )

From 57f8a7b3e12cc87e3da0c9b6f53bbf8cfb2e4ab1 Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Fri, 27 Jun 2025 14:24:00 +0200
Subject: [PATCH 3/9] Update requirements

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 requirements.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d3feed5..194c53a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,6 @@
-fastapi[standard]>=0.115.13
\ No newline at end of file
+fastapi[standard]>=0.115.13
+python-multipart
+pandas
+python-docx
+odfpy
+tabula-py

From 4e5ab475f214dc90e1865d4b419f8516d5204dde Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:13:39 +0200
Subject: [PATCH 4/9] Organize imports & Add typehints

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 app/main.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/app/main.py b/app/main.py
index 6ac7455..4ac5933 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,11 +1,12 @@
-from fastapi import FastAPI, Response, UploadFile
 from enum import Enum
-from pandas import DataFrame, concat, read_csv, read_json, read_xml
-from typing import Callable
-from docx import Document
-from odf.opendocument import OpenDocumentText
-from odf.table import Table, TableRow, TableCell
+from typing import BinaryIO, Callable
+
 import tabula
+from docx import Document
+from fastapi import FastAPI, Response, UploadFile
+from odf.opendocument import load
+from odf.table import Table, TableCell, TableRow
+from pandas import DataFrame, concat, read_csv, read_json, read_xml
 
 app = FastAPI()
 
@@ -39,11 +40,11 @@ class ValidExtensions(Enum):
 }
 
 
-def from_odt(file) -> DataFrame:
+def from_odt(file: BinaryIO) -> DataFrame:
     """
     Extracts tables from an ODT file and returns a concatenated DataFrame.
     """
-    doc = OpenDocumentText(file)
+    doc = load(file)
     dfs = []
     for table in doc.getElementsByType(Table):
         data = []
@@ -66,7 +67,7 @@ def from_odt(file) -> DataFrame:
     return concat(dfs, ignore_index=True)
 
 
-def from_pdf(file_path: str) -> DataFrame:
+def from_pdf(file_path: BinaryIO) -> DataFrame:
     """
     Extracts tables from a PDF file and returns a concatenated DataFrame.
     """
@@ -79,7 +80,7 @@ def from_pdf(file_path: str) -> DataFrame:
     return concat(tables, ignore_index=True)
 
 
-def from_docx(file) -> DataFrame:
+def from_docx(file: BinaryIO) -> DataFrame:
     """
     Extracts tables from a DOCX file and returns a concatenated DataFrame.
     """

From 77009cb24c068338f2c208dd27afc29266415177 Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:14:07 +0200
Subject: [PATCH 5/9] Add local build instructions

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1585063..ef442fd 100644
--- a/README.md
+++ b/README.md
@@ -4,4 +4,16 @@ docker run --rm -it -p YOUR_PORT:80 ghcr.io/Atomic-IT/DataManager-Python-Tools
 ```
 
 API Documentation URL:
-- http://YOUR_IP:YOUR_PORT/docs
\ No newline at end of file
+- http://YOUR_IP:YOUR_PORT/docs
+
+# Local build
+
+## Build Docker
+```sh
+docker build -t dm-tools -f Dockerfile
+```
+
+## Run locally built image
+```sh
+docker run -p 1080:80 -it --rm dm-tools
+```

From a9164f29d19123cd3b3fa6423e5588c4fabc36bc Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:21:15 +0200
Subject: [PATCH 6/9] Fix extracting tables from .odt

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 app/main.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/app/main.py b/app/main.py
index 4ac5933..0d24f77 100644
--- a/app/main.py
+++ b/app/main.py
@@ -6,6 +6,7 @@
 from fastapi import FastAPI, Response, UploadFile
 from odf.opendocument import load
 from odf.table import Table, TableCell, TableRow
+from odf.teletype import extractText
 from pandas import DataFrame, concat, read_csv, read_json, read_xml
 
 app = FastAPI()
@@ -51,11 +52,7 @@ def from_odt(file: BinaryIO) -> DataFrame:
         for row in table.getElementsByType(TableRow):
             row_data = []
             for cell in row.getElementsByType(TableCell):
-                cell_text = "".join(
-                    node.data
-                    for node in cell.childNodes
-                    if node.nodeType == node.TEXT_NODE
-                )
+                cell_text = extractText(cell)
                 row_data.append(cell_text)
             data.append(row_data)
         if data:

From 7aab2be385d1c50d2200c97a696938300fdc0ad4 Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:26:52 +0200
Subject: [PATCH 7/9] Rename file_path to file to better reflect type

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 app/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/main.py b/app/main.py
index 0d24f77..c5126b1 100644
--- a/app/main.py
+++ b/app/main.py
@@ -64,12 +64,12 @@ def from_odt(file: BinaryIO) -> DataFrame:
     return concat(dfs, ignore_index=True)
 
 
-def from_pdf(file_path: BinaryIO) -> DataFrame:
+def from_pdf(file: BinaryIO) -> DataFrame:
     """
     Extracts tables from a PDF file and returns a concatenated DataFrame.
     """
     try:
-        tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True)
+        tables = tabula.read_pdf(file, pages="all", multiple_tables=True)
     except Exception:
         return DataFrame()
     if not tables:

From cb717cab0584bdd5a11c0b56f23b14ca9178d437 Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Tue, 24 Jun 2025 01:15:55 +0200
Subject: [PATCH 8/9] Create codeql.yml

---
 .github/workflows/codeql.yml | 100 +++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 .github/workflows/codeql.yml

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..57612d5
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,100 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL Advanced"
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  schedule:
+    - cron: '28 12 * * 3'
+
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    # Runner size impacts CodeQL analysis time. To learn more, please see:
+    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
+    #   - https://gh.io/supported-runners-and-hardware-resources
+    #   - https://gh.io/using-larger-runners (GitHub.com only)
+    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # required to fetch internal or private CodeQL packs
+      packages: read
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - language: actions
+          build-mode: none
+        - language: python
+          build-mode: none
+        # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
+        # Use `c-cpp` to analyze code written in C, C++ or both
+        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
+        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
+        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
+        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
+        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    # Add any setup steps before running the `github/codeql-action/init` action.
+    # This includes steps like installing compilers or runtimes (`actions/setup-node`
+    # or others). This is typically only required for manual builds.
+    # - name: Setup runtime (example)
+    #   uses: actions/setup-example@v1
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: ${{ matrix.language }}
+        build-mode: ${{ matrix.build-mode }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+
+        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        # queries: security-extended,security-and-quality
+
+    # If the analyze step fails for one of the languages you are analyzing with
+    # "We were unable to automatically build your code", modify the matrix above
+    # to set the build mode to "manual" for that language. Then modify this step
+    # to build your code.
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    - if: matrix.build-mode == 'manual'
+      shell: bash
+      run: |
+        echo 'If you are using a "manual" build mode for one or more of the' \
+          'languages you are analyzing, replace this with the commands to build' \
+          'your code, for example:'
+        echo '  make bootstrap'
+        echo '  make release'
+        exit 1
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:${{matrix.language}}"

From 48cb6346ef3da553d89f68c790433f7315a54b13 Mon Sep 17 00:00:00 2001
From: Mmesek <13630781+Mmesek@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:34:57 +0200
Subject: [PATCH 9/9] Fix formating for linting action

Signed-off-by: Mmesek <13630781+Mmesek@users.noreply.github.com>
---
 app/main.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/app/main.py b/app/main.py
index c5126b1..3f6897f 100644
--- a/app/main.py
+++ b/app/main.py
@@ -29,15 +29,9 @@ class ValidExtensions(Enum):
 
 
 FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = {
-    Formats.CSV: lambda df: Response(
-        content=df.to_csv(index=False), media_type="text/csv"
-    ),
-    Formats.JSON: lambda df: Response(
-        content=df.to_json(orient="records"), media_type="application/json"
-    ),
-    Formats.XML: lambda df: Response(
-        content=df.to_xml(index=False), media_type="application/xml"
-    ),
+    Formats.CSV: lambda df: Response(content=df.to_csv(index=False), media_type="text/csv"),
+    Formats.JSON: lambda df: Response(content=df.to_json(orient="records"), media_type="application/json"),
+    Formats.XML: lambda df: Response(content=df.to_xml(index=False), media_type="application/xml")
 }
 
 
@@ -127,15 +121,11 @@ def transform(format: Formats, file: UploadFile):
             status_code=400,
         )
     else:
-        return Response(
-            content=f"Unsupported file extension: {extension}", status_code=400
-        )
+        return Response(content=f"Unsupported file extension: {extension}", status_code=400)
 
     if df.empty:
         return Response(content="Could not extract data from file.", status_code=400)
 
     if func := FUNCTIONS.get(format):
         return func(df)
-    return Response(
-        content=f"Unsupported output format: {format.value}", status_code=400
-    )
+    return Response(content=f"Unsupported output format: {format.value}", status_code=400)