Nucleify · Mmesek · Aug 29, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -0,0 +1,100 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL Advanced"
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  schedule:
+    - cron: '28 12 * * 3'
+
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    # Runner size impacts CodeQL analysis time. To learn more, please see:
+    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
+    #   - https://gh.io/supported-runners-and-hardware-resources
+    #   - https://gh.io/using-larger-runners (GitHub.com only)
+    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # required to fetch internal or private CodeQL packs
+      packages: read
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - language: actions
+          build-mode: none
+        - language: python
+          build-mode: none
+        # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
+        # Use `c-cpp` to analyze code written in C, C++ or both
+        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
+        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
+        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
+        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
+        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    # Add any setup steps before running the `github/codeql-action/init` action.
+    # This includes steps like installing compilers or runtimes (`actions/setup-node`
+    # or others). This is typically only required for manual builds.
+    # - name: Setup runtime (example)
+    #   uses: actions/setup-example@v1
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: ${{ matrix.language }}
+        build-mode: ${{ matrix.build-mode }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+
+        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        # queries: security-extended,security-and-quality
+
+    # If the analyze step fails for one of the languages you are analyzing with
+    # "We were unable to automatically build your code", modify the matrix above
+    # to set the build mode to "manual" for that language. Then modify this step
+    # to build your code.
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    - if: matrix.build-mode == 'manual'
+      shell: bash
+      run: |
+        echo 'If you are using a "manual" build mode for one or more of the' \
+          'languages you are analyzing, replace this with the commands to build' \
+          'your code, for example:'
+        echo '  make bootstrap'
+        echo '  make release'
+        exit 1
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/README.md b/README.md
@@ -4,4 +4,16 @@ docker run --rm -it -p YOUR_PORT:80 ghcr.io/Atomic-IT/DataManager-Python-Tools
 ```
 
 API Documentation URL:
-- http://YOUR_IP:YOUR_PORT/docs
+- http://YOUR_IP:YOUR_PORT/docs
+
+# Local build
+
+## Build Docker
+```sh
+docker build -t dm-tools -f Dockerfile
+```
+
+## Run locally built image
+```sh
+docker run -p 1080:80 -it --rm dm-tools
+```
diff --git a/app/main.py b/app/main.py
@@ -1,8 +1,131 @@
-from fastapi import FastAPI
+from enum import Enum
+from typing import BinaryIO, Callable
+
+import tabula
+from docx import Document
+from fastapi import FastAPI, Response, UploadFile
+from odf.opendocument import load
+from odf.table import Table, TableCell, TableRow
+from odf.teletype import extractText
+from pandas import DataFrame, concat, read_csv, read_json, read_xml
 
 app = FastAPI()
 
 
-@app.get("/")
-def read_root():
-    return {"Hello": "World"}
+class Formats(Enum):
+    CSV = "csv"
+    XML = "xml"
+    JSON = "json"
+
+
+class ValidExtensions(Enum):
+    CSV = "csv"
+    XML = "xml"
+    JSON = "json"
+    PDF = "pdf"
+    DOC = "doc"
+    DOCX = "docx"
+    ODT = "odt"
+
+
+FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = {
+    Formats.CSV: lambda df: Response(content=df.to_csv(index=False), media_type="text/csv"),
+    Formats.JSON: lambda df: Response(content=df.to_json(orient="records"), media_type="application/json"),
+    Formats.XML: lambda df: Response(content=df.to_xml(index=False), media_type="application/xml")
+}
+
+
+def from_odt(file: BinaryIO) -> DataFrame:
+    """
+    Extracts tables from an ODT file and returns a concatenated DataFrame.
+    """
+    doc = load(file)
+    dfs = []
+    for table in doc.getElementsByType(Table):
+        data = []
+        for row in table.getElementsByType(TableRow):
+            row_data = []
+            for cell in row.getElementsByType(TableCell):
+                cell_text = extractText(cell)
+                row_data.append(cell_text)
+            data.append(row_data)
+        if data:
+            header = data[0]
+            df = DataFrame(data[1:], columns=header)
+            dfs.append(df)
+    if not dfs:
+        return DataFrame()
+    return concat(dfs, ignore_index=True)
+
+
+def from_pdf(file: BinaryIO) -> DataFrame:
+    """
+    Extracts tables from a PDF file and returns a concatenated DataFrame.
+    """
+    try:
+        tables = tabula.read_pdf(file, pages="all", multiple_tables=True)
+    except Exception:
+        return DataFrame()
+    if not tables:
+        return DataFrame()
+    return concat(tables, ignore_index=True)
+
+
+def from_docx(file: BinaryIO) -> DataFrame:
+    """
+    Extracts tables from a DOCX file and returns a concatenated DataFrame.
+    """
+    doc = Document(file)
+    dfs = []
+    for table in doc.tables:
+        data = []
+        for row in table.rows:
+            data.append([cell.text for cell in row.cells])
+        if data:
+            header = data[0]
+            df = DataFrame(data[1:], columns=header)
+            dfs.append(df)
+    if not dfs:
+        return DataFrame()
+    return concat(dfs, ignore_index=True)
+
+
+@app.post("/")
+def transform(format: Formats, file: UploadFile):
+    """
+    Transforms given file to another format
+    Parameters
+    ----------
+    format:
+        Destination format to use
+    file:
+        File to transform. Requires valid extension
+    """
+    extension = file.filename.split(".")[-1] if file.filename else ""
+
+    if extension == "csv":
+        df = read_csv(file.file)
+    elif extension == "xml":
+        df = read_xml(file.file)
+    elif extension == "json":
+        df = read_json(file.file)
+    elif extension == "docx":
+        df = from_docx(file.file)
+    elif extension == "odt":
+        df = from_odt(file.file)
+    elif extension == "pdf":
+        df = from_pdf(file.file)
+    elif extension == "doc":
+        return Response(
+            content="ERROR: .doc format is not supported for in-memory structured data extraction without external tools.",
+            status_code=400,
+        )
+    else:
+        return Response(content=f"Unsupported file extension: {extension}", status_code=400)
+
+    if df.empty:
+        return Response(content="Could not extract data from file.", status_code=400)
+
+    if func := FUNCTIONS.get(format):
+        return func(df)
+    return Response(content=f"Unsupported output format: {format.value}", status_code=400)
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,6 @@
-fastapi[standard]>=0.115.13
+fastapi[standard]>=0.115.13
+python-multipart
+pandas
+python-docx
+odfpy
+tabula-py