diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..57612d5 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,100 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL Advanced" + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + schedule: + - cron: '28 12 * * 3' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: actions + build-mode: none + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Add any setup steps before running the `github/codeql-action/init` action. + # This includes steps like installing compilers or runtimes (`actions/setup-node` + # or others). This is typically only required for manual builds. + # - name: Setup runtime (example) + # uses: actions/setup-example@v1 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/README.md b/README.md index 1585063..ef442fd 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,16 @@ docker run --rm -it -p YOUR_PORT:80 ghcr.io/Atomic-IT/DataManager-Python-Tools ``` API Documentation URL: -- http://YOUR_IP:YOUR_PORT/docs \ No newline at end of file +- http://YOUR_IP:YOUR_PORT/docs + +# Local build + +## Build Docker +```sh +docker build -t dm-tools -f Dockerfile +``` + +## Run locally built image +```sh +docker run -p 1080:80 -it --rm dm-tools +``` diff --git a/app/main.py b/app/main.py index 0aa38c5..3f6897f 100644 --- a/app/main.py +++ b/app/main.py @@ -1,8 +1,131 @@ -from fastapi import FastAPI +from enum import Enum +from typing import BinaryIO, Callable + +import tabula +from docx import Document +from fastapi import FastAPI, Response, UploadFile +from odf.opendocument import load +from odf.table import Table, TableCell, TableRow +from odf.teletype import extractText +from pandas import DataFrame, concat, read_csv, read_json, read_xml app = FastAPI() -@app.get("/") -def read_root(): - return {"Hello": "World"} +class Formats(Enum): + CSV = "csv" + XML = "xml" + JSON = "json" + + +class ValidExtensions(Enum): + CSV = "csv" + XML = "xml" + JSON = "json" + PDF = "pdf" + DOC = "doc" + DOCX = "docx" + ODT = "odt" + + +FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = { + Formats.CSV: lambda df: Response(content=df.to_csv(index=False), media_type="text/csv"), + Formats.JSON: lambda df: Response(content=df.to_json(orient="records"), media_type="application/json"), + Formats.XML: lambda df: Response(content=df.to_xml(index=False), media_type="application/xml") +} + + +def from_odt(file: BinaryIO) -> DataFrame: + """ + Extracts tables from an ODT file and returns a concatenated DataFrame. + """ + doc = load(file) + dfs = [] + for table in doc.getElementsByType(Table): + data = [] + for row in table.getElementsByType(TableRow): + row_data = [] + for cell in row.getElementsByType(TableCell): + cell_text = extractText(cell) + row_data.append(cell_text) + data.append(row_data) + if data: + header = data[0] + df = DataFrame(data[1:], columns=header) + dfs.append(df) + if not dfs: + return DataFrame() + return concat(dfs, ignore_index=True) + + +def from_pdf(file: BinaryIO) -> DataFrame: + """ + Extracts tables from a PDF file and returns a concatenated DataFrame. + """ + try: + tables = tabula.read_pdf(file, pages="all", multiple_tables=True) + except Exception: + return DataFrame() + if not tables: + return DataFrame() + return concat(tables, ignore_index=True) + + +def from_docx(file: BinaryIO) -> DataFrame: + """ + Extracts tables from a DOCX file and returns a concatenated DataFrame. + """ + doc = Document(file) + dfs = [] + for table in doc.tables: + data = [] + for row in table.rows: + data.append([cell.text for cell in row.cells]) + if data: + header = data[0] + df = DataFrame(data[1:], columns=header) + dfs.append(df) + if not dfs: + return DataFrame() + return concat(dfs, ignore_index=True) + + +@app.post("/") +def transform(format: Formats, file: UploadFile): + """ + Transforms given file to another format + Parameters + ---------- + format: + Destination format to use + file: + File to transform. Requires valid extension + """ + extension = file.filename.split(".")[-1] if file.filename else "" + + if extension == "csv": + df = read_csv(file.file) + elif extension == "xml": + df = read_xml(file.file) + elif extension == "json": + df = read_json(file.file) + elif extension == "docx": + df = from_docx(file.file) + elif extension == "odt": + df = from_odt(file.file) + elif extension == "pdf": + df = from_pdf(file.file) + elif extension == "doc": + return Response( + content="ERROR: .doc format is not supported for in-memory structured data extraction without external tools.", + status_code=400, + ) + else: + return Response(content=f"Unsupported file extension: {extension}", status_code=400) + + if df.empty: + return Response(content="Could not extract data from file.", status_code=400) + + if func := FUNCTIONS.get(format): + return func(df) + return Response(content=f"Unsupported output format: {format.value}", status_code=400) diff --git a/requirements.txt b/requirements.txt index d3feed5..194c53a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,6 @@ -fastapi[standard]>=0.115.13 \ No newline at end of file +fastapi[standard]>=0.115.13 +python-multipart +pandas +python-docx +odfpy +tabula-py