Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL Advanced"

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
schedule:
- cron: '28 12 * * 3'

jobs:
analyze:
name: Analyze (${{ matrix.language }})
# Runner size impacts CodeQL analysis time. To learn more, please see:
# - https://gh.io/recommended-hardware-resources-for-running-codeql
# - https://gh.io/supported-runners-and-hardware-resources
# - https://gh.io/using-larger-runners (GitHub.com only)
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
permissions:
# required for all workflows
security-events: write

# required to fetch internal or private CodeQL packs
packages: read

# only required for workflows in private repositories
actions: read
contents: read

strategy:
fail-fast: false
matrix:
include:
- language: actions
build-mode: none
- language: python
build-mode: none
# CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
# Use `c-cpp` to analyze code written in C, C++ or both
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
steps:
- name: Checkout repository
uses: actions/checkout@v4

# Add any setup steps before running the `github/codeql-action/init` action.
# This includes steps like installing compilers or runtimes (`actions/setup-node`
# or others). This is typically only required for manual builds.
# - name: Setup runtime (example)
# uses: actions/setup-example@v1

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
build-mode: ${{ matrix.build-mode }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.

# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality

# If the analyze step fails for one of the languages you are analyzing with
# "We were unable to automatically build your code", modify the matrix above
# to set the build mode to "manual" for that language. Then modify this step
# to build your code.
# ℹ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
- if: matrix.build-mode == 'manual'
shell: bash
run: |
echo 'If you are using a "manual" build mode for one or more of the' \
'languages you are analyzing, replace this with the commands to build' \
'your code, for example:'
echo ' make bootstrap'
echo ' make release'
exit 1

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{matrix.language}}"
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,16 @@ docker run --rm -it -p YOUR_PORT:80 ghcr.io/Atomic-IT/DataManager-Python-Tools
```

API Documentation URL:
- http://YOUR_IP:YOUR_PORT/docs
- http://YOUR_IP:YOUR_PORT/docs

# Local build

## Build Docker
```sh
docker build -t dm-tools -f Dockerfile
```

## Run locally built image
```sh
docker run -p 1080:80 -it --rm dm-tools
```
131 changes: 127 additions & 4 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,131 @@
from fastapi import FastAPI
from enum import Enum
from typing import BinaryIO, Callable

import tabula
from docx import Document
from fastapi import FastAPI, Response, UploadFile
from odf.opendocument import load
from odf.table import Table, TableCell, TableRow
from odf.teletype import extractText
from pandas import DataFrame, concat, read_csv, read_json, read_xml

app = FastAPI()


@app.get("/")
def read_root():
return {"Hello": "World"}
class Formats(Enum):
CSV = "csv"
XML = "xml"
JSON = "json"


class ValidExtensions(Enum):
CSV = "csv"
XML = "xml"
JSON = "json"
PDF = "pdf"
DOC = "doc"
DOCX = "docx"
ODT = "odt"


FUNCTIONS: dict[Formats, Callable[[DataFrame], Response]] = {
Formats.CSV: lambda df: Response(content=df.to_csv(index=False), media_type="text/csv"),
Formats.JSON: lambda df: Response(content=df.to_json(orient="records"), media_type="application/json"),
Formats.XML: lambda df: Response(content=df.to_xml(index=False), media_type="application/xml")
}


def from_odt(file: BinaryIO) -> DataFrame:
"""
Extracts tables from an ODT file and returns a concatenated DataFrame.
"""
doc = load(file)
dfs = []
for table in doc.getElementsByType(Table):
data = []
for row in table.getElementsByType(TableRow):
row_data = []
for cell in row.getElementsByType(TableCell):
cell_text = extractText(cell)
row_data.append(cell_text)
data.append(row_data)
if data:
header = data[0]
df = DataFrame(data[1:], columns=header)
dfs.append(df)
if not dfs:
return DataFrame()
return concat(dfs, ignore_index=True)


def from_pdf(file: BinaryIO) -> DataFrame:
"""
Extracts tables from a PDF file and returns a concatenated DataFrame.
"""
try:
tables = tabula.read_pdf(file, pages="all", multiple_tables=True)
except Exception:
return DataFrame()
if not tables:
return DataFrame()
return concat(tables, ignore_index=True)


def from_docx(file: BinaryIO) -> DataFrame:
"""
Extracts tables from a DOCX file and returns a concatenated DataFrame.
"""
doc = Document(file)
dfs = []
for table in doc.tables:
data = []
for row in table.rows:
data.append([cell.text for cell in row.cells])
if data:
header = data[0]
df = DataFrame(data[1:], columns=header)
dfs.append(df)
if not dfs:
return DataFrame()
return concat(dfs, ignore_index=True)


@app.post("/")
def transform(format: Formats, file: UploadFile):
"""
Transforms given file to another format
Parameters
----------
format:
Destination format to use
file:
File to transform. Requires valid extension
"""
extension = file.filename.split(".")[-1] if file.filename else ""

if extension == "csv":
df = read_csv(file.file)
elif extension == "xml":
df = read_xml(file.file)
elif extension == "json":
df = read_json(file.file)
elif extension == "docx":
df = from_docx(file.file)
elif extension == "odt":
df = from_odt(file.file)
elif extension == "pdf":
df = from_pdf(file.file)
elif extension == "doc":
return Response(
content="ERROR: .doc format is not supported for in-memory structured data extraction without external tools.",
status_code=400,
)
else:
return Response(content=f"Unsupported file extension: {extension}", status_code=400)

if df.empty:
return Response(content="Could not extract data from file.", status_code=400)

if func := FUNCTIONS.get(format):
return func(df)
return Response(content=f"Unsupported output format: {format.value}", status_code=400)
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
fastapi[standard]>=0.115.13
fastapi[standard]>=0.115.13
python-multipart
pandas
python-docx
odfpy
tabula-py
Loading