diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ae4eb5..d47cd65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,85 +1,88 @@ -name: ci - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - schedule: - - cron: '0 0 * * *' # Runs every day at 00:00 UTC - -jobs: - gotest: - name: Test ocr worker - runs-on: ubuntu-latest - steps: - - name: Set up Go 1.x - uses: actions/setup-go@v6 - with: - go-version: stable - id: go - - - name: Check out code into the Go module directory - uses: actions/checkout@v6 - - - name: Build - run: go build -v ./... - - - name: gofmt - run: | - GOFMTOUT=$(gofmt -l .) - if [[ ! -z "${GOFMTOUT}" ]]; then - echo "FATAL: gofmt violation(s), please fix" - echo $GOFMTOUT - exit -1 - fi - - name: go vet - run: go vet ./... - - - name: Test - run: go test -v ./... - - build_push: - name: Build and Push - needs: [gotest] - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v6 - - - name: Docker meta - id: docker_meta - uses: docker/metadata-action@v5.10.0 - with: - images: ashirt/ocr-worker - tags: | - type=sha - type=ref,event=branch - type=ref,event=pr - flavor: | - latest=false - - name: Login to Docker Hub - uses: docker/login-action@v3.7.0 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and Push PR - if: github.ref != 'refs/heads/main' - uses: docker/build-push-action@v6.18.0 - with: - context: . - file: Dockerfile - tags: ${{ steps.docker_meta.outputs.tags }} - labels: ${{ steps.docker_meta.outputs.labels }} - push: true # Push with pr-### and sha-xxxxxxx tags - - - name: Build and Push Latest - if: github.ref == 'refs/heads/main' - uses: docker/build-push-action@v6.18.0 - with: - context: . - file: Dockerfile - tags: ${{ steps.docker_meta.outputs.tags }}, ashirt/ocr-worker:latest #Add latest tag for main - labels: ${{ steps.docker_meta.outputs.labels }} +name: ci + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 0 * * *' # Runs every day at 00:00 UTC + +jobs: + gotest: + name: Test ocr worker + runs-on: ubuntu-latest + steps: + - name: Set up Go 1.x + uses: actions/setup-go@v6 + with: + go-version: stable + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v6 + + - name: Install packages + run: sudo apt install -y libtesseract-dev tesseract-ocr-eng + + - name: Build + run: go build -v ./... + + - name: gofmt + run: | + GOFMTOUT=$(gofmt -l .) + if [[ ! -z "${GOFMTOUT}" ]]; then + echo "FATAL: gofmt violation(s), please fix" + echo $GOFMTOUT + exit -1 + fi + - name: go vet + run: go vet ./... + + - name: Test + run: go test -v ./... + + build_push: + name: Build and Push + needs: [gotest] + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v6 + + - name: Docker meta + id: docker_meta + uses: docker/metadata-action@v5.10.0 + with: + images: ashirt/ocr-worker + tags: | + type=sha + type=ref,event=branch + type=ref,event=pr + flavor: | + latest=false + - name: Login to Docker Hub + uses: docker/login-action@v3.7.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and Push PR + if: github.ref != 'refs/heads/main' + uses: docker/build-push-action@v6.18.0 + with: + context: . + file: Dockerfile + tags: ${{ steps.docker_meta.outputs.tags }} + labels: ${{ steps.docker_meta.outputs.labels }} + push: true # Push with pr-### and sha-xxxxxxx tags + + - name: Build and Push Latest + if: github.ref == 'refs/heads/main' + uses: docker/build-push-action@v6.18.0 + with: + context: . + file: Dockerfile + tags: ${{ steps.docker_meta.outputs.tags }}, ashirt/ocr-worker:latest #Add latest tag for main + labels: ${{ steps.docker_meta.outputs.labels }} push: true \ No newline at end of file diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 620ec88..3db87f9 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -19,6 +19,10 @@ jobs: - uses: actions/setup-go@v6 with: go-version: stable + + - name: Install packages + run: sudo apt install -y libtesseract-dev tesseract-ocr-eng + - name: golangci-lint uses: golangci/golangci-lint-action@v9 with: diff --git a/Dockerfile b/Dockerfile index 3bb557f..d2656de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,21 @@ -FROM golang:1.25-alpine AS build - -RUN mkdir app -COPY . ./app/ -WORKDIR /go/app -RUN go build -v ./... - -FROM alpine:latest - -RUN apk add --no-cache tesseract-ocr && \ - adduser -h /home/ashirt -S -D ashirt - -USER ashirt -WORKDIR /home/ashirt - -COPY --from=build /go/app/ocr-worker /home/ashirt/ocr-worker - +FROM golang:1.25-alpine AS build + +RUN mkdir app && \ + apk add --no-cache build-base leptonica-dev tesseract-ocr-dev + +COPY . ./app/ +WORKDIR /go/app + +RUN go build -v ./cmd/... + +FROM alpine:latest + +RUN apk add --no-cache tesseract-ocr tesseract-ocr-data-eng && \ + adduser -h /home/ashirt -S -D ashirt + +USER ashirt +WORKDIR /home/ashirt + +COPY --from=build /go/app/ocr-worker /home/ashirt/ocr-worker + CMD ["ocr-worker"] \ No newline at end of file diff --git a/cmd/ocr-worker/main.go b/cmd/ocr-worker/main.go index 70a7fe5..3576685 100644 --- a/cmd/ocr-worker/main.go +++ b/cmd/ocr-worker/main.go @@ -1,7 +1,31 @@ package main -import "fmt" +import ( + "log" + "log/slog" + "net/http" + "os" + + "github.com/ashirt-ops/ocr-worker/internal/handlers" + "github.com/ashirt-ops/ocr-worker/internal/textextractor" + "github.com/jrozner/weby/middleware" + "github.com/jrozner/weby/rlog" +) func main() { - fmt.Println("Hello world!") + var handler slog.Handler = slog.NewTextHandler(os.Stdout, nil) + + handler = rlog.RequestIDHandler{Handler: handler} + logger := slog.New(handler) + + extractor := textextractor.NewTesseract() + + env := handlers.New(extractor) + mux := env.Routes() + + mux.Use(middleware.RequestID) + mux.Use(middleware.WrapResponse) + mux.Use(middleware.Logger(logger)) + + log.Fatal(http.ListenAndServe(":8080", mux)) } diff --git a/go.mod b/go.mod index e8162a8..e1a2d58 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,14 @@ module github.com/ashirt-ops/ocr-worker go 1.25.6 + +require ( + github.com/jrozner/weby v0.1.0 + github.com/otiai10/gosseract v2.2.1+incompatible +) + +require ( + github.com/google/uuid v1.6.0 // indirect + github.com/otiai10/mint v1.6.3 // indirect + golang.org/x/net v0.49.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..9e887b5 --- /dev/null +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jrozner/weby v0.1.0 h1:kI+DXnYHsSB6Kmmil8i7Wqh5O1oF1c1xOhnVSONh+BA= +github.com/jrozner/weby v0.1.0/go.mod h1:cBLmTAkOScydeEd02pmDkSeQfdZUQ9Y3r3hcLSUZrqI= +github.com/otiai10/gosseract v2.2.1+incompatible h1:Ry5ltVdpdp4LAa2bMjsSJH34XHVOV7XMi41HtzL8X2I= +github.com/otiai10/gosseract v2.2.1+incompatible/go.mod h1:XrzWItCzCpFRZ35n3YtVTgq5bLAhFIkascoRo8G32QE= +github.com/otiai10/mint v1.6.3 h1:87qsV/aw1F5as1eH1zS/yqHY85ANKVMgkDrf9rcxbQs= +github.com/otiai10/mint v1.6.3/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= diff --git a/internal/client/client.go b/internal/client/client.go new file mode 100644 index 0000000..b6f7595 --- /dev/null +++ b/internal/client/client.go @@ -0,0 +1,87 @@ +package client + +import ( + "bytes" + "crypto/hmac" + "crypto/sha256" + "encoding/base64" + "fmt" + "io" + "net/http" + "time" +) + +type Client struct { + *http.Client + base string + accessKey string + secretKey []byte +} + +func New(base, accessKey string, secretKey []byte) *Client { + c := &http.Client{} + client := &Client{ + Client: c, + base: base, + accessKey: accessKey, + secretKey: secretKey, + } + + return client +} + +func (c *Client) Do(r *http.Request) (*http.Response, error) { + date := time.Now().Format(time.RFC1123) + + r.Header.Set("Date", date) + + signature, err := generateSignature(r, c.secretKey) + if err != nil { + return nil, err + } + + encodedSignature := base64.StdEncoding.EncodeToString(signature) + + r.Header.Add("Authorization", fmt.Sprintf("%s:%s", c.accessKey, encodedSignature)) + + return c.Client.Do(r) +} + +func generateSignature(r *http.Request, key []byte) ([]byte, error) { + // copy the body into somewhere that we can reset + body := bytes.NewBuffer([]byte{}) + _, err := io.Copy(body, r.Body) + if err != nil { + return nil, err + } + + // close the original body so we don't leak it + err = r.Body.Close() + if err != nil { + return nil, err + } + + // shasum the body + requestBodySHA256 := sha256.New() + _, err = io.Copy(requestBodySHA256, r.Body) + if err != nil { + return nil, err + } + + // reset the body so it can be read again + body.Reset() + r.Body = io.NopCloser(body) + + m := new(bytes.Buffer) + m.WriteString(r.Method) + m.WriteString("\n") + m.WriteString(r.URL.RequestURI()) + m.WriteString("\n") + m.WriteString(r.Header.Get("Date")) + m.WriteString("\n") + m.Write(requestBodySHA256.Sum(nil)) + + mac := hmac.New(sha256.New, key) + mac.Write(m.Bytes()) + return mac.Sum(nil), nil +} diff --git a/internal/client/evidence.go b/internal/client/evidence.go new file mode 100644 index 0000000..da13c8e --- /dev/null +++ b/internal/client/evidence.go @@ -0,0 +1 @@ +package client diff --git a/internal/client/operations.go b/internal/client/operations.go new file mode 100644 index 0000000..da13c8e --- /dev/null +++ b/internal/client/operations.go @@ -0,0 +1 @@ +package client diff --git a/internal/handlers/handlers.go b/internal/handlers/handlers.go new file mode 100644 index 0000000..0f0f6d7 --- /dev/null +++ b/internal/handlers/handlers.go @@ -0,0 +1,55 @@ +package handlers + +import ( + "encoding/json" + "log/slog" + "net/http" + + "github.com/ashirt-ops/ocr-worker/internal/messages" + "github.com/ashirt-ops/ocr-worker/internal/textextractor" + "github.com/jrozner/weby" +) + +type Env struct { + extractor textextractor.TextExtractor +} + +func New(extractor textextractor.TextExtractor) *Env { + return &Env{ + extractor: extractor, + } +} + +func (e *Env) Routes() *weby.ServeMux { + mux := weby.NewServeMux() + mux.HandleFunc("/process", e.Process) + + return mux +} + +func (e *Env) Process(w http.ResponseWriter, r *http.Request) { + var request messages.Request + + err := json.NewDecoder(r.Body).Decode(&request) + if err != nil { + slog.ErrorContext(r.Context(), "error decoding request body", "error", err) + goto error + } + + if request.ContentType != "IMAGE" { + slog.DebugContext(r.Context(), "unsupported content type, skipping processing", "content_type", request.ContentType) + return + } + +error: + response := messages.Response{ + Action: "rejected", + Content: "", + } + + w.WriteHeader(http.StatusOK) + err = json.NewEncoder(w).Encode(&response) + if err != nil { + slog.ErrorContext(r.Context(), "unable to serialize response", "error", err) + } +} diff --git a/internal/messages/request.go b/internal/messages/request.go new file mode 100644 index 0000000..3902ca5 --- /dev/null +++ b/internal/messages/request.go @@ -0,0 +1,7 @@ +package messages + +type Request struct { + ContentType string `json:"content_type"` + EvidenceUUID string `json:"evidence_uuid"` + OperationSlug string `json:"operation_slug"` +} diff --git a/internal/messages/response.go b/internal/messages/response.go new file mode 100644 index 0000000..318f644 --- /dev/null +++ b/internal/messages/response.go @@ -0,0 +1,6 @@ +package messages + +type Response struct { + Action string `json:"action"` + Content string `json:"content"` +} diff --git a/internal/textextractor/tesseract.go b/internal/textextractor/tesseract.go new file mode 100644 index 0000000..50ce6f1 --- /dev/null +++ b/internal/textextractor/tesseract.go @@ -0,0 +1,36 @@ +package textextractor + +import ( + "github.com/otiai10/gosseract" +) + +type Tesseract struct{} + +// NewTesseract returns a new Tesseract TextExtractor +func NewTesseract() TextExtractor { + return &Tesseract{} +} + +func (t *Tesseract) ExtractText(img []byte) (string, error) { + client := gosseract.NewClient() + defer func() { + // TODO: we're doing this because golangci-lint complains about an unchecked error + // We can either add an exclusion to it's config or used a named return to surface' + // this up + _ = client.Close() + }() + + // NOTE: For some reason this segfaults without calling SetLanguage even though the + // docs say "eng" should be the default + err := client.SetLanguage("eng") + if err != nil { + return "", err + } + + err = client.SetImageFromBytes(img) + if err != nil { + return "", err + } + + return client.Text() +} diff --git a/internal/textextractor/tesseract_test.go b/internal/textextractor/tesseract_test.go new file mode 100644 index 0000000..f162d3d --- /dev/null +++ b/internal/textextractor/tesseract_test.go @@ -0,0 +1,22 @@ +package textextractor + +import ( + "os" + "testing" +) + +func TestTesseract(t *testing.T) { + data, err := os.ReadFile("./testdata/ashirt.png") + if err != nil { + t.Fatalf("unable to read test data: %v", err) + } + extractor := NewTesseract() + extracted, err := extractor.ExtractText(data) + if err != nil { + t.Fatalf("unable to extract text: %v", err) + } + + if extracted != "ASHIRT" { + t.Fatalf("extracted text does not match expected text, expected: %v, got: %v", "ashirt", extracted) + } +} diff --git a/internal/textextractor/testdata/ashirt.png b/internal/textextractor/testdata/ashirt.png new file mode 100755 index 0000000..9fb27e2 Binary files /dev/null and b/internal/textextractor/testdata/ashirt.png differ diff --git a/internal/textextractor/textextractor.go b/internal/textextractor/textextractor.go new file mode 100644 index 0000000..718f599 --- /dev/null +++ b/internal/textextractor/textextractor.go @@ -0,0 +1,7 @@ +package textextractor + +// TextExtractor is an interface for extracting text from image files with various backends +type TextExtractor interface { + // ExtractText identifies and returns all text found within an image file + ExtractText(img []byte) (string, error) +}