From a94ca0c6563387e9532f47e032ef2331b8274eb6 Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Tue, 3 Feb 2026 22:49:27 -0800 Subject: [PATCH 1/9] WIP --- cmd/ocr-worker/main.go | 28 +++++++- go.mod | 2 + go.sum | 2 + internal/client/client.go | 87 +++++++++++++++++++++++++ internal/client/evidence.go | 1 + internal/client/operations.go | 1 + internal/handlers/handlers.go | 55 ++++++++++++++++ internal/messages/request.go | 7 ++ internal/messages/response.go | 6 ++ internal/textextractor/tesseract.go | 12 ++++ internal/textextractor/textextractor.go | 7 ++ 11 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 go.sum create mode 100644 internal/client/client.go create mode 100644 internal/client/evidence.go create mode 100644 internal/client/operations.go create mode 100644 internal/handlers/handlers.go create mode 100644 internal/messages/request.go create mode 100644 internal/messages/response.go create mode 100644 internal/textextractor/tesseract.go create mode 100644 internal/textextractor/textextractor.go diff --git a/cmd/ocr-worker/main.go b/cmd/ocr-worker/main.go index 70a7fe5..3576685 100644 --- a/cmd/ocr-worker/main.go +++ b/cmd/ocr-worker/main.go @@ -1,7 +1,31 @@ package main -import "fmt" +import ( + "log" + "log/slog" + "net/http" + "os" + + "github.com/ashirt-ops/ocr-worker/internal/handlers" + "github.com/ashirt-ops/ocr-worker/internal/textextractor" + "github.com/jrozner/weby/middleware" + "github.com/jrozner/weby/rlog" +) func main() { - fmt.Println("Hello world!") + var handler slog.Handler = slog.NewTextHandler(os.Stdout, nil) + + handler = rlog.RequestIDHandler{Handler: handler} + logger := slog.New(handler) + + extractor := textextractor.NewTesseract() + + env := handlers.New(extractor) + mux := env.Routes() + + mux.Use(middleware.RequestID) + mux.Use(middleware.WrapResponse) + mux.Use(middleware.Logger(logger)) + + log.Fatal(http.ListenAndServe(":8080", mux)) } diff --git a/go.mod b/go.mod index e8162a8..a78c5d4 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/ashirt-ops/ocr-worker go 1.25.6 + +require github.com/jrozner/weby v0.1.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..92c4090 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/jrozner/weby v0.1.0 h1:kI+DXnYHsSB6Kmmil8i7Wqh5O1oF1c1xOhnVSONh+BA= +github.com/jrozner/weby v0.1.0/go.mod h1:cBLmTAkOScydeEd02pmDkSeQfdZUQ9Y3r3hcLSUZrqI= diff --git a/internal/client/client.go b/internal/client/client.go new file mode 100644 index 0000000..b6f7595 --- /dev/null +++ b/internal/client/client.go @@ -0,0 +1,87 @@ +package client + +import ( + "bytes" + "crypto/hmac" + "crypto/sha256" + "encoding/base64" + "fmt" + "io" + "net/http" + "time" +) + +type Client struct { + *http.Client + base string + accessKey string + secretKey []byte +} + +func New(base, accessKey string, secretKey []byte) *Client { + c := &http.Client{} + client := &Client{ + Client: c, + base: base, + accessKey: accessKey, + secretKey: secretKey, + } + + return client +} + +func (c *Client) Do(r *http.Request) (*http.Response, error) { + date := time.Now().Format(time.RFC1123) + + r.Header.Set("Date", date) + + signature, err := generateSignature(r, c.secretKey) + if err != nil { + return nil, err + } + + encodedSignature := base64.StdEncoding.EncodeToString(signature) + + r.Header.Add("Authorization", fmt.Sprintf("%s:%s", c.accessKey, encodedSignature)) + + return c.Client.Do(r) +} + +func generateSignature(r *http.Request, key []byte) ([]byte, error) { + // copy the body into somewhere that we can reset + body := bytes.NewBuffer([]byte{}) + _, err := io.Copy(body, r.Body) + if err != nil { + return nil, err + } + + // close the original body so we don't leak it + err = r.Body.Close() + if err != nil { + return nil, err + } + + // shasum the body + requestBodySHA256 := sha256.New() + _, err = io.Copy(requestBodySHA256, r.Body) + if err != nil { + return nil, err + } + + // reset the body so it can be read again + body.Reset() + r.Body = io.NopCloser(body) + + m := new(bytes.Buffer) + m.WriteString(r.Method) + m.WriteString("\n") + m.WriteString(r.URL.RequestURI()) + m.WriteString("\n") + m.WriteString(r.Header.Get("Date")) + m.WriteString("\n") + m.Write(requestBodySHA256.Sum(nil)) + + mac := hmac.New(sha256.New, key) + mac.Write(m.Bytes()) + return mac.Sum(nil), nil +} diff --git a/internal/client/evidence.go b/internal/client/evidence.go new file mode 100644 index 0000000..da13c8e --- /dev/null +++ b/internal/client/evidence.go @@ -0,0 +1 @@ +package client diff --git a/internal/client/operations.go b/internal/client/operations.go new file mode 100644 index 0000000..da13c8e --- /dev/null +++ b/internal/client/operations.go @@ -0,0 +1 @@ +package client diff --git a/internal/handlers/handlers.go b/internal/handlers/handlers.go new file mode 100644 index 0000000..0f0f6d7 --- /dev/null +++ b/internal/handlers/handlers.go @@ -0,0 +1,55 @@ +package handlers + +import ( + "encoding/json" + "log/slog" + "net/http" + + "github.com/ashirt-ops/ocr-worker/internal/messages" + "github.com/ashirt-ops/ocr-worker/internal/textextractor" + "github.com/jrozner/weby" +) + +type Env struct { + extractor textextractor.TextExtractor +} + +func New(extractor textextractor.TextExtractor) *Env { + return &Env{ + extractor: extractor, + } +} + +func (e *Env) Routes() *weby.ServeMux { + mux := weby.NewServeMux() + mux.HandleFunc("/process", e.Process) + + return mux +} + +func (e *Env) Process(w http.ResponseWriter, r *http.Request) { + var request messages.Request + + err := json.NewDecoder(r.Body).Decode(&request) + if err != nil { + slog.ErrorContext(r.Context(), "error decoding request body", "error", err) + goto error + } + + if request.ContentType != "IMAGE" { + slog.DebugContext(r.Context(), "unsupported content type, skipping processing", "content_type", request.ContentType) + return + } + +error: + response := messages.Response{ + Action: "rejected", + Content: "", + } + + w.WriteHeader(http.StatusOK) + err = json.NewEncoder(w).Encode(&response) + if err != nil { + slog.ErrorContext(r.Context(), "unable to serialize response", "error", err) + } +} diff --git a/internal/messages/request.go b/internal/messages/request.go new file mode 100644 index 0000000..3902ca5 --- /dev/null +++ b/internal/messages/request.go @@ -0,0 +1,7 @@ +package messages + +type Request struct { + ContentType string `json:"content_type"` + EvidenceUUID string `json:"evidence_uuid"` + OperationSlug string `json:"operation_slug"` +} diff --git a/internal/messages/response.go b/internal/messages/response.go new file mode 100644 index 0000000..318f644 --- /dev/null +++ b/internal/messages/response.go @@ -0,0 +1,6 @@ +package messages + +type Response struct { + Action string `json:"action"` + Content string `json:"content"` +} diff --git a/internal/textextractor/tesseract.go b/internal/textextractor/tesseract.go new file mode 100644 index 0000000..bb640e5 --- /dev/null +++ b/internal/textextractor/tesseract.go @@ -0,0 +1,12 @@ +package textextractor + +type Tesseract struct{} + +// NewTesseract returns a new Tesseract TextExtractor +func NewTesseract() TextExtractor { + return &Tesseract{} +} + +func (t *Tesseract) ExtractText(img []byte) (string, error) { + return "", nil +} diff --git a/internal/textextractor/textextractor.go b/internal/textextractor/textextractor.go new file mode 100644 index 0000000..718f599 --- /dev/null +++ b/internal/textextractor/textextractor.go @@ -0,0 +1,7 @@ +package textextractor + +// TextExtractor is an interface for extracting text from image files with various backends +type TextExtractor interface { + // ExtractText identifies and returns all text found within an image file + ExtractText(img []byte) (string, error) +} From fba40355c788d8f30ee9c76f8edf389dcfaf13a2 Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Tue, 3 Feb 2026 22:50:48 -0800 Subject: [PATCH 2/9] Add missing modules --- go.mod | 2 ++ go.sum | 2 ++ 2 files changed, 4 insertions(+) diff --git a/go.mod b/go.mod index a78c5d4..720e3e4 100644 --- a/go.mod +++ b/go.mod @@ -3,3 +3,5 @@ module github.com/ashirt-ops/ocr-worker go 1.25.6 require github.com/jrozner/weby v0.1.0 + +require github.com/google/uuid v1.6.0 // indirect diff --git a/go.sum b/go.sum index 92c4090..f97397e 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jrozner/weby v0.1.0 h1:kI+DXnYHsSB6Kmmil8i7Wqh5O1oF1c1xOhnVSONh+BA= github.com/jrozner/weby v0.1.0/go.mod h1:cBLmTAkOScydeEd02pmDkSeQfdZUQ9Y3r3hcLSUZrqI= From bde34c8c67a76e4299125d6fb0a215a4a4e3ebf6 Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Tue, 3 Feb 2026 22:56:21 -0800 Subject: [PATCH 3/9] Fix copy --- Dockerfile | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3bb557f..8726041 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,18 @@ -FROM golang:1.25-alpine AS build - -RUN mkdir app -COPY . ./app/ -WORKDIR /go/app -RUN go build -v ./... - -FROM alpine:latest - -RUN apk add --no-cache tesseract-ocr && \ - adduser -h /home/ashirt -S -D ashirt - -USER ashirt -WORKDIR /home/ashirt - -COPY --from=build /go/app/ocr-worker /home/ashirt/ocr-worker - +FROM golang:1.25-alpine AS build + +RUN mkdir app +COPY . ./app/ +WORKDIR /go/app +RUN go build -v ./cmd/... + +FROM alpine:latest + +RUN apk add --no-cache tesseract-ocr && \ + adduser -h /home/ashirt -S -D ashirt + +USER ashirt +WORKDIR /home/ashirt + +COPY --from=build /go/app/ocr-worker /home/ashirt/ocr-worker + CMD ["ocr-worker"] \ No newline at end of file From ff7675e77bbd46aedea3e9ca0d59f67a828d71fa Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Wed, 4 Feb 2026 17:35:54 -0800 Subject: [PATCH 4/9] Working tesseract extractor --- .github/workflows/ci.yml | 171 +++++++++++---------- Dockerfile | 2 +- go.mod | 11 +- go.sum | 6 + internal/textextractor/tesseract.go | 19 ++- internal/textextractor/tesseract_test.go | 22 +++ internal/textextractor/testdata/ashirt.png | Bin 0 -> 3520 bytes 7 files changed, 143 insertions(+), 88 deletions(-) create mode 100644 internal/textextractor/tesseract_test.go create mode 100755 internal/textextractor/testdata/ashirt.png diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ae4eb5..622ace2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,85 +1,88 @@ -name: ci - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - schedule: - - cron: '0 0 * * *' # Runs every day at 00:00 UTC - -jobs: - gotest: - name: Test ocr worker - runs-on: ubuntu-latest - steps: - - name: Set up Go 1.x - uses: actions/setup-go@v6 - with: - go-version: stable - id: go - - - name: Check out code into the Go module directory - uses: actions/checkout@v6 - - - name: Build - run: go build -v ./... - - - name: gofmt - run: | - GOFMTOUT=$(gofmt -l .) - if [[ ! -z "${GOFMTOUT}" ]]; then - echo "FATAL: gofmt violation(s), please fix" - echo $GOFMTOUT - exit -1 - fi - - name: go vet - run: go vet ./... - - - name: Test - run: go test -v ./... - - build_push: - name: Build and Push - needs: [gotest] - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v6 - - - name: Docker meta - id: docker_meta - uses: docker/metadata-action@v5.10.0 - with: - images: ashirt/ocr-worker - tags: | - type=sha - type=ref,event=branch - type=ref,event=pr - flavor: | - latest=false - - name: Login to Docker Hub - uses: docker/login-action@v3.7.0 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and Push PR - if: github.ref != 'refs/heads/main' - uses: docker/build-push-action@v6.18.0 - with: - context: . - file: Dockerfile - tags: ${{ steps.docker_meta.outputs.tags }} - labels: ${{ steps.docker_meta.outputs.labels }} - push: true # Push with pr-### and sha-xxxxxxx tags - - - name: Build and Push Latest - if: github.ref == 'refs/heads/main' - uses: docker/build-push-action@v6.18.0 - with: - context: . - file: Dockerfile - tags: ${{ steps.docker_meta.outputs.tags }}, ashirt/ocr-worker:latest #Add latest tag for main - labels: ${{ steps.docker_meta.outputs.labels }} +name: ci + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 0 * * *' # Runs every day at 00:00 UTC + +jobs: + gotest: + name: Test ocr worker + runs-on: ubuntu-latest + steps: + - name: Set up Go 1.x + uses: actions/setup-go@v6 + with: + go-version: stable + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v6 + + - name: Install packages + run: apt install -y tesseract-ocr-dev tesseract-ocr-eng + + - name: Build + run: go build -v ./... + + - name: gofmt + run: | + GOFMTOUT=$(gofmt -l .) + if [[ ! -z "${GOFMTOUT}" ]]; then + echo "FATAL: gofmt violation(s), please fix" + echo $GOFMTOUT + exit -1 + fi + - name: go vet + run: go vet ./... + + - name: Test + run: go test -v ./... + + build_push: + name: Build and Push + needs: [gotest] + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v6 + + - name: Docker meta + id: docker_meta + uses: docker/metadata-action@v5.10.0 + with: + images: ashirt/ocr-worker + tags: | + type=sha + type=ref,event=branch + type=ref,event=pr + flavor: | + latest=false + - name: Login to Docker Hub + uses: docker/login-action@v3.7.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and Push PR + if: github.ref != 'refs/heads/main' + uses: docker/build-push-action@v6.18.0 + with: + context: . + file: Dockerfile + tags: ${{ steps.docker_meta.outputs.tags }} + labels: ${{ steps.docker_meta.outputs.labels }} + push: true # Push with pr-### and sha-xxxxxxx tags + + - name: Build and Push Latest + if: github.ref == 'refs/heads/main' + uses: docker/build-push-action@v6.18.0 + with: + context: . + file: Dockerfile + tags: ${{ steps.docker_meta.outputs.tags }}, ashirt/ocr-worker:latest #Add latest tag for main + labels: ${{ steps.docker_meta.outputs.labels }} push: true \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 8726041..794a88b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ RUN go build -v ./cmd/... FROM alpine:latest -RUN apk add --no-cache tesseract-ocr && \ +RUN apk add --no-cache tesseract-ocr tesseract-ocr-data-eng && \ adduser -h /home/ashirt -S -D ashirt USER ashirt diff --git a/go.mod b/go.mod index 720e3e4..e1a2d58 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,13 @@ module github.com/ashirt-ops/ocr-worker go 1.25.6 -require github.com/jrozner/weby v0.1.0 +require ( + github.com/jrozner/weby v0.1.0 + github.com/otiai10/gosseract v2.2.1+incompatible +) -require github.com/google/uuid v1.6.0 // indirect +require ( + github.com/google/uuid v1.6.0 // indirect + github.com/otiai10/mint v1.6.3 // indirect + golang.org/x/net v0.49.0 // indirect +) diff --git a/go.sum b/go.sum index f97397e..9e887b5 100644 --- a/go.sum +++ b/go.sum @@ -2,3 +2,9 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jrozner/weby v0.1.0 h1:kI+DXnYHsSB6Kmmil8i7Wqh5O1oF1c1xOhnVSONh+BA= github.com/jrozner/weby v0.1.0/go.mod h1:cBLmTAkOScydeEd02pmDkSeQfdZUQ9Y3r3hcLSUZrqI= +github.com/otiai10/gosseract v2.2.1+incompatible h1:Ry5ltVdpdp4LAa2bMjsSJH34XHVOV7XMi41HtzL8X2I= +github.com/otiai10/gosseract v2.2.1+incompatible/go.mod h1:XrzWItCzCpFRZ35n3YtVTgq5bLAhFIkascoRo8G32QE= +github.com/otiai10/mint v1.6.3 h1:87qsV/aw1F5as1eH1zS/yqHY85ANKVMgkDrf9rcxbQs= +github.com/otiai10/mint v1.6.3/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= diff --git a/internal/textextractor/tesseract.go b/internal/textextractor/tesseract.go index bb640e5..162d7c2 100644 --- a/internal/textextractor/tesseract.go +++ b/internal/textextractor/tesseract.go @@ -1,5 +1,9 @@ package textextractor +import ( + "github.com/otiai10/gosseract" +) + type Tesseract struct{} // NewTesseract returns a new Tesseract TextExtractor @@ -8,5 +12,18 @@ func NewTesseract() TextExtractor { } func (t *Tesseract) ExtractText(img []byte) (string, error) { - return "", nil + client := gosseract.NewClient() + defer client.Close() + + err := client.SetLanguage("eng") + if err != nil { + return "", err + } + + err = client.SetImageFromBytes(img) + if err != nil { + return "", err + } + + return client.Text() } diff --git a/internal/textextractor/tesseract_test.go b/internal/textextractor/tesseract_test.go new file mode 100644 index 0000000..f162d3d --- /dev/null +++ b/internal/textextractor/tesseract_test.go @@ -0,0 +1,22 @@ +package textextractor + +import ( + "os" + "testing" +) + +func TestTesseract(t *testing.T) { + data, err := os.ReadFile("./testdata/ashirt.png") + if err != nil { + t.Fatalf("unable to read test data: %v", err) + } + extractor := NewTesseract() + extracted, err := extractor.ExtractText(data) + if err != nil { + t.Fatalf("unable to extract text: %v", err) + } + + if extracted != "ASHIRT" { + t.Fatalf("extracted text does not match expected text, expected: %v, got: %v", "ashirt", extracted) + } +} diff --git a/internal/textextractor/testdata/ashirt.png b/internal/textextractor/testdata/ashirt.png new file mode 100755 index 0000000000000000000000000000000000000000..9fb27e223d21cb91a42fc591fa768df56b1d8905 GIT binary patch literal 3520 zcmbVP_ct33v`4E#OKZ0A@&y3Xl+G^8I4-4S@mr-Rz;~IXe3st zQJbn!GqtM5^ZtbQ?m72<&gY!_!##JLdvA(~(L-h?J|-F(8fLgI%i!W0ItawoEN2SgEIGbLxnvKLejYV`THWJf?a};NWWkYf7JG+_6HYE zQvd6u6@+|@^7QuuT6p>*X)HpJKp6;72Z;d6$;c@HWo1-kl~nG@12qhRCgv6~4*ka$ zQSicH8Wv9qHw!2IEv$=U+CPnLy``E-e@d%qU2IBMOb!IVwN&)o&8wapi9%-h8t)l{ z_5~r|py{I{Ibvp25T_-&SPv*rOY4zA%eT^r@rZg)r%nKMAFp_Yvxy_>J_BM7%6aq=1Y)Z9E@Ao0-e*>XO_xA zla7+4be_<4lw*L^Hbs6>Se9#Hrzli|&r4mgFN^z}nkgGSXv7DVH~$5hBQm%0I=F1z z;IMc~|1l$Y>(pjZ;VO5g1^@R?k5)dOH*;sSWW8s7MCeJ$i0=KjlDB2@HwIXWD}2IV zFi~HH-om)qc{|t$4~3*#NK&YEC6m?SI!O;#+k?_m^2516yz1)s+@a%dkte^=-+6{f zFs?PV1BSEH5!N{SMu$Bq4my88({qm0H8#YM@JFGc5m~ju7X#|JJRRjLyXgRF5Q7T>sf`7#mO#>}H?jHEF z>}nDsK3?;Ch+Yy@?R~f%-C0f8Q=MyS>!fFmZ;U(GQf;urejxO_p*aph6@$ySy(^dO zdAMO5XtdydEAj5!gRfWjG4@;Bms#VxF};Z1CEvyTrwjmty}-Ry)3Z>wEnSjCbr+ zf--z+0yN~DYk$DQ(CBRjF}VDK-+Fl&m9BmXZ?7pE^Ty_l$LwJ3Y-IeY`zO#89^(Fvvr zs?^67Nw2Sad+vl!EY#iZp)bvaBZMxmkY#K`T>q*%VIbV6YCrINvP)CtGK2e%dginWTL_@4ZoVfxV~|1~BL zARDsFbN;u3o(po_n&`_e-3Qity1DTV|4!6Hyb~QC_3S58NO$`$?T=}%c*LoVBaUI& z+tOFQ&L-tXj>;pQ}pIl zxv@F&TG6NQrpj9&pahd!)y)E?@>`EQo8ZmKUgEFu^_(T)sv-qxc%=#QTCB3w^V&bH zpV+iBWq(MluD&fRSNpR6v*(m=(ju+VgUN)J_Pd$hR7VMwMP$#>9-+)5)ZC&c!y!@( zXLT$_o-pW%0N7GVsrAh~kIAMj23cLHw&9~>R=QH*GBK)@3pee}1zCnlUsh>di6&}9 z#1AMTvrvTB5%J1DVNIQh;EyA$NbFVv`6*BAS|*m)m&o}JBV%>K<6O-u&~7_lkhUD4 zj@A9?_3hEEh5jKrcJ`4pV*^HF%9}+5Fe-}On$Q3ZyoWdreOL|T>&d3VDori_Swt_sZM&~_{6?SqaS_a zT6M*6HYc9NPOONTczz8ggu0G7l>%D!GSs z>Bkz}KvD)I?82;vTOtMRz}@T9Vr^~Q%F%?9?WC&w>;5zNl(&i0N_XYay&eB>V_lt8 zs8s7wQp$is7Rpwj2hl^5a?I<_j`d5cOjj#+;%r7*Ju zh~VfAMOkr+LFWRsW24TRMCYJy(~}1KiX&-B@m1Sw??2b}i+>Tr zuKF|z?}~+cEX4VIA(SN+7h(1XRoi6Z&uGC;?_YN}-0~_4+1%JI zMnnO#e~+UYTyc~7kEnv?Ak@ZIX*~f0_E#3s<}$d#ZdJfsb9;O(MWU(C%ITkSc^>!C z>D&vyI|apt>{q1&eAfm>6l@z!77ym@pz8w`q6Mk-izSF1YWUglU;3W0ThXAWx(jrv z)yXYQMtmCtVMi@8n8Y=XcMp62Qk%ZOQIw6|AO`3XYB4giwfyxf@J<1!F7GyLa}VKe z<=W{F9c@D?=KFnOO`PWVuHJ!&yYkP%7>o&A68( zd_2h(_SuSf7PM;JY8qXlUts^*T{poRx61$*a%sXYR)280F?P-TCBIP%>_8w2>x=Hyj54DeG&5Xv-n|3>(mJ~U1urCi-US>Eh?;-eZWO7{6 z?q>P9DyN@yEuY8->3T}JRyzmc_c>B=n%^vG@BV$Wf``Ubetu`;i!YlW`+N5hqSZ-S z2Bxmn@|1}Q(~LcsP>`B(MMxSKuVh3+y9|0XwU?Zp?V`CSmve2m*AWR;c{X;DK{avL z)s;7pIE@RP4Zp(D3KWKn<8xI*%(2V;P}v?0u2N zo;2R$zw)K|g4TaJSH}WCPJ`BkyoKHE0*l8@4owpuMulAowAT%2)9wH4^GizsXtI~YHo{M0)8jR+u{;@|w(*)*Jg zAC2Z>|IIJaEx(7dfGk;OWuv_RGf`^2ZEUj+&y=2!;#D`h?zBaD1_;qk!jjigoc-ke@7((~ z%-M515OnVqjS6W;PU~#?Sz8{M5BF>vki`7^E7PUeRL3kIv$&@QX9}WI8P2Zo2P7bsm8a88$JM)Y?6Lrix ze*~q^8K=w0iQ9pe0ilCpfI$dT=byCy6FsL7Kcm*+I4)UVd{;DZZ6nxwP3O4(0g8si Ad;kCd literal 0 HcmV?d00001 From b047ca0cf0662161832a86a67665d6e34d37f4a9 Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Wed, 4 Feb 2026 17:36:42 -0800 Subject: [PATCH 5/9] Add missing package --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 794a88b..bd343c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ RUN go build -v ./cmd/... FROM alpine:latest -RUN apk add --no-cache tesseract-ocr tesseract-ocr-data-eng && \ +RUN apk add --no-cache tesseract-ocr tesseract-ocr-dev tesseract-ocr-data-eng && \ adduser -h /home/ashirt -S -D ashirt USER ashirt From 3938ca3757feb1c556aa95a6d48b185cd6f1cf5e Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Wed, 4 Feb 2026 17:39:39 -0800 Subject: [PATCH 6/9] Fix deps --- .github/workflows/ci.yml | 2 +- .github/workflows/golangci-lint.yml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 622ace2..48e803b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v6 - name: Install packages - run: apt install -y tesseract-ocr-dev tesseract-ocr-eng + run: sudo apt install -y tesseract-ocr-dev tesseract-ocr-eng - name: Build run: go build -v ./... diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 620ec88..f30ba8c 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -19,6 +19,10 @@ jobs: - uses: actions/setup-go@v6 with: go-version: stable + + - name: Install packages + run: sudo apt install -y tesseract-ocr-dev tesseract-ocr-eng + - name: golangci-lint uses: golangci/golangci-lint-action@v9 with: From a5855fa9d1891ff679ea34db0ff7f5f8079d9077 Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Wed, 4 Feb 2026 17:42:37 -0800 Subject: [PATCH 7/9] Fix more deps --- .github/workflows/ci.yml | 2 +- .github/workflows/golangci-lint.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 48e803b..d47cd65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v6 - name: Install packages - run: sudo apt install -y tesseract-ocr-dev tesseract-ocr-eng + run: sudo apt install -y libtesseract-dev tesseract-ocr-eng - name: Build run: go build -v ./... diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index f30ba8c..3db87f9 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -21,7 +21,7 @@ jobs: go-version: stable - name: Install packages - run: sudo apt install -y tesseract-ocr-dev tesseract-ocr-eng + run: sudo apt install -y libtesseract-dev tesseract-ocr-eng - name: golangci-lint uses: golangci/golangci-lint-action@v9 From d621c111f10cfc3acfacd2383bec8cb7f3c8c138 Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Wed, 4 Feb 2026 17:47:36 -0800 Subject: [PATCH 8/9] Fix lint issue --- internal/textextractor/tesseract.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/textextractor/tesseract.go b/internal/textextractor/tesseract.go index 162d7c2..50ce6f1 100644 --- a/internal/textextractor/tesseract.go +++ b/internal/textextractor/tesseract.go @@ -13,8 +13,15 @@ func NewTesseract() TextExtractor { func (t *Tesseract) ExtractText(img []byte) (string, error) { client := gosseract.NewClient() - defer client.Close() + defer func() { + // TODO: we're doing this because golangci-lint complains about an unchecked error + // We can either add an exclusion to it's config or used a named return to surface' + // this up + _ = client.Close() + }() + // NOTE: For some reason this segfaults without calling SetLanguage even though the + // docs say "eng" should be the default err := client.SetLanguage("eng") if err != nil { return "", err From 353dcdbbdc8636eb905a1b60243f48e7fbe40c4b Mon Sep 17 00:00:00 2001 From: Joe Rozner Date: Wed, 4 Feb 2026 17:58:17 -0800 Subject: [PATCH 9/9] Closer but failing to leptonica --- Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd343c2..d2656de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,16 @@ FROM golang:1.25-alpine AS build -RUN mkdir app +RUN mkdir app && \ + apk add --no-cache build-base leptonica-dev tesseract-ocr-dev + COPY . ./app/ WORKDIR /go/app + RUN go build -v ./cmd/... FROM alpine:latest -RUN apk add --no-cache tesseract-ocr tesseract-ocr-dev tesseract-ocr-data-eng && \ +RUN apk add --no-cache tesseract-ocr tesseract-ocr-data-eng && \ adduser -h /home/ashirt -S -D ashirt USER ashirt