diff --git a/Dockerfile b/Dockerfile index 2e22195..624c5a1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,9 @@ RUN apt-get update && apt-get install -y chromium ENV CHROMIUM_BIN=/usr/bin/chromium ENV GOOGLE_CHROME_BIN=/usr/bin/chromium +# Install poppler-utils for pdftotext for academic calendar parsing +RUN apt-get install -y poppler-utils + # Copy build file from builder COPY --from=builder /app/api-tools /app/api-tools @@ -32,5 +35,5 @@ RUN chmod +x /app/runners/setup.sh ENTRYPOINT ["/app/runners/setup.sh"] # Optional .env copy for development -FROM base AS dev +FROM base AS local COPY .env /app/.env diff --git a/README.md b/README.md index 6d133ce..3979e26 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,8 @@ Docker is used for automated running on Google Cloud Platform. More info [here]( To build the container for local testing first make sure all scripts in the `runners` folder have LF line endings then run: ``` -docker build --target dev -t my-runner:local . -docker run --rm -e ENVIRONMENT=dev -e RUNNER_SCRIPT_NAME=daily.sh my-runner:local +docker build --target local -t my-runner:local . +docker run --rm -e ENVIRONMENT=local -e RUNNER_SCRIPT_NAME=daily.sh my-runner:local ``` ## Questions? diff --git a/go.mod b/go.mod index 9150392..3773932 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/chromedp/chromedp v0.12.1 github.com/google/go-cmp v0.7.0 github.com/joho/godotenv v1.5.1 - github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 github.com/valyala/fastjson v1.6.4 go.mongodb.org/mongo-driver v1.17.3 golang.org/x/net v0.43.0 diff --git a/parser/academicCalendars.go b/parser/academicCalendars.go index 169d32a..b2e4d3c 100644 --- a/parser/academicCalendars.go +++ b/parser/academicCalendars.go @@ -1,3 +1,10 @@ +/* +Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html +apt-get install -y poppler-utils +I found all the Go programs for PDF text extraction were all either paid, had a +complicated installation process, or errored on one of the PDFs. +*/ + package parser import ( @@ -12,13 +19,14 @@ import ( "log" "net/http" "os" + "os/exec" "path/filepath" "strings" "sync" + "time" "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" - "github.com/ledongthuc/pdf" "google.golang.org/genai" ) @@ -73,7 +81,7 @@ PDF Content: func ParseAcademicCalendars(inDir string, outDir string) { // Get sub folder from output folder - outSubDir := filepath.Join(outDir, "academicCalendars") + inSubDir := filepath.Join(inDir, "academicCalendars") result := []schema.AcademicCalendar{} @@ -81,6 +89,7 @@ func ParseAcademicCalendars(inDir string, outDir string) { numWorkers := 10 jobs := make(chan string) var wg sync.WaitGroup + var mu sync.Mutex // Start worker goroutines for range numWorkers { @@ -92,16 +101,33 @@ func ParseAcademicCalendars(inDir string, outDir string) { academicCalendar, err := parsePdf(path) if err != nil { - panic(err) + if strings.Contains(err.Error(), "429") { + // Exponential-ish backoff up to 60s for 429 rate limiting + backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second} + for _, delay := range backoffs { + time.Sleep(delay) + academicCalendar, err = parsePdf(path) + if err == nil || !strings.Contains(err.Error(), "429") { + break + } + } + } + + if err != nil { + panic(err) + } } + + mu.Lock() result = append(result, academicCalendar) + mu.Unlock() log.Printf("Parsed %s!", filepath.Base(path)) } }() } - err := filepath.WalkDir(outSubDir, func(path string, d fs.DirEntry, err error) error { + err := filepath.WalkDir(inSubDir, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } @@ -196,33 +222,20 @@ func parsePdf(path string) (schema.AcademicCalendar, error) { } // Read the text from the first page of a PDF +// Using external program pdftotext func readPdf(path string) (string, error) { - // Open the PDF - f, r, err := pdf.Open(path) - if err != nil { - return "", err - } - defer f.Close() - - // Make sure at least one page exists - if r.NumPage() < 1 { - return "", fmt.Errorf("no pages in PDF") - } + cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-") - // Get the first page - page := r.Page(1) // pages are 1-indexed - if page.V.IsNull() { - return "", fmt.Errorf("failed to read page 1") - } + var out bytes.Buffer + var stderr bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &stderr - // Read text - var buf bytes.Buffer - text := page.Content().Text - for _, t := range text { - buf.WriteString(t.S) // S is the actual string + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String()) } - return buf.String(), nil + return out.String(), nil } // Check cache for a response to the same prompt diff --git a/runners/setup.sh b/runners/setup.sh index 8e83122..8af8239 100644 --- a/runners/setup.sh +++ b/runners/setup.sh @@ -1,6 +1,6 @@ #!/bin/sh -if [ "$ENVIRONMENT" = "prod" ]; then +if [ "$ENVIRONMENT" = "gcp" ]; then # auth with service account gcloud secrets versions access latest --secret="$SERVICE_ACCOUNT_SECRET_NAME" > service_account.json gcloud auth activate-service-account --key-file=service_account.json