Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ RUN apt-get update && apt-get install -y chromium
ENV CHROMIUM_BIN=/usr/bin/chromium
ENV GOOGLE_CHROME_BIN=/usr/bin/chromium

# Install poppler-utils for pdftotext for academic calendar parsing
RUN apt-get install -y poppler-utils

# Copy build file from builder
COPY --from=builder /app/api-tools /app/api-tools

Expand All @@ -32,5 +35,5 @@ RUN chmod +x /app/runners/setup.sh
ENTRYPOINT ["/app/runners/setup.sh"]

# Optional .env copy for development
FROM base AS dev
FROM base AS local
COPY .env /app/.env
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ Docker is used for automated running on Google Cloud Platform. More info [here](

To build the container for local testing first make sure all scripts in the `runners` folder have LF line endings then run:
```
docker build --target dev -t my-runner:local .
docker run --rm -e ENVIRONMENT=dev -e RUNNER_SCRIPT_NAME=daily.sh my-runner:local
docker build --target local -t my-runner:local .
docker run --rm -e ENVIRONMENT=local -e RUNNER_SCRIPT_NAME=daily.sh my-runner:local
```

## Questions?
Expand Down
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ require (
github.com/chromedp/chromedp v0.12.1
github.com/google/go-cmp v0.7.0
github.com/joho/godotenv v1.5.1
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80
github.com/valyala/fastjson v1.6.4
go.mongodb.org/mongo-driver v1.17.3
golang.org/x/net v0.43.0
Expand Down
65 changes: 39 additions & 26 deletions parser/academicCalendars.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
/*
Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html
apt-get install -y poppler-utils
I found all the Go programs for PDF text extraction were all either paid, had a
complicated installation process, or errored on one of the PDFs.
*/

package parser

import (
Expand All @@ -12,13 +19,14 @@ import (
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"

"github.com/UTDNebula/api-tools/utils"
"github.com/UTDNebula/nebula-api/api/schema"
"github.com/ledongthuc/pdf"
"google.golang.org/genai"
)

Expand Down Expand Up @@ -73,14 +81,15 @@ PDF Content:

func ParseAcademicCalendars(inDir string, outDir string) {
// Get sub folder from output folder
outSubDir := filepath.Join(outDir, "academicCalendars")
inSubDir := filepath.Join(inDir, "academicCalendars")

result := []schema.AcademicCalendar{}

// Parallel requests
numWorkers := 10
jobs := make(chan string)
var wg sync.WaitGroup
var mu sync.Mutex

// Start worker goroutines
for range numWorkers {
Expand All @@ -92,16 +101,33 @@ func ParseAcademicCalendars(inDir string, outDir string) {

academicCalendar, err := parsePdf(path)
if err != nil {
panic(err)
if strings.Contains(err.Error(), "429") {
// Exponential-ish backoff up to 60s for 429 rate limiting
backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second}
for _, delay := range backoffs {
time.Sleep(delay)
academicCalendar, err = parsePdf(path)
if err == nil || !strings.Contains(err.Error(), "429") {
break
}
}
}

if err != nil {
panic(err)
}
}

mu.Lock()
result = append(result, academicCalendar)
mu.Unlock()

log.Printf("Parsed %s!", filepath.Base(path))
}
}()
}

err := filepath.WalkDir(outSubDir, func(path string, d fs.DirEntry, err error) error {
err := filepath.WalkDir(inSubDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
Expand Down Expand Up @@ -196,33 +222,20 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
}

// Read the text from the first page of a PDF
// Using external program pdftotext
func readPdf(path string) (string, error) {
// Open the PDF
f, r, err := pdf.Open(path)
if err != nil {
return "", err
}
defer f.Close()

// Make sure at least one page exists
if r.NumPage() < 1 {
return "", fmt.Errorf("no pages in PDF")
}
cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-")

// Get the first page
page := r.Page(1) // pages are 1-indexed
if page.V.IsNull() {
return "", fmt.Errorf("failed to read page 1")
}
var out bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = &stderr

// Read text
var buf bytes.Buffer
text := page.Content().Text
for _, t := range text {
buf.WriteString(t.S) // S is the actual string
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String())
}

return buf.String(), nil
return out.String(), nil
}

// Check cache for a response to the same prompt
Expand Down
2 changes: 1 addition & 1 deletion runners/setup.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh

if [ "$ENVIRONMENT" = "prod" ]; then
if [ "$ENVIRONMENT" = "gcp" ]; then
# auth with service account
gcloud secrets versions access latest --secret="$SERVICE_ACCOUNT_SECRET_NAME" > service_account.json
gcloud auth activate-service-account --key-file=service_account.json
Expand Down