diff --git a/.env.template b/.env.template index 9bb0daf..d0c456d 100644 --- a/.env.template +++ b/.env.template @@ -5,6 +5,15 @@ LOGIN_ASTRA_USERNAME= LOGIN_ASTRA_PASSWORD= #Login to https://east.mymazevo.com/main-home then go to https://east.mymazevo.com/api/tenantsettings/GetApiKey MAZEVO_API_KEY= +#Academic Calendars +GOOGLE_GENAI_USE_VERTEXAI= +GOOGLE_CLOUD_PROJECT= +GOOGLE_APPLICATION_CREDENTIALS= +GEMINI_SERVICE_ACCOUNT= +NEBULA_API_URL= +NEBULA_API_STORAGE_BUCKET= +NEBULA_API_KEY= +NEBULA_API_STORAGE_KEY= # Uploader MONGODB_URI= diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d0b235d..f44f885 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -55,6 +55,20 @@ jobs: job: 'daily-prod' image: 'us-central1-docker.pkg.dev/api-tools-451421/runners/runner' region: us-central1 + + - name: Upload to weekly dev job + uses: google-github-actions/deploy-cloudrun@v2 + with: + job: 'weekly-dev' + image: 'us-central1-docker.pkg.dev/api-tools-451421/runners/runner' + region: us-central1 + + - name: Upload to weekly prod job + uses: google-github-actions/deploy-cloudrun@v2 + with: + job: 'weekly-prod' + image: 'us-central1-docker.pkg.dev/api-tools-451421/runners/runner' + region: us-central1 - name: Upload to monthly dev job uses: google-github-actions/deploy-cloudrun@v2 diff --git a/.gitignore b/.gitignore index 9471423..a6a1d1f 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ yarn-error.log* *.env.development.local *.env.test.local *.env +*service-account.json # vercel .vercel diff --git a/go.mod b/go.mod index 8f3726c..9150392 100644 --- a/go.mod +++ b/go.mod @@ -9,9 +9,11 @@ require ( github.com/chromedp/chromedp v0.12.1 github.com/google/go-cmp v0.7.0 github.com/joho/godotenv v1.5.1 + github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 github.com/valyala/fastjson v1.6.4 go.mongodb.org/mongo-driver v1.17.3 golang.org/x/net v0.43.0 + google.golang.org/genai v1.32.0 ) require ( @@ -54,6 +56,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.5 // indirect github.com/googleapis/gax-go/v2 v2.14.1 // indirect + github.com/gorilla/websocket v1.5.3 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.8 // indirect diff --git a/go.sum b/go.sum index 7a5d07a..760cac4 100644 --- a/go.sum +++ b/go.sum @@ -109,6 +109,8 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.5 h1:VgzTY2jogw3xt39CusE github.com/googleapis/enterprise-certificate-proxy v0.3.5/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= github.com/googleapis/gax-go/v2 v2.14.1 h1:hb0FFeiPaQskmvakKu5EbCbpntQn48jyHuvrkurSS/Q= github.com/googleapis/gax-go/v2 v2.14.1/go.mod h1:Hb/NubMaVM88SrNkvl8X/o8XWwDJEPqouaLeN2IUxoA= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -243,6 +245,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.224.0 h1:Ir4UPtDsNiwIOHdExr3fAj4xZ42QjK7uQte3lORLJwU= google.golang.org/api v0.224.0/go.mod h1:3V39my2xAGkodXy0vEqcEtkqgw2GtrFL5WuBZlCTCOQ= +google.golang.org/genai v1.32.0 h1:kku/m3kWOncjnw8EIa2sgmrPLhaxFHaP+uqOq5ZckvI= +google.golang.org/genai v1.32.0/go.mod h1:7pAilaICJlQBonjKKJNhftDFv3SREhZcTe9F6nRcjbg= google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb h1:ITgPrl429bc6+2ZraNSzMDk3I95nmQln2fuPstKwFDE= google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:sAo5UzpjUwgFBCzupwhcLcxHVDK7vG5IqI30YnwX2eE= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= diff --git a/main.go b/main.go index fd8369c..7d0af33 100644 --- a/main.go +++ b/main.go @@ -46,6 +46,8 @@ func main() { mazevo := flag.Bool("mazevo", false, "Alongside -scrape or -parse, signifies that Mazevo should be scraped/parsed.") // Flag for map scraping, parsing, and uploading mapFlag := flag.Bool("map", false, "Alongside -scrape, -parse, or -upload, signifies that the UTD map should be scraped/parsed/uploaded.") + // Flag for academic calendar scraping + academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.") // Flags for parsing parse := flag.Bool("parse", false, "Puts the tool into parsing mode.") @@ -114,6 +116,8 @@ func main() { scrapers.ScrapeMazevo(*outDir) case *mapFlag: scrapers.ScrapeMapLocations(*outDir) + case *academicCalendars: + scrapers.ScrapeAcademicCalendars(*outDir) default: log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!") } @@ -127,6 +131,8 @@ func main() { parser.ParseMazevo(*inDir, *outDir) case *mapFlag: parser.ParseMapLocations(*inDir, *outDir) + case *academicCalendars: + parser.ParseAcademicCalendars(*inDir, *outDir) default: parser.Parse(*inDir, *outDir, *csvDir, *skipValidation) } @@ -136,6 +142,8 @@ func main() { uploader.UploadEvents(*inDir) case *mapFlag: uploader.UploadMapLocations(*inDir) + case *academicCalendars: + uploader.UploadAcademicCalendars(*inDir) default: uploader.Upload(*inDir, *replace, *staticOnly) } diff --git a/parser/academicCalendars.go b/parser/academicCalendars.go new file mode 100644 index 0000000..169d32a --- /dev/null +++ b/parser/academicCalendars.go @@ -0,0 +1,361 @@ +package parser + +import ( + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "io/fs" + "log" + "net/http" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/UTDNebula/api-tools/utils" + "github.com/UTDNebula/nebula-api/api/schema" + "github.com/ledongthuc/pdf" + "google.golang.org/genai" +) + +// Store client to only create once +var once sync.Once +var geminiClient *genai.Client + +// What gets sent to Gemini, with the PDF content added +var prompt = `Parse this PDF content and generate the following JSON schema. + +{ + _id: %s, + timeline: %s, + sessions: [ + { + name: string, + begin: date string, + last_registration: date string, + late_registration: [date string, date string], + census_day: date string, + drop_deadlines { + without_w: date string, + undergrad_approval_required: date string, // use end date + graduate_withdrawl_ends: date string, + } + end: date string, + reading_days: [date string], + exams: [date string, date string], + final_grading_period: [date string, date string], + } + ], + enrollment_opens: date string, + schedule_planner_available: date string, + online_add_swap_ends: date string, + last_readmission: date string, + last_from_waitlist: date string, + midterms_due: date string, + university_closings: [[date string, date string]], // for single days off use the same date string twice + no_classes: [[date string, date string]], +} + +- Use the ISO 8601 format for date strings (2006-01-02) +- There will be 3 sessions for Fall and Spring and 4 sessions for Summer. +- You can determine the year for the dates based on the title. Be careful with Spring and Summer academic calendars as for example the 2025 one may have some earlier dates, such as registration, in 2024. +- Only use dates that are explicitly written in the PDF text. +- Do not infer, estimate, or guess any date. +- If a date is missing or unclear, return null for that field. + +PDF Content: + +%s` + +func ParseAcademicCalendars(inDir string, outDir string) { + // Get sub folder from output folder + outSubDir := filepath.Join(outDir, "academicCalendars") + + result := []schema.AcademicCalendar{} + + // Parallel requests + numWorkers := 10 + jobs := make(chan string) + var wg sync.WaitGroup + + // Start worker goroutines + for range numWorkers { + wg.Add(1) + go func() { + defer wg.Done() + for path := range jobs { + log.Printf("Parsing %s...", filepath.Base(path)) + + academicCalendar, err := parsePdf(path) + if err != nil { + panic(err) + } + result = append(result, academicCalendar) + + log.Printf("Parsed %s!", filepath.Base(path)) + } + }() + } + + err := filepath.WalkDir(outSubDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() { // Is a file + jobs <- path + } + return nil + }) + if err != nil { + panic(err) + } + + close(jobs) + + // Wait for workers to finish + wg.Wait() + + utils.WriteJSON(fmt.Sprintf("%s/academicCalendars.json", outDir), result) +} + +// Read a PDF, build a prompt for Gemini to parse it, check if it has already been asked in the cache, and ask Gemini if not +func parsePdf(path string) (schema.AcademicCalendar, error) { + // "Fall 2025" to "25F" + filename := filepath.Base(path) + filename = filename[0 : len(filename)-4] + filenameParts := strings.Split(filename, "-") + name := filenameParts[1][len(filenameParts[1])-2 : len(filenameParts[1])] + if strings.Contains(filenameParts[1], "Fall") { + name = name + "F" + } else if strings.Contains(filenameParts[1], "Spring") { + name = name + "S" + } else { + name = name + "U" + } + timeline := filenameParts[0] + + // Read PDF + content, err := readPdf(path) + if err != nil { + return schema.AcademicCalendar{}, err + } + + // Build prompt + promptFilled := fmt.Sprintf(prompt, name, timeline, content) + + // Check cache + hashByte := sha256.Sum256([]byte(promptFilled)) + hash := hex.EncodeToString(hashByte[:]) + ".json" + result, err := checkCache(hash) + if err != nil { + return schema.AcademicCalendar{}, err + } + + // Skip AI if cache found + if result != "" { + log.Printf("Cache found for %s!", filename) + } else { + // Cache not found + log.Printf("No cache for %s, asking Gemini.", filename) + + // AI + geminiClient := getGeminiClient() + + // Send request with default config + response, err := geminiClient.Models.GenerateContent(context.Background(), + "gemini-2.5-pro", + genai.Text(promptFilled), + &genai.GenerateContentConfig{}, + ) + if err != nil { + return schema.AcademicCalendar{}, err + } + + // Get response, remove backtick formatting if present + result = strings.ReplaceAll(strings.ReplaceAll(response.Candidates[0].Content.Parts[0].Text, "```json", ""), "```", "") + + // Set cache for next time + err = setCache(hash, result) + if err != nil { + return schema.AcademicCalendar{}, err + } + } + + // Build struct + var academicCalendar schema.AcademicCalendar + err = json.Unmarshal([]byte(result), &academicCalendar) + if err != nil { + return schema.AcademicCalendar{}, err + } + + return academicCalendar, nil +} + +// Read the text from the first page of a PDF +func readPdf(path string) (string, error) { + // Open the PDF + f, r, err := pdf.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + // Make sure at least one page exists + if r.NumPage() < 1 { + return "", fmt.Errorf("no pages in PDF") + } + + // Get the first page + page := r.Page(1) // pages are 1-indexed + if page.V.IsNull() { + return "", fmt.Errorf("failed to read page 1") + } + + // Read text + var buf bytes.Buffer + text := page.Content().Text + for _, t := range text { + buf.WriteString(t.S) // S is the actual string + } + + return buf.String(), nil +} + +// Check cache for a response to the same prompt +func checkCache(hash string) (string, error) { + apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys() + if err != nil { + return "", err + } + + client := &http.Client{} + + // Make request + req, err := http.NewRequest("GET", apiUrl+"storage/"+apiBucket+"/"+hash, nil) + if err != nil { + return "", err + } + req.Header.Add("x-api-key", apiKey) + req.Header.Add("x-storage-key", apiStorageKey) + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Read the response body + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + var parsedBody schema.APIResponse[schema.ObjectInfo] + err = json.Unmarshal([]byte(body), &parsedBody) + if err != nil { + // If this errors, return ("", nil) to indicate not found + return "", nil + } + + // Fetch object + req, err = http.NewRequest("GET", parsedBody.Data.MediaLink, nil) + if err != nil { + return "", err + } + resp, err = client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Read the response body + body, err = io.ReadAll(resp.Body) + if err != nil { + return "", err + } + + return string(body), nil +} + +// Upload AI response to cache +func setCache(hash string, result string) error { + apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys() + if err != nil { + return err + } + + // Make request + jsonStr := []byte(result) + bodyReader := bytes.NewBuffer(jsonStr) + req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+hash, bodyReader) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Add("x-api-key", apiKey) + req.Header.Add("x-storage-key", apiStorageKey) + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + return nil +} + +// Get all the keys to access the Nebula API storage routes +func getNebulaKeys() (string, string, string, string, error) { + apiUrl, err := utils.GetEnv("NEBULA_API_URL") + if err != nil { + return "", "", "", "", err + } + apiBucket, err := utils.GetEnv("NEBULA_API_STORAGE_BUCKET") + if err != nil { + return "", "", "", "", err + } + apiKey, err := utils.GetEnv("NEBULA_API_KEY") + if err != nil { + return "", "", "", "", err + } + apiStorageKey, err := utils.GetEnv("NEBULA_API_STORAGE_KEY") + if err != nil { + return "", "", "", "", err + } + + return apiUrl, apiBucket, apiKey, apiStorageKey, nil +} + +// Create client only once +// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON which is created from GEMINI_SERVICE_ACCOUNT +func getGeminiClient() *genai.Client { + once.Do(func() { + // Create JSON file + serviceAccount, err := utils.GetEnv("GEMINI_SERVICE_ACCOUNT") + if err != nil { + panic(err) + } + jsonFile, err := utils.GetEnv("GOOGLE_APPLICATION_CREDENTIALS") + if err != nil { + panic(err) + } + err = os.WriteFile(jsonFile, []byte(serviceAccount), 0644) + if err != nil { + panic(err) + } + + // Create client + geminiClient, err = genai.NewClient(context.Background(), + &genai.ClientConfig{ + Project: "api-tools-451421", + Location: "us-central1", + Backend: genai.BackendVertexAI, + }) + if err != nil { + panic(err) + } + }) + return geminiClient +} diff --git a/runners/weekly.sh b/runners/weekly.sh new file mode 100644 index 0000000..4345b71 --- /dev/null +++ b/runners/weekly.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# for weekly tasks to run + +# scrape, parse, and upload academic calendars +./api-tools -headless -verbose -scrape -academicCalendars +./api-tools -headless -verbose -parse -academicCalendars +./api-tools -headless -verbose -upload -academicCalendars diff --git a/scrapers/adacemicCalendars.go b/scrapers/adacemicCalendars.go new file mode 100644 index 0000000..ca69bc5 --- /dev/null +++ b/scrapers/adacemicCalendars.go @@ -0,0 +1,133 @@ +/* + This file contains the code for the academic calendars scaper. +*/ + +package scrapers + +import ( + "context" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "path" + "path/filepath" + + "github.com/UTDNebula/api-tools/utils" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" +) + +type AcademicCalendar struct { + Title string + Href string + Time string +} + +func ScrapeAcademicCalendars(outDir string) { + // Start chromedp + chromedpCtx, cancel := utils.InitChromeDp() + + // Get sub folder from output folder + outSubDir := filepath.Join(outDir, "academicCalendars") + + // Make output folder + os.RemoveAll(outSubDir) + err := os.MkdirAll(outSubDir, 0777) + if err != nil { + panic(err) + } + + // Go to listings page + chromedp.RunResponse(chromedpCtx, + chromedp.Navigate(`https://www.utdallas.edu/academics/calendar/`), + ) + + // Extract data from links + // Current + academicCalendars := []AcademicCalendar{AcademicCalendar{"", "", "current"}} + chromedp.Run(chromedpCtx, chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery)) + var currentNode []*cdp.Node + chromedp.Run(chromedpCtx, chromedp.Nodes("a.wp-block-button__link", ¤tNode, chromedp.ByQuery)) + for i := 0; i < len(currentNode[0].Attributes); i += 2 { + if currentNode[0].Attributes[i] == "href" { + academicCalendars[0].Href = currentNode[0].Attributes[i+1] + } + } + + // Future list + var futureNodes []*cdp.Node + chromedp.Run(chromedpCtx, + chromedp.Nodes(`//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a`, &futureNodes, chromedp.BySearch), + ) + academicCalendars = append(academicCalendars, extractTextAndHref(futureNodes, "future", chromedpCtx)...) + + // Past list + var pastNodes []*cdp.Node + chromedp.Run(chromedpCtx, + chromedp.Nodes(`//h2[normalize-space(text())="Past Terms"]/following-sibling::div[1]//a`, &pastNodes, chromedp.BySearch), + ) + academicCalendars = append(academicCalendars, extractTextAndHref(pastNodes, "past", chromedpCtx)...) + + // Don't need ChromeDP anymore + cancel() + + // Download all PDFs + for _, academicCalendar := range academicCalendars { + downloadPdfFromBox(academicCalendar.Href, academicCalendar.Time+"-"+academicCalendar.Title, outSubDir) + } +} + +func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Context) []AcademicCalendar { + output := []AcademicCalendar{} + + // Extract href and text + for _, n := range nodes { + var href, text string + // Get href attribute + for i := 0; i < len(n.Attributes); i += 2 { + if n.Attributes[i] == "href" { + href = n.Attributes[i+1] + } + } + // Get inner text + chromedp.Run(chromedpCtx, chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery)) + + output = append(output, AcademicCalendar{text, href, time}) + } + + return output +} + +func downloadPdfFromBox(href string, filename string, outDir string) { + // Create blank file + out, err := os.Create(filepath.Join(outDir, fmt.Sprintf("%s.pdf", filename))) + if err != nil { + panic(err) + } + defer out.Close() + + // Pull ID from link + parsedLink, err := url.Parse(href) + if err != nil { + panic(err) + } + fileId := path.Base(parsedLink.Path) + + // Use box download link with ID + resp, err := http.Get(fmt.Sprintf("https://utdallas.box.com/shared/static/%s.pdf", fileId)) + if err != nil { + panic(err) + } + defer resp.Body.Close() + + // Output response to blank file + _, err = io.Copy(out, resp.Body) + if err != nil { + panic(err) + } + + log.Printf("Scraped academic calendar %s!", filename) +} diff --git a/uploader/academicCalendarUploader.go b/uploader/academicCalendarUploader.go new file mode 100644 index 0000000..803938a --- /dev/null +++ b/uploader/academicCalendarUploader.go @@ -0,0 +1,49 @@ +/* + This file is responsible for handling uploading of parsed academic calendar data to MongoDB. +*/ + +package uploader + +import ( + "context" + "fmt" + "log" + "os" + "time" + + "github.com/UTDNebula/nebula-api/api/schema" + "github.com/joho/godotenv" +) + +// Note that this uploader assumes that the collection names match the names of these files, which they should. +// If the names of these collections ever change, the file names should be updated accordingly. + +var academicCalendarFile = "academicCalendars.json" + +func UploadAcademicCalendars(inDir string) { + + //Load env vars + if err := godotenv.Load(); err != nil { + log.Panic("Error loading .env file") + } + + //Connect to mongo + client := connectDB() + + // Get 5 minute context + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // Open data file for reading + fptr, err := os.Open(fmt.Sprintf("%s/"+academicCalendarFile, inDir)) + if err != nil { + if os.IsNotExist(err) { + log.Panicf("File not found. Skipping %s", academicCalendarFile) + } + log.Panic(err) + } + + defer fptr.Close() + + UploadData[schema.AcademicCalendar](client, ctx, fptr, true) +}