From cec6ca0a4be812d53d234ab0ba77ab2d72829414 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Thu, 13 Mar 2025 19:54:39 -0500 Subject: [PATCH 01/11] Add unit tests for validator --- .env.template | 4 +- parser/validator.go | 155 +++++++------ parser/validator_test.go | 457 +++++++++++++++++++++++++++++++++++++++ utils/methods.go | 5 +- 4 files changed, 556 insertions(+), 65 deletions(-) create mode 100644 parser/validator_test.go diff --git a/.env.template b/.env.template index fd04e52..9bb0daf 100644 --- a/.env.template +++ b/.env.template @@ -1,4 +1,4 @@ -#Scrapers +# Scrapers LOGIN_NETID= LOGIN_PASSWORD= LOGIN_ASTRA_USERNAME= @@ -6,5 +6,5 @@ LOGIN_ASTRA_PASSWORD= #Login to https://east.mymazevo.com/main-home then go to https://east.mymazevo.com/api/tenantsettings/GetApiKey MAZEVO_API_KEY= -#Uploader +# Uploader MONGODB_URI= diff --git a/parser/validator.go b/parser/validator.go index 78ae337..e37da2d 100644 --- a/parser/validator.go +++ b/parser/validator.go @@ -2,10 +2,14 @@ package parser import ( "log" + "slices" "github.com/UTDNebula/api-tools/utils" + "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" ) +// Main validation, putting everything together func validate() { // Set up deferred handler for panics to display validation fails defer func() { @@ -16,78 +20,33 @@ func validate() { log.Printf("\nValidating courses...") courseKeys := utils.GetMapKeys(Courses) - for i := 0; i < len(courseKeys)-1; i++ { + for i := range len(courseKeys) { course1 := Courses[courseKeys[i]] // Check for duplicate courses by comparing course_number, subject_prefix, and catalog_year as a compound key for j := i + 1; j < len(courseKeys); j++ { course2 := Courses[courseKeys[j]] - if course2.Catalog_year == course1.Catalog_year && course2.Course_number == course1.Course_number && course2.Subject_prefix == course1.Subject_prefix { - log.Printf("Duplicate course found for %s%s!", course1.Subject_prefix, course1.Course_number) - log.Printf("Course 1: %v\n\nCourse 2: %v", course1, course2) - log.Panic("Courses failed to validate!") - } + valDuplicateCourses(course1, course2) } // Make sure course isn't referencing any nonexistent sections, and that course-section references are consistent both ways - for _, sectionId := range course1.Sections { - section, exists := Sections[sectionId] - if !exists { - log.Printf("Nonexistent section reference found for %s%s!", course1.Subject_prefix, course1.Course_number) - log.Printf("Referenced section ID: %s\nCourse ID: %s", sectionId, course1.Id) - log.Panic("Courses failed to validate!") - } - if section.Course_reference != course1.Id { - log.Printf("Inconsistent section reference found for %s%s! The course references the section, but not vice-versa!", course1.Subject_prefix, course1.Course_number) - log.Printf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s", sectionId, course1.Id, section.Course_reference) - log.Panic("Courses failed to validate!") - } - } + valCourseReference(course1, Sections) } courseKeys = nil log.Print("No invalid courses!") log.Print("Validating sections...") sectionKeys := utils.GetMapKeys(Sections) - for i := 0; i < len(sectionKeys)-1; i++ { + for i := range len(sectionKeys) { section1 := Sections[sectionKeys[i]] // Check for duplicate sections by comparing section_number, course_reference, and academic_session as a compound key for j := i + 1; j < len(sectionKeys); j++ { section2 := Sections[sectionKeys[j]] - if section2.Section_number == section1.Section_number && - section2.Course_reference == section1.Course_reference && - section2.Academic_session == section1.Academic_session { - log.Print("Duplicate section found!") - log.Printf("Section 1: %v\n\nSection 2: %v", section1, section2) - log.Panic("Sections failed to validate!") - } + valDuplicateSections(section1, section2) } // Make sure section isn't referencing any nonexistent professors, and that section-professor references are consistent both ways - for _, profId := range section1.Professors { - professorKey, exists := ProfessorIDMap[profId] - if !exists { - log.Printf("Nonexistent professor reference found for section ID %s!", section1.Id) - log.Printf("Referenced professor ID: %s", profId) - log.Panic("Sections failed to validate!") - } - profRefsSection := false - for _, profSection := range Professors[professorKey].Sections { - if profSection == section1.Id { - profRefsSection = true - break - } - } - if !profRefsSection { - log.Printf("Inconsistent professor reference found for section ID %s! The section references the professor, but not vice-versa!", section1.Id) - log.Printf("Referenced professor ID: %s", profId) - log.Panic("Sections failed to validate!") - } - } + valSectionReferenceProf(section1, Professors, ProfessorIDMap) + // Make sure section isn't referencing a nonexistant course - _, exists := CourseIDMap[section1.Course_reference] - if !exists { - log.Printf("Nonexistent course reference found for section ID %s!", section1.Id) - log.Printf("Referenced course ID: %s", section1.Course_reference) - log.Panic("Sections failed to validate!") - } + valSectionReferenceCourse(section1, CourseIDMap) } sectionKeys = nil log.Printf("No invalid sections!") @@ -95,18 +54,92 @@ func validate() { log.Printf("Validating professors...") profKeys := utils.GetMapKeys(Professors) // Check for duplicate professors by comparing first_name, last_name, and sections as a compound key - for i := 0; i < len(profKeys)-1; i++ { + for i := range len(profKeys) { prof1 := Professors[profKeys[i]] for j := i + 1; j < len(profKeys); j++ { prof2 := Professors[profKeys[j]] - if prof2.First_name == prof1.First_name && - prof2.Last_name == prof1.Last_name && - prof2.Profile_uri == prof1.Profile_uri { - log.Printf("Duplicate professor found!") - log.Printf("Professor 1: %v\n\nProfessor 2: %v", prof1, prof2) - log.Panic("Professors failed to validate!") - } + valDuplicateProfs(prof1, prof2) } } log.Printf("No invalid professors!") } + +// Validate if the courses are duplicate +func valDuplicateCourses(course1 *schema.Course, course2 *schema.Course) { + if course1.Catalog_year == course2.Catalog_year && course1.Course_number == course2.Course_number && + course1.Subject_prefix == course2.Subject_prefix { + log.Printf("Duplicate course found for %s%s!", course1.Subject_prefix, course1.Course_number) + log.Printf("Course 1: %v\n\nCourse 2: %v", course1, course2) + log.Panic("Courses failed to validate!") + } +} + +// Validate course reference to sections +func valCourseReference(course *schema.Course, sections map[primitive.ObjectID]*schema.Section) { + for _, sectionID := range course.Sections { + section, exists := sections[sectionID] + // validate if course references to some section not in the parsed sections + if !exists { + log.Printf("Nonexistent section reference found for %s%s!", course.Subject_prefix, course.Course_number) + log.Printf("Referenced section ID: %s\nCourse ID: %s", sectionID, course.Id) + log.Panic("Courses failed to validate!") + } + + // validate if the ref sections references back to the course + if section.Course_reference != course.Id { + log.Printf("Inconsistent section reference found for %s%s! The course references the section, but not vice-versa!", course.Subject_prefix, course.Course_number) + log.Printf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s", sectionID, course.Id, section.Course_reference) + log.Panic("Courses failed to validate!") + } + } +} + +// Validate if the sections are duplicate +func valDuplicateSections(section1 *schema.Section, section2 *schema.Section) { + if section1.Section_number == section2.Section_number && section1.Course_reference == section2.Course_reference && + section1.Academic_session == section2.Academic_session { + log.Print("Duplicate section found!") + log.Printf("Section 1: %v\n\nSection 2: %v", section1, section2) + log.Panic("Sections failed to validate!") + } +} + +// Validate section reference to professor +func valSectionReferenceProf(section *schema.Section, profs map[string]*schema.Professor, profIDMap map[primitive.ObjectID]string) { + for _, profID := range section.Professors { + professorKey, exists := profIDMap[profID] + // validate if the section references to some prof not in the parsed professors + if !exists { + log.Printf("Nonexistent professor reference found for section ID %s!", section.Id) + log.Printf("Referenced professor ID: %s", profID) + log.Panic("Sections failed to validate!") + } + + // validate if the referenced professor references back to section + if !slices.Contains(profs[professorKey].Sections, section.Id) { + log.Printf("Inconsistent professor reference found for section ID %s! The section references the professor, but not vice-versa!", section.Id) + log.Printf("Referenced professor ID: %s", profID) + log.Panic("Sections failed to validate!") + } + } +} + +// Validate section reference to course +func valSectionReferenceCourse(section *schema.Section, courseIDMap map[primitive.ObjectID]string) { + _, exists := courseIDMap[section.Course_reference] + // validate if section reference some course not in parsed courses + if !exists { + log.Printf("Nonexistent course reference found for section ID %s!", section.Id) + log.Printf("Referenced course ID: %s", section.Course_reference) + log.Panic("Sections failed to validate!") + } +} + +// Validate if the professors are duplicate +func valDuplicateProfs(prof1 *schema.Professor, prof2 *schema.Professor) { + if prof1.First_name == prof2.First_name && prof1.Last_name == prof2.Last_name && prof1.Profile_uri == prof2.Profile_uri { + log.Printf("Duplicate professor found!") + log.Printf("Professor 1: %v\n\nProfessor 2: %v", prof1, prof2) + log.Panic("Professors failed to validate!") + } +} diff --git a/parser/validator_test.go b/parser/validator_test.go new file mode 100644 index 0000000..8e629d2 --- /dev/null +++ b/parser/validator_test.go @@ -0,0 +1,457 @@ +package parser + +import ( + "bytes" + "encoding/json" + "fmt" + "log" + "os" + "strings" + "testing" + + "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" +) + +// Globals for testing these validation units +var testCourses []*schema.Course +var testSections []*schema.Section +var testProfessors []*schema.Professor + +// Map used to map index of test sections to test courses +var indexMap map[int]int + +// Main to upload the test data +func TestMain(m *testing.M) { + // parse the test courses + data, err := os.ReadFile("./testdata/courses.json") + if err != nil { + panic(err) + } + err = json.Unmarshal(data, &testCourses) + if err != nil { + panic(err) + } + + // parse the test sections + data, err = os.ReadFile("./testdata/sections.json") + if err != nil { + panic(err) + } + err = json.Unmarshal(data, &testSections) + if err != nil { + panic(err) + } + + // parse the test professors + data, err = os.ReadFile("./testdata/professors.json") + if err != nil { + panic(err) + } + err = json.Unmarshal(data, &testProfessors) + if err != nil { + panic(err) + } + + // map + indexMap = map[int]int{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4} + + testRun := m.Run() + os.Exit(testRun) +} + +// Test duplicate courses. Designed for fail cases +func TestDuplicateCoursesFail(t *testing.T) { + for i := range len(testCourses) { + t.Run(fmt.Sprintf("Duplicate course %v", i), func(t *testing.T) { + testDuplicateFail("course", i, t) + }) + } +} + +// Test duplicate sections. Designed for fail cases +func TestDuplicateSectionsFail(t *testing.T) { + for i := range len(testSections) { + t.Run(fmt.Sprintf("Duplicate section %v", i), func(t *testing.T) { + testDuplicateFail("section", i, t) + }) + } +} + +// Test duplicate professors . Designed for fail cases +func TestDuplicateProfFail(t *testing.T) { + for i := range len(testProfessors) { + t.Run(fmt.Sprintf("Duplicate professor %v", i), func(t *testing.T) { + testDuplicateFail("professor", i, t) + }) + } +} + +// Test duplicate courses. Designed for pass case +func TestDuplicateCoursesPass(t *testing.T) { + for i := range len(testCourses) - 1 { + t.Run(fmt.Sprintf("Duplicate courses %v, %v", i, i+1), func(t *testing.T) { + testDuplicatePass("course", i, i+1, t) + }) + } +} + +// Test duplicate sections. Designed for pass cases +func TestDuplicateSectionsPass(t *testing.T) { + for i := range len(testSections) - 1 { + t.Run(fmt.Sprintf("Duplicate sections %v, %v", i, i+1), func(t *testing.T) { + testDuplicatePass("section", i, i+1, t) + }) + } +} + +// Test duplicate professors. Designed for pass cases +func TestDuplicateProfPass(t *testing.T) { + for i := range len(testProfessors) - 1 { + t.Run(fmt.Sprintf("Duplicate professors %v, %v", i, i+1), func(t *testing.T) { + testDuplicatePass("professor", i, i+1, t) + }) + } +} + +// Test if course references to anything nonexistent. Designed for pass case +func TestCourseReferencePass(t *testing.T) { + sectionMap := make(map[primitive.ObjectID]*schema.Section) + for _, section := range testSections { + sectionMap[section.Id] = section + } + + // Buffer to capture the output + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + defer func() { + logOutput := logBuffer.String() + + if logOutput != "" { + t.Errorf("Expected nothing printed in log") + } + if r := recover(); r != nil { + t.Errorf("The function panic unexpectedly for course") + } + }() + + // Run func + for _, course := range testCourses { + valCourseReference(course, sectionMap) + } +} + +// Test if function log expected msgs when course references non-existent sections +// 2 types of fail: +// - Course references non-existent section +// - Section doesn't reference back to same course +// +// This is fail type 1 +func TestCourseReferenceFail1(t *testing.T) { + for key, value := range indexMap { + t.Run(fmt.Sprintf("Section %v & course %v", key, value), func(t *testing.T) { + testCourseReferenceFail(1, value, key, t) + }) + } +} + +// This is fail type 2 +func TestCourseReferenceFail2(t *testing.T) { + for key, value := range indexMap { + t.Run(fmt.Sprintf("Section %v & course %v", key, value), func(t *testing.T) { + testCourseReferenceFail(2, value, key, t) + }) + } +} + +// Test section reference to professor, designed for pass case +func TestSectionReferenceProfPass(t *testing.T) { + // Build profIDMap & profs + profIDMap := make(map[primitive.ObjectID]string) + profs := make(map[string]*schema.Professor) + + for _, professor := range testProfessors { + profIDMap[professor.Id] = professor.First_name + professor.Last_name + profs[professor.First_name+professor.Last_name] = professor + } + + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + defer func() { + logOutput := logBuffer.String() + + if logOutput != "" { + t.Errorf("Expected nothing printed in log") + } + if r := recover(); r != nil { + t.Errorf("The function panic unexpectedly for section") + } + }() + + for _, section := range testSections { + valSectionReferenceProf(section, profs, profIDMap) + } +} + +// Test section reference to professors, designed for fail case +func TestSectionReferenceProfFail(t *testing.T) { + profIDMap := make(map[primitive.ObjectID]string) + profs := make(map[string]*schema.Professor) + + for i, professor := range testProfessors { + if i != 0 { + profIDMap[professor.Id] = professor.First_name + professor.Last_name + profs[professor.First_name+professor.Last_name] = professor + } + } + + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + defer func() { + logOutput := logBuffer.String() + + for _, msg := range []string{ + "Nonexistent professor reference found for section ID ObjectID(\"67d07ee0c972c18731e23bea\")!", + "Referenced professor ID: ObjectID(\"67d07ee0c972c18731e23beb\")", + } { + if !strings.Contains(logOutput, msg) { + t.Errorf("The function didn't log correct message. Expected \"%v\"", msg) + } + } + + if r := recover(); r == nil { + t.Errorf("The function didn't panic") + } else { + if r != "Sections failed to validate!" { + t.Errorf("The function panic the wrong message") + } + } + }() + + for _, section := range testSections { + valSectionReferenceProf(section, profs, profIDMap) + } +} + +// Test section reference to course +func TestSectionReferenceCourse(t *testing.T) { + courseIDMap := make(map[primitive.ObjectID]string) + for _, course := range testCourses { + courseIDMap[course.Id] = course.Internal_course_number + course.Catalog_year + } + + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + defer func() { + logOutput := logBuffer.String() + + if logOutput != "" { + t.Errorf("Expected nothing printed in log") + } + if r := recover(); r != nil { + t.Errorf("The function panic unexpectedly for section") + } + }() + + for _, section := range testSections { + valSectionReferenceCourse(section, courseIDMap) + } +} + +/* BELOW HERE ARE HELPER FUNCTION FOR TESTS ABOVE */ + +// Helper function +// Test if validate() throws erros when encountering duplicate +// Design for fail cases +func testDuplicateFail(objType string, index int, t *testing.T) { + // the buffer used to capture the log output + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + // determine the expected msgs and panic msgs based on object type + var expectedMsgs []string + var panicMsg string + + switch objType { + case "course": + failCourse := testCourses[index] + + // list of msgs it must print + expectedMsgs = []string{ + fmt.Sprintf("Duplicate course found for %s%s!", failCourse.Subject_prefix, failCourse.Course_number), + fmt.Sprintf("Course 1: %v\n\nCourse 2: %v", failCourse, failCourse), + } + panicMsg = "Courses failed to validate!" + case "section": + failSection := testSections[index] + + expectedMsgs = []string{ + "Duplicate section found!", + fmt.Sprintf("Section 1: %v\n\nSection 2: %v", failSection, failSection), + } + panicMsg = "Sections failed to validate!" + case "professor": + failProf := testProfessors[index] + + expectedMsgs = []string{ + "Duplicate professor found!", + fmt.Sprintf("Professor 1: %v\n\nProfessor 2: %v", failProf, failProf), + } + panicMsg = "Professors failed to validate!" + } + + defer func() { + logOutput := logBuffer.String() // log output after running the function + + // log output needs to contain lines in the list + for _, msg := range expectedMsgs { + if !strings.Contains(logOutput, msg) { + t.Errorf("Exptected the message for %v: %v", objType, msg) + } + } + + // test whether func panics and sends the correct panic msg + if r := recover(); r == nil { + t.Errorf("The function didn't panic for %v", objType) + } else { + if r != panicMsg { + // The panic msg is incorrect + t.Errorf("The function outputted the wrong panic message for %v.", objType) + } + } + }() + + // Run func + switch objType { + case "course": + valDuplicateCourses(testCourses[index], testCourses[index]) + case "section": + valDuplicateSections(testSections[index], testSections[index]) + case "professor": + valDuplicateProfs(testProfessors[index], testProfessors[index]) + } +} + +// Helper function +// Test if func doesn't log anything and doesn't panic. +// Design for pass cases +func testDuplicatePass(objType string, index1 int, index2 int, t *testing.T) { + // Buffer to capture the output + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + defer func() { + logOutput := logBuffer.String() + if logOutput != "" { + t.Errorf("Expected nothing in log for " + objType) + } + if r := recover(); r != nil { + t.Errorf("The function panic unexpectedly for " + objType) + } + }() + + // Run func according to the object type. Choose pair of objects which are not duplicate + switch objType { + case "course": + valDuplicateCourses(testCourses[index1], testCourses[index2]) + case "section": + valDuplicateSections(testSections[index1], testSections[index2]) + case "professor": + valDuplicateProfs(testProfessors[index1], testProfessors[index2]) + } +} + +// Helper function for the case of course reference that fails +// failType: 1 means it lacks one sections +// failType: 2 means one section's course reference has been modified +func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t *testing.T) { + sectionMap := make(map[primitive.ObjectID]*schema.Section) + + var sectionID, originalID primitive.ObjectID // used to store IDs of modified sections + + // Build the failed section map based on fail type + if failType == 1 { + // misses a section + for i, section := range testSections { + if sectionIndex != i { + sectionMap[section.Id] = section + } else { + sectionID = section.Id // Nonexistent ID referenced by course + } + } + } else { + // one section doesn't reference to correct courses + for i, section := range testSections { + sectionMap[section.Id] = section + if sectionIndex == i { + // save the section ID and original course reference to be restored later on + sectionID = section.Id + originalID = section.Course_reference + + // modify part + sectionMap[section.Id].Course_reference = primitive.NewObjectID() + } + } + } + + // Expected msgs + var expectedMsgs []string + + // The course that references nonexistent stuff + var failCourse *schema.Course + + if failType == 1 { + failCourse = testCourses[courseIndex] + + expectedMsgs = []string{ + fmt.Sprintf("Nonexistent section reference found for %v%v!", failCourse.Subject_prefix, failCourse.Course_number), + fmt.Sprintf("Referenced section ID: %s\nCourse ID: %s", sectionID, failCourse.Id), + } + } else { + failCourse = testCourses[courseIndex] + failSection := testSections[sectionIndex] + + expectedMsgs = []string{ + fmt.Sprintf("Inconsistent section reference found for %v%v! The course references the section, but not vice-versa!", + failCourse.Subject_prefix, failCourse.Course_number), + fmt.Sprintf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s", + failSection.Id, failCourse.Id, failSection.Course_reference), + } + } + + // Buffer to capture the output + var logBuffer bytes.Buffer + log.SetOutput(&logBuffer) + + defer func() { + logOutput := logBuffer.String() + + for _, msg := range expectedMsgs { + if !strings.Contains(logOutput, msg) { + t.Errorf("The function didn't log correct message. Expected \"%v\"", msg) + } + } + + // restore to original course reference of modified section (if needed) + if failType == 2 { + sectionMap[sectionID].Course_reference = originalID + } + + if r := recover(); r == nil { + t.Errorf("The function didn't panic") + } else { + if r != "Courses failed to validate!" { + t.Errorf("The function panic the wrong message") + } + } + }() + + // Run func + for _, course := range testCourses { + valCourseReference(course, sectionMap) + } +} diff --git a/utils/methods.go b/utils/methods.go index 9c97324..4d01fc1 100644 --- a/utils/methods.go +++ b/utils/methods.go @@ -38,13 +38,13 @@ func GetEnv(name string) (string, error) { func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) { log.Printf("Initializing chromedp...") if Headless { - chromedpCtx, cancelFnc = chromedp.NewContext(context.Background()) + chromedpCtx, cancelFnc = chromedp.NewContext(context.Background(), chromedp.WithDebugf(log.Printf)) } else { allocCtx, _ := chromedp.NewExecAllocator(context.Background()) chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx) } log.Printf("Initialized chromedp!") - return + return chromedpCtx, cancelFnc } // This function generates a fresh auth token and returns the new headers @@ -262,6 +262,7 @@ func RetryHTTP(requestCreator func() *http.Request, client *http.Client, retryCa return res, err } +// Get all the available course prefixes func GetCoursePrefixes(chromedpCtx context.Context) []string { // Refresh the token // refreshToken(chromedpCtx) From 627137d1861e40607775439b73ef87d3ea15b96a Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Thu, 13 Mar 2025 20:11:44 -0500 Subject: [PATCH 02/11] Add unit test for validator --- parser/validator_test.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/parser/validator_test.go b/parser/validator_test.go index 8e629d2..e1ca364 100644 --- a/parser/validator_test.go +++ b/parser/validator_test.go @@ -21,8 +21,7 @@ var testProfessors []*schema.Professor // Map used to map index of test sections to test courses var indexMap map[int]int -// Main to upload the test data -func TestMain(m *testing.M) { +func init() { // parse the test courses data, err := os.ReadFile("./testdata/courses.json") if err != nil { @@ -53,11 +52,7 @@ func TestMain(m *testing.M) { panic(err) } - // map indexMap = map[int]int{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4} - - testRun := m.Run() - os.Exit(testRun) } // Test duplicate courses. Designed for fail cases From f1a2f92a125b4215c1a239310795b8a62a25cc4d Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Mon, 21 Apr 2025 02:08:39 -0500 Subject: [PATCH 03/11] Refactor calendar scraper --- scrapers/calendar.go | 64 +++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 8a130dd..176737c 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -39,18 +39,17 @@ func ScrapeCalendar(outDir string) { events := []schema.Event{} log.Printf("Scraping event page links") - //Grab all links to event pages + // Grab all links to event pages var pageLinks []string = []string{} _, err = chromedp.RunResponse(chromedpCtx, chromedp.Navigate(CALENDAR_LINK), - chromedp.QueryAfter(".item.event_item.vevent > a", + chromedp.QueryAfter(".em-card_image > a", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { for _, node := range nodes { href, hasHref := node.Attribute("href") if !hasHref { return errors.New("event card was missing an href") } - pageLinks = append(pageLinks, href) } return nil @@ -61,13 +60,17 @@ func ScrapeCalendar(outDir string) { panic(err) } log.Printf("Scraped event page links!") + for _, page := range pageLinks { + // Print the links of the page + log.Println(page) + } for _, page := range pageLinks { - //Navigate to page and get page summary + // Navigate to page and get page summary summary := "" _, err := chromedp.RunResponse(chromedpCtx, chromedp.Navigate(page), - chromedp.QueryAfter(".summary", + chromedp.QueryAfter(".em-card_title", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { if len(nodes) != 0 { summary = trailingSpaceRegex.ReplaceAllString(getNodeText(nodes[0]), "") @@ -126,13 +129,26 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped time: %s to %s ", dateTimeStart, dateTimeEnd) - //Grab Location of Event - var location string = "" + // Grab Location of Event + + // If .location doesn't have children, then it's an virtual event + var location string = "Virtual Event" // Default + err = chromedp.Run(chromedpCtx, + // Grab the name of the location + chromedp.QueryAfter("p.location > a", + func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { + if len(nodes) != 0 { + location = getNodeText(nodes[0]) + "\n " + } + return nil + }, chromedp.AtLeast(0), + ), + // Grab the address of the location (concatenated with the name) chromedp.QueryAfter("p.location > span", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { if len(nodes) != 0 { - location = getNodeText(nodes[0]) + location += getNodeText(nodes[0]) } return nil }, chromedp.AtLeast(0), @@ -143,13 +159,15 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped location: %s, ", location) - //Get description of event + // Get description of event var description string = "" err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".description > p", + chromedp.QueryAfter(".em-about_description > p", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - description = getNodeText(nodes[0]) + for _, node := range nodes { + if getNodeText(node) != "" { + description += getNodeText(node) + "\n\n" + } } return nil }, chromedp.AtLeast(0), @@ -160,7 +178,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped description: %s, ", description) - //Grab Event Type + // Grab Event Type var eventType []string = []string{} err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".filter-event_types > p > a", @@ -177,7 +195,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped event type: %s", eventType) - //Grab Target Audience + // Grab Target Audience targetAudience := []string{} err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".filter-event_target_audience > p > a", @@ -194,7 +212,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped target audience: %s, ", targetAudience) - //Grab Topic + // Grab Topic topic := []string{} err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".filter-event_topic > p > a", @@ -211,7 +229,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped topic: %s, ", topic) - //Grab Event Tags + // Grab Event Tags tags := []string{} err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".event-tags > p > a", @@ -228,7 +246,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped tags: %s, ", tags) - //Grab Website + // Grab Website var eventWebsite string = "" err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".event-website > p > a", @@ -249,7 +267,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped website: %s, ", eventWebsite) - //Grab Department + // Grab Department var eventDepartment []string = []string{} err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".event-group > a", @@ -266,7 +284,7 @@ func ScrapeCalendar(outDir string) { } utils.VPrintf("Scraped department: %s, ", eventDepartment) - //Grab Contact information + // Grab Contact information var contactInformationName string = "" var contactInformationEmail string = "" var contactInformationPhone string = "" @@ -279,10 +297,14 @@ func ScrapeCalendar(outDir string) { return nil }, chromedp.AtLeast(0), ), - chromedp.QueryAfter(".custom-field-contact_information_email", + chromedp.QueryAfter(".custom-field-contact_information_email > a", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { if len(nodes) != 0 { - contactInformationEmail = getNodeText(nodes[0]) + emailHref, hasEmailHref := nodes[0].Attribute("href") + if !hasEmailHref { + return errors.New("event contact doesn't have email") + } + contactInformationEmail = emailHref[7:] } return nil }, chromedp.AtLeast(0), From 27bcf6395b4e022ffe3af1ebb1b7107cce1e93b7 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Mon, 21 Apr 2025 02:29:43 -0500 Subject: [PATCH 04/11] Just revert the previous erroneous commit --- utils/methods.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/methods.go b/utils/methods.go index 2398351..2195240 100644 --- a/utils/methods.go +++ b/utils/methods.go @@ -38,7 +38,7 @@ func GetEnv(name string) (string, error) { func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) { log.Printf("Initializing chromedp...") if Headless { - chromedpCtx, cancelFnc = chromedp.NewContext(context.Background(), chromedp.WithDebugf(log.Printf)) + chromedpCtx, cancelFnc = chromedp.NewContext(context.Background()) } else { allocCtx, _ := chromedp.NewExecAllocator(context.Background()) chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx) From 616ead81a03055a4e515394e96ce775aef7c7cbe Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Mon, 21 Apr 2025 04:03:38 -0500 Subject: [PATCH 05/11] Some minor fixes and comments for readability --- scrapers/calendar.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 176737c..1627f95 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -25,6 +25,7 @@ import ( const CALENDAR_LINK string = "https://calendar.utdallas.edu/calendar" var trailingSpaceRegex *regexp.Regexp = regexp.MustCompile(`(\s{2,}?\s{2,})|(\n)`) +var leadingSpaceRegex *regexp.Regexp = regexp.MustCompile(`^\s+`) func ScrapeCalendar(outDir string) { @@ -61,7 +62,7 @@ func ScrapeCalendar(outDir string) { } log.Printf("Scraped event page links!") for _, page := range pageLinks { - // Print the links of the page + // Print the links of the page to check log.Println(page) } @@ -139,7 +140,8 @@ func ScrapeCalendar(outDir string) { chromedp.QueryAfter("p.location > a", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { if len(nodes) != 0 { - location = getNodeText(nodes[0]) + "\n " + // Location's name somehow contains leading space, trim it + location = leadingSpaceRegex.ReplaceAllString(getNodeText(nodes[0]), "") } return nil }, chromedp.AtLeast(0), @@ -148,7 +150,10 @@ func ScrapeCalendar(outDir string) { chromedp.QueryAfter("p.location > span", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { if len(nodes) != 0 { - location += getNodeText(nodes[0]) + // There are cases where it doesn't show the address + if getNodeText(nodes[0]) != "" { + location += "\n" + getNodeText(nodes[0]) + } } return nil }, chromedp.AtLeast(0), @@ -164,8 +169,9 @@ func ScrapeCalendar(outDir string) { err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".em-about_description > p", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { + // Concatenate all the sentences in the description together for _, node := range nodes { - if getNodeText(node) != "" { + if getNodeText(node) != "" && getNodeText(node) != "\u00A0" { description += getNodeText(node) + "\n\n" } } From d5f4b6e1b8c21337adda5a5986037f2fe4347577 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Mon, 21 Apr 2025 14:15:25 -0500 Subject: [PATCH 06/11] Adjust the validation test a bit for readability --- parser/validator_test.go | 88 ++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/parser/validator_test.go b/parser/validator_test.go index e1ca364..6a9c9ff 100644 --- a/parser/validator_test.go +++ b/parser/validator_test.go @@ -18,11 +18,11 @@ var testCourses []*schema.Course var testSections []*schema.Section var testProfessors []*schema.Professor -// Map used to map index of test sections to test courses +// Map index of test sections to test courses var indexMap map[int]int func init() { - // parse the test courses + // Parse the test courses data, err := os.ReadFile("./testdata/courses.json") if err != nil { panic(err) @@ -32,7 +32,7 @@ func init() { panic(err) } - // parse the test sections + // Parse the test sections data, err = os.ReadFile("./testdata/sections.json") if err != nil { panic(err) @@ -42,7 +42,7 @@ func init() { panic(err) } - // parse the test professors + // Parse the test professors data, err = os.ReadFile("./testdata/professors.json") if err != nil { panic(err) @@ -52,6 +52,7 @@ func init() { panic(err) } + // The correct mapping indexMap = map[int]int{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4} } @@ -142,20 +143,20 @@ func TestCourseReferencePass(t *testing.T) { // - Course references non-existent section // - Section doesn't reference back to same course // -// This is fail type 1 +// This is fail: missing func TestCourseReferenceFail1(t *testing.T) { for key, value := range indexMap { t.Run(fmt.Sprintf("Section %v & course %v", key, value), func(t *testing.T) { - testCourseReferenceFail(1, value, key, t) + testCourseReferenceFail("missing", value, key, t) }) } } -// This is fail type 2 +// This is fail: modified func TestCourseReferenceFail2(t *testing.T) { for key, value := range indexMap { t.Run(fmt.Sprintf("Section %v & course %v", key, value), func(t *testing.T) { - testCourseReferenceFail(2, value, key, t) + testCourseReferenceFail("modified", value, key, t) }) } } @@ -192,6 +193,7 @@ func TestSectionReferenceProfPass(t *testing.T) { // Test section reference to professors, designed for fail case func TestSectionReferenceProfFail(t *testing.T) { + profIDMap := make(map[primitive.ObjectID]string) profs := make(map[string]*schema.Professor) @@ -257,23 +259,22 @@ func TestSectionReferenceCourse(t *testing.T) { } } -/* BELOW HERE ARE HELPER FUNCTION FOR TESTS ABOVE */ +/******** BELOW HERE ARE HELPER FUNCTION FOR TESTS ABOVE ********/ -// Helper function // Test if validate() throws erros when encountering duplicate // Design for fail cases -func testDuplicateFail(objType string, index int, t *testing.T) { +func testDuplicateFail(objType string, ix int, t *testing.T) { // the buffer used to capture the log output var logBuffer bytes.Buffer log.SetOutput(&logBuffer) - // determine the expected msgs and panic msgs based on object type + // Determine the expected messages and panic messages based on object type var expectedMsgs []string var panicMsg string switch objType { case "course": - failCourse := testCourses[index] + failCourse := testCourses[ix] // list of msgs it must print expectedMsgs = []string{ @@ -282,7 +283,7 @@ func testDuplicateFail(objType string, index int, t *testing.T) { } panicMsg = "Courses failed to validate!" case "section": - failSection := testSections[index] + failSection := testSections[ix] expectedMsgs = []string{ "Duplicate section found!", @@ -290,7 +291,7 @@ func testDuplicateFail(objType string, index int, t *testing.T) { } panicMsg = "Sections failed to validate!" case "professor": - failProf := testProfessors[index] + failProf := testProfessors[ix] expectedMsgs = []string{ "Duplicate professor found!", @@ -302,14 +303,14 @@ func testDuplicateFail(objType string, index int, t *testing.T) { defer func() { logOutput := logBuffer.String() // log output after running the function - // log output needs to contain lines in the list + // Log output needs to contain lines in the list for _, msg := range expectedMsgs { if !strings.Contains(logOutput, msg) { t.Errorf("Exptected the message for %v: %v", objType, msg) } } - // test whether func panics and sends the correct panic msg + // Test whether func panics and sends the correct panic msg if r := recover(); r == nil { t.Errorf("The function didn't panic for %v", objType) } else { @@ -323,18 +324,17 @@ func testDuplicateFail(objType string, index int, t *testing.T) { // Run func switch objType { case "course": - valDuplicateCourses(testCourses[index], testCourses[index]) + valDuplicateCourses(testCourses[ix], testCourses[ix]) case "section": - valDuplicateSections(testSections[index], testSections[index]) + valDuplicateSections(testSections[ix], testSections[ix]) case "professor": - valDuplicateProfs(testProfessors[index], testProfessors[index]) + valDuplicateProfs(testProfessors[ix], testProfessors[ix]) } } -// Helper function // Test if func doesn't log anything and doesn't panic. // Design for pass cases -func testDuplicatePass(objType string, index1 int, index2 int, t *testing.T) { +func testDuplicatePass(objType string, ix1 int, ix2 int, t *testing.T) { // Buffer to capture the output var logBuffer bytes.Buffer log.SetOutput(&logBuffer) @@ -349,45 +349,45 @@ func testDuplicatePass(objType string, index1 int, index2 int, t *testing.T) { } }() - // Run func according to the object type. Choose pair of objects which are not duplicate + // Run func according to the object type. + // Choose pair of objects which are not duplicate switch objType { case "course": - valDuplicateCourses(testCourses[index1], testCourses[index2]) + valDuplicateCourses(testCourses[ix1], testCourses[ix2]) case "section": - valDuplicateSections(testSections[index1], testSections[index2]) + valDuplicateSections(testSections[ix1], testSections[ix2]) case "professor": - valDuplicateProfs(testProfessors[index1], testProfessors[index2]) + valDuplicateProfs(testProfessors[ix1], testProfessors[ix2]) } } -// Helper function for the case of course reference that fails -// failType: 1 means it lacks one sections -// failType: 2 means one section's course reference has been modified -func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t *testing.T) { +// fail = "missing" means it lacks one sections +// fail = "modified" means one section's course reference has been modified +func testCourseReferenceFail(fail string, courseIx int, sectionIx int, t *testing.T) { sectionMap := make(map[primitive.ObjectID]*schema.Section) var sectionID, originalID primitive.ObjectID // used to store IDs of modified sections // Build the failed section map based on fail type - if failType == 1 { - // misses a section + if fail == "missing" { + // Misses a section for i, section := range testSections { - if sectionIndex != i { + if sectionIx != i { sectionMap[section.Id] = section } else { sectionID = section.Id // Nonexistent ID referenced by course } } - } else { - // one section doesn't reference to correct courses + } else if fail == "modified" { + // One section doesn't reference to correct courses for i, section := range testSections { sectionMap[section.Id] = section - if sectionIndex == i { - // save the section ID and original course reference to be restored later on + if sectionIx == i { + // Save the section ID and original course reference to be restored later on sectionID = section.Id originalID = section.Course_reference - // modify part + // Modified part sectionMap[section.Id].Course_reference = primitive.NewObjectID() } } @@ -399,16 +399,16 @@ func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t // The course that references nonexistent stuff var failCourse *schema.Course - if failType == 1 { - failCourse = testCourses[courseIndex] + if fail == "missing" { + failCourse = testCourses[courseIx] expectedMsgs = []string{ fmt.Sprintf("Nonexistent section reference found for %v%v!", failCourse.Subject_prefix, failCourse.Course_number), fmt.Sprintf("Referenced section ID: %s\nCourse ID: %s", sectionID, failCourse.Id), } } else { - failCourse = testCourses[courseIndex] - failSection := testSections[sectionIndex] + failCourse = testCourses[courseIx] + failSection := testSections[sectionIx] expectedMsgs = []string{ fmt.Sprintf("Inconsistent section reference found for %v%v! The course references the section, but not vice-versa!", @@ -431,8 +431,8 @@ func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t } } - // restore to original course reference of modified section (if needed) - if failType == 2 { + // Restore to original course reference of modified section (if needed) + if fail == "modified" { sectionMap[sectionID].Course_reference = originalID } From 401e11559b3399b6093ed00cc5d9180e8d086284 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Fri, 1 Aug 2025 14:34:56 -0500 Subject: [PATCH 07/11] Implement API approach to getting callendar data --- scrapers/calendar.go | 120 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 3 deletions(-) diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 1627f95..7cf89eb 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -5,11 +5,13 @@ package scrapers import ( + "bytes" "context" "encoding/json" "errors" "fmt" "log" + "net/http" "os" "regexp" "time" @@ -132,15 +134,15 @@ func ScrapeCalendar(outDir string) { // Grab Location of Event - // If .location doesn't have children, then it's an virtual event - var location string = "Virtual Event" // Default + // If p.location doesn't have children, then it's an virtual event + var location string = "Virtual Event" err = chromedp.Run(chromedpCtx, // Grab the name of the location chromedp.QueryAfter("p.location > a", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { if len(nodes) != 0 { - // Location's name somehow contains leading space, trim it + // Location's name somehow contains leading space, so trim it location = leadingSpaceRegex.ReplaceAllString(getNodeText(nodes[0]), "") } return nil @@ -310,6 +312,7 @@ func ScrapeCalendar(outDir string) { if !hasEmailHref { return errors.New("event contact doesn't have email") } + // Slicing the text to exclude "mailto:" contactInformationEmail = emailHref[7:] } return nil @@ -363,3 +366,114 @@ func ScrapeCalendar(outDir string) { encoder.Encode(events) fptr.Close() } + +// Get the calendar data through API instead of scraping from website +func ScrapeAPICalendar(outDir string) { + err := os.MkdirAll(outDir, 0777) + if err != nil { + panic(err) + } + client := http.Client{Timeout: 30 * time.Second} + var events []schema.Event + + for i := range 1 { + // Set up the API Request + calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", i) + req, err := http.NewRequest("GET", calendarUrl, nil) + if err != nil { + panic(err) + } + + // Call API to get the response + res, err := client.Do(req) + if err != nil { + panic(err) + } + if res != nil && res.StatusCode != 200 { + log.Panicf("ERROR: Status was %s", res.Status) + } + + buffer := bytes.Buffer{} + buffer.ReadFrom(res.Body) + res.Body.Close() + + // Structure of the API response + type RawEvent struct { + Event map[string]any `json:"event"` + } + + type APICalendarResponse struct { + Events []RawEvent `json:"events"` + Page map[string]int `json:"page"` + Date map[string]string `json:"date"` + } + + var responseData APICalendarResponse + if err := json.Unmarshal(buffer.Bytes(), &responseData); err != nil { + panic(err) + } + + for _, rawEvent := range responseData.Events { + filters := pullMap(rawEvent.Event["filters"]) + eventType := []string{} + eventTopic := []string{} + eventAudience := []string{} + + // Parse the event types, event topic, and event target audience + rawTypes := pullSlice(filters["event_types"]) + for _, rawType := range rawTypes { + eventType = append(eventType, pullMap(rawType)["name"].(string)) + } + + rawTopic := pullSlice(filters["event_topic"]) + for _, topic := range rawTopic { + eventTopic = append(eventTopic, pullMap(topic)["name"].(string)) + } + + rawAudience := pullSlice(filters["event_target_audience"]) + for _, audience := range rawAudience { + eventAudience = append(eventAudience, pullMap(audience)["name"].(string)) + } + + // Parse the event departments + departments := []string{} + rawDeparments := pullSlice(rawEvent.Event["departments"]) + for _, deparment := range rawDeparments { + departments = append(departments, pullMap(deparment)["name"].(string)) + } + + events = append(events, schema.Event{ + Id: primitive.NewObjectID(), + EventType: eventType, + TargetAudience: eventAudience, + Topic: eventTopic, + Department: departments, + }) + } + } + + fptr, err := os.Create(fmt.Sprintf("%s/events1.json", outDir)) + if err != nil { + panic(err) + } + encoder := json.NewEncoder(fptr) + encoder.SetIndent("", "\t") + encoder.Encode(events) + fptr.Close() +} + +// Casting an any to an slice of any +func pullSlice(data any) []any { + if array, ok := data.([]any); ok { + return array + } + return nil +} + +// Casting an any to map from string to any +func pullMap(data any) map[string]any { + if dataMap, ok := data.(map[string]any); ok { + return dataMap + } + return nil +} From 6508e800b644437bc7132da01fa9bc44d54cf455 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Sat, 2 Aug 2025 17:13:11 -0500 Subject: [PATCH 08/11] Move api calendar as separate options, can change later --- main.go | 4 + scrapers/apiCalendar.go | 196 ++++++++++++++++++++++++++++++++++++++++ scrapers/calendar.go | 113 ----------------------- 3 files changed, 200 insertions(+), 113 deletions(-) create mode 100644 scrapers/apiCalendar.go diff --git a/main.go b/main.go index 727c376..bdafd1b 100644 --- a/main.go +++ b/main.go @@ -39,6 +39,8 @@ func main() { scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.") // Flag for calendar scraping scrapeCalendar := flag.Bool("calendar", false, "Alongside -scrape, signifies that calendar should be scraped.") + // Flag for api calendar scraping + scrapeAPICalendar := flag.Bool("apiCalendar", false, "Alongside -scrape, signifies that calendar should be scraped.") // Flag for astra scraping and parsing astra := flag.Bool("astra", false, "Alongside -scrape or -parse, signifies that Astra should be scraped/parsed.") // Flag for mazevo scraping and parsing @@ -104,6 +106,8 @@ func main() { scrapers.ScrapeOrganizations(*outDir) case *scrapeCalendar: scrapers.ScrapeCalendar(*outDir) + case *scrapeAPICalendar: + scrapers.ScrapeAPICalendar(*outDir) case *astra: scrapers.ScrapeAstra(*outDir) case *mazevo: diff --git a/scrapers/apiCalendar.go b/scrapers/apiCalendar.go new file mode 100644 index 0000000..74023eb --- /dev/null +++ b/scrapers/apiCalendar.go @@ -0,0 +1,196 @@ +package scrapers + +import ( + "bytes" + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "time" + + "github.com/UTDNebula/api-tools/utils" + "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" +) + +// Structure of the API response +type APICalendarResponse struct { + Events []struct { + Event map[string]interface{} `json:"event"` + } `json:"events"` + Page map[string]int `json:"page"` + Date map[string]string `json:"date"` +} + +// Get the calendar data through API instead of scraping from website +func ScrapeAPICalendar(outDir string) { + err := os.MkdirAll(outDir, 0777) + if err != nil { + panic(err) + } + cli := http.Client{Timeout: 30 * time.Second} + var calendarData APICalendarResponse + + // Get the total number of pages + log.Printf("Getting the number of pages...") + if err := scrapeAndUnmarshal(&cli, 0, &calendarData); err != nil { + panic(err) + } + numPages := calendarData.Page["total"] + log.Printf("The number of pages is %d!\n\n", numPages) + + var events []schema.Event + for page := range numPages { + log.Printf("Scraping events of page %d...", page+1) + if err := scrapeAndUnmarshal(&cli, page+1, &calendarData); err != nil { + panic(err) + } + log.Printf("Scraped events of page %d successfully!\n", page+1) + + log.Printf("Parsing the events of page %d...", page+1) + for _, rawEvent := range calendarData.Events { + // Parse the time + eventInstance := toMap(toMap(toSlice(rawEvent.Event["event_instances"])[0])["event_instance"]) + startTime := parseTime(toString(eventInstance["start"])) + endTime := startTime + if toString(eventInstance["end"]) != "" { + endTime = parseTime(toString(eventInstance["end"])) + } + + // Parse location + location := fmt.Sprintf("%s-%s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"])) + + // Parse the event types, event topic, and event target audience + filters := toMap(rawEvent.Event["filters"]) + eventTypes := []string{} + eventTopics := []string{} + targetAudiences := []string{} + + rawTypes := toSlice(filters["event_types"]) + for _, rawType := range rawTypes { + eventTypes = append(eventTypes, toString(toMap(rawType)["name"])) + } + + rawAudiences := toSlice(filters["event_target_audience"]) + for _, audience := range rawAudiences { + targetAudiences = append(targetAudiences, toString(toMap(audience)["name"])) + } + + rawTopics := toSlice(filters["event_topic"]) + for _, topic := range rawTopics { + eventTopics = append(eventTopics, toString(toMap(topic)["name"])) + } + + // Parse the event departments, and tags + departments := []string{} + tags := []string{} + + rawTags := toSlice(rawEvent.Event["tags"]) + for _, tag := range rawTags { + tags = append(tags, tag.(string)) + } + + rawDeparments := toSlice(rawEvent.Event["departments"]) + for _, deparment := range rawDeparments { + departments = append(departments, toMap(deparment)["name"].(string)) + } + + // Parse the contact info, =ote that some events won't have contact phone number + rawContactInfo := toMap(rawEvent.Event["custom_fields"]) + contactInfo := [3]string{} + for i, infoField := range []string{ + "contact_information_name", "contact_information_email", "contact_information_phone", + } { + contactInfo[i] = toString(rawContactInfo[infoField]) + } + + events = append(events, schema.Event{ + Id: primitive.NewObjectID(), + Summary: toString(rawEvent.Event["title"]), + Location: location, + StartTime: startTime, + EndTime: endTime, + Description: toString(rawEvent.Event["description_text"]), + EventType: eventTypes, + TargetAudience: targetAudiences, + Topic: eventTopics, + EventTags: tags, + EventWebsite: toString(rawEvent.Event["url"]), + Department: departments, + ContactName: contactInfo[0], + ContactEmail: contactInfo[1], + ContactPhoneNumber: contactInfo[2], + }) + } + log.Printf("Parsed the events of page %d successfully!\n\n", page+1) + } + + if err := utils.WriteJSON(fmt.Sprintf("%s/api_events.json", outDir), events); err != nil { + panic(err) + } + log.Printf("Finished parsing %d events successfully!\n\n", len(events)) +} + +// Get the data from the api and unmarshal it to data +func scrapeAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { + // Call API to get the byte data + calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", page) + req, err := http.NewRequest("GET", calendarUrl, nil) + if err != nil { + return err + } + res, err := client.Do(req) + if err != nil { + return err + } + if res != nil && res.StatusCode != 200 { + return fmt.Errorf("ERROR: Non-200 status is returned, %s", res.Status) + } + + // Unmarshal bytes to the response data + buffer := bytes.Buffer{} + if _, err = buffer.ReadFrom(res.Body); err != nil { + return err + } + res.Body.Close() + if err = json.Unmarshal(buffer.Bytes(), &data); err != nil { + return err + } + return nil +} + +// Casting an interface{} to an slice of interface{} +func toSlice(data interface{}) []interface{} { + if array, ok := data.([]interface{}); ok { + return array + } + return nil +} + +// Casting an interface{} to map from string to interface{} +func toMap(data interface{}) map[string]interface{} { + if dataMap, ok := data.(map[string]interface{}); ok { + return dataMap + } + return nil +} + +// Casting an interface{} to string +func toString(data interface{}) string { + if data != nil { + if dataString, ok := data.(string); ok { + return dataString + } + } + return "" +} + +// Parse string time +func parseTime(stringTime string) time.Time { + parsedTime, err := time.Parse(time.RFC3339, stringTime) + if err != nil { + panic(err) + } + return parsedTime +} diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 7cf89eb..adf6f82 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -5,13 +5,11 @@ package scrapers import ( - "bytes" "context" "encoding/json" "errors" "fmt" "log" - "net/http" "os" "regexp" "time" @@ -366,114 +364,3 @@ func ScrapeCalendar(outDir string) { encoder.Encode(events) fptr.Close() } - -// Get the calendar data through API instead of scraping from website -func ScrapeAPICalendar(outDir string) { - err := os.MkdirAll(outDir, 0777) - if err != nil { - panic(err) - } - client := http.Client{Timeout: 30 * time.Second} - var events []schema.Event - - for i := range 1 { - // Set up the API Request - calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", i) - req, err := http.NewRequest("GET", calendarUrl, nil) - if err != nil { - panic(err) - } - - // Call API to get the response - res, err := client.Do(req) - if err != nil { - panic(err) - } - if res != nil && res.StatusCode != 200 { - log.Panicf("ERROR: Status was %s", res.Status) - } - - buffer := bytes.Buffer{} - buffer.ReadFrom(res.Body) - res.Body.Close() - - // Structure of the API response - type RawEvent struct { - Event map[string]any `json:"event"` - } - - type APICalendarResponse struct { - Events []RawEvent `json:"events"` - Page map[string]int `json:"page"` - Date map[string]string `json:"date"` - } - - var responseData APICalendarResponse - if err := json.Unmarshal(buffer.Bytes(), &responseData); err != nil { - panic(err) - } - - for _, rawEvent := range responseData.Events { - filters := pullMap(rawEvent.Event["filters"]) - eventType := []string{} - eventTopic := []string{} - eventAudience := []string{} - - // Parse the event types, event topic, and event target audience - rawTypes := pullSlice(filters["event_types"]) - for _, rawType := range rawTypes { - eventType = append(eventType, pullMap(rawType)["name"].(string)) - } - - rawTopic := pullSlice(filters["event_topic"]) - for _, topic := range rawTopic { - eventTopic = append(eventTopic, pullMap(topic)["name"].(string)) - } - - rawAudience := pullSlice(filters["event_target_audience"]) - for _, audience := range rawAudience { - eventAudience = append(eventAudience, pullMap(audience)["name"].(string)) - } - - // Parse the event departments - departments := []string{} - rawDeparments := pullSlice(rawEvent.Event["departments"]) - for _, deparment := range rawDeparments { - departments = append(departments, pullMap(deparment)["name"].(string)) - } - - events = append(events, schema.Event{ - Id: primitive.NewObjectID(), - EventType: eventType, - TargetAudience: eventAudience, - Topic: eventTopic, - Department: departments, - }) - } - } - - fptr, err := os.Create(fmt.Sprintf("%s/events1.json", outDir)) - if err != nil { - panic(err) - } - encoder := json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(events) - fptr.Close() -} - -// Casting an any to an slice of any -func pullSlice(data any) []any { - if array, ok := data.([]any); ok { - return array - } - return nil -} - -// Casting an any to map from string to any -func pullMap(data any) map[string]any { - if dataMap, ok := data.(map[string]any); ok { - return dataMap - } - return nil -} From 72018d75496ac66d69ea56da69e4974dbff3fa93 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Sat, 2 Aug 2025 19:11:25 -0500 Subject: [PATCH 09/11] Change the name of the file to write into --- scrapers/apiCalendar.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/apiCalendar.go b/scrapers/apiCalendar.go index 74023eb..161b5ec 100644 --- a/scrapers/apiCalendar.go +++ b/scrapers/apiCalendar.go @@ -126,7 +126,7 @@ func ScrapeAPICalendar(outDir string) { log.Printf("Parsed the events of page %d successfully!\n\n", page+1) } - if err := utils.WriteJSON(fmt.Sprintf("%s/api_events.json", outDir), events); err != nil { + if err := utils.WriteJSON(fmt.Sprintf("%s/events.json", outDir), events); err != nil { panic(err) } log.Printf("Finished parsing %d events successfully!\n\n", len(events)) From b21b721048f08748de5fb5c56bf144c7757cccca Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Sun, 3 Aug 2025 18:48:18 -0500 Subject: [PATCH 10/11] Completely refactor the calendar scraper to api calling approach --- main.go | 4 - scrapers/apiCalendar.go | 196 ----------------- scrapers/calendar.go | 477 +++++++++++++--------------------------- 3 files changed, 156 insertions(+), 521 deletions(-) delete mode 100644 scrapers/apiCalendar.go diff --git a/main.go b/main.go index 0e155ab..5df58b7 100644 --- a/main.go +++ b/main.go @@ -39,8 +39,6 @@ func main() { scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.") // Flag for calendar scraping scrapeCalendar := flag.Bool("calendar", false, "Alongside -scrape, signifies that calendar should be scraped.") - // Flag for api calendar scraping - scrapeAPICalendar := flag.Bool("apiCalendar", false, "Alongside -scrape, signifies that calendar should be scraped.") // Flag for astra scraping and parsing astra := flag.Bool("astra", false, "Alongside -scrape or -parse, signifies that Astra should be scraped/parsed.") // Flag for mazevo scraping and parsing @@ -109,8 +107,6 @@ func main() { scrapers.ScrapeOrganizations(*outDir) case *scrapeCalendar: scrapers.ScrapeCalendar(*outDir) - case *scrapeAPICalendar: - scrapers.ScrapeAPICalendar(*outDir) case *astra: scrapers.ScrapeAstra(*outDir) case *mazevo: diff --git a/scrapers/apiCalendar.go b/scrapers/apiCalendar.go deleted file mode 100644 index 161b5ec..0000000 --- a/scrapers/apiCalendar.go +++ /dev/null @@ -1,196 +0,0 @@ -package scrapers - -import ( - "bytes" - "encoding/json" - "fmt" - "log" - "net/http" - "os" - "time" - - "github.com/UTDNebula/api-tools/utils" - "github.com/UTDNebula/nebula-api/api/schema" - "go.mongodb.org/mongo-driver/bson/primitive" -) - -// Structure of the API response -type APICalendarResponse struct { - Events []struct { - Event map[string]interface{} `json:"event"` - } `json:"events"` - Page map[string]int `json:"page"` - Date map[string]string `json:"date"` -} - -// Get the calendar data through API instead of scraping from website -func ScrapeAPICalendar(outDir string) { - err := os.MkdirAll(outDir, 0777) - if err != nil { - panic(err) - } - cli := http.Client{Timeout: 30 * time.Second} - var calendarData APICalendarResponse - - // Get the total number of pages - log.Printf("Getting the number of pages...") - if err := scrapeAndUnmarshal(&cli, 0, &calendarData); err != nil { - panic(err) - } - numPages := calendarData.Page["total"] - log.Printf("The number of pages is %d!\n\n", numPages) - - var events []schema.Event - for page := range numPages { - log.Printf("Scraping events of page %d...", page+1) - if err := scrapeAndUnmarshal(&cli, page+1, &calendarData); err != nil { - panic(err) - } - log.Printf("Scraped events of page %d successfully!\n", page+1) - - log.Printf("Parsing the events of page %d...", page+1) - for _, rawEvent := range calendarData.Events { - // Parse the time - eventInstance := toMap(toMap(toSlice(rawEvent.Event["event_instances"])[0])["event_instance"]) - startTime := parseTime(toString(eventInstance["start"])) - endTime := startTime - if toString(eventInstance["end"]) != "" { - endTime = parseTime(toString(eventInstance["end"])) - } - - // Parse location - location := fmt.Sprintf("%s-%s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"])) - - // Parse the event types, event topic, and event target audience - filters := toMap(rawEvent.Event["filters"]) - eventTypes := []string{} - eventTopics := []string{} - targetAudiences := []string{} - - rawTypes := toSlice(filters["event_types"]) - for _, rawType := range rawTypes { - eventTypes = append(eventTypes, toString(toMap(rawType)["name"])) - } - - rawAudiences := toSlice(filters["event_target_audience"]) - for _, audience := range rawAudiences { - targetAudiences = append(targetAudiences, toString(toMap(audience)["name"])) - } - - rawTopics := toSlice(filters["event_topic"]) - for _, topic := range rawTopics { - eventTopics = append(eventTopics, toString(toMap(topic)["name"])) - } - - // Parse the event departments, and tags - departments := []string{} - tags := []string{} - - rawTags := toSlice(rawEvent.Event["tags"]) - for _, tag := range rawTags { - tags = append(tags, tag.(string)) - } - - rawDeparments := toSlice(rawEvent.Event["departments"]) - for _, deparment := range rawDeparments { - departments = append(departments, toMap(deparment)["name"].(string)) - } - - // Parse the contact info, =ote that some events won't have contact phone number - rawContactInfo := toMap(rawEvent.Event["custom_fields"]) - contactInfo := [3]string{} - for i, infoField := range []string{ - "contact_information_name", "contact_information_email", "contact_information_phone", - } { - contactInfo[i] = toString(rawContactInfo[infoField]) - } - - events = append(events, schema.Event{ - Id: primitive.NewObjectID(), - Summary: toString(rawEvent.Event["title"]), - Location: location, - StartTime: startTime, - EndTime: endTime, - Description: toString(rawEvent.Event["description_text"]), - EventType: eventTypes, - TargetAudience: targetAudiences, - Topic: eventTopics, - EventTags: tags, - EventWebsite: toString(rawEvent.Event["url"]), - Department: departments, - ContactName: contactInfo[0], - ContactEmail: contactInfo[1], - ContactPhoneNumber: contactInfo[2], - }) - } - log.Printf("Parsed the events of page %d successfully!\n\n", page+1) - } - - if err := utils.WriteJSON(fmt.Sprintf("%s/events.json", outDir), events); err != nil { - panic(err) - } - log.Printf("Finished parsing %d events successfully!\n\n", len(events)) -} - -// Get the data from the api and unmarshal it to data -func scrapeAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { - // Call API to get the byte data - calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", page) - req, err := http.NewRequest("GET", calendarUrl, nil) - if err != nil { - return err - } - res, err := client.Do(req) - if err != nil { - return err - } - if res != nil && res.StatusCode != 200 { - return fmt.Errorf("ERROR: Non-200 status is returned, %s", res.Status) - } - - // Unmarshal bytes to the response data - buffer := bytes.Buffer{} - if _, err = buffer.ReadFrom(res.Body); err != nil { - return err - } - res.Body.Close() - if err = json.Unmarshal(buffer.Bytes(), &data); err != nil { - return err - } - return nil -} - -// Casting an interface{} to an slice of interface{} -func toSlice(data interface{}) []interface{} { - if array, ok := data.([]interface{}); ok { - return array - } - return nil -} - -// Casting an interface{} to map from string to interface{} -func toMap(data interface{}) map[string]interface{} { - if dataMap, ok := data.(map[string]interface{}); ok { - return dataMap - } - return nil -} - -// Casting an interface{} to string -func toString(data interface{}) string { - if data != nil { - if dataString, ok := data.(string); ok { - return dataString - } - } - return "" -} - -// Parse string time -func parseTime(stringTime string) time.Time { - parsedTime, err := time.Parse(time.RFC3339, stringTime) - if err != nil { - panic(err) - } - return parsedTime -} diff --git a/scrapers/calendar.go b/scrapers/calendar.go index adf6f82..0bdd92a 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -5,362 +5,197 @@ package scrapers import ( - "context" + "bytes" "encoding/json" - "errors" "fmt" "log" + "net/http" "os" - "regexp" "time" "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" - "github.com/chromedp/cdproto/cdp" - "github.com/chromedp/cdproto/runtime" - "github.com/chromedp/chromedp" "go.mongodb.org/mongo-driver/bson/primitive" ) -const CALENDAR_LINK string = "https://calendar.utdallas.edu/calendar" +// Structure of the API response +type RawEvent struct { + Event map[string]interface{} `json:"event"` +} -var trailingSpaceRegex *regexp.Regexp = regexp.MustCompile(`(\s{2,}?\s{2,})|(\n)`) -var leadingSpaceRegex *regexp.Regexp = regexp.MustCompile(`^\s+`) +type APICalendarResponse struct { + Events []RawEvent `json:"events"` + Page map[string]int `json:"page"` + Date map[string]string `json:"date"` +} +// Get the calendar data through API instead of scraping from website func ScrapeCalendar(outDir string) { - - chromedpCtx, cancel := utils.InitChromeDp() - defer cancel() - err := os.MkdirAll(outDir, 0777) if err != nil { panic(err) } + cli := http.Client{Timeout: 15 * time.Second} + var calendarData APICalendarResponse - events := []schema.Event{} - - log.Printf("Scraping event page links") - // Grab all links to event pages - var pageLinks []string = []string{} - _, err = chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(CALENDAR_LINK), - chromedp.QueryAfter(".em-card_image > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - href, hasHref := node.Attribute("href") - if !hasHref { - return errors.New("event card was missing an href") - } - pageLinks = append(pageLinks, href) - } - return nil - }, - ), - ) - if err != nil { + // Get the total number of pages + log.Printf("Getting the number of pages...") + if err := scrapeAndUnmarshal(&cli, 0, &calendarData); err != nil { panic(err) } - log.Printf("Scraped event page links!") - for _, page := range pageLinks { - // Print the links of the page to check - log.Println(page) - } - - for _, page := range pageLinks { - // Navigate to page and get page summary - summary := "" - _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(page), - chromedp.QueryAfter(".em-card_title", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - summary = trailingSpaceRegex.ReplaceAllString(getNodeText(nodes[0]), "") - } - return nil - }, chromedp.AtLeast(0), - ), - ) + numPages := calendarData.Page["total"] + log.Printf("The number of pages is %d!\n\n", numPages) - if err != nil { + var events []schema.Event + for page := range numPages { + log.Printf("Scraping events of page %d...", page+1) + if err := scrapeAndUnmarshal(&cli, page+1, &calendarData); err != nil { panic(err) } - utils.VPrintf("Navigated to page %s", summary) - - // Grab date/time of the event - var dateTimeStart time.Time - var dateTimeEnd time.Time - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".dtstart", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - timeStamp, hasTime := nodes[0].Attribute("title") - if !hasTime { - return errors.New("event does not have a start time") - } - formattedTime, err := time.Parse(time.RFC3339, timeStamp) - if err != nil { - return err - } - - dateTimeStart = formattedTime - } - return nil - }, chromedp.AtLeast(0), - ), - chromedp.QueryAfter(".dtend", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - timeStamp, hasTime := nodes[0].Attribute("title") - if !hasTime { - return errors.New("event does not have an end time") - } - formattedTime, err := time.Parse(time.RFC3339, timeStamp) - if err != nil { - return err - } - - dateTimeEnd = formattedTime - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped time: %s to %s ", dateTimeStart, dateTimeEnd) - - // Grab Location of Event - - // If p.location doesn't have children, then it's an virtual event - var location string = "Virtual Event" - - err = chromedp.Run(chromedpCtx, - // Grab the name of the location - chromedp.QueryAfter("p.location > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - // Location's name somehow contains leading space, so trim it - location = leadingSpaceRegex.ReplaceAllString(getNodeText(nodes[0]), "") - } - return nil - }, chromedp.AtLeast(0), - ), - // Grab the address of the location (concatenated with the name) - chromedp.QueryAfter("p.location > span", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - // There are cases where it doesn't show the address - if getNodeText(nodes[0]) != "" { - location += "\n" + getNodeText(nodes[0]) - } - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped location: %s, ", location) - - // Get description of event - var description string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".em-about_description > p", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - // Concatenate all the sentences in the description together - for _, node := range nodes { - if getNodeText(node) != "" && getNodeText(node) != "\u00A0" { - description += getNodeText(node) + "\n\n" - } - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped description: %s, ", description) - - // Grab Event Type - var eventType []string = []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".filter-event_types > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - eventType = append(eventType, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) + log.Printf("Scraped events of page %d successfully!\n", page+1) + + log.Printf("Parsing the events of page %d...", page+1) + for _, rawEvent := range calendarData.Events { + // Parse the time + eventInstance := toMap(toMap(toSlice(rawEvent.Event["event_instances"])[0])["event_instance"]) + startTime := parseTime(toString(eventInstance["start"])) + endTime := startTime + if toString(eventInstance["end"]) != "" { + endTime = parseTime(toString(eventInstance["end"])) + } + + location := utils.TrimWhitespace(fmt.Sprintf("%s, %s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"]))) + + // Parse the event types, event topic, and event target audience + filters := toMap(rawEvent.Event["filters"]) + eventTypes := []string{} + eventTopics := []string{} + targetAudiences := []string{} + + rawTypes := toSlice(filters["event_types"]) + for _, rawType := range rawTypes { + eventTypes = append(eventTypes, toString(toMap(rawType)["name"])) + } + + rawAudiences := toSlice(filters["event_target_audience"]) + for _, audience := range rawAudiences { + targetAudiences = append(targetAudiences, toString(toMap(audience)["name"])) + } + + rawTopics := toSlice(filters["event_topic"]) + for _, topic := range rawTopics { + eventTopics = append(eventTopics, toString(toMap(topic)["name"])) + } + + // Parse the event departments, and tags + departments := []string{} + tags := []string{} + + rawTags := toSlice(rawEvent.Event["tags"]) + for _, tag := range rawTags { + tags = append(tags, tag.(string)) + } + + rawDeparments := toSlice(rawEvent.Event["departments"]) + for _, deparment := range rawDeparments { + departments = append(departments, toMap(deparment)["name"].(string)) + } + + // Parse the contact info, =ote that some events won't have contact phone number + rawContactInfo := toMap(rawEvent.Event["custom_fields"]) + contactInfo := [3]string{} + for i, infoField := range []string{ + "contact_information_name", "contact_information_email", "contact_information_phone", + } { + contactInfo[i] = toString(rawContactInfo[infoField]) + } + + events = append(events, schema.Event{ + Id: primitive.NewObjectID(), + Summary: toString(rawEvent.Event["title"]), + Location: location, + StartTime: startTime, + EndTime: endTime, + Description: toString(rawEvent.Event["description_text"]), + EventType: eventTypes, + TargetAudience: targetAudiences, + Topic: eventTopics, + EventTags: tags, + EventWebsite: toString(rawEvent.Event["url"]), + Department: departments, + ContactName: contactInfo[0], + ContactEmail: contactInfo[1], + ContactPhoneNumber: contactInfo[2], + }) } - utils.VPrintf("Scraped event type: %s", eventType) + log.Printf("Parsed the events of page %d successfully!\n\n", page+1) + } - // Grab Target Audience - targetAudience := []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".filter-event_target_audience > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - targetAudience = append(targetAudience, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped target audience: %s, ", targetAudience) + if err := utils.WriteJSON(fmt.Sprintf("%s/events.json", outDir), events); err != nil { + panic(err) + } + log.Printf("Finished parsing %d events successfully!\n\n", len(events)) +} - // Grab Topic - topic := []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".filter-event_topic > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - topic = append(topic, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped topic: %s, ", topic) +// Scrape the data from the api and unmarshal it to response data +func scrapeAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { + // Call API to get the byte data + calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", page) + req, err := http.NewRequest("GET", calendarUrl, nil) + if err != nil { + return err + } + res, err := client.Do(req) + if err != nil { + return err + } + if res != nil && res.StatusCode != 200 { + return fmt.Errorf("ERROR: Non-200 status is returned, %s", res.Status) + } - // Grab Event Tags - tags := []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".event-tags > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - tags = append(tags, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped tags: %s, ", tags) + // Unmarshal bytes to the response data + buffer := bytes.Buffer{} + if _, err = buffer.ReadFrom(res.Body); err != nil { + return err + } + res.Body.Close() + if err = json.Unmarshal(buffer.Bytes(), &data); err != nil { + return err + } + return nil +} - // Grab Website - var eventWebsite string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".event-website > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - href, hasHref := nodes[0].Attribute("href") - if !hasHref { - return errors.New("event does not have website") - } - eventWebsite = href - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped website: %s, ", eventWebsite) +// Casting an interface{} to an slice of interface{} +func toSlice(data interface{}) []interface{} { + if array, ok := data.([]interface{}); ok { + return array + } + return nil +} - // Grab Department - var eventDepartment []string = []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".event-group > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - eventDepartment = append(eventDepartment, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped department: %s, ", eventDepartment) +// Casting an interface{} to map from string to interface{} +func toMap(data interface{}) map[string]interface{} { + if dataMap, ok := data.(map[string]interface{}); ok { + return dataMap + } + return nil +} - // Grab Contact information - var contactInformationName string = "" - var contactInformationEmail string = "" - var contactInformationPhone string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".custom-field-contact_information_name", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - contactInformationName = getNodeText(nodes[0]) - } - return nil - }, chromedp.AtLeast(0), - ), - chromedp.QueryAfter(".custom-field-contact_information_email > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - emailHref, hasEmailHref := nodes[0].Attribute("href") - if !hasEmailHref { - return errors.New("event contact doesn't have email") - } - // Slicing the text to exclude "mailto:" - contactInformationEmail = emailHref[7:] - } - return nil - }, chromedp.AtLeast(0), - ), - chromedp.QueryAfter(".custom-field-contact_information_phone", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - contactInformationPhone = getNodeText(nodes[0]) - if err != nil { - return err - } - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) +// Casting an interface{} to string, if the data is nil then string is "" +func toString(data interface{}) string { + if data != nil { + if dataString, ok := data.(string); ok { + return dataString } - utils.VPrintf("Scraped contact name info: %s", contactInformationName) - utils.VPrintf("Scraped contact email info: %s", contactInformationEmail) - utils.VPrintf("Scraped contact phone info: %s", contactInformationPhone) - - events = append(events, schema.Event{ - Id: primitive.NewObjectID(), - Summary: summary, - Location: location, - StartTime: dateTimeStart, - EndTime: dateTimeEnd, - Description: description, - EventType: eventType, - TargetAudience: targetAudience, - Topic: topic, - EventTags: tags, - EventWebsite: eventWebsite, - Department: eventDepartment, - ContactName: contactInformationName, - ContactEmail: contactInformationEmail, - ContactPhoneNumber: contactInformationPhone, - }) } + return "" +} - // Write event data to output file - fptr, err := os.Create(fmt.Sprintf("%s/events.json", outDir)) +// Parse string time +func parseTime(stringTime string) time.Time { + parsedTime, err := time.Parse(time.RFC3339, stringTime) if err != nil { panic(err) } - encoder := json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(events) - fptr.Close() + return parsedTime } From a2545161383ecbfeed1719c6844ae3fc6c458f01 Mon Sep 17 00:00:00 2001 From: mikehquan19 Date: Sun, 3 Aug 2025 19:56:26 -0500 Subject: [PATCH 11/11] Trim the space in location --- scrapers/calendar.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 0bdd92a..4429c2e 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -11,6 +11,7 @@ import ( "log" "net/http" "os" + "strings" "time" "github.com/UTDNebula/api-tools/utils" @@ -64,7 +65,8 @@ func ScrapeCalendar(outDir string) { endTime = parseTime(toString(eventInstance["end"])) } - location := utils.TrimWhitespace(fmt.Sprintf("%s, %s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"]))) + // Parse location + location := strings.Trim(fmt.Sprintf("%s, %s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"])), " ,") // Parse the event types, event topic, and event target audience filters := toMap(rawEvent.Event["filters"])