diff --git a/parser/validator_test.go b/parser/validator_test.go index e1ca364..6a9c9ff 100644 --- a/parser/validator_test.go +++ b/parser/validator_test.go @@ -18,11 +18,11 @@ var testCourses []*schema.Course var testSections []*schema.Section var testProfessors []*schema.Professor -// Map used to map index of test sections to test courses +// Map index of test sections to test courses var indexMap map[int]int func init() { - // parse the test courses + // Parse the test courses data, err := os.ReadFile("./testdata/courses.json") if err != nil { panic(err) @@ -32,7 +32,7 @@ func init() { panic(err) } - // parse the test sections + // Parse the test sections data, err = os.ReadFile("./testdata/sections.json") if err != nil { panic(err) @@ -42,7 +42,7 @@ func init() { panic(err) } - // parse the test professors + // Parse the test professors data, err = os.ReadFile("./testdata/professors.json") if err != nil { panic(err) @@ -52,6 +52,7 @@ func init() { panic(err) } + // The correct mapping indexMap = map[int]int{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4} } @@ -142,20 +143,20 @@ func TestCourseReferencePass(t *testing.T) { // - Course references non-existent section // - Section doesn't reference back to same course // -// This is fail type 1 +// This is fail: missing func TestCourseReferenceFail1(t *testing.T) { for key, value := range indexMap { t.Run(fmt.Sprintf("Section %v & course %v", key, value), func(t *testing.T) { - testCourseReferenceFail(1, value, key, t) + testCourseReferenceFail("missing", value, key, t) }) } } -// This is fail type 2 +// This is fail: modified func TestCourseReferenceFail2(t *testing.T) { for key, value := range indexMap { t.Run(fmt.Sprintf("Section %v & course %v", key, value), func(t *testing.T) { - testCourseReferenceFail(2, value, key, t) + testCourseReferenceFail("modified", value, key, t) }) } } @@ -192,6 +193,7 @@ func TestSectionReferenceProfPass(t *testing.T) { // Test section reference to professors, designed for fail case func TestSectionReferenceProfFail(t *testing.T) { + profIDMap := make(map[primitive.ObjectID]string) profs := make(map[string]*schema.Professor) @@ -257,23 +259,22 @@ func TestSectionReferenceCourse(t *testing.T) { } } -/* BELOW HERE ARE HELPER FUNCTION FOR TESTS ABOVE */ +/******** BELOW HERE ARE HELPER FUNCTION FOR TESTS ABOVE ********/ -// Helper function // Test if validate() throws erros when encountering duplicate // Design for fail cases -func testDuplicateFail(objType string, index int, t *testing.T) { +func testDuplicateFail(objType string, ix int, t *testing.T) { // the buffer used to capture the log output var logBuffer bytes.Buffer log.SetOutput(&logBuffer) - // determine the expected msgs and panic msgs based on object type + // Determine the expected messages and panic messages based on object type var expectedMsgs []string var panicMsg string switch objType { case "course": - failCourse := testCourses[index] + failCourse := testCourses[ix] // list of msgs it must print expectedMsgs = []string{ @@ -282,7 +283,7 @@ func testDuplicateFail(objType string, index int, t *testing.T) { } panicMsg = "Courses failed to validate!" case "section": - failSection := testSections[index] + failSection := testSections[ix] expectedMsgs = []string{ "Duplicate section found!", @@ -290,7 +291,7 @@ func testDuplicateFail(objType string, index int, t *testing.T) { } panicMsg = "Sections failed to validate!" case "professor": - failProf := testProfessors[index] + failProf := testProfessors[ix] expectedMsgs = []string{ "Duplicate professor found!", @@ -302,14 +303,14 @@ func testDuplicateFail(objType string, index int, t *testing.T) { defer func() { logOutput := logBuffer.String() // log output after running the function - // log output needs to contain lines in the list + // Log output needs to contain lines in the list for _, msg := range expectedMsgs { if !strings.Contains(logOutput, msg) { t.Errorf("Exptected the message for %v: %v", objType, msg) } } - // test whether func panics and sends the correct panic msg + // Test whether func panics and sends the correct panic msg if r := recover(); r == nil { t.Errorf("The function didn't panic for %v", objType) } else { @@ -323,18 +324,17 @@ func testDuplicateFail(objType string, index int, t *testing.T) { // Run func switch objType { case "course": - valDuplicateCourses(testCourses[index], testCourses[index]) + valDuplicateCourses(testCourses[ix], testCourses[ix]) case "section": - valDuplicateSections(testSections[index], testSections[index]) + valDuplicateSections(testSections[ix], testSections[ix]) case "professor": - valDuplicateProfs(testProfessors[index], testProfessors[index]) + valDuplicateProfs(testProfessors[ix], testProfessors[ix]) } } -// Helper function // Test if func doesn't log anything and doesn't panic. // Design for pass cases -func testDuplicatePass(objType string, index1 int, index2 int, t *testing.T) { +func testDuplicatePass(objType string, ix1 int, ix2 int, t *testing.T) { // Buffer to capture the output var logBuffer bytes.Buffer log.SetOutput(&logBuffer) @@ -349,45 +349,45 @@ func testDuplicatePass(objType string, index1 int, index2 int, t *testing.T) { } }() - // Run func according to the object type. Choose pair of objects which are not duplicate + // Run func according to the object type. + // Choose pair of objects which are not duplicate switch objType { case "course": - valDuplicateCourses(testCourses[index1], testCourses[index2]) + valDuplicateCourses(testCourses[ix1], testCourses[ix2]) case "section": - valDuplicateSections(testSections[index1], testSections[index2]) + valDuplicateSections(testSections[ix1], testSections[ix2]) case "professor": - valDuplicateProfs(testProfessors[index1], testProfessors[index2]) + valDuplicateProfs(testProfessors[ix1], testProfessors[ix2]) } } -// Helper function for the case of course reference that fails -// failType: 1 means it lacks one sections -// failType: 2 means one section's course reference has been modified -func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t *testing.T) { +// fail = "missing" means it lacks one sections +// fail = "modified" means one section's course reference has been modified +func testCourseReferenceFail(fail string, courseIx int, sectionIx int, t *testing.T) { sectionMap := make(map[primitive.ObjectID]*schema.Section) var sectionID, originalID primitive.ObjectID // used to store IDs of modified sections // Build the failed section map based on fail type - if failType == 1 { - // misses a section + if fail == "missing" { + // Misses a section for i, section := range testSections { - if sectionIndex != i { + if sectionIx != i { sectionMap[section.Id] = section } else { sectionID = section.Id // Nonexistent ID referenced by course } } - } else { - // one section doesn't reference to correct courses + } else if fail == "modified" { + // One section doesn't reference to correct courses for i, section := range testSections { sectionMap[section.Id] = section - if sectionIndex == i { - // save the section ID and original course reference to be restored later on + if sectionIx == i { + // Save the section ID and original course reference to be restored later on sectionID = section.Id originalID = section.Course_reference - // modify part + // Modified part sectionMap[section.Id].Course_reference = primitive.NewObjectID() } } @@ -399,16 +399,16 @@ func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t // The course that references nonexistent stuff var failCourse *schema.Course - if failType == 1 { - failCourse = testCourses[courseIndex] + if fail == "missing" { + failCourse = testCourses[courseIx] expectedMsgs = []string{ fmt.Sprintf("Nonexistent section reference found for %v%v!", failCourse.Subject_prefix, failCourse.Course_number), fmt.Sprintf("Referenced section ID: %s\nCourse ID: %s", sectionID, failCourse.Id), } } else { - failCourse = testCourses[courseIndex] - failSection := testSections[sectionIndex] + failCourse = testCourses[courseIx] + failSection := testSections[sectionIx] expectedMsgs = []string{ fmt.Sprintf("Inconsistent section reference found for %v%v! The course references the section, but not vice-versa!", @@ -431,8 +431,8 @@ func testCourseReferenceFail(failType int, courseIndex int, sectionIndex int, t } } - // restore to original course reference of modified section (if needed) - if failType == 2 { + // Restore to original course reference of modified section (if needed) + if fail == "modified" { sectionMap[sectionID].Course_reference = originalID } diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 8a130dd..4429c2e 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -5,333 +5,199 @@ package scrapers import ( - "context" + "bytes" "encoding/json" - "errors" "fmt" "log" + "net/http" "os" - "regexp" + "strings" "time" "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" - "github.com/chromedp/cdproto/cdp" - "github.com/chromedp/cdproto/runtime" - "github.com/chromedp/chromedp" "go.mongodb.org/mongo-driver/bson/primitive" ) -const CALENDAR_LINK string = "https://calendar.utdallas.edu/calendar" +// Structure of the API response +type RawEvent struct { + Event map[string]interface{} `json:"event"` +} -var trailingSpaceRegex *regexp.Regexp = regexp.MustCompile(`(\s{2,}?\s{2,})|(\n)`) +type APICalendarResponse struct { + Events []RawEvent `json:"events"` + Page map[string]int `json:"page"` + Date map[string]string `json:"date"` +} +// Get the calendar data through API instead of scraping from website func ScrapeCalendar(outDir string) { - - chromedpCtx, cancel := utils.InitChromeDp() - defer cancel() - err := os.MkdirAll(outDir, 0777) if err != nil { panic(err) } + cli := http.Client{Timeout: 15 * time.Second} + var calendarData APICalendarResponse - events := []schema.Event{} - - log.Printf("Scraping event page links") - //Grab all links to event pages - var pageLinks []string = []string{} - _, err = chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(CALENDAR_LINK), - chromedp.QueryAfter(".item.event_item.vevent > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - href, hasHref := node.Attribute("href") - if !hasHref { - return errors.New("event card was missing an href") - } - - pageLinks = append(pageLinks, href) - } - return nil - }, - ), - ) - if err != nil { + // Get the total number of pages + log.Printf("Getting the number of pages...") + if err := scrapeAndUnmarshal(&cli, 0, &calendarData); err != nil { panic(err) } - log.Printf("Scraped event page links!") - - for _, page := range pageLinks { - //Navigate to page and get page summary - summary := "" - _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(page), - chromedp.QueryAfter(".summary", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - summary = trailingSpaceRegex.ReplaceAllString(getNodeText(nodes[0]), "") - } - return nil - }, chromedp.AtLeast(0), - ), - ) + numPages := calendarData.Page["total"] + log.Printf("The number of pages is %d!\n\n", numPages) - if err != nil { + var events []schema.Event + for page := range numPages { + log.Printf("Scraping events of page %d...", page+1) + if err := scrapeAndUnmarshal(&cli, page+1, &calendarData); err != nil { panic(err) } - utils.VPrintf("Navigated to page %s", summary) - - // Grab date/time of the event - var dateTimeStart time.Time - var dateTimeEnd time.Time - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".dtstart", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - timeStamp, hasTime := nodes[0].Attribute("title") - if !hasTime { - return errors.New("event does not have a start time") - } - formattedTime, err := time.Parse(time.RFC3339, timeStamp) - if err != nil { - return err - } - - dateTimeStart = formattedTime - } - return nil - }, chromedp.AtLeast(0), - ), - chromedp.QueryAfter(".dtend", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - timeStamp, hasTime := nodes[0].Attribute("title") - if !hasTime { - return errors.New("event does not have an end time") - } - formattedTime, err := time.Parse(time.RFC3339, timeStamp) - if err != nil { - return err - } - - dateTimeEnd = formattedTime - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue + log.Printf("Scraped events of page %d successfully!\n", page+1) + + log.Printf("Parsing the events of page %d...", page+1) + for _, rawEvent := range calendarData.Events { + // Parse the time + eventInstance := toMap(toMap(toSlice(rawEvent.Event["event_instances"])[0])["event_instance"]) + startTime := parseTime(toString(eventInstance["start"])) + endTime := startTime + if toString(eventInstance["end"]) != "" { + endTime = parseTime(toString(eventInstance["end"])) + } + + // Parse location + location := strings.Trim(fmt.Sprintf("%s, %s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"])), " ,") + + // Parse the event types, event topic, and event target audience + filters := toMap(rawEvent.Event["filters"]) + eventTypes := []string{} + eventTopics := []string{} + targetAudiences := []string{} + + rawTypes := toSlice(filters["event_types"]) + for _, rawType := range rawTypes { + eventTypes = append(eventTypes, toString(toMap(rawType)["name"])) + } + + rawAudiences := toSlice(filters["event_target_audience"]) + for _, audience := range rawAudiences { + targetAudiences = append(targetAudiences, toString(toMap(audience)["name"])) + } + + rawTopics := toSlice(filters["event_topic"]) + for _, topic := range rawTopics { + eventTopics = append(eventTopics, toString(toMap(topic)["name"])) + } + + // Parse the event departments, and tags + departments := []string{} + tags := []string{} + + rawTags := toSlice(rawEvent.Event["tags"]) + for _, tag := range rawTags { + tags = append(tags, tag.(string)) + } + + rawDeparments := toSlice(rawEvent.Event["departments"]) + for _, deparment := range rawDeparments { + departments = append(departments, toMap(deparment)["name"].(string)) + } + + // Parse the contact info, =ote that some events won't have contact phone number + rawContactInfo := toMap(rawEvent.Event["custom_fields"]) + contactInfo := [3]string{} + for i, infoField := range []string{ + "contact_information_name", "contact_information_email", "contact_information_phone", + } { + contactInfo[i] = toString(rawContactInfo[infoField]) + } + + events = append(events, schema.Event{ + Id: primitive.NewObjectID(), + Summary: toString(rawEvent.Event["title"]), + Location: location, + StartTime: startTime, + EndTime: endTime, + Description: toString(rawEvent.Event["description_text"]), + EventType: eventTypes, + TargetAudience: targetAudiences, + Topic: eventTopics, + EventTags: tags, + EventWebsite: toString(rawEvent.Event["url"]), + Department: departments, + ContactName: contactInfo[0], + ContactEmail: contactInfo[1], + ContactPhoneNumber: contactInfo[2], + }) } - utils.VPrintf("Scraped time: %s to %s ", dateTimeStart, dateTimeEnd) - - //Grab Location of Event - var location string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter("p.location > span", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - location = getNodeText(nodes[0]) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped location: %s, ", location) - - //Get description of event - var description string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".description > p", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - description = getNodeText(nodes[0]) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped description: %s, ", description) - - //Grab Event Type - var eventType []string = []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".filter-event_types > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - eventType = append(eventType, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped event type: %s", eventType) + log.Printf("Parsed the events of page %d successfully!\n\n", page+1) + } - //Grab Target Audience - targetAudience := []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".filter-event_target_audience > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - targetAudience = append(targetAudience, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped target audience: %s, ", targetAudience) + if err := utils.WriteJSON(fmt.Sprintf("%s/events.json", outDir), events); err != nil { + panic(err) + } + log.Printf("Finished parsing %d events successfully!\n\n", len(events)) +} - //Grab Topic - topic := []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".filter-event_topic > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - topic = append(topic, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped topic: %s, ", topic) +// Scrape the data from the api and unmarshal it to response data +func scrapeAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { + // Call API to get the byte data + calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", page) + req, err := http.NewRequest("GET", calendarUrl, nil) + if err != nil { + return err + } + res, err := client.Do(req) + if err != nil { + return err + } + if res != nil && res.StatusCode != 200 { + return fmt.Errorf("ERROR: Non-200 status is returned, %s", res.Status) + } - //Grab Event Tags - tags := []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".event-tags > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - tags = append(tags, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped tags: %s, ", tags) + // Unmarshal bytes to the response data + buffer := bytes.Buffer{} + if _, err = buffer.ReadFrom(res.Body); err != nil { + return err + } + res.Body.Close() + if err = json.Unmarshal(buffer.Bytes(), &data); err != nil { + return err + } + return nil +} - //Grab Website - var eventWebsite string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".event-website > p > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - href, hasHref := nodes[0].Attribute("href") - if !hasHref { - return errors.New("event does not have website") - } - eventWebsite = href - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } - utils.VPrintf("Scraped website: %s, ", eventWebsite) +// Casting an interface{} to an slice of interface{} +func toSlice(data interface{}) []interface{} { + if array, ok := data.([]interface{}); ok { + return array + } + return nil +} - //Grab Department - var eventDepartment []string = []string{} - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".event-group > a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - eventDepartment = append(eventDepartment, getNodeText(node)) - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) - } - utils.VPrintf("Scraped department: %s, ", eventDepartment) +// Casting an interface{} to map from string to interface{} +func toMap(data interface{}) map[string]interface{} { + if dataMap, ok := data.(map[string]interface{}); ok { + return dataMap + } + return nil +} - //Grab Contact information - var contactInformationName string = "" - var contactInformationEmail string = "" - var contactInformationPhone string = "" - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter(".custom-field-contact_information_name", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - contactInformationName = getNodeText(nodes[0]) - } - return nil - }, chromedp.AtLeast(0), - ), - chromedp.QueryAfter(".custom-field-contact_information_email", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - contactInformationEmail = getNodeText(nodes[0]) - } - return nil - }, chromedp.AtLeast(0), - ), - chromedp.QueryAfter(".custom-field-contact_information_phone", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - if len(nodes) != 0 { - contactInformationPhone = getNodeText(nodes[0]) - if err != nil { - return err - } - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - panic(err) +// Casting an interface{} to string, if the data is nil then string is "" +func toString(data interface{}) string { + if data != nil { + if dataString, ok := data.(string); ok { + return dataString } - utils.VPrintf("Scraped contact name info: %s", contactInformationName) - utils.VPrintf("Scraped contact email info: %s", contactInformationEmail) - utils.VPrintf("Scraped contact phone info: %s", contactInformationPhone) - - events = append(events, schema.Event{ - Id: primitive.NewObjectID(), - Summary: summary, - Location: location, - StartTime: dateTimeStart, - EndTime: dateTimeEnd, - Description: description, - EventType: eventType, - TargetAudience: targetAudience, - Topic: topic, - EventTags: tags, - EventWebsite: eventWebsite, - Department: eventDepartment, - ContactName: contactInformationName, - ContactEmail: contactInformationEmail, - ContactPhoneNumber: contactInformationPhone, - }) } + return "" +} - // Write event data to output file - fptr, err := os.Create(fmt.Sprintf("%s/events.json", outDir)) +// Parse string time +func parseTime(stringTime string) time.Time { + parsedTime, err := time.Parse(time.RFC3339, stringTime) if err != nil { panic(err) } - encoder := json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(events) - fptr.Close() + return parsedTime }