Skip to content

Commit

Permalink
Astra scraper (#31)
Browse files Browse the repository at this point in the history
* Log in to Astra

* Attempt to call backend Astra API

* Uncomment cookie code

* Successful scrape!

TODOs:
sorting
scrape each day
look into login inputting user/pass in wrong sometimes

* Request in loop

* Scrape until 90 days of less than 10 events

After this and next semester seems there's only ever 2 events, one in FO 3.616 with no time (?) and one with no location that always shows up after the current semester and says either the holiday and "Events for Future Terms" as well as "No Events Allowed". This just scrapes 90 days into that, stops at about a year and 2 months out.

* Sort by start time

* Check if max events exceeded

* Run close commands when not needed

Closes chromedp when not necessary

* Start on previous day
  • Loading branch information
TyHil authored Oct 14, 2024
1 parent 28b3641 commit 6b96e8d
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#Scrapers
LOGIN_NETID=
LOGIN_PASSWORD=
LOGIN_ASTRA_USERNAME=
LOGIN_ASTRA_PASSWORD=
HEADLESS_MODE=false

#Uploader
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335
github.com/chromedp/chromedp v0.10.0
github.com/joho/godotenv v1.5.1
github.com/valyala/fastjson v1.6.4
go.mongodb.org/mongo-driver v1.15.0
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ=
github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c=
github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY=
Expand Down
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ func main() {
scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.")
// Flag for event scraping
scrapeEvents := flag.Bool("events", false, "Alongside -scrape, signifies that events should be scraped.")
// Flag for astra scraping
scrapeAstra := flag.Bool("astra", false, "Alongside -scrape, signifies that Astra should be scraped.")

// Flags for parsing
parse := flag.Bool("parse", false, "Puts the tool into parsing mode.")
Expand Down Expand Up @@ -92,6 +94,8 @@ func main() {
scrapers.ScrapeOrganizations(*outDir)
case *scrapeEvents:
scrapers.ScrapeEvents(*outDir)
case *scrapeAstra:
scrapers.ScrapeAstra(*outDir)
default:
log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!")
}
Expand Down
123 changes: 123 additions & 0 deletions scrapers/astra.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
This file contains the code for the Astra scraper.
*/

package scrapers

import (
"fmt"
"io"
"log"
"net/http"
"os"
"time"

"github.com/UTDNebula/api-tools/utils"
"github.com/joho/godotenv"
"github.com/valyala/fastjson"
)

var MAX_EVENTS_PER_DAY = 5000

func ScrapeAstra(outDir string) {

// Load env vars
if err := godotenv.Load(); err != nil {
log.Panic("Error loading .env file")
}

// Start chromedp
chromedpCtx, cancel := utils.InitChromeDp()

// Make output folder
err := os.MkdirAll(outDir, 0777)
if err != nil {
panic(err)
}

days := "{" // String JSON for storing results by day
firstLoop := true // To avoid adding a comma to the JSON on the first loop

// Init http client
tr := &http.Transport{
MaxIdleConns: 10,
IdleConnTimeout: 30 * time.Second,
DisableCompression: true,
}
cli := &http.Client{Transport: tr}

// Get cookies for auth
astraHeaders := utils.RefreshAstraToken(chromedpCtx)
time.Sleep(500 * time.Millisecond)
cancel() // Don't need chromedp anymore

// Starting date
date := time.Now()
// Start on previous date to make sure we have today's data, regardless of what timezone the scraper is in
date = date.Add(time.Hour * -24)

// Stop condition
lt10EventsCount := 0

// Run until 90 days of no events
for lt10EventsCount < 90 {
formattedDate := date.Format("2006-01-02")
log.Printf("Scraping %s...", formattedDate)

// Request daily events
url := fmt.Sprintf("https://www.aaiscloud.com/UTXDallas/~api/calendar/CalendarWeekGrid?_dc=%d&action=GET&start=0&limit=%d&isForWeekView=false&fields=ActivityId,ActivityPk,ActivityName,ParentActivityId,ParentActivityName,MeetingType,Description,StartDate,EndDate,DayOfWeek,StartMinute,EndMinute,ActivityTypeCode,ResourceId,CampusName,BuildingCode,RoomNumber,RoomName,LocationName,InstitutionId,SectionId,SectionPk,IsExam,IsCrosslist,IsAllDay,IsPrivate,EventId,EventPk,CurrentState,NotAllowedUsageMask,UsageColor,UsageColorIsPrimary,EventTypeColor,MaxAttendance,ActualAttendance,Capacity&filter=(StartDate%%3C%%3D%%22%sT23%%3A00%%3A00%%22)%%26%%26(EndDate%%3E%%3D%%22%sT00%%3A00%%3A00%%22)&page=1&sortOrder=%%2BStartDate,%%2BStartMinute", time.Now().UnixMilli(), MAX_EVENTS_PER_DAY, formattedDate, formattedDate)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
panic(err)
}
req.Header = astraHeaders
res, err := cli.Do(req)
if err != nil {
panic(err)
}
if res.StatusCode != 200 {
log.Panicf("ERROR: Status was: %s\nIf the status is 404, you've likely been IP ratelimited!", res.Status)
}
body, err := io.ReadAll(res.Body)
if err != nil {
panic(err)
}
res.Body.Close()
stringBody := string(body)

// Check for no events
numEvents := fastjson.GetInt(body, "totalRecords")
if numEvents >= MAX_EVENTS_PER_DAY {
log.Panic("ERROR: Max events per day exceeded!")
}
if numEvents < 10 {
lt10EventsCount += 1
if lt10EventsCount > 30 {
log.Printf("There have been %d days in a row with fewer than 10 events.", lt10EventsCount)
}
} else {
lt10EventsCount = 0
}

// Add to record
comma := ","
if firstLoop {
comma = ""
firstLoop = false
}
days = fmt.Sprintf("%s%s\"%s\":%s", days, comma, formattedDate, stringBody)
date = date.Add(time.Hour * 24)
}

// Write event data to output file
days = fmt.Sprintf("%s}", days)
fptr, err := os.Create(fmt.Sprintf("%s/reservations.json", outDir))
if err != nil {
panic(err)
}
_, err = fptr.Write([]byte(days))
if err != nil {
panic(err)
}
fptr.Close()
}
73 changes: 73 additions & 0 deletions utils/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,79 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
}
}

// This function signs into Astra
func RefreshAstraToken(chromedpCtx context.Context) map[string][]string {
// Get username and password
username, present := os.LookupEnv("LOGIN_ASTRA_USERNAME")
if !present {
log.Panic("LOGIN_ASTRA_USERNAME is missing from .env!")
}
password, present := os.LookupEnv("LOGIN_ASTRA_PASSWORD")
if !present {
log.Panic("LOGIN_ASTRA_PASSWORD is missing from .env!")
}

// Sign in
VPrintf("Signing in...")
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
err := network.ClearBrowserCookies().Do(ctx)
return err
}),
chromedp.Navigate(`https://www.aaiscloud.com/UTXDallas/logon.aspx?ReturnUrl=%2futxdallas%2fcalendars%2fdailygridcalendar.aspx`),
chromedp.WaitVisible(`input#userNameField-inputEl`),
chromedp.SendKeys(`input#userNameField-inputEl`, username),
chromedp.SendKeys(`input#textfield-1029-inputEl`, password),
chromedp.WaitVisible(`a#logonButton`),
chromedp.Click(`a#logonButton`),
chromedp.WaitVisible(`body`, chromedp.ByQuery),
)
if err != nil {
panic(err)
}

// Save all cookies to string
cookieStr := ""
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.WaitVisible(`body`, chromedp.ByQuery),
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := network.GetCookies().Do(ctx)
gotToken := false
for _, cookie := range cookies {
cookieStr = fmt.Sprintf("%s%s=%s; ", cookieStr, cookie.Name, cookie.Value)
if cookie.Name == "UTXDallas.ASPXFORMSAUTH" {
VPrintf("Got new token: PTGSESSID = %s", cookie.Value)
gotToken = true
}
}
if !gotToken {
return errors.New("failed to get a new token")
}
return err
}),
)
if err != nil {
panic(err)
}

// Return headers, copied from a request the actual site made
return map[string][]string{
"Host": {"www.aaiscloud.com"},
"User-Agent": {"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"},
"Accept": {"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"},
"Accept-Language": {"en-US,en;q=0.5"},
"Accept-Encoding": {"gzip, deflate, br, zstd"},
"Connection": {"keep-alive"},
"Cookie": {cookieStr},
"Upgrade-Insecure-Requests": {"1"},
"Sec-Fetch-Dest": {"document"},
"Sec-Fetch-Mode": {"navigate"},
"Sec-Fetch-Site": {"none"},
"Sec-Fetch-User": {"?1"},
"Priority": {"u=0, i"},
}
}

// Encodes and writes the given data as tab-indented JSON to the given filepath.
func WriteJSON(filepath string, data interface{}) error {
fptr, err := os.Create(filepath)
Expand Down

0 comments on commit 6b96e8d

Please sign in to comment.