package main import ( "strconv" "strings" log "github.com/Sirupsen/logrus" "github.com/gocolly/colly" ) func (app *App) ScrapeGithub(platform Platform) []Entry { var err error URL := platform.URL Languages := app.GetLanguages() UpdatePeriods := app.GetUpdatePeriods() current_language := Language{} current_update_period := UpdatePeriod{} Entries := []Entry{} c := app.customCollector([]string{"www.github.com", "github.com"}) c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) { entry := Entry{} owner := Owner{} e.ForEach("div > h3", func(i int, e *colly.HTMLElement) { entry.URL = e.ChildAttr("a", "href") owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /") owner.Name = strings.TrimSpace(owner.Name) entry.Title = strings.TrimPrefix(e.ChildText("a"), owner.Name+" /") entry.Title = strings.TrimSpace(entry.Title) }) e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) { entry.Synopsis = e.ChildText("p") }) e.ForEach("div.text-gray > a.muted-link", func(i int, e *colly.HTMLElement) { if strings.Contains(e.Attr("href"), "stargazers") { stars := strings.TrimSpace(strings.Replace(e.Text, ",", "", -1)) entry.Stars, err = strconv.Atoi(stars) if err != nil { Warn(err, "Github: Extracting stars from "+entry.Title+" failed") } } }) l := Language{} l.ID = current_language.ID l.Name = current_language.Name p := Platform{} p.ID = platform.ID p.Name = platform.Name p.URL = platform.URL u := UpdatePeriod{} u.ID = current_update_period.ID u.Name = current_update_period.Name owner.Platform = &platform owner.URL = URL + owner.Name entry.Owner = &owner entry.Platform = &p entry.Language = &l entry.UpdatePeriod = &u entry.Created_At = app.Now log.Debugf("%+v\n", owner) log.Debugf("%+v\n", entry) Entries = append(Entries, entry) }) for _, l := range Languages { current_language = l for _, t := range UpdatePeriods { current_update_period = t CURRENT_URL := platform.URL + "/trending/" + l.Name + "?since=" + t.Name log.Println("Crawling " + CURRENT_URL) err := c.Visit(CURRENT_URL) if err != nil { Warn(err, "Scraping Platform "+platform.Name+" failed with URL: "+CURRENT_URL) } } } return Entries }