package main import ( "time" log "github.com/sirupsen/logrus" "github.com/gocolly/colly" ) func (app *App) Scrape(platforms []Platform) { wait := make(chan bool) count := 0 for _, platform := range platforms { go app.ScrapeHTML(platform, wait) count++ } // Wait until all go routines finished for i := 0; i < count; i++ { <-wait } } func (app *App) ScrapeHTML(platform Platform, wait chan bool) { var Entries []Entry var err error // retry on error for i := 1; i < 4; i++ { Entries = app.ScrapePlatform(platform) if len(Entries) >= 1 { break } } // if no results, return early if len(Entries) == 0 { wait <- true return } err = app.SaveEntries(Entries) if err != nil { Warn(err, "Saving entries failed. Platform: "+platform.Name) } wait <- true } func (app *App) ScrapePlatform(platform Platform) []Entry { switch platform.Name { case "Github": return app.ScrapeGithub(platform) default: log.Println(platform.Name + ": No Crawler") } return []Entry{} } /* * Sets the crawler config. */ func (app *App) customCollector(allowed_urls []string) *colly.Collector { c := colly.NewCollector( colly.UserAgent(app.Config.UserAgent), colly.AllowedDomains(allowed_urls...), ) c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT c.Limit(&colly.LimitRule{ DomainGlob: "*", RandomDelay: time.Duration(app.Config.Delay) * time.Second, }) return c }