From 4dc18e3691127e058833fd9c7a5bbee333c3a66c Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 17 Apr 2019 09:44:56 +0200 Subject: Initial commit. --- scrape.go | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 scrape.go (limited to 'scrape.go') diff --git a/scrape.go b/scrape.go new file mode 100644 index 0000000..ebb02fa --- /dev/null +++ b/scrape.go @@ -0,0 +1,84 @@ +package main + +import ( + "time" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) ScrapeHTML(platforms []Platform) { + + wait := make(chan bool) + count := 0 + + for _, platform := range platforms { + + go app.Scrape(platform, wait) + count++ + + } + + // Wait until all go routines finished + for i := 0; i < count; i++ { + <-wait + } +} + +func (app *App) Scrape(platform Platform, wait chan bool) { + var Entries []Entry + var err error + + // retry on error + for i := 1; i < 4; i++ { + Entries = app.ScrapePlatform(platform) + + if len(Entries) >= 1 { + break + } + } + + // if no results, return early + if len(Entries) == 0 { + wait <- true + return + + } + + err = app.SaveEntries(Entries) + if err != nil { + Warn(err, "Saving entries failed. Platform: "+platform.Name) + } + + wait <- true +} + +func (app *App) ScrapePlatform(platform Platform) []Entry { + + switch platform.Name { + case "Github": + return app.ScrapeGithub(platform) + default: + log.Println(platform.Name + ": No Crawler") + } + + return []Entry{} +} + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} -- cgit v1.2.3