From 8d68ac7c900241eb8499a94c23ab1f60750e7aed Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 23:28:18 +0200 Subject: Introduces config for user agent, robots.txt and crawler delay. (crawler) --- crawler/scrape.go | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'crawler/scrape.go') diff --git a/crawler/scrape.go b/crawler/scrape.go index 4bc66e0..f9e758d 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -1,7 +1,10 @@ package main import ( + "time" + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" ) func (app *App) ScrapeHTML(shops []Shop) { @@ -41,6 +44,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) { } } + // if no results, return early + if len(W) == 0 { + wait <- true + return + + } + err = app.save_offer(W) if err != nil { Warn(err, "Saving offers failed. Shop: "+shop.Name) @@ -78,3 +88,21 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return []Angebot{} } + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} -- cgit v1.2.3