diff options
| author | horus | 2018-06-15 23:28:18 +0200 |
|---|---|---|
| committer | horus | 2018-06-15 23:28:18 +0200 |
| commit | 8d68ac7c900241eb8499a94c23ab1f60750e7aed (patch) | |
| tree | 3a5d444f866383d5cdefc512242dc2afa236641e /crawler/scrape.go | |
| parent | 0026ba55f03c5378d5773459fcdd7c6931ff42a5 (diff) | |
| download | alkobote-8d68ac7c900241eb8499a94c23ab1f60750e7aed.tar.gz | |
Introduces config for user agent, robots.txt and crawler delay. (crawler)
Diffstat (limited to 'crawler/scrape.go')
| -rw-r--r-- | crawler/scrape.go | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/crawler/scrape.go b/crawler/scrape.go index 4bc66e0..f9e758d 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -1,7 +1,10 @@ package main import ( + "time" + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" ) func (app *App) ScrapeHTML(shops []Shop) { @@ -41,6 +44,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) { } } + // if no results, return early + if len(W) == 0 { + wait <- true + return + + } + err = app.save_offer(W) if err != nil { Warn(err, "Saving offers failed. Shop: "+shop.Name) @@ -78,3 +88,21 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return []Angebot{} } + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} |
