summaryrefslogtreecommitdiff
path: root/crawler/scrape.go
diff options
context:
space:
mode:
authorhorus2018-06-15 23:28:18 +0200
committerhorus2018-06-15 23:28:18 +0200
commit8d68ac7c900241eb8499a94c23ab1f60750e7aed (patch)
tree3a5d444f866383d5cdefc512242dc2afa236641e /crawler/scrape.go
parent0026ba55f03c5378d5773459fcdd7c6931ff42a5 (diff)
downloadalkobote-8d68ac7c900241eb8499a94c23ab1f60750e7aed.tar.gz
Introduces config for user agent, robots.txt and crawler delay. (crawler)
Diffstat (limited to 'crawler/scrape.go')
-rw-r--r--crawler/scrape.go28
1 files changed, 28 insertions, 0 deletions
diff --git a/crawler/scrape.go b/crawler/scrape.go
index 4bc66e0..f9e758d 100644
--- a/crawler/scrape.go
+++ b/crawler/scrape.go
@@ -1,7 +1,10 @@
package main
import (
+ "time"
+
log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
)
func (app *App) ScrapeHTML(shops []Shop) {
@@ -41,6 +44,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) {
}
}
+ // if no results, return early
+ if len(W) == 0 {
+ wait <- true
+ return
+
+ }
+
err = app.save_offer(W)
if err != nil {
Warn(err, "Saving offers failed. Shop: "+shop.Name)
@@ -78,3 +88,21 @@ func (app *App) ScrapeShop(shop Shop) []Angebot {
return []Angebot{}
}
+
+/*
+ * Sets the crawler config.
+ */
+func (app *App) customCollector(allowed_urls []string) *colly.Collector {
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ colly.AllowedDomains(allowed_urls...),
+ )
+ c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT
+
+ c.Limit(&colly.LimitRule{
+ DomainGlob: "*",
+ RandomDelay: time.Duration(app.Config.Delay) * time.Second,
+ })
+
+ return c
+}