From 0026ba55f03c5378d5773459fcdd7c6931ff42a5 Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 15 Jun 2018 19:38:04 +0200 Subject: Introduces central crawler config. (crawler) --- crawler/shop_bottleworld.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'crawler/shop_bottleworld.go') diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index b92896d..de9fe13 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -12,10 +12,7 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all" Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("bottleworld.de"), - colly.AllowedDomains("www.bottleworld.de"), - ) + c := customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} -- cgit v1.2.3 From 8d68ac7c900241eb8499a94c23ab1f60750e7aed Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 23:28:18 +0200 Subject: Introduces config for user agent, robots.txt and crawler delay. (crawler) --- crawler/shop_bottleworld.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'crawler/shop_bottleworld.go') diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index de9fe13..d679b43 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -6,13 +6,18 @@ import ( // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" + "log" + "time" ) func (app *App) ScrapeBottleWord(shop Shop) []Angebot { Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all" Whiskys := []Angebot{} - c := customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) + c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) + + log.Println(c.IgnoreRobotsTxt) + log.Println(time.Duration(app.Config.Delay)) c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} @@ -60,7 +65,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { W.Image_url = e.ChildAttr("img", "src") - e.Request.Visit(W.Url) + erro := e.Request.Visit(W.Url) + if erro != nil { + Warn(nil, W.Url+" "+erro.Error()) + } var ctx string W.Volume, ctx = get_volume(e) @@ -109,7 +117,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { e.Request.Ctx.Put("spirit_type", detect_spirit_type(text_noisy)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } -- cgit v1.2.3 From 482ac52e2db7ca3db7005dcc01d21b69da0faf89 Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 23:44:07 +0200 Subject: Removes unnecessary code. (crawler) --- crawler/shop_bottleworld.go | 5 ----- 1 file changed, 5 deletions(-) (limited to 'crawler/shop_bottleworld.go') diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index d679b43..8722211 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -6,8 +6,6 @@ import ( // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" - "log" - "time" ) func (app *App) ScrapeBottleWord(shop Shop) []Angebot { @@ -16,9 +14,6 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) - log.Println(c.IgnoreRobotsTxt) - log.Println(time.Duration(app.Config.Delay)) - c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} -- cgit v1.2.3