From d0b2f70f278924b264fce12b3da7c4c87cbe4593 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 16:05:06 +0200 Subject: Adds scraper for Drankdozijn. (crawler) --- crawler/shop_drankdozijn.go | 192 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 crawler/shop_drankdozijn.go (limited to 'crawler/shop_drankdozijn.go') diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go new file mode 100644 index 0000000..0a5cca4 --- /dev/null +++ b/crawler/shop_drankdozijn.go @@ -0,0 +1,192 @@ +package main + +import ( + "net/http" + "strconv" + "strings" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { + Shop_url_base := "https://drankdozijn.de/aanbiedingen/" + var Shop_url string + Async_url := "https://drankdozijn.de/async/scroll" + + Offers := []Angebot{} + + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac"} + //types := map[int]string{240: "Cognac"} + var current_type string + + c := app.customCollector([]string{"drankdozijn.de"}) + + c.OnHTML(".product_top", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url { + //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") + return + } + + W := Angebot{} + + W.Shop = shop.Id + W.Spirit_type = current_type + + var err error + + e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { + W.Url = e.ChildAttr("a", "href") + W.Image_url = e.ChildAttr("img", "src") + }) + e.ForEach(".product_title", func(i int, e *colly.HTMLElement) { + W.Name = e.ChildText("a") + }) + + if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } + + e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { + W.Original_price, err = convert_price(e.ChildText(".product_acties")) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_acties") + PrintlnOffer(W, "Drankdozijn: Converting original price failed") + return + } + W.Discounted_price, err = convert_price(e.ChildText(".product_aanbieding_prijs")) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_aanbieding_prijs") + PrintlnOffer(W, "Drankdozijn: Converting discounted price failed") + return + } + }) + + e.Request.Visit(W.Url) + + var ctx string + + W.Volume, ctx = get_volume(e) + if W.Volume == 0 { + W.error_msg = e.Request.Ctx.Get("volume") + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: Volume is zero") + return + } + + W.Abv, ctx = get_abv(e) + if W.Abv == 0 { + W.error_msg = "Drankdozijn: Abv is zero" + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: abv is zero") + return + } + + base_price_noisy := e.Request.Ctx.Get("base_price") + W.Base_price, err = convert_price(base_price_noisy) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".price_l") + PrintlnOffer(W, "Drankdozijn: Converting base price failed") + return + } + + if current_type == "Cognac" { + W.Spirit_type = e.Request.Ctx.Get("spirit_type") + } + + W.Website = e.Request.Ctx.Get("website") + + //DebugOffer(W, "DEBUG") + + Offers = append(Offers, W) + }) + + c.OnHTML(".main_price", func(e *colly.HTMLElement) { + //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L")) + e.Request.Ctx.Put("base_price", e.ChildText(".price_l")) + }) + + c.OnHTML(".main_description", func(e *colly.HTMLElement) { + prev := "" + count := 0 + e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) { + if count%2 == 0 { + prev = e.Text + } else { + switch strings.TrimSpace(prev) { + case "Inhalt": + e.Request.Ctx.Put("volume", e.Text) + case "Alkoholgehalt": + e.Request.Ctx.Put("abv", e.Text) + case "Kategorie": + e.Request.Ctx.Put("spirit_type", e.Text) + } + + prev = "" + } + count++ + }) + }) + + c.OnHTML("body", func(e *colly.HTMLElement) { + if e.Request.URL.String() == Shop_url { + return + } + e.Request.Ctx.Put("website", string(e.Response.Body)) + }) + + var cookie *http.Cookie + var has_cookie bool + c.OnResponse(func(r *colly.Response) { + //log.Debug("Cookies:", c.Cookies(r.Request.URL.String())) + if len(c.Cookies(r.Request.URL.String())) > 0 { + has_cookie = true + cookie = c.Cookies(r.Request.URL.String())[0] + } + }) + + for groepnr, cur_type := range types { + current_type = cur_type + switch current_type { + case "Wodka": + Shop_url = Shop_url_base + "vodka" + case "Likör": + Shop_url = Shop_url_base + "likeuren" + default: + Shop_url = Shop_url_base + current_type + } + + //log.Debug(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": Error (Visit): "+err.Error()) + } + + c.OnRequest(func(r *colly.Request) { + r.Headers.Set("X-Requested-With", "XMLHttpRequest") + r.Headers.Set("Referer", Shop_url) + if has_cookie { + //log.Debug("Setting Cookie: " + cookie.String()) + r.Headers.Set("Cookie", cookie.String()) + } + }) + + for i := 12; true; i = i + 12 { + log.Debug("Crawling Drankdozijn: type = " + cur_type + " items = " + strconv.Itoa(i)) + err := c.Post(Async_url, map[string]string{"items": strconv.Itoa(i), "datum": "0", "groepnr": strconv.Itoa(groepnr)}) + if err != nil { + if "EOF" != err.Error() { + Warn(nil, shop.Name+": Error (Post): "+err.Error()) + } + break + } + } + } + + return Offers +} -- cgit v1.2.3 From 2588017275b32f8f433c732fbb89100fe87c3e96 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 16:51:51 +0200 Subject: Adds champagner / Drankdozijn. (crawler) --- crawler/shop_drankdozijn.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'crawler/shop_drankdozijn.go') diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go index 0a5cca4..adc8633 100644 --- a/crawler/shop_drankdozijn.go +++ b/crawler/shop_drankdozijn.go @@ -16,8 +16,8 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { Offers := []Angebot{} - types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac"} - //types := map[int]string{240: "Cognac"} + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", "Champagner": 100} + //types := map[int]string{100: "Champagner"} var current_type string c := app.customCollector([]string{"drankdozijn.de"}) @@ -98,6 +98,10 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { if current_type == "Cognac" { W.Spirit_type = e.Request.Ctx.Get("spirit_type") } + if current_type == "Champagner" && e.Request.Ctx.Get("spirit_type") != "Champagner" { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } W.Website = e.Request.Ctx.Get("website") @@ -157,6 +161,8 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { Shop_url = Shop_url_base + "vodka" case "Likör": Shop_url = Shop_url_base + "likeuren" + case "Champagner": + Shop_url = Shop_url_base + "wijn" default: Shop_url = Shop_url_base + current_type } -- cgit v1.2.3 From f61abc2069936f600c153d019b5f7a8c9a234e24 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 17:00:43 +0200 Subject: Bugfix. (crawler) --- crawler/shop_drankdozijn.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'crawler/shop_drankdozijn.go') diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go index adc8633..96d914d 100644 --- a/crawler/shop_drankdozijn.go +++ b/crawler/shop_drankdozijn.go @@ -16,8 +16,8 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { Offers := []Angebot{} - types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", "Champagner": 100} - //types := map[int]string{100: "Champagner"} + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", 100: "Champagner"} + //types := map[int]string{240: "Likör"} var current_type string c := app.customCollector([]string{"drankdozijn.de"}) @@ -35,6 +35,7 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { W.Spirit_type = current_type var err error + var skip_offer bool e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { W.Url = e.ChildAttr("a", "href") @@ -50,7 +51,13 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { } e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { - W.Original_price, err = convert_price(e.ChildText(".product_acties")) + original_price_noisy := e.ChildText(".product_acties") + if !strings.Contains(original_price_noisy, "€") { + PrintlnOffer(W, "Drankdozijn: Original price has no € sign. Skipping!") + skip_offer = true + return + } + W.Original_price, err = convert_price(original_price_noisy) if err != nil { W.error_msg = err.Error() W.error_ctx = e.ChildText(".product_acties") @@ -66,6 +73,10 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { } }) + if skip_offer { + return + } + e.Request.Visit(W.Url) var ctx string -- cgit v1.2.3