From fc83917d623228b09191f178062e59fad0722795 Mon Sep 17 00:00:00 2001 From: horus_arch Date: Sat, 17 Feb 2018 15:07:52 +0100 Subject: Adds crawler for whiskysite.nl. (crawler) --- crawler/shop_whiskysitenl.go | 84 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 5 deletions(-) (limited to 'crawler/shop_whiskysitenl.go') diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go index 43345b2..fb2940d 100644 --- a/crawler/shop_whiskysitenl.go +++ b/crawler/shop_whiskysitenl.go @@ -1,7 +1,6 @@ package main import ( - log "github.com/Sirupsen/logrus" "regexp" "strings" @@ -11,6 +10,8 @@ import ( func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { Whiskys := []Angebot{} + Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100" + c := colly.NewCollector( colly.AllowedDomains("whiskysite.nl"), colly.AllowedDomains("www.whiskysite.nl"), @@ -30,30 +31,103 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { price_discount_noisy := e.ChildText(".product-block-price") r, err := regexp.Compile("[0-9]+(,[0-9]{1,2})") if err != nil { - log.Fatal(err) + Fatal(err, "Whiskysite.nl: Discounted price regex failed") } discounted_price := r.FindString(strings.Trim(strings.TrimPrefix(price_discount_noisy, regular_price), "")) W.Original_price, err = convert_price(regular_price) if err != nil { - //log.Println(W.Name, err) + W.error_msg = err.Error() + W.error_ctx = regular_price + WarnOffer(W, "Whiskysite.nl: Extracting original price failed") return } W.Discounted_price, err = convert_price(discounted_price) if err != nil { - //log.Println(W.Name, err) + W.error_msg = err.Error() + W.error_ctx = discounted_price + WarnOffer(W, "Whiskysite.nl: Extracting discounted price failed") return } W.Image_url = e.ChildAttr("img", "src") + if e.Request.Ctx.Get("volume_failed") != "" { + W.error_msg = "Whiskysite.nl: Extracting volume via Liter-Regex failed" + W.error_ctx = e.Request.Ctx.Get("volume_failed") + WarnOffer(W, "Whiskysite.nl: Extracting volume via Liter-Regex failed") + return + } + if e.Request.Ctx.Get("abv_failed") != "" { + W.error_msg = "Whiskysite.nl: Extracting abv via Abv-Regex failed" + W.error_ctx = e.Request.Ctx.Get("volume_failed") + WarnOffer(W, "Whiskysite.nl: Extracting abv via Abv-Regex failed") + return + } + + var ctx string + W.Volume, ctx = get_volume(e) + if W.Volume == 0 { + W.error_msg = "Whiskysite.nl: Extracting volume failed" + W.error_ctx = ctx + WarnOffer(W, "Whiskysite.nl: Extracting volume failed") + return + } + W.Abv, ctx = get_volume(e) + if W.Abv == 0 { + W.error_msg = "Whiskysite.nl: Extracting abv failed" + W.error_ctx = ctx + WarnOffer(W, "Whiskysite.nl: Extracting abv failed") + return + } + + // calculate base price, volume is never zero + W.Base_price = int(RoundToEven(float64(W.Discounted_price) / float64(W.Volume))) + W.Shop = shop.Id W.Spirit_type = "Whisky" Whiskys = append(Whiskys, W) }) - c.Visit("https://www.whiskysite.nl/en/specials/?limit=100") + c.OnHTML("#information", func(e *colly.HTMLElement) { + if e.Request.URL.String() == Shop_url { + return + } + text_noisy := e.Text + + // 0.70ltr. 43.00% + // 0,70 l 46% + // 1,0ltr. 43% + r_number, err := regexp.Compile("[0-9]+([.,][0-9]+)?") + if err != nil { + Fatal(err, "Whiskysite.nl: Number regex failed") + } + r_liter, err := regexp.Compile("[0-9]+([.,][0-9]+)?( )*(l|ltr)") + if err != nil { + Fatal(err, "Whiskysite.nl: Volume regex failed") + } + litre_noisy := r_liter.FindString(text_noisy) + if litre_noisy == "" { + e.Request.Ctx.Put("volume_failed", text_noisy) + return + } + e.Request.Ctx.Put("volume", r_number.FindString(litre_noisy)) + + r_abv, err := regexp.Compile("[0-9]+([.,][0-9]+)?( )*%") + if err != nil { + Fatal(err, "Whiskysite.nl: Abv regex failed") + } + abv_noisy := r_abv.FindString(text_noisy) + if abv_noisy == "" { + e.Request.Ctx.Put("abv_failed", text_noisy) + return + } + e.Request.Ctx.Put("abv", abv_noisy) + + }) + + c.Visit(Shop_url) return Whiskys } -- cgit v1.2.3