diff options
| author | Max | 2018-02-08 18:26:41 +0100 |
|---|---|---|
| committer | Max | 2018-02-08 18:26:41 +0100 |
| commit | f6904aab20e2d09255fd0adabfd246165ff3cb02 (patch) | |
| tree | f7ac27cb5dd34443640235a97ce9bde8f2a1816a /crawler/shop_whic.go | |
| parent | ae7ed42df6a55e36c82b88e7c71569951847a68c (diff) | |
| download | alkobote-f6904aab20e2d09255fd0adabfd246165ff3cb02.tar.gz | |
Crawler extracts volume, price per litre and abv. (MC Whisky, Rum & Co, Whic)
Diffstat (limited to 'crawler/shop_whic.go')
| -rw-r--r-- | crawler/shop_whic.go | 45 |
1 files changed, 44 insertions, 1 deletions
diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index e489161..6025050 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -2,6 +2,7 @@ package main import ( "log" + "regexp" "strings" "github.com/PuerkitoBio/goquery" @@ -9,6 +10,7 @@ import ( ) func ScrapeWhic(shop Shop) []Angebot { + Shop_url := "https://whic.de/angebote" Whiskys := []Angebot{} c := colly.NewCollector( @@ -16,6 +18,11 @@ func ScrapeWhic(shop Shop) []Angebot { ) c.OnHTML("li.item", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url { + return + } + W := Angebot{} whisky_name := e.ChildAttr("a", "title") @@ -41,6 +48,12 @@ func ScrapeWhic(shop Shop) []Angebot { }) }) + base_price_noisy := e.ChildText(".base-price") + W.Base_price, err = sanitize_base_price(base_price_noisy) + if err != nil { + log.Fatal(err) + } + /* * colly does not parse a <noscript>, thus we are reading the content and parse it as html. */ @@ -52,13 +65,43 @@ func ScrapeWhic(shop Shop) []Angebot { } W.Image_url, _ = doc.Find("img").Attr("src") + e.Request.Visit(W.Url) + W.Volume = get_volume(e) + W.Abv = get_abv(e) + W.Shop = shop.Id W.Spirit_type = "Whisky" Whiskys = append(Whiskys, W) }) - c.Visit("https://whic.de/angebote") + c.OnHTML("#product-view-head-txt-extra-info", func(e *colly.HTMLElement) { + text_noisy := e.Text + + r_volume, err := regexp.Compile("Volumen: ([0-9]+([.,][0-9]+)) Liter") + if err != nil { + log.Fatal(err) + } + volume := r_volume.FindStringSubmatch(text_noisy) + if volume == nil || len(volume) < 2 { + return + } + + e.Request.Ctx.Put("volume", volume[1]+"l") + + r_abv, err := regexp.Compile("Alkoholgehalt: ([0-9]+([.,][0-9]+))%") + if err != nil { + log.Fatal(err) + } + abv := r_abv.FindStringSubmatch(text_noisy) + if abv == nil || len(abv) < 2 { + return + } + + e.Request.Ctx.Put("abv", abv[1]+"%") + }) + + c.Visit(Shop_url) return Whiskys } |
