diff options
| author | horus | 2018-02-12 22:55:47 +0100 |
|---|---|---|
| committer | horus | 2018-02-12 22:55:47 +0100 |
| commit | b6b5993e2c1215c90342398a21e6503a8c03950d (patch) | |
| tree | 60d62bb386875aa14fa4bb56625f5daeb0a64920 | |
| parent | ca5ac0bcb9206e81faab60cc8a8d6da697bdfdbe (diff) | |
| download | alkobote-b6b5993e2c1215c90342398a21e6503a8c03950d.tar.gz | |
Crawls now whiskworld.de and whiskyzone.de. (crawler)
| -rw-r--r-- | crawler/shop_whiskyworld.go | 31 | ||||
| -rw-r--r-- | crawler/shop_whiskyzone.go | 90 |
2 files changed, 102 insertions, 19 deletions
diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index 8e4b984..c0fb7b6 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -9,14 +9,23 @@ import ( func ScrapeWhiskyworld(shop Shop) []Angebot { + Shop_urls := []string{"https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BMalt%2522%257D", + "https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BWhiskies%2522%257D", + "https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Single%2BMalt%2522%257D", + } + Whiskys := []Angebot{} c := colly.NewCollector( + colly.UserAgent("friendly"), colly.AllowedDomains("whiskyworld.de"), colly.AllowedDomains("www.whiskyworld.de"), ) c.OnHTML(".product-item", func(e *colly.HTMLElement) { + if !stringInSlice(e.Request.URL.String(), Shop_urls) { + return + } W := Angebot{} @@ -25,7 +34,7 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { W.Name = whisky_name_part1 + " " + whisky_name_part2 - W.Url = "https://www.whiskyworld.de/" + strings.TrimPrefix(e.ChildAttr("a", "href"), "../") + W.Url = "https://www.whiskyworld.de/" + e.ChildAttr("a", "href") regular_price_noisy := e.ChildText(".offer") regular_price := strings.TrimSuffix(strings.TrimPrefix(regular_price_noisy, "statt "), " €*") @@ -34,12 +43,14 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { W.Original_price, err = convert_price(regular_price) if err != nil { + log.Println("Whisky World: Original_price failed: " + regular_price + " // " + W.Name + " // " + W.Url + " // " + e.Request.URL.String()) log.Fatal(err) return } W.Discounted_price, err = convert_price(e.ChildText(".uvp")) if err != nil { + log.Println("Whisky World: Discounted_price failed") log.Fatal(err) return } @@ -64,6 +75,7 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { base_price_noisy = strings.TrimSpace(strings.SplitAfter(base_price_noisy, "Liter")[0]) W.Base_price, err = sanitize_base_price(base_price_noisy) if err != nil { + log.Println("Whisky World: Base_price failed") log.Fatal(err) } } @@ -75,12 +87,23 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { W.Shop = shop.Id W.Spirit_type = "Whisky" + e.Request.Visit(W.Url) + W.Website = e.Request.Ctx.Get("website") + Whiskys = append(Whiskys, W) }) - c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BMalt%2522%257D") - c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BWhiskies%2522%257D") - c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Single%2BMalt%2522%257D") + c.OnHTML("body", func(e *colly.HTMLElement) { + if stringInSlice(e.Request.URL.String(), Shop_urls) { + return + } + + e.Request.Ctx.Put("website", string(e.Response.Body)) + }) + + for _, url := range Shop_urls { + c.Visit(url) + } return Whiskys } diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index 3303b5e..5809b7e 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -2,13 +2,15 @@ package main import ( "log" - "regexp" + "strings" "github.com/gocolly/colly" ) func ScrapeWhiskyzone(shop Shop) []Angebot { + Shop_url := "https://www.whiskyzone.de/widgets/emotion/index/emotionId/248/controllerName/listing" + Whiskys := []Angebot{} c := colly.NewCollector( @@ -18,39 +20,97 @@ func ScrapeWhiskyzone(shop Shop) []Angebot { c.OnHTML(".product--info", func(e *colly.HTMLElement) { + if e.Request.URL.String() != Shop_url { + return + } + W := Angebot{} W.Name = e.ChildAttr("a", "title") W.Url = e.ChildAttr("a", "href") - price_discount_noisy := e.ChildText(".price--default") - price_regular_noisy := e.ChildText(".price--discount") - r, err := regexp.Compile("[0-9]+(,[0-9]{1,2})") + e.ForEach(".image--media", func(i int, e *colly.HTMLElement) { + W.Image_url = e.ChildAttr("img", "src") + }) + + W.Shop = shop.Id + W.Spirit_type = "Whisky" + + e.Request.Visit(W.Url) + + var err error + W.Discounted_price, err = convert_price(e.Request.Ctx.Get("discounted_price")) if err != nil { + log.Println("Discounted_price failed") log.Fatal(err) } - W.Discounted_price, err = convert_price(r.FindString(price_discount_noisy)) + + W.Original_price, err = convert_price(e.Request.Ctx.Get("original_price")) if err != nil { + log.Println("Original_price failed") log.Fatal(err) - return } - W.Original_price, err = convert_price(r.FindString(price_regular_noisy)) - if err != nil { - log.Fatal(err) + + W.Volume = get_volume(e) + W.Abv = get_abv(e) + + base_price := e.Request.Ctx.Get("base_price") + if base_price == "same_as_discounted_price" { + W.Base_price = W.Discounted_price + } else { + W.Base_price = get_base_price(e) + } + + W.Website = e.Request.Ctx.Get("website") + Whiskys = append(Whiskys, W) + }) + + c.OnHTML(".product--buybox", func(e *colly.HTMLElement) { + if e.Request.URL.String() == Shop_url { return } - e.ForEach(".image--media", func(i int, e *colly.HTMLElement) { - W.Image_url = e.ChildAttr("img", "src") + // Original & Discounted Price + e.ForEach(".product--price.price--default.price--discount", func(i int, e *colly.HTMLElement) { + e.Request.Ctx.Put("discounted_price", e.ChildText(".price--content.content--default")) + e.Request.Ctx.Put("original_price", e.ChildText(".price--line-through")) }) - W.Shop = shop.Id - W.Spirit_type = "Whisky" + // Volume & Base Price + e.ForEach(".product--price.price--unit", func(i int, e *colly.HTMLElement) { + text_noisy_t := e.Text + text_noisy_t = strings.Replace(text_noisy_t, "Inhalt", "", 1) + text_noisy_t = strings.Replace(text_noisy_t, ":", "", 1) - Whiskys = append(Whiskys, W) + // Containts the base price in "(" if it's not "1 Liter" + if strings.Contains(text_noisy_t, "(") { + text_noisy := strings.Split(text_noisy_t, "(") + volume_noisy := strings.Replace(text_noisy[0], "(", "", 1) + e.Request.Ctx.Put("volume", volume_noisy) + + base_price_noisy := strings.Replace(text_noisy[1], ")", "", 1) + e.Request.Ctx.Put("base_price", base_price_noisy) + } else { + e.Request.Ctx.Put("volume", text_noisy_t) + e.Request.Ctx.Put("base_price", "same_as_discounted_price") + } + }) + + // ABV + e.ForEach(".base-info--entry.entry-attribute", func(i int, e *colly.HTMLElement) { + text_noisy := e.ChildText(".entry--content") + + if strings.Contains(text_noisy, "Alkoholgehalt") && strings.Contains(text_noisy, "%") { + abv_noisy := strings.Replace(text_noisy, "Alkoholgehalt:", "", 1) + e.Request.Ctx.Put("abv", abv_noisy) + + } + }) + + e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit("https://www.whiskyzone.de/widgets/emotion/index/emotionId/248/controllerName/listing") + c.Visit(Shop_url) return Whiskys } |
