package main import ( "net/url" "regexp" "strings" // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" log "github.com/sirupsen/logrus" ) func (app *App) ScrapeRumundCo(shop Shop) []Angebot { /* * kf=29 means Whisky * kf=63 means Gin * kf=92 means Tequila * kf=8 means Rum */ Shop_urls := []string{"https://www.rumundco.de/navi.php?q=4&kf=29&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350", "https://www.rumundco.de/navi.php?q=4&kf=63&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350", "https://www.rumundco.de/navi.php?q=4&kf=92&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350", "https://www.rumundco.de/navi.php?q=4&kf=8&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=290&Sortierung=11&a=350", } Whiskys := []Angebot{} c := app.customCollector([]string{"rumundco.de", "www.rumundco.de"}) c.OnHTML(".product-teaser", func(e *colly.HTMLElement) { if !stringInSlice(e.Request.URL.String(), Shop_urls) { return } W := Angebot{} W.Shop = shop.Id // spirit type is encoded in "kf" param param, err := url.ParseQuery(e.Request.URL.RawQuery) if err != nil { W.error_msg = "Rum & Co: Parsing Query from Shop-URL failed" W.error_ctx = e.Request.URL.String() W.Println("Rum & Co: Parsing Query from Shop-URL failed") } switch param["kf"][0] { case "29": W.Spirit_type = "Whisky" case "63": W.Spirit_type = "Gin" case "92": W.Spirit_type = "Tequila" case "8": W.Spirit_type = "Rum" default: W.error_msg = "Rum & Co: Query parameter has unexpected value" W.error_ctx = param["kf"][0] W.Url = e.Request.URL.String() W.Println("Rum & Co: Detecting spirit type failed") } log.Debug("Rum & Co: Crawling " + W.Spirit_type + " with param kf=" + param["kf"][0]) whisky_name := strings.TrimPrefix(e.ChildAttr("img", "alt"), "Restposten: ") whisky_url := "https://www.rumundco.de/" + e.ChildAttr("a", "href") matched, err := regexp.MatchString("verfügbar", e.ChildText(".delivery-status")) if err != nil { Fatal(err, "Rum & Co: Verfügbar regex failed") } if !matched { W.error_msg = "Rum & Co: Offer not available" W.error_ctx = e.ChildText(".delivery-status") W.Url = whisky_url W.Name = whisky_name W.Println("Rum & Co: Offer not available") return } W.Name = whisky_name W.Url = whisky_url r_abv, err := regexp.Compile("[0-9]+([,.][0-9])?( )*(%|([vV]ol))") if err != nil { Fatal(err, "Rum & Co: Abv regex failed") } abv_noisy := r_abv.FindString(whisky_name) e.ForEach(".price_wrapper", func(i int, e *colly.HTMLElement) { regular_price := e.ChildText(".instead_of") if "" == regular_price { W.error_msg = "Rum & Co: No regular price found" W.error_ctx = regular_price W.Println("Rum & Co: No regular price found") return } W.Original_price, err = convert_price(regular_price) if err != nil { W.error_msg = err.Error() W.error_ctx = regular_price W.Println("Rum & Co: Original price: Convert price failed") return } W.Discounted_price, err = convert_price(e.ChildText(".price-value")) if err != nil { W.error_msg = err.Error() W.error_ctx = e.ChildText(".price-value") W.Println("Rum & Co: Discounted price: Convert price failed") return } e.ForEach(".base_price", func(i int, e *colly.HTMLElement) { price_per_litre_noisy := e.ChildText(".value") W.Base_price, err = sanitize_base_price(price_per_litre_noisy) if err != nil { W.error_msg = err.Error() W.error_ctx = e.ChildText(".value") W.Println("Rum & Co: Base price: Sanitizing base price failed") return } }) }) // Rum & Co uses pagespeed image_url_noisy := e.ChildAttr("img", "data-src") if strings.Contains(image_url_noisy, "pagespeed") { r_pagespeed, err := regexp.Compile(`jpg(\.pagespeed.+)$`) if err != nil { Fatal(err, "Rum & Co: Pagespeed regexp failed") } image_url_noisy_slice := r_pagespeed.FindStringSubmatch(image_url_noisy) if len(image_url_noisy_slice) < 2 { W.error_msg = "Rum & Co: (Pagespeed) Image URL not found" W.error_ctx = image_url_noisy W.Println("Rum & Co: (Pagespeed) Image URL not found") return } image_url_noisy = strings.Replace(image_url_noisy, image_url_noisy_slice[1], "", 1) } W.Image_url = "https://www.rumundco.de/" + image_url_noisy e.Request.Visit(W.Url) var ctx string W.Volume, ctx = get_volume(e) if W.Volume == 0 { W.error_msg = "Rum & Co: Volume is zero" W.error_ctx = ctx W.Println("Rum & Co: Volume is zero") return } if "" == abv_noisy { W.Abv, ctx = get_abv(e) abv_noisy = ctx } else { W.Abv, err = extract_abv(abv_noisy) if err != nil { W.error_msg = err.Error() W.error_ctx = abv_noisy W.Println("Rum & Co: Base price: Extracting ABV failed") return } } if W.Abv == 0 { W.error_msg = "Rum & Co: Abv is zero" W.error_ctx = abv_noisy W.Println("Rum & Co: Abv is zero") return } W.Website = e.Request.Ctx.Get("website") if W.Original_price == 0 { W.Println("Rum & Co: Original price is zero") return } else { Whiskys = append(Whiskys, W) } }) c.OnHTML("#table-collapse .product-attributes table", func(e *colly.HTMLElement) { e.ForEach("tr", func(i int, e *colly.HTMLElement) { text_noisy := e.ChildText("th") switch text_noisy { case "Genauer Inhalt:": e.Request.Ctx.Put("volume", e.ChildText("td")) case "Inhalt:": e.Request.Ctx.Put("volume", e.ChildText("td")) case "Alkoholgehalt in %:": e.Request.Ctx.Put("abv", e.ChildText("a")) } }) e.Request.Ctx.Put("website", string(e.Response.Body)) }) for _, url := range Shop_urls { err := c.Visit(url) if err != nil { shop.error_msg = err.Error() shop.error_ctx = url shop.Warn("Crawling failed") } } return Whiskys }