From 6c4de0beead82d646e743c7c0919af1f7add3b80 Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 11 Jan 2019 20:28:09 +0100 Subject: Crawler for Drankdozijn now uses the official API. (crawler) --- crawler/shop_drankdozijn.go | 525 ++++++++++++++++++++++++++++++-------------- 1 file changed, 365 insertions(+), 160 deletions(-) (limited to 'crawler') diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go index 782a0f5..a76148f 100644 --- a/crawler/shop_drankdozijn.go +++ b/crawler/shop_drankdozijn.go @@ -1,221 +1,426 @@ package main import ( + "encoding/json" + "io/ioutil" "net/http" - "strconv" "strings" + //"strconv" - log "github.com/Sirupsen/logrus" + //log "github.com/Sirupsen/logrus" "github.com/gocolly/colly" ) func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { - Shop_url_base := "https://drankdozijn.de/aanbiedingen/" - var Shop_url string - Async_url := "https://drankdozijn.de/async/scroll" Offers := []Angebot{} - types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", 100: "Champagner"} - //types := map[int]string{100: "Champagner"} - var current_type string + /** + * Parse the API. + */ + API_URL := "https://api.drankdozijn.nl/sale-products?country=DE&language=de" - c := app.customCollector([]string{"drankdozijn.de", "drankdozijn.nl"}) + c := http.Client{} - c.OnHTML(".product_top", func(e *colly.HTMLElement) { + req, err := http.NewRequest(http.MethodGet, API_URL, nil) + if err != nil { + // TODO + panic(err) + } + + req.Header.Set("accept", "application/json") + req.Header.Set("User-Agent", "") + + api_resp, err := c.Do(req) + if err != nil { + // TODO + panic(err) + } - if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url { - //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") - return + api_body, err := ioutil.ReadAll(api_resp.Body) + if err != nil { + // TODO + panic(err) + } + + var tmp_api_map map[string]interface{} + + err = json.Unmarshal(api_body, tmp_api_map) + if err != nil { + // TODO + panic(err) + } + + for _, value := range tmp_api_map { + + api_data := value.(map[string]interface{}) + + if api_data["type"] != "offer" { + continue } W := Angebot{} - W.Shop = shop.Id - W.Spirit_type = current_type - var err error - var skip_offer bool + W.Name = api_data["saleDescription"].(string) - e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { - W.Url = e.ChildAttr("a", "href") - W.Image_url = e.ChildAttr("img", "src") - }) - e.ForEach(".product_title", func(i int, e *colly.HTMLElement) { - W.Name = e.ChildText("a") - }) + W.Spirit_type = detect_spirit_type(api_data["description"].(string)) + + W.Original_price, err = convert_price(api_data["price"].(string)) + if err != nil { + // TODO + panic(err) + } + W.Discounted_price, err = convert_price(api_data["salePrice"].(string)) + if err != nil { + // TODO + panic(err) + } - if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") { - DebugOffer(W, "Drankdozijn: Skip Offer") - return + // Offer URL + tmp_offer_url_map := api_data["products"].(map[string]interface{}) + for _, v := range tmp_offer_url_map { + tmp_url := v.(map[string]interface{}) + W.Url = "https://drankdozijn.de/artikel/" + (tmp_url["alias"]).(string) } - e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { - original_price_noisy := e.ChildText(".product_acties") - if !strings.Contains(original_price_noisy, "€") { - PrintlnOffer(W, "Drankdozijn: Original price has no € sign. Skipping!") - skip_offer = true + c := app.customCollector([]string{"drankdozijn.de", "drankdozijn.nl"}) + + err = c.Visit(W.Url) + if err != nil { + Warn(nil, shop.Name+": Error (Visit): "+err.Error()) + } + + c.OnHTML(".product_top", func(e *colly.HTMLElement) { + /* + if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url { + //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") + return + } + */ + + e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { + W.Image_url = e.ChildAttr("img", "src") + }) + + if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") { + DebugOffer(W, "Drankdozijn: Skip Offer") return } - W.Original_price, err = convert_price(original_price_noisy) - if err != nil { - W.error_msg = err.Error() - W.error_ctx = e.ChildText(".product_acties") - PrintlnOffer(W, "Drankdozijn: Converting original price failed") + + e.Request.Visit(W.Url) + + var ctx string + + W.Volume, ctx = get_volume(e) + if W.Volume == 0 { + W.error_msg = e.Request.Ctx.Get("volume") + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: Volume is zero") + return + } + + W.Abv, ctx = get_abv(e) + if W.Abv == 0 { + W.error_msg = "Drankdozijn: Abv is zero" + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: abv is zero") return } - W.Discounted_price, err = convert_price(e.ChildText(".product_aanbieding_prijs")) + + base_price_noisy := e.Request.Ctx.Get("base_price") + W.Base_price, err = convert_price(base_price_noisy) if err != nil { W.error_msg = err.Error() - W.error_ctx = e.ChildText(".product_aanbieding_prijs") - PrintlnOffer(W, "Drankdozijn: Converting discounted price failed") + W.error_ctx = e.ChildText(".price_l") + PrintlnOffer(W, "Drankdozijn: Converting base price failed") return } + + if W.Spirit_type == "Cognac" { + W.Spirit_type = e.Request.Ctx.Get("spirit_type") + } + + if W.Spirit_type == "Likör" { + tmp_type := e.Request.Ctx.Get("spirit_type") + switch tmp_type { + case "Tequila": + W.Spirit_type = "Tequila" + } + } + + if W.Spirit_type == "Wein" { + tmp_type := e.Request.Ctx.Get("spirit_type") + switch tmp_type { + case "Champagner": + case "Champagne": + W.Spirit_type = "Champagner" + default: + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } + } + + W.Website = e.Request.Ctx.Get("website") + + //DebugOffer(W, "DEBUG") + + Offers = append(Offers, W) }) - if skip_offer { - return - } + c.OnHTML(".main_price", func(e *colly.HTMLElement) { + //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L")) + e.Request.Ctx.Put("base_price", e.ChildText(".price_l")) + }) - e.Request.Visit(W.Url) + c.OnHTML(".main_description", func(e *colly.HTMLElement) { + prev := "" + count := 0 + e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) { + if count%2 == 0 { + prev = e.Text + } else { + switch strings.TrimSpace(prev) { + case "Inhalt": + case "Inhoud": + e.Request.Ctx.Put("volume", e.Text) + case "Alkoholgehalt": + case "Alcoholpercentage": + e.Request.Ctx.Put("abv", e.Text) + case "Kategorie": + case "Categorie": + e.Request.Ctx.Put("spirit_type", e.Text) + } + + prev = "" + } + count++ + }) + }) - var ctx string + c.OnHTML("body", func(e *colly.HTMLElement) { + /* + if e.Request.URL.String() == Shop_url { + return + } + */ + e.Request.Ctx.Put("website", string(e.Response.Body)) + }) - W.Volume, ctx = get_volume(e) - if W.Volume == 0 { - W.error_msg = e.Request.Ctx.Get("volume") - W.error_ctx = ctx - PrintlnOffer(W, "Drankdozijn: Volume is zero") - return - } + } - W.Abv, ctx = get_abv(e) - if W.Abv == 0 { - W.error_msg = "Drankdozijn: Abv is zero" - W.error_ctx = ctx - PrintlnOffer(W, "Drankdozijn: abv is zero") - return - } + return Offers - base_price_noisy := e.Request.Ctx.Get("base_price") - W.Base_price, err = convert_price(base_price_noisy) - if err != nil { - W.error_msg = err.Error() - W.error_ctx = e.ChildText(".price_l") - PrintlnOffer(W, "Drankdozijn: Converting base price failed") - return - } + // ++++++++++ OLD +++++++++ - if current_type == "Cognac" { - W.Spirit_type = e.Request.Ctx.Get("spirit_type") - } + /* + Offers := []Angebot{} - if current_type == "Likör" { - tmp_type := e.Request.Ctx.Get("spirit_type") - switch tmp_type { - case "Tequila": - W.Spirit_type = "Tequila" + Shop_url_base := "https://drankdozijn.de/aanbiedingen/" + var Shop_url string + Async_url := "https://drankdozijn.de/async/scroll" + + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", 100: "Champagner"} + //types := map[int]string{100: "Champagner"} + var current_type string + + c := app.customCollector([]string{"drankdozijn.de", "drankdozijn.nl"}) + + c.OnHTML(".product_top", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url { + //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") + return } - } - if current_type == "Champagner" && (e.Request.Ctx.Get("spirit_type") != "Champagner" && e.Request.Ctx.Get("spirit_type") != "Champagne") { - DebugOffer(W, "Drankdozijn: Skip Offer") - return - } + W := Angebot{} + + W.Shop = shop.Id + W.Spirit_type = current_type + + var err error + var skip_offer bool + + e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { + W.Url = e.ChildAttr("a", "href") + W.Image_url = e.ChildAttr("img", "src") + }) + e.ForEach(".product_title", func(i int, e *colly.HTMLElement) { + W.Name = e.ChildText("a") + }) + + if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } + + e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { + original_price_noisy := e.ChildText(".product_acties") + if !strings.Contains(original_price_noisy, "€") { + PrintlnOffer(W, "Drankdozijn: Original price has no € sign. Skipping!") + skip_offer = true + return + } + W.Original_price, err = convert_price(original_price_noisy) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_acties") + PrintlnOffer(W, "Drankdozijn: Converting original price failed") + return + } + W.Discounted_price, err = convert_price(e.ChildText(".product_aanbieding_prijs")) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_aanbieding_prijs") + PrintlnOffer(W, "Drankdozijn: Converting discounted price failed") + return + } + }) + + if skip_offer { + return + } + + e.Request.Visit(W.Url) + + var ctx string + + W.Volume, ctx = get_volume(e) + if W.Volume == 0 { + W.error_msg = e.Request.Ctx.Get("volume") + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: Volume is zero") + return + } + + W.Abv, ctx = get_abv(e) + if W.Abv == 0 { + W.error_msg = "Drankdozijn: Abv is zero" + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: abv is zero") + return + } + + base_price_noisy := e.Request.Ctx.Get("base_price") + W.Base_price, err = convert_price(base_price_noisy) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".price_l") + PrintlnOffer(W, "Drankdozijn: Converting base price failed") + return + } + + if current_type == "Cognac" { + W.Spirit_type = e.Request.Ctx.Get("spirit_type") + } - W.Website = e.Request.Ctx.Get("website") - - //DebugOffer(W, "DEBUG") - - Offers = append(Offers, W) - }) - - c.OnHTML(".main_price", func(e *colly.HTMLElement) { - //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L")) - e.Request.Ctx.Put("base_price", e.ChildText(".price_l")) - }) - - c.OnHTML(".main_description", func(e *colly.HTMLElement) { - prev := "" - count := 0 - e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) { - if count%2 == 0 { - prev = e.Text - } else { - switch strings.TrimSpace(prev) { - case "Inhalt": - case "Inhoud": - e.Request.Ctx.Put("volume", e.Text) - case "Alkoholgehalt": - case "Alcoholpercentage": - e.Request.Ctx.Put("abv", e.Text) - case "Kategorie": - case "Categorie": - e.Request.Ctx.Put("spirit_type", e.Text) + if current_type == "Likör" { + tmp_type := e.Request.Ctx.Get("spirit_type") + switch tmp_type { + case "Tequila": + W.Spirit_type = "Tequila" } + } - prev = "" + if current_type == "Champagner" && (e.Request.Ctx.Get("spirit_type") != "Champagner" && e.Request.Ctx.Get("spirit_type") != "Champagne") { + DebugOffer(W, "Drankdozijn: Skip Offer") + return } - count++ + + W.Website = e.Request.Ctx.Get("website") + + //DebugOffer(W, "DEBUG") + + Offers = append(Offers, W) }) - }) - c.OnHTML("body", func(e *colly.HTMLElement) { - if e.Request.URL.String() == Shop_url { - return - } - e.Request.Ctx.Put("website", string(e.Response.Body)) - }) - - var cookie *http.Cookie - var has_cookie bool - c.OnResponse(func(r *colly.Response) { - //log.Debug("Cookies:", c.Cookies(r.Request.URL.String())) - if len(c.Cookies(r.Request.URL.String())) > 0 { - has_cookie = true - cookie = c.Cookies(r.Request.URL.String())[0] - } - }) - - for groepnr, cur_type := range types { - current_type = cur_type - switch current_type { - case "Wodka": - Shop_url = Shop_url_base + "vodka" - case "Likör": - Shop_url = Shop_url_base + "likeuren" - case "Champagner": - Shop_url = Shop_url_base + "wijn" - default: - Shop_url = Shop_url_base + current_type - } + c.OnHTML(".main_price", func(e *colly.HTMLElement) { + //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L")) + e.Request.Ctx.Put("base_price", e.ChildText(".price_l")) + }) - //log.Debug(Shop_url) - err := c.Visit(Shop_url) - if err != nil { - Warn(nil, shop.Name+": Error (Visit): "+err.Error()) - } + c.OnHTML(".main_description", func(e *colly.HTMLElement) { + prev := "" + count := 0 + e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) { + if count%2 == 0 { + prev = e.Text + } else { + switch strings.TrimSpace(prev) { + case "Inhalt": + case "Inhoud": + e.Request.Ctx.Put("volume", e.Text) + case "Alkoholgehalt": + case "Alcoholpercentage": + e.Request.Ctx.Put("abv", e.Text) + case "Kategorie": + case "Categorie": + e.Request.Ctx.Put("spirit_type", e.Text) + } + + prev = "" + } + count++ + }) + }) + + c.OnHTML("body", func(e *colly.HTMLElement) { + if e.Request.URL.String() == Shop_url { + return + } + e.Request.Ctx.Put("website", string(e.Response.Body)) + }) - c.OnRequest(func(r *colly.Request) { - r.Headers.Set("X-Requested-With", "XMLHttpRequest") - r.Headers.Set("Referer", Shop_url) - if has_cookie { - //log.Debug("Setting Cookie: " + cookie.String()) - r.Headers.Set("Cookie", cookie.String()) + var cookie *http.Cookie + var has_cookie bool + c.OnResponse(func(r *colly.Response) { + //log.Debug("Cookies:", c.Cookies(r.Request.URL.String())) + if len(c.Cookies(r.Request.URL.String())) > 0 { + has_cookie = true + cookie = c.Cookies(r.Request.URL.String())[0] } }) - for i := 12; true; i = i + 12 { - log.Debug("Crawling Drankdozijn: type = " + cur_type + " items = " + strconv.Itoa(i)) - err := c.Post(Async_url, map[string]string{"items": strconv.Itoa(i), "datum": "0", "groepnr": strconv.Itoa(groepnr)}) + for groepnr, cur_type := range types { + current_type = cur_type + switch current_type { + case "Wodka": + Shop_url = Shop_url_base + "vodka" + case "Likör": + Shop_url = Shop_url_base + "likeuren" + case "Champagner": + Shop_url = Shop_url_base + "wijn" + default: + Shop_url = Shop_url_base + current_type + } + + //log.Debug(Shop_url) + err := c.Visit(Shop_url) if err != nil { - if "EOF" != err.Error() { - Warn(nil, shop.Name+": Error (Post): "+err.Error()) + Warn(nil, shop.Name+": Error (Visit): "+err.Error()) + } + + c.OnRequest(func(r *colly.Request) { + r.Headers.Set("X-Requested-With", "XMLHttpRequest") + r.Headers.Set("Referer", Shop_url) + if has_cookie { + //log.Debug("Setting Cookie: " + cookie.String()) + r.Headers.Set("Cookie", cookie.String()) + } + }) + + for i := 12; true; i = i + 12 { + log.Debug("Crawling Drankdozijn: type = " + cur_type + " items = " + strconv.Itoa(i)) + err := c.Post(Async_url, map[string]string{"items": strconv.Itoa(i), "datum": "0", "groepnr": strconv.Itoa(groepnr)}) + if err != nil { + if "EOF" != err.Error() { + Warn(nil, shop.Name+": Error (Post): "+err.Error()) + } + break } - break } } - } - return Offers + return Offers + */ } -- cgit v1.2.3