diff options
| author | horus | 2018-06-18 15:54:53 +0200 |
|---|---|---|
| committer | horus | 2018-06-18 15:54:53 +0200 |
| commit | 01e0cbe79f37b4be2fc82d31c71042b5ce4d699a (patch) | |
| tree | bb179b5c5c6349a69853c3781236b6056b7e7ea6 /crawler | |
| parent | 88a2628258eb5ea79736338637ab8b5b83680c92 (diff) | |
| parent | 8114b7b17b723a5fe0fee24470e255faf587332e (diff) | |
| download | alkobote-01e0cbe79f37b4be2fc82d31c71042b5ce4d699a.tar.gz | |
Merge branch 'master' of /home/horus/app/fk_angebote
Diffstat (limited to 'crawler')
| -rw-r--r-- | crawler/config.go | 22 | ||||
| -rw-r--r-- | crawler/init.go | 13 | ||||
| -rw-r--r-- | crawler/sanitize.go | 65 | ||||
| -rw-r--r-- | crawler/scrape.go | 34 | ||||
| -rw-r--r-- | crawler/shop_bottleworld.go | 15 | ||||
| -rw-r--r-- | crawler/shop_drankdozijn.go | 209 | ||||
| -rw-r--r-- | crawler/shop_mcwhisky.go | 10 | ||||
| -rw-r--r-- | crawler/shop_rumundco.go | 10 | ||||
| -rw-r--r-- | crawler/shop_whic.go | 9 | ||||
| -rw-r--r-- | crawler/shop_whiskyde.go | 11 | ||||
| -rw-r--r-- | crawler/shop_whiskysitenl.go | 10 | ||||
| -rw-r--r-- | crawler/shop_whiskyworld.go | 16 | ||||
| -rw-r--r-- | crawler/shop_whiskyzone.go | 10 | ||||
| -rw-r--r-- | crawler/shops.go | 8 | ||||
| -rw-r--r-- | crawler/utility.go | 28 |
15 files changed, 403 insertions, 67 deletions
diff --git a/crawler/config.go b/crawler/config.go index f89fa45..a3939c4 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -17,6 +17,10 @@ type Config struct { DBOptions string DBPath string // for sqlite + UserAgent string + Delay int + IgnoreRobotsTXT bool + DisableURLShorter bool Polr_URL string Polr_API_Key string @@ -40,6 +44,12 @@ func (c *Config) parseConfig(configFile string) { viper.SetDefault("FixDatabase", false) viper.SetDefault("DisableURLShorter", false) viper.SetDefault("ShopIDs", []string{}) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") // Name of the configuration file viper.SetConfigName("config") @@ -95,10 +105,16 @@ func (c *Config) setsConfig() { c.DBDBName = viper.GetString("DB_DBName") c.DBOptions = viper.GetString("DB_Options") c.DBPath = viper.GetString("DB_Path") - c.Debug = viper.GetBool("Debug") - c.FixDatabase = viper.GetBool("FixDatabase") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + c.DisableURLShorter = viper.GetBool("DisableURLShorter") - c.ShopIDs = viper.GetStringSlice("ShopIDs") c.Polr_URL = viper.GetString("Polr_URL") c.Polr_API_Key = viper.GetString("Polr_API_Key") + + c.Debug = viper.GetBool("Debug") + c.FixDatabase = viper.GetBool("FixDatabase") + c.ShopIDs = viper.GetStringSlice("ShopIDs") } diff --git a/crawler/init.go b/crawler/init.go index 60f7e47..668df2d 100644 --- a/crawler/init.go +++ b/crawler/init.go @@ -23,6 +23,9 @@ func init() { loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`) flag.Bool("list-shops", false, `lists all crawlable shops`) shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`) + user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent") + delay_f := flag.Int("delay", 0, "toggles random delay between crawls") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt") flag.Parse() loglevel := strings.ToLower(*loglevel_f) @@ -41,6 +44,16 @@ func init() { _conf.parseConfig(*configFile) + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + if _conf.Debug && !*silent { log.SetLevel(log.DebugLevel) } diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 2fef9a4..d67b32b 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "net/http" "regexp" "strconv" "strings" @@ -49,6 +50,13 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot { continue } + if err := sanitize_image_url(offer.Image_url); err != nil { + offer.error_ctx = offer.Image_url + offer.error_msg = err.Error() + WarnOffer(offer, "Sanitizer: Image-URL is not valid") + continue + } + //offer.Website = "" W = append(W, offer) @@ -74,6 +82,10 @@ func sanitize_name(name string) string { name = strings.Replace(name, "years old", "Jahre", 1) } + if strings.Contains(name, "years") { + name = strings.Replace(name, "years", "Jahre", 1) + } + if strings.Contains(name, "Years Old") { name = strings.Replace(name, "Years Old", "Jahre", 1) } @@ -100,7 +112,7 @@ func sanitize_name(name string) string { name = strings.Replace(name, age_noisy, age+" Jahre ", 1) } - r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC]?[lL]((iter)|(tr))?`) if err != nil { Fatal(err, "sanitize_name: Liter-Regexp failed") } @@ -193,34 +205,38 @@ func sanitize_base_price(price_noisy string) (price int, err error) { if strings.Contains(price_noisy, "Grundpreis:") { price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } if strings.Contains(price_noisy, "/Liter") { price_noisy = strings.Replace(price_noisy, "/Liter", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } + if strings.Contains(price_noisy, "/L") { + price_noisy = strings.Replace(price_noisy, "/L", "", -1) + } + price_noisy = strings.TrimSpace(price_noisy) return convert_price(price_noisy) } func _check_abv_for_spirit_type(offer Angebot) bool { - if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { - WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) - return false - } + /* + if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { + WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) + return false + } - if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { - WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) - return false - } + if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { + WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) + return false + } - if offer.Abv < 14 && offer.Spirit_type == "Likör" { - WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) - return false + if offer.Abv < 14 && offer.Spirit_type == "Likör" { + WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) + return false - } + } + */ if offer.Abv == 0 { WarnOffer(offer, "Sanitizer: Abv is zero") @@ -254,3 +270,22 @@ func get_age_from_name(name string) int { } return age } + +func sanitize_image_url(url string) error { + + log.Debugf("sanitize_image_url: Making HEAD request to %s \n", url) + resp, err := http.Head(url) + if err != nil { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error()) + } + + if resp.StatusCode != 200 { + return fmt.Errorf("sanitize_image_url: HEAD request failed. StatusCode not 200, got %d \n", resp.StatusCode) + } + + if !strings.HasPrefix(resp.Header.Get("Content-Type"), "image") { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got no image, content-type is %s \n", resp.Header.Get("Content-Type")) + } + + return nil +} diff --git a/crawler/scrape.go b/crawler/scrape.go index 4bc66e0..6874239 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -1,7 +1,10 @@ package main import ( + "time" + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" ) func (app *App) ScrapeHTML(shops []Shop) { @@ -26,10 +29,6 @@ func (app *App) Scrape(shop Shop, wait chan bool) { var W []Angebot var err error - if err != nil { - Fatal(err, "scrape.go: Starting transaction failed. Shop: "+shop.Name) - } - // retry on error for i := 1; i < 4; i++ { W = app.ScrapeShop(shop) @@ -41,6 +40,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) { } } + // if no results, return early + if len(W) == 0 { + wait <- true + return + + } + err = app.save_offer(W) if err != nil { Warn(err, "Saving offers failed. Shop: "+shop.Name) @@ -72,9 +78,29 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return app.ScrapeWhiskyworld(shop) case "Whiskyzone": return app.ScrapeWhiskyzone(shop) + case "Drankdozijn": + return app.ScrapeDrankdozijn(shop) default: log.Println(shop.Name + ": No Crawler") } return []Angebot{} } + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index b92896d..8722211 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -12,10 +12,7 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all" Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("bottleworld.de"), - colly.AllowedDomains("www.bottleworld.de"), - ) + c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} @@ -63,7 +60,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { W.Image_url = e.ChildAttr("img", "src") - e.Request.Visit(W.Url) + erro := e.Request.Visit(W.Url) + if erro != nil { + Warn(nil, W.Url+" "+erro.Error()) + } var ctx string W.Volume, ctx = get_volume(e) @@ -112,7 +112,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { e.Request.Ctx.Put("spirit_type", detect_spirit_type(text_noisy)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go new file mode 100644 index 0000000..96d914d --- /dev/null +++ b/crawler/shop_drankdozijn.go @@ -0,0 +1,209 @@ +package main + +import ( + "net/http" + "strconv" + "strings" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { + Shop_url_base := "https://drankdozijn.de/aanbiedingen/" + var Shop_url string + Async_url := "https://drankdozijn.de/async/scroll" + + Offers := []Angebot{} + + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", 100: "Champagner"} + //types := map[int]string{240: "Likör"} + var current_type string + + c := app.customCollector([]string{"drankdozijn.de"}) + + c.OnHTML(".product_top", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url { + //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") + return + } + + W := Angebot{} + + W.Shop = shop.Id + W.Spirit_type = current_type + + var err error + var skip_offer bool + + e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { + W.Url = e.ChildAttr("a", "href") + W.Image_url = e.ChildAttr("img", "src") + }) + e.ForEach(".product_title", func(i int, e *colly.HTMLElement) { + W.Name = e.ChildText("a") + }) + + if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } + + e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { + original_price_noisy := e.ChildText(".product_acties") + if !strings.Contains(original_price_noisy, "€") { + PrintlnOffer(W, "Drankdozijn: Original price has no € sign. Skipping!") + skip_offer = true + return + } + W.Original_price, err = convert_price(original_price_noisy) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_acties") + PrintlnOffer(W, "Drankdozijn: Converting original price failed") + return + } + W.Discounted_price, err = convert_price(e.ChildText(".product_aanbieding_prijs")) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_aanbieding_prijs") + PrintlnOffer(W, "Drankdozijn: Converting discounted price failed") + return + } + }) + + if skip_offer { + return + } + + e.Request.Visit(W.Url) + + var ctx string + + W.Volume, ctx = get_volume(e) + if W.Volume == 0 { + W.error_msg = e.Request.Ctx.Get("volume") + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: Volume is zero") + return + } + + W.Abv, ctx = get_abv(e) + if W.Abv == 0 { + W.error_msg = "Drankdozijn: Abv is zero" + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: abv is zero") + return + } + + base_price_noisy := e.Request.Ctx.Get("base_price") + W.Base_price, err = convert_price(base_price_noisy) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".price_l") + PrintlnOffer(W, "Drankdozijn: Converting base price failed") + return + } + + if current_type == "Cognac" { + W.Spirit_type = e.Request.Ctx.Get("spirit_type") + } + if current_type == "Champagner" && e.Request.Ctx.Get("spirit_type") != "Champagner" { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } + + W.Website = e.Request.Ctx.Get("website") + + //DebugOffer(W, "DEBUG") + + Offers = append(Offers, W) + }) + + c.OnHTML(".main_price", func(e *colly.HTMLElement) { + //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L")) + e.Request.Ctx.Put("base_price", e.ChildText(".price_l")) + }) + + c.OnHTML(".main_description", func(e *colly.HTMLElement) { + prev := "" + count := 0 + e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) { + if count%2 == 0 { + prev = e.Text + } else { + switch strings.TrimSpace(prev) { + case "Inhalt": + e.Request.Ctx.Put("volume", e.Text) + case "Alkoholgehalt": + e.Request.Ctx.Put("abv", e.Text) + case "Kategorie": + e.Request.Ctx.Put("spirit_type", e.Text) + } + + prev = "" + } + count++ + }) + }) + + c.OnHTML("body", func(e *colly.HTMLElement) { + if e.Request.URL.String() == Shop_url { + return + } + e.Request.Ctx.Put("website", string(e.Response.Body)) + }) + + var cookie *http.Cookie + var has_cookie bool + c.OnResponse(func(r *colly.Response) { + //log.Debug("Cookies:", c.Cookies(r.Request.URL.String())) + if len(c.Cookies(r.Request.URL.String())) > 0 { + has_cookie = true + cookie = c.Cookies(r.Request.URL.String())[0] + } + }) + + for groepnr, cur_type := range types { + current_type = cur_type + switch current_type { + case "Wodka": + Shop_url = Shop_url_base + "vodka" + case "Likör": + Shop_url = Shop_url_base + "likeuren" + case "Champagner": + Shop_url = Shop_url_base + "wijn" + default: + Shop_url = Shop_url_base + current_type + } + + //log.Debug(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": Error (Visit): "+err.Error()) + } + + c.OnRequest(func(r *colly.Request) { + r.Headers.Set("X-Requested-With", "XMLHttpRequest") + r.Headers.Set("Referer", Shop_url) + if has_cookie { + //log.Debug("Setting Cookie: " + cookie.String()) + r.Headers.Set("Cookie", cookie.String()) + } + }) + + for i := 12; true; i = i + 12 { + log.Debug("Crawling Drankdozijn: type = " + cur_type + " items = " + strconv.Itoa(i)) + err := c.Post(Async_url, map[string]string{"items": strconv.Itoa(i), "datum": "0", "groepnr": strconv.Itoa(groepnr)}) + if err != nil { + if "EOF" != err.Error() { + Warn(nil, shop.Name+": Error (Post): "+err.Error()) + } + break + } + } + } + + return Offers +} diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index cea020a..941f3b9 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -11,10 +11,7 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("mcwhisky.com"), - colly.AllowedDomains("www.mcwhisky.com"), - ) + c := app.customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { @@ -133,7 +130,10 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index 1ce202f..45069c2 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -14,10 +14,7 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("rumundco.de"), - colly.AllowedDomains("www.rumundco.de"), - ) + c := app.customCollector([]string{"rumundco.de", "www.rumundco.de"}) c.OnHTML(".product-teaser", func(e *colly.HTMLElement) { @@ -158,7 +155,10 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index af86bdc..93bff23 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -12,9 +12,7 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { Shop_url := "https://whic.de/angebote" Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("whic.de"), - ) + c := app.customCollector([]string{"whic.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { @@ -129,7 +127,10 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index 0245c85..d3087ca 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -11,14 +11,12 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("whisky.de"), - colly.AllowedDomains("www.whisky.de"), - ) + c := app.customCollector([]string{"whisky.de", "www.whisky.de"}) c.OnHTML(".is-buyable", func(e *colly.HTMLElement) { if e.Request.URL.String() != Shop_url { + Debug(nil, "Whisky.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") return } @@ -120,7 +118,10 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go index f1b667c..e3ae075 100644 --- a/crawler/shop_whiskysitenl.go +++ b/crawler/shop_whiskysitenl.go @@ -13,10 +13,7 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100" - c := colly.NewCollector( - colly.AllowedDomains("whiskysite.nl"), - colly.AllowedDomains("www.whiskysite.nl"), - ) + c := app.customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"}) c.OnHTML(".product-block", func(e *colly.HTMLElement) { @@ -141,7 +138,10 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index af97511..3f0874d 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -15,11 +15,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.UserAgent("friendly"), - colly.AllowedDomains("whiskyworld.de"), - colly.AllowedDomains("www.whiskyworld.de"), - ) + c := app.customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"}) c.OnHTML(".product-item", func(e *colly.HTMLElement) { if !stringInSlice(e.Request.URL.String(), Shop_urls) { @@ -106,7 +102,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { }) - W.Image_url = "https:" + e.ChildAttr("img", "data-src") + W.Image_url = e.ChildAttr("img", "data-src") + if !strings.HasPrefix(W.Image_url, "https:") { + W.Image_url = "https:" + W.Image_url + } e.Request.Visit(W.Url) W.Website = e.Request.Ctx.Get("website") @@ -124,7 +123,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { }) for _, url := range Shop_urls { - c.Visit(url) + err := c.Visit(url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } } return Whiskys diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index 2c1fb99..dbaf0ba 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -13,10 +13,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("whiskyzone.de"), - colly.AllowedDomains("www.whiskyzone.de"), - ) + c := app.customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"}) c.OnHTML(".product--info", func(e *colly.HTMLElement) { @@ -157,7 +154,10 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shops.go b/crawler/shops.go index d9fcc0d..79eff96 100644 --- a/crawler/shops.go +++ b/crawler/shops.go @@ -90,6 +90,14 @@ func getShopsFromStruct() []Shop { Shipping_costs: 495, Free_shipping: "75€", }) + Shops = append(Shops, Shop{ + Name: "Drankdozijn", + Url: "https://Drankdozijn.de", + Short_url: "https://l.fuselkoenig.de/drankdozijn", + Logo_url: "", + Shipping_costs: 595, + Free_shipping: "250€", + }) return Shops } diff --git a/crawler/utility.go b/crawler/utility.go index 5fa78c4..0650546 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -81,12 +81,29 @@ func detect_spirit_type(name string) string { } func extract_volume(volume string) (float32, error) { - r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + var volume_noisy string + var is_litre_instead_of_cl bool + + // difference between cl... + r_cl, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC][lL]`) if err != nil { - Fatal(err, "Extract volume regex failed") + Fatal(err, "Extract volume (centiliter) regex failed") + } + + volume_noisy = r_cl.FindString(volume) + + if volume_noisy == "" { + // ...and litre + is_litre_instead_of_cl = true + + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + if err != nil { + Fatal(err, "Extract volume regex failed") + } + volume_noisy = r_liter.FindString(volume) } - volume_noisy := r_liter.FindString(volume) + // extract numbers r_liter2, err := regexp.Compile(`[0-9]+([,.][0-9]+)?`) if err != nil { Fatal(err, "2nd extract volume regex failed") @@ -99,6 +116,11 @@ func extract_volume(volume string) (float32, error) { return 0, err } + // converting from cl to litre + if !is_litre_instead_of_cl { + volume64 = volume64 / 100 + } + return float32(volume64), err } |
