From c59d15dfc04e0fb75c8132a3ce778dcf801645c1 Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 16:19:58 +0200 Subject: Fix because changed html. (crawler) --- crawler/shop_whiskyde.go | 1 + crawler/shop_whiskyworld.go | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index 0245c85..7117d71 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -19,6 +19,7 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { c.OnHTML(".is-buyable", func(e *colly.HTMLElement) { if e.Request.URL.String() != Shop_url { + Debug(nil, "Whisky.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") return } diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index af97511..f617ebb 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -106,7 +106,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { }) - W.Image_url = "https:" + e.ChildAttr("img", "data-src") + W.Image_url = e.ChildAttr("img", "data-src") + if !strings.HasPrefix(W.Image_url, "https:") { + W.Image_url = "https:" + W.Image_url + } e.Request.Visit(W.Url) W.Website = e.Request.Ctx.Get("website") -- cgit v1.2.3 From f37ebbb81785fb2c02f166b84581b9e92c829b2a Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 16:21:12 +0200 Subject: Tries to validate image url by making head request. (crawler) --- crawler/sanitize.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 2fef9a4..6370588 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "net/http" "regexp" "strconv" "strings" @@ -49,6 +50,13 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot { continue } + if err := sanitize_image_url(offer.Image_url); err != nil { + offer.error_ctx = offer.Image_url + offer.error_msg = err.Error() + WarnOffer(offer, "Sanitizer: Image-URL is not valid") + continue + } + //offer.Website = "" W = append(W, offer) @@ -254,3 +262,21 @@ func get_age_from_name(name string) int { } return age } + +func sanitize_image_url(url string) error { + + resp, err := http.Head(url) + if err != nil { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error()) + } + + if resp.StatusCode != 200 { + return fmt.Errorf("sanitize_image_url: HEAD request failed. StatusCode not 200, got %d \n", resp.StatusCode) + } + + if !strings.HasPrefix(resp.Header.Get("Content-Type"), "image") { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got no image, content-type is %s \n", resp.Header.Get("Content-Type")) + } + + return nil +} -- cgit v1.2.3 From b3b35a1706cd99e0978147a4d1b841381cf48348 Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 15 Jun 2018 19:37:33 +0200 Subject: Improves debugging output. (crawler) --- crawler/sanitize.go | 1 + 1 file changed, 1 insertion(+) diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 6370588..960d5f6 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -265,6 +265,7 @@ func get_age_from_name(name string) int { func sanitize_image_url(url string) error { + log.Debugf("sanitize_image_url: Making HEAD request to %s \n", url) resp, err := http.Head(url) if err != nil { return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error()) -- cgit v1.2.3 From 0026ba55f03c5378d5773459fcdd7c6931ff42a5 Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 15 Jun 2018 19:38:04 +0200 Subject: Introduces central crawler config. (crawler) --- crawler/shop_bottleworld.go | 5 +---- crawler/shop_mcwhisky.go | 5 +---- crawler/shop_rumundco.go | 5 +---- crawler/shop_whic.go | 4 +--- crawler/shop_whiskyde.go | 5 +---- crawler/shop_whiskysitenl.go | 5 +---- crawler/shop_whiskyworld.go | 6 +----- crawler/shop_whiskyzone.go | 5 +---- crawler/utility.go | 6 ++++++ 9 files changed, 14 insertions(+), 32 deletions(-) diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index b92896d..de9fe13 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -12,10 +12,7 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all" Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("bottleworld.de"), - colly.AllowedDomains("www.bottleworld.de"), - ) + c := customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index cea020a..ef780a9 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -11,10 +11,7 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("mcwhisky.com"), - colly.AllowedDomains("www.mcwhisky.com"), - ) + c := customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index 1ce202f..4b72c08 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -14,10 +14,7 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("rumundco.de"), - colly.AllowedDomains("www.rumundco.de"), - ) + c := customCollector([]string{"rumundco.de", "www.rumundco.de"}) c.OnHTML(".product-teaser", func(e *colly.HTMLElement) { diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index af86bdc..2d0170b 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -12,9 +12,7 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { Shop_url := "https://whic.de/angebote" Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("whic.de"), - ) + c := customCollector([]string{"whic.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index 7117d71..9e061ac 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -11,10 +11,7 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("whisky.de"), - colly.AllowedDomains("www.whisky.de"), - ) + c := customCollector([]string{"whisky.de", "www.whisky.de"}) c.OnHTML(".is-buyable", func(e *colly.HTMLElement) { diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go index f1b667c..4dad313 100644 --- a/crawler/shop_whiskysitenl.go +++ b/crawler/shop_whiskysitenl.go @@ -13,10 +13,7 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100" - c := colly.NewCollector( - colly.AllowedDomains("whiskysite.nl"), - colly.AllowedDomains("www.whiskysite.nl"), - ) + c := customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"}) c.OnHTML(".product-block", func(e *colly.HTMLElement) { diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index f617ebb..7b57d37 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -15,11 +15,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.UserAgent("friendly"), - colly.AllowedDomains("whiskyworld.de"), - colly.AllowedDomains("www.whiskyworld.de"), - ) + c := customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"}) c.OnHTML(".product-item", func(e *colly.HTMLElement) { if !stringInSlice(e.Request.URL.String(), Shop_urls) { diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index 2c1fb99..4dc825a 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -13,10 +13,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { Whiskys := []Angebot{} - c := colly.NewCollector( - colly.AllowedDomains("whiskyzone.de"), - colly.AllowedDomains("www.whiskyzone.de"), - ) + c := customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"}) c.OnHTML(".product--info", func(e *colly.HTMLElement) { diff --git a/crawler/utility.go b/crawler/utility.go index 5fa78c4..e0acf3f 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -10,6 +10,12 @@ import ( "github.com/gocolly/colly" ) +func customCollector(allowed_urls []string) *colly.Collector { + return colly.NewCollector( + colly.AllowedDomains(allowed_urls...), + ) +} + func stringInSlice(a string, list []string) bool { for _, b := range list { if b == a { -- cgit v1.2.3 From 8d68ac7c900241eb8499a94c23ab1f60750e7aed Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 23:28:18 +0200 Subject: Introduces config for user agent, robots.txt and crawler delay. (crawler) --- crawler/config.go | 22 +++++++++++++++++++--- crawler/init.go | 13 +++++++++++++ crawler/scrape.go | 28 ++++++++++++++++++++++++++++ crawler/shop_bottleworld.go | 17 ++++++++++++++--- crawler/shop_mcwhisky.go | 7 +++++-- crawler/shop_rumundco.go | 7 +++++-- crawler/shop_whic.go | 7 +++++-- crawler/shop_whiskyde.go | 7 +++++-- crawler/shop_whiskysitenl.go | 7 +++++-- crawler/shop_whiskyworld.go | 7 +++++-- crawler/shop_whiskyzone.go | 7 +++++-- crawler/utility.go | 6 ------ 12 files changed, 109 insertions(+), 26 deletions(-) diff --git a/crawler/config.go b/crawler/config.go index f89fa45..a3939c4 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -17,6 +17,10 @@ type Config struct { DBOptions string DBPath string // for sqlite + UserAgent string + Delay int + IgnoreRobotsTXT bool + DisableURLShorter bool Polr_URL string Polr_API_Key string @@ -40,6 +44,12 @@ func (c *Config) parseConfig(configFile string) { viper.SetDefault("FixDatabase", false) viper.SetDefault("DisableURLShorter", false) viper.SetDefault("ShopIDs", []string{}) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") // Name of the configuration file viper.SetConfigName("config") @@ -95,10 +105,16 @@ func (c *Config) setsConfig() { c.DBDBName = viper.GetString("DB_DBName") c.DBOptions = viper.GetString("DB_Options") c.DBPath = viper.GetString("DB_Path") - c.Debug = viper.GetBool("Debug") - c.FixDatabase = viper.GetBool("FixDatabase") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + c.DisableURLShorter = viper.GetBool("DisableURLShorter") - c.ShopIDs = viper.GetStringSlice("ShopIDs") c.Polr_URL = viper.GetString("Polr_URL") c.Polr_API_Key = viper.GetString("Polr_API_Key") + + c.Debug = viper.GetBool("Debug") + c.FixDatabase = viper.GetBool("FixDatabase") + c.ShopIDs = viper.GetStringSlice("ShopIDs") } diff --git a/crawler/init.go b/crawler/init.go index 60f7e47..668df2d 100644 --- a/crawler/init.go +++ b/crawler/init.go @@ -23,6 +23,9 @@ func init() { loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`) flag.Bool("list-shops", false, `lists all crawlable shops`) shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`) + user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent") + delay_f := flag.Int("delay", 0, "toggles random delay between crawls") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt") flag.Parse() loglevel := strings.ToLower(*loglevel_f) @@ -41,6 +44,16 @@ func init() { _conf.parseConfig(*configFile) + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + if _conf.Debug && !*silent { log.SetLevel(log.DebugLevel) } diff --git a/crawler/scrape.go b/crawler/scrape.go index 4bc66e0..f9e758d 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -1,7 +1,10 @@ package main import ( + "time" + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" ) func (app *App) ScrapeHTML(shops []Shop) { @@ -41,6 +44,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) { } } + // if no results, return early + if len(W) == 0 { + wait <- true + return + + } + err = app.save_offer(W) if err != nil { Warn(err, "Saving offers failed. Shop: "+shop.Name) @@ -78,3 +88,21 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return []Angebot{} } + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index de9fe13..d679b43 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -6,13 +6,18 @@ import ( // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" + "log" + "time" ) func (app *App) ScrapeBottleWord(shop Shop) []Angebot { Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all" Whiskys := []Angebot{} - c := customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) + c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) + + log.Println(c.IgnoreRobotsTxt) + log.Println(time.Duration(app.Config.Delay)) c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} @@ -60,7 +65,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { W.Image_url = e.ChildAttr("img", "src") - e.Request.Visit(W.Url) + erro := e.Request.Visit(W.Url) + if erro != nil { + Warn(nil, W.Url+" "+erro.Error()) + } var ctx string W.Volume, ctx = get_volume(e) @@ -109,7 +117,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { e.Request.Ctx.Put("spirit_type", detect_spirit_type(text_noisy)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index ef780a9..941f3b9 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -11,7 +11,7 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"}) + c := app.customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { @@ -130,7 +130,10 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index 4b72c08..45069c2 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -14,7 +14,7 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"rumundco.de", "www.rumundco.de"}) + c := app.customCollector([]string{"rumundco.de", "www.rumundco.de"}) c.OnHTML(".product-teaser", func(e *colly.HTMLElement) { @@ -155,7 +155,10 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index 2d0170b..93bff23 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -12,7 +12,7 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { Shop_url := "https://whic.de/angebote" Whiskys := []Angebot{} - c := customCollector([]string{"whic.de"}) + c := app.customCollector([]string{"whic.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { @@ -127,7 +127,10 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index 9e061ac..d3087ca 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -11,7 +11,7 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"whisky.de", "www.whisky.de"}) + c := app.customCollector([]string{"whisky.de", "www.whisky.de"}) c.OnHTML(".is-buyable", func(e *colly.HTMLElement) { @@ -118,7 +118,10 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go index 4dad313..e3ae075 100644 --- a/crawler/shop_whiskysitenl.go +++ b/crawler/shop_whiskysitenl.go @@ -13,7 +13,7 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100" - c := customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"}) + c := app.customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"}) c.OnHTML(".product-block", func(e *colly.HTMLElement) { @@ -138,7 +138,10 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index 7b57d37..3f0874d 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -15,7 +15,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"}) + c := app.customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"}) c.OnHTML(".product-item", func(e *colly.HTMLElement) { if !stringInSlice(e.Request.URL.String(), Shop_urls) { @@ -123,7 +123,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { }) for _, url := range Shop_urls { - c.Visit(url) + err := c.Visit(url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } } return Whiskys diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index 4dc825a..dbaf0ba 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -13,7 +13,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"}) + c := app.customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"}) c.OnHTML(".product--info", func(e *colly.HTMLElement) { @@ -154,7 +154,10 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/utility.go b/crawler/utility.go index e0acf3f..5fa78c4 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -10,12 +10,6 @@ import ( "github.com/gocolly/colly" ) -func customCollector(allowed_urls []string) *colly.Collector { - return colly.NewCollector( - colly.AllowedDomains(allowed_urls...), - ) -} - func stringInSlice(a string, list []string) bool { for _, b := range list { if b == a { -- cgit v1.2.3 From 482ac52e2db7ca3db7005dcc01d21b69da0faf89 Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 23:44:07 +0200 Subject: Removes unnecessary code. (crawler) --- crawler/scrape.go | 4 ---- crawler/shop_bottleworld.go | 5 ----- 2 files changed, 9 deletions(-) diff --git a/crawler/scrape.go b/crawler/scrape.go index f9e758d..de79813 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -29,10 +29,6 @@ func (app *App) Scrape(shop Shop, wait chan bool) { var W []Angebot var err error - if err != nil { - Fatal(err, "scrape.go: Starting transaction failed. Shop: "+shop.Name) - } - // retry on error for i := 1; i < 4; i++ { W = app.ScrapeShop(shop) diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index d679b43..8722211 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -6,8 +6,6 @@ import ( // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" - "log" - "time" ) func (app *App) ScrapeBottleWord(shop Shop) []Angebot { @@ -16,9 +14,6 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) - log.Println(c.IgnoreRobotsTxt) - log.Println(time.Duration(app.Config.Delay)) - c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} -- cgit v1.2.3 From f9b561c087ccf5109928371192f0f5807103e296 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 13:52:18 +0200 Subject: Adds support for cl. (crawler) --- crawler/utility.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/crawler/utility.go b/crawler/utility.go index 5fa78c4..0650546 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -81,12 +81,29 @@ func detect_spirit_type(name string) string { } func extract_volume(volume string) (float32, error) { - r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + var volume_noisy string + var is_litre_instead_of_cl bool + + // difference between cl... + r_cl, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC][lL]`) if err != nil { - Fatal(err, "Extract volume regex failed") + Fatal(err, "Extract volume (centiliter) regex failed") + } + + volume_noisy = r_cl.FindString(volume) + + if volume_noisy == "" { + // ...and litre + is_litre_instead_of_cl = true + + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + if err != nil { + Fatal(err, "Extract volume regex failed") + } + volume_noisy = r_liter.FindString(volume) } - volume_noisy := r_liter.FindString(volume) + // extract numbers r_liter2, err := regexp.Compile(`[0-9]+([,.][0-9]+)?`) if err != nil { Fatal(err, "2nd extract volume regex failed") @@ -99,6 +116,11 @@ func extract_volume(volume string) (float32, error) { return 0, err } + // converting from cl to litre + if !is_litre_instead_of_cl { + volume64 = volume64 / 100 + } + return float32(volume64), err } -- cgit v1.2.3 From a25368ce25e3de3add81e4347639a9b0401750a7 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 13:52:50 +0200 Subject: Improves sanitizing function. (crawler) --- crawler/sanitize.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 960d5f6..4a5197b 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -82,6 +82,10 @@ func sanitize_name(name string) string { name = strings.Replace(name, "years old", "Jahre", 1) } + if strings.Contains(name, "years") { + name = strings.Replace(name, "years", "Jahre", 1) + } + if strings.Contains(name, "Years Old") { name = strings.Replace(name, "Years Old", "Jahre", 1) } @@ -201,13 +205,15 @@ func sanitize_base_price(price_noisy string) (price int, err error) { if strings.Contains(price_noisy, "Grundpreis:") { price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } if strings.Contains(price_noisy, "/Liter") { price_noisy = strings.Replace(price_noisy, "/Liter", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } + if strings.Contains(price_noisy, "/L") { + price_noisy = strings.Replace(price_noisy, "/L", "", -1) + } + price_noisy = strings.TrimSpace(price_noisy) return convert_price(price_noisy) } -- cgit v1.2.3 From db6fa4428e8b6d6c7fd845463a93d83affbf880b Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 15:15:36 +0200 Subject: Detects cl in sanitize_name(). (crawler) --- crawler/sanitize.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 4a5197b..262bfa6 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -112,7 +112,7 @@ func sanitize_name(name string) string { name = strings.Replace(name, age_noisy, age+" Jahre ", 1) } - r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC]?[lL]((iter)|(tr))?`) if err != nil { Fatal(err, "sanitize_name: Liter-Regexp failed") } -- cgit v1.2.3 From 0dedda30a0cb983c41f879e9fe0be53a79ba347c Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 16:02:59 +0200 Subject: Removes validating abv based of spirit type. (crawler) --- crawler/sanitize.go | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 262bfa6..d67b32b 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -220,21 +220,23 @@ func sanitize_base_price(price_noisy string) (price int, err error) { func _check_abv_for_spirit_type(offer Angebot) bool { - if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { - WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) - return false - } + /* + if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { + WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) + return false + } - if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { - WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) - return false - } + if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { + WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) + return false + } - if offer.Abv < 14 && offer.Spirit_type == "Likör" { - WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) - return false + if offer.Abv < 14 && offer.Spirit_type == "Likör" { + WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) + return false - } + } + */ if offer.Abv == 0 { WarnOffer(offer, "Sanitizer: Abv is zero") -- cgit v1.2.3 From d0b2f70f278924b264fce12b3da7c4c87cbe4593 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 16:05:06 +0200 Subject: Adds scraper for Drankdozijn. (crawler) --- crawler/scrape.go | 2 + crawler/shop_drankdozijn.go | 192 ++++++++++++++++++++++++++++++++++++++++++++ crawler/shops.go | 8 ++ 3 files changed, 202 insertions(+) create mode 100644 crawler/shop_drankdozijn.go diff --git a/crawler/scrape.go b/crawler/scrape.go index de79813..6874239 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -78,6 +78,8 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return app.ScrapeWhiskyworld(shop) case "Whiskyzone": return app.ScrapeWhiskyzone(shop) + case "Drankdozijn": + return app.ScrapeDrankdozijn(shop) default: log.Println(shop.Name + ": No Crawler") } diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go new file mode 100644 index 0000000..0a5cca4 --- /dev/null +++ b/crawler/shop_drankdozijn.go @@ -0,0 +1,192 @@ +package main + +import ( + "net/http" + "strconv" + "strings" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { + Shop_url_base := "https://drankdozijn.de/aanbiedingen/" + var Shop_url string + Async_url := "https://drankdozijn.de/async/scroll" + + Offers := []Angebot{} + + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac"} + //types := map[int]string{240: "Cognac"} + var current_type string + + c := app.customCollector([]string{"drankdozijn.de"}) + + c.OnHTML(".product_top", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url { + //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").") + return + } + + W := Angebot{} + + W.Shop = shop.Id + W.Spirit_type = current_type + + var err error + + e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { + W.Url = e.ChildAttr("a", "href") + W.Image_url = e.ChildAttr("img", "src") + }) + e.ForEach(".product_title", func(i int, e *colly.HTMLElement) { + W.Name = e.ChildText("a") + }) + + if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } + + e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { + W.Original_price, err = convert_price(e.ChildText(".product_acties")) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_acties") + PrintlnOffer(W, "Drankdozijn: Converting original price failed") + return + } + W.Discounted_price, err = convert_price(e.ChildText(".product_aanbieding_prijs")) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".product_aanbieding_prijs") + PrintlnOffer(W, "Drankdozijn: Converting discounted price failed") + return + } + }) + + e.Request.Visit(W.Url) + + var ctx string + + W.Volume, ctx = get_volume(e) + if W.Volume == 0 { + W.error_msg = e.Request.Ctx.Get("volume") + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: Volume is zero") + return + } + + W.Abv, ctx = get_abv(e) + if W.Abv == 0 { + W.error_msg = "Drankdozijn: Abv is zero" + W.error_ctx = ctx + PrintlnOffer(W, "Drankdozijn: abv is zero") + return + } + + base_price_noisy := e.Request.Ctx.Get("base_price") + W.Base_price, err = convert_price(base_price_noisy) + if err != nil { + W.error_msg = err.Error() + W.error_ctx = e.ChildText(".price_l") + PrintlnOffer(W, "Drankdozijn: Converting base price failed") + return + } + + if current_type == "Cognac" { + W.Spirit_type = e.Request.Ctx.Get("spirit_type") + } + + W.Website = e.Request.Ctx.Get("website") + + //DebugOffer(W, "DEBUG") + + Offers = append(Offers, W) + }) + + c.OnHTML(".main_price", func(e *colly.HTMLElement) { + //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L")) + e.Request.Ctx.Put("base_price", e.ChildText(".price_l")) + }) + + c.OnHTML(".main_description", func(e *colly.HTMLElement) { + prev := "" + count := 0 + e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) { + if count%2 == 0 { + prev = e.Text + } else { + switch strings.TrimSpace(prev) { + case "Inhalt": + e.Request.Ctx.Put("volume", e.Text) + case "Alkoholgehalt": + e.Request.Ctx.Put("abv", e.Text) + case "Kategorie": + e.Request.Ctx.Put("spirit_type", e.Text) + } + + prev = "" + } + count++ + }) + }) + + c.OnHTML("body", func(e *colly.HTMLElement) { + if e.Request.URL.String() == Shop_url { + return + } + e.Request.Ctx.Put("website", string(e.Response.Body)) + }) + + var cookie *http.Cookie + var has_cookie bool + c.OnResponse(func(r *colly.Response) { + //log.Debug("Cookies:", c.Cookies(r.Request.URL.String())) + if len(c.Cookies(r.Request.URL.String())) > 0 { + has_cookie = true + cookie = c.Cookies(r.Request.URL.String())[0] + } + }) + + for groepnr, cur_type := range types { + current_type = cur_type + switch current_type { + case "Wodka": + Shop_url = Shop_url_base + "vodka" + case "Likör": + Shop_url = Shop_url_base + "likeuren" + default: + Shop_url = Shop_url_base + current_type + } + + //log.Debug(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": Error (Visit): "+err.Error()) + } + + c.OnRequest(func(r *colly.Request) { + r.Headers.Set("X-Requested-With", "XMLHttpRequest") + r.Headers.Set("Referer", Shop_url) + if has_cookie { + //log.Debug("Setting Cookie: " + cookie.String()) + r.Headers.Set("Cookie", cookie.String()) + } + }) + + for i := 12; true; i = i + 12 { + log.Debug("Crawling Drankdozijn: type = " + cur_type + " items = " + strconv.Itoa(i)) + err := c.Post(Async_url, map[string]string{"items": strconv.Itoa(i), "datum": "0", "groepnr": strconv.Itoa(groepnr)}) + if err != nil { + if "EOF" != err.Error() { + Warn(nil, shop.Name+": Error (Post): "+err.Error()) + } + break + } + } + } + + return Offers +} diff --git a/crawler/shops.go b/crawler/shops.go index d9fcc0d..61676ed 100644 --- a/crawler/shops.go +++ b/crawler/shops.go @@ -90,6 +90,14 @@ func getShopsFromStruct() []Shop { Shipping_costs: 495, Free_shipping: "75€", }) + Shops = append(Shops, Shop{ + Name: "Drankdozijn", + Url: "https://Drankdozijn.de", + Short_url: "https://l.fuselkoenig.de/whiskyzone", + Logo_url: "", + Shipping_costs: 595, + Free_shipping: "250€", + }) return Shops } -- cgit v1.2.3 From 2588017275b32f8f433c732fbb89100fe87c3e96 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 16:51:51 +0200 Subject: Adds champagner / Drankdozijn. (crawler) --- crawler/shop_drankdozijn.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go index 0a5cca4..adc8633 100644 --- a/crawler/shop_drankdozijn.go +++ b/crawler/shop_drankdozijn.go @@ -16,8 +16,8 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { Offers := []Angebot{} - types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac"} - //types := map[int]string{240: "Cognac"} + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", "Champagner": 100} + //types := map[int]string{100: "Champagner"} var current_type string c := app.customCollector([]string{"drankdozijn.de"}) @@ -98,6 +98,10 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { if current_type == "Cognac" { W.Spirit_type = e.Request.Ctx.Get("spirit_type") } + if current_type == "Champagner" && e.Request.Ctx.Get("spirit_type") != "Champagner" { + DebugOffer(W, "Drankdozijn: Skip Offer") + return + } W.Website = e.Request.Ctx.Get("website") @@ -157,6 +161,8 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { Shop_url = Shop_url_base + "vodka" case "Likör": Shop_url = Shop_url_base + "likeuren" + case "Champagner": + Shop_url = Shop_url_base + "wijn" default: Shop_url = Shop_url_base + current_type } -- cgit v1.2.3 From f61abc2069936f600c153d019b5f7a8c9a234e24 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 17:00:43 +0200 Subject: Bugfix. (crawler) --- crawler/shop_drankdozijn.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go index adc8633..96d914d 100644 --- a/crawler/shop_drankdozijn.go +++ b/crawler/shop_drankdozijn.go @@ -16,8 +16,8 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { Offers := []Angebot{} - types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", "Champagner": 100} - //types := map[int]string{100: "Champagner"} + types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", 100: "Champagner"} + //types := map[int]string{240: "Likör"} var current_type string c := app.customCollector([]string{"drankdozijn.de"}) @@ -35,6 +35,7 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { W.Spirit_type = current_type var err error + var skip_offer bool e.ForEach(".product_image", func(i int, e *colly.HTMLElement) { W.Url = e.ChildAttr("a", "href") @@ -50,7 +51,13 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { } e.ForEach(".product_price", func(i int, e *colly.HTMLElement) { - W.Original_price, err = convert_price(e.ChildText(".product_acties")) + original_price_noisy := e.ChildText(".product_acties") + if !strings.Contains(original_price_noisy, "€") { + PrintlnOffer(W, "Drankdozijn: Original price has no € sign. Skipping!") + skip_offer = true + return + } + W.Original_price, err = convert_price(original_price_noisy) if err != nil { W.error_msg = err.Error() W.error_ctx = e.ChildText(".product_acties") @@ -66,6 +73,10 @@ func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot { } }) + if skip_offer { + return + } + e.Request.Visit(W.Url) var ctx string -- cgit v1.2.3 From 8114b7b17b723a5fe0fee24470e255faf587332e Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 19:00:08 +0200 Subject: Corrects short url. (crawler) --- crawler/shops.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/shops.go b/crawler/shops.go index 61676ed..79eff96 100644 --- a/crawler/shops.go +++ b/crawler/shops.go @@ -93,7 +93,7 @@ func getShopsFromStruct() []Shop { Shops = append(Shops, Shop{ Name: "Drankdozijn", Url: "https://Drankdozijn.de", - Short_url: "https://l.fuselkoenig.de/whiskyzone", + Short_url: "https://l.fuselkoenig.de/drankdozijn", Logo_url: "", Shipping_costs: 595, Free_shipping: "250€", -- cgit v1.2.3