summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhorus2018-06-18 15:54:53 +0200
committerhorus2018-06-18 15:54:53 +0200
commit01e0cbe79f37b4be2fc82d31c71042b5ce4d699a (patch)
treebb179b5c5c6349a69853c3781236b6056b7e7ea6
parent88a2628258eb5ea79736338637ab8b5b83680c92 (diff)
parent8114b7b17b723a5fe0fee24470e255faf587332e (diff)
downloadalkobote-01e0cbe79f37b4be2fc82d31c71042b5ce4d699a.tar.gz
Merge branch 'master' of /home/horus/app/fk_angebote
-rw-r--r--crawler/config.go22
-rw-r--r--crawler/init.go13
-rw-r--r--crawler/sanitize.go65
-rw-r--r--crawler/scrape.go34
-rw-r--r--crawler/shop_bottleworld.go15
-rw-r--r--crawler/shop_drankdozijn.go209
-rw-r--r--crawler/shop_mcwhisky.go10
-rw-r--r--crawler/shop_rumundco.go10
-rw-r--r--crawler/shop_whic.go9
-rw-r--r--crawler/shop_whiskyde.go11
-rw-r--r--crawler/shop_whiskysitenl.go10
-rw-r--r--crawler/shop_whiskyworld.go16
-rw-r--r--crawler/shop_whiskyzone.go10
-rw-r--r--crawler/shops.go8
-rw-r--r--crawler/utility.go28
15 files changed, 403 insertions, 67 deletions
diff --git a/crawler/config.go b/crawler/config.go
index f89fa45..a3939c4 100644
--- a/crawler/config.go
+++ b/crawler/config.go
@@ -17,6 +17,10 @@ type Config struct {
DBOptions string
DBPath string // for sqlite
+ UserAgent string
+ Delay int
+ IgnoreRobotsTXT bool
+
DisableURLShorter bool
Polr_URL string
Polr_API_Key string
@@ -40,6 +44,12 @@ func (c *Config) parseConfig(configFile string) {
viper.SetDefault("FixDatabase", false)
viper.SetDefault("DisableURLShorter", false)
viper.SetDefault("ShopIDs", []string{})
+ viper.SetDefault("Delay", 0)
+
+ // needs some refactoring to truly respect robots.txt
+ viper.SetDefault("IgnoreRobotsTXT", true)
+
+ viper.SetDefault("UserAgent", "colly - a friendly crawler :)")
// Name of the configuration file
viper.SetConfigName("config")
@@ -95,10 +105,16 @@ func (c *Config) setsConfig() {
c.DBDBName = viper.GetString("DB_DBName")
c.DBOptions = viper.GetString("DB_Options")
c.DBPath = viper.GetString("DB_Path")
- c.Debug = viper.GetBool("Debug")
- c.FixDatabase = viper.GetBool("FixDatabase")
+
+ c.UserAgent = viper.GetString("UserAgent")
+ c.Delay = viper.GetInt("Delay")
+ c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT")
+
c.DisableURLShorter = viper.GetBool("DisableURLShorter")
- c.ShopIDs = viper.GetStringSlice("ShopIDs")
c.Polr_URL = viper.GetString("Polr_URL")
c.Polr_API_Key = viper.GetString("Polr_API_Key")
+
+ c.Debug = viper.GetBool("Debug")
+ c.FixDatabase = viper.GetBool("FixDatabase")
+ c.ShopIDs = viper.GetStringSlice("ShopIDs")
}
diff --git a/crawler/init.go b/crawler/init.go
index 60f7e47..668df2d 100644
--- a/crawler/init.go
+++ b/crawler/init.go
@@ -23,6 +23,9 @@ func init() {
loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`)
flag.Bool("list-shops", false, `lists all crawlable shops`)
shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`)
+ user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent")
+ delay_f := flag.Int("delay", 0, "toggles random delay between crawls")
+ ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt")
flag.Parse()
loglevel := strings.ToLower(*loglevel_f)
@@ -41,6 +44,16 @@ func init() {
_conf.parseConfig(*configFile)
+ if *user_agent_f != "" {
+ _conf.UserAgent = *user_agent_f
+ }
+ if *delay_f != 0 {
+ _conf.Delay = *delay_f
+ }
+ if !*ignore_robots_f {
+ _conf.IgnoreRobotsTXT = *ignore_robots_f
+ }
+
if _conf.Debug && !*silent {
log.SetLevel(log.DebugLevel)
}
diff --git a/crawler/sanitize.go b/crawler/sanitize.go
index 2fef9a4..d67b32b 100644
--- a/crawler/sanitize.go
+++ b/crawler/sanitize.go
@@ -2,6 +2,7 @@ package main
import (
"fmt"
+ "net/http"
"regexp"
"strconv"
"strings"
@@ -49,6 +50,13 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot {
continue
}
+ if err := sanitize_image_url(offer.Image_url); err != nil {
+ offer.error_ctx = offer.Image_url
+ offer.error_msg = err.Error()
+ WarnOffer(offer, "Sanitizer: Image-URL is not valid")
+ continue
+ }
+
//offer.Website = ""
W = append(W, offer)
@@ -74,6 +82,10 @@ func sanitize_name(name string) string {
name = strings.Replace(name, "years old", "Jahre", 1)
}
+ if strings.Contains(name, "years") {
+ name = strings.Replace(name, "years", "Jahre", 1)
+ }
+
if strings.Contains(name, "Years Old") {
name = strings.Replace(name, "Years Old", "Jahre", 1)
}
@@ -100,7 +112,7 @@ func sanitize_name(name string) string {
name = strings.Replace(name, age_noisy, age+" Jahre ", 1)
}
- r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`)
+ r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC]?[lL]((iter)|(tr))?`)
if err != nil {
Fatal(err, "sanitize_name: Liter-Regexp failed")
}
@@ -193,34 +205,38 @@ func sanitize_base_price(price_noisy string) (price int, err error) {
if strings.Contains(price_noisy, "Grundpreis:") {
price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1)
- price_noisy = strings.TrimSpace(price_noisy)
}
if strings.Contains(price_noisy, "/Liter") {
price_noisy = strings.Replace(price_noisy, "/Liter", "", -1)
- price_noisy = strings.TrimSpace(price_noisy)
}
+ if strings.Contains(price_noisy, "/L") {
+ price_noisy = strings.Replace(price_noisy, "/L", "", -1)
+ }
+ price_noisy = strings.TrimSpace(price_noisy)
return convert_price(price_noisy)
}
func _check_abv_for_spirit_type(offer Angebot) bool {
- if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") {
- WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type)
- return false
- }
+ /*
+ if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") {
+ WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type)
+ return false
+ }
- if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") {
- WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type)
- return false
- }
+ if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") {
+ WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type)
+ return false
+ }
- if offer.Abv < 14 && offer.Spirit_type == "Likör" {
- WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type)
- return false
+ if offer.Abv < 14 && offer.Spirit_type == "Likör" {
+ WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type)
+ return false
- }
+ }
+ */
if offer.Abv == 0 {
WarnOffer(offer, "Sanitizer: Abv is zero")
@@ -254,3 +270,22 @@ func get_age_from_name(name string) int {
}
return age
}
+
+func sanitize_image_url(url string) error {
+
+ log.Debugf("sanitize_image_url: Making HEAD request to %s \n", url)
+ resp, err := http.Head(url)
+ if err != nil {
+ return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error())
+ }
+
+ if resp.StatusCode != 200 {
+ return fmt.Errorf("sanitize_image_url: HEAD request failed. StatusCode not 200, got %d \n", resp.StatusCode)
+ }
+
+ if !strings.HasPrefix(resp.Header.Get("Content-Type"), "image") {
+ return fmt.Errorf("sanitize_image_url: HEAD request failed. Got no image, content-type is %s \n", resp.Header.Get("Content-Type"))
+ }
+
+ return nil
+}
diff --git a/crawler/scrape.go b/crawler/scrape.go
index 4bc66e0..6874239 100644
--- a/crawler/scrape.go
+++ b/crawler/scrape.go
@@ -1,7 +1,10 @@
package main
import (
+ "time"
+
log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
)
func (app *App) ScrapeHTML(shops []Shop) {
@@ -26,10 +29,6 @@ func (app *App) Scrape(shop Shop, wait chan bool) {
var W []Angebot
var err error
- if err != nil {
- Fatal(err, "scrape.go: Starting transaction failed. Shop: "+shop.Name)
- }
-
// retry on error
for i := 1; i < 4; i++ {
W = app.ScrapeShop(shop)
@@ -41,6 +40,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) {
}
}
+ // if no results, return early
+ if len(W) == 0 {
+ wait <- true
+ return
+
+ }
+
err = app.save_offer(W)
if err != nil {
Warn(err, "Saving offers failed. Shop: "+shop.Name)
@@ -72,9 +78,29 @@ func (app *App) ScrapeShop(shop Shop) []Angebot {
return app.ScrapeWhiskyworld(shop)
case "Whiskyzone":
return app.ScrapeWhiskyzone(shop)
+ case "Drankdozijn":
+ return app.ScrapeDrankdozijn(shop)
default:
log.Println(shop.Name + ": No Crawler")
}
return []Angebot{}
}
+
+/*
+ * Sets the crawler config.
+ */
+func (app *App) customCollector(allowed_urls []string) *colly.Collector {
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ colly.AllowedDomains(allowed_urls...),
+ )
+ c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT
+
+ c.Limit(&colly.LimitRule{
+ DomainGlob: "*",
+ RandomDelay: time.Duration(app.Config.Delay) * time.Second,
+ })
+
+ return c
+}
diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go
index b92896d..8722211 100644
--- a/crawler/shop_bottleworld.go
+++ b/crawler/shop_bottleworld.go
@@ -12,10 +12,7 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot {
Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all"
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.AllowedDomains("bottleworld.de"),
- colly.AllowedDomains("www.bottleworld.de"),
- )
+ c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"})
c.OnHTML("li.item", func(e *colly.HTMLElement) {
W := Angebot{}
@@ -63,7 +60,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot {
W.Image_url = e.ChildAttr("img", "src")
- e.Request.Visit(W.Url)
+ erro := e.Request.Visit(W.Url)
+ if erro != nil {
+ Warn(nil, W.Url+" "+erro.Error())
+ }
var ctx string
W.Volume, ctx = get_volume(e)
@@ -112,7 +112,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot {
e.Request.Ctx.Put("spirit_type", detect_spirit_type(text_noisy))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_drankdozijn.go b/crawler/shop_drankdozijn.go
new file mode 100644
index 0000000..96d914d
--- /dev/null
+++ b/crawler/shop_drankdozijn.go
@@ -0,0 +1,209 @@
+package main
+
+import (
+ "net/http"
+ "strconv"
+ "strings"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) ScrapeDrankdozijn(shop Shop) []Angebot {
+ Shop_url_base := "https://drankdozijn.de/aanbiedingen/"
+ var Shop_url string
+ Async_url := "https://drankdozijn.de/async/scroll"
+
+ Offers := []Angebot{}
+
+ types := map[int]string{230: "Whisky", 270: "Gin", 220: "Wodka", 210: "Rum", 250: "Likör", 240: "Cognac", 100: "Champagner"}
+ //types := map[int]string{240: "Likör"}
+ var current_type string
+
+ c := app.customCollector([]string{"drankdozijn.de"})
+
+ c.OnHTML(".product_top", func(e *colly.HTMLElement) {
+
+ if e.Request.URL.String() != Shop_url && e.Request.URL.String() != Async_url {
+ //Debug(nil, "Drankdozijn.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").")
+ return
+ }
+
+ W := Angebot{}
+
+ W.Shop = shop.Id
+ W.Spirit_type = current_type
+
+ var err error
+ var skip_offer bool
+
+ e.ForEach(".product_image", func(i int, e *colly.HTMLElement) {
+ W.Url = e.ChildAttr("a", "href")
+ W.Image_url = e.ChildAttr("img", "src")
+ })
+ e.ForEach(".product_title", func(i int, e *colly.HTMLElement) {
+ W.Name = e.ChildText("a")
+ })
+
+ if strings.Contains(W.Name, "+ gratis") || strings.Contains(W.Name, "& gratis") {
+ DebugOffer(W, "Drankdozijn: Skip Offer")
+ return
+ }
+
+ e.ForEach(".product_price", func(i int, e *colly.HTMLElement) {
+ original_price_noisy := e.ChildText(".product_acties")
+ if !strings.Contains(original_price_noisy, "€") {
+ PrintlnOffer(W, "Drankdozijn: Original price has no € sign. Skipping!")
+ skip_offer = true
+ return
+ }
+ W.Original_price, err = convert_price(original_price_noisy)
+ if err != nil {
+ W.error_msg = err.Error()
+ W.error_ctx = e.ChildText(".product_acties")
+ PrintlnOffer(W, "Drankdozijn: Converting original price failed")
+ return
+ }
+ W.Discounted_price, err = convert_price(e.ChildText(".product_aanbieding_prijs"))
+ if err != nil {
+ W.error_msg = err.Error()
+ W.error_ctx = e.ChildText(".product_aanbieding_prijs")
+ PrintlnOffer(W, "Drankdozijn: Converting discounted price failed")
+ return
+ }
+ })
+
+ if skip_offer {
+ return
+ }
+
+ e.Request.Visit(W.Url)
+
+ var ctx string
+
+ W.Volume, ctx = get_volume(e)
+ if W.Volume == 0 {
+ W.error_msg = e.Request.Ctx.Get("volume")
+ W.error_ctx = ctx
+ PrintlnOffer(W, "Drankdozijn: Volume is zero")
+ return
+ }
+
+ W.Abv, ctx = get_abv(e)
+ if W.Abv == 0 {
+ W.error_msg = "Drankdozijn: Abv is zero"
+ W.error_ctx = ctx
+ PrintlnOffer(W, "Drankdozijn: abv is zero")
+ return
+ }
+
+ base_price_noisy := e.Request.Ctx.Get("base_price")
+ W.Base_price, err = convert_price(base_price_noisy)
+ if err != nil {
+ W.error_msg = err.Error()
+ W.error_ctx = e.ChildText(".price_l")
+ PrintlnOffer(W, "Drankdozijn: Converting base price failed")
+ return
+ }
+
+ if current_type == "Cognac" {
+ W.Spirit_type = e.Request.Ctx.Get("spirit_type")
+ }
+ if current_type == "Champagner" && e.Request.Ctx.Get("spirit_type") != "Champagner" {
+ DebugOffer(W, "Drankdozijn: Skip Offer")
+ return
+ }
+
+ W.Website = e.Request.Ctx.Get("website")
+
+ //DebugOffer(W, "DEBUG")
+
+ Offers = append(Offers, W)
+ })
+
+ c.OnHTML(".main_price", func(e *colly.HTMLElement) {
+ //e.Request.Ctx.Put("base_price", strings.TrimPrefix(e.ChildText(".price_l"), "/L"))
+ e.Request.Ctx.Put("base_price", e.ChildText(".price_l"))
+ })
+
+ c.OnHTML(".main_description", func(e *colly.HTMLElement) {
+ prev := ""
+ count := 0
+ e.ForEach(".col-xs-6", func(i int, e *colly.HTMLElement) {
+ if count%2 == 0 {
+ prev = e.Text
+ } else {
+ switch strings.TrimSpace(prev) {
+ case "Inhalt":
+ e.Request.Ctx.Put("volume", e.Text)
+ case "Alkoholgehalt":
+ e.Request.Ctx.Put("abv", e.Text)
+ case "Kategorie":
+ e.Request.Ctx.Put("spirit_type", e.Text)
+ }
+
+ prev = ""
+ }
+ count++
+ })
+ })
+
+ c.OnHTML("body", func(e *colly.HTMLElement) {
+ if e.Request.URL.String() == Shop_url {
+ return
+ }
+ e.Request.Ctx.Put("website", string(e.Response.Body))
+ })
+
+ var cookie *http.Cookie
+ var has_cookie bool
+ c.OnResponse(func(r *colly.Response) {
+ //log.Debug("Cookies:", c.Cookies(r.Request.URL.String()))
+ if len(c.Cookies(r.Request.URL.String())) > 0 {
+ has_cookie = true
+ cookie = c.Cookies(r.Request.URL.String())[0]
+ }
+ })
+
+ for groepnr, cur_type := range types {
+ current_type = cur_type
+ switch current_type {
+ case "Wodka":
+ Shop_url = Shop_url_base + "vodka"
+ case "Likör":
+ Shop_url = Shop_url_base + "likeuren"
+ case "Champagner":
+ Shop_url = Shop_url_base + "wijn"
+ default:
+ Shop_url = Shop_url_base + current_type
+ }
+
+ //log.Debug(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": Error (Visit): "+err.Error())
+ }
+
+ c.OnRequest(func(r *colly.Request) {
+ r.Headers.Set("X-Requested-With", "XMLHttpRequest")
+ r.Headers.Set("Referer", Shop_url)
+ if has_cookie {
+ //log.Debug("Setting Cookie: " + cookie.String())
+ r.Headers.Set("Cookie", cookie.String())
+ }
+ })
+
+ for i := 12; true; i = i + 12 {
+ log.Debug("Crawling Drankdozijn: type = " + cur_type + " items = " + strconv.Itoa(i))
+ err := c.Post(Async_url, map[string]string{"items": strconv.Itoa(i), "datum": "0", "groepnr": strconv.Itoa(groepnr)})
+ if err != nil {
+ if "EOF" != err.Error() {
+ Warn(nil, shop.Name+": Error (Post): "+err.Error())
+ }
+ break
+ }
+ }
+ }
+
+ return Offers
+}
diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go
index cea020a..941f3b9 100644
--- a/crawler/shop_mcwhisky.go
+++ b/crawler/shop_mcwhisky.go
@@ -11,10 +11,7 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.AllowedDomains("mcwhisky.com"),
- colly.AllowedDomains("www.mcwhisky.com"),
- )
+ c := app.customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"})
c.OnHTML("li.item", func(e *colly.HTMLElement) {
@@ -133,7 +130,10 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot {
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go
index 1ce202f..45069c2 100644
--- a/crawler/shop_rumundco.go
+++ b/crawler/shop_rumundco.go
@@ -14,10 +14,7 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.AllowedDomains("rumundco.de"),
- colly.AllowedDomains("www.rumundco.de"),
- )
+ c := app.customCollector([]string{"rumundco.de", "www.rumundco.de"})
c.OnHTML(".product-teaser", func(e *colly.HTMLElement) {
@@ -158,7 +155,10 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go
index af86bdc..93bff23 100644
--- a/crawler/shop_whic.go
+++ b/crawler/shop_whic.go
@@ -12,9 +12,7 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot {
Shop_url := "https://whic.de/angebote"
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.AllowedDomains("whic.de"),
- )
+ c := app.customCollector([]string{"whic.de"})
c.OnHTML("li.item", func(e *colly.HTMLElement) {
@@ -129,7 +127,10 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go
index 0245c85..d3087ca 100644
--- a/crawler/shop_whiskyde.go
+++ b/crawler/shop_whiskyde.go
@@ -11,14 +11,12 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.AllowedDomains("whisky.de"),
- colly.AllowedDomains("www.whisky.de"),
- )
+ c := app.customCollector([]string{"whisky.de", "www.whisky.de"})
c.OnHTML(".is-buyable", func(e *colly.HTMLElement) {
if e.Request.URL.String() != Shop_url {
+ Debug(nil, "Whisky.de: Request url ("+e.Request.URL.String()+") is not shop url ("+Shop_url+").")
return
}
@@ -120,7 +118,10 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go
index f1b667c..e3ae075 100644
--- a/crawler/shop_whiskysitenl.go
+++ b/crawler/shop_whiskysitenl.go
@@ -13,10 +13,7 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot {
Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100"
- c := colly.NewCollector(
- colly.AllowedDomains("whiskysite.nl"),
- colly.AllowedDomains("www.whiskysite.nl"),
- )
+ c := app.customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"})
c.OnHTML(".product-block", func(e *colly.HTMLElement) {
@@ -141,7 +138,10 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go
index af97511..3f0874d 100644
--- a/crawler/shop_whiskyworld.go
+++ b/crawler/shop_whiskyworld.go
@@ -15,11 +15,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.UserAgent("friendly"),
- colly.AllowedDomains("whiskyworld.de"),
- colly.AllowedDomains("www.whiskyworld.de"),
- )
+ c := app.customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"})
c.OnHTML(".product-item", func(e *colly.HTMLElement) {
if !stringInSlice(e.Request.URL.String(), Shop_urls) {
@@ -106,7 +102,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
})
- W.Image_url = "https:" + e.ChildAttr("img", "data-src")
+ W.Image_url = e.ChildAttr("img", "data-src")
+ if !strings.HasPrefix(W.Image_url, "https:") {
+ W.Image_url = "https:" + W.Image_url
+ }
e.Request.Visit(W.Url)
W.Website = e.Request.Ctx.Get("website")
@@ -124,7 +123,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
})
for _, url := range Shop_urls {
- c.Visit(url)
+ err := c.Visit(url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
}
return Whiskys
diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go
index 2c1fb99..dbaf0ba 100644
--- a/crawler/shop_whiskyzone.go
+++ b/crawler/shop_whiskyzone.go
@@ -13,10 +13,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := colly.NewCollector(
- colly.AllowedDomains("whiskyzone.de"),
- colly.AllowedDomains("www.whiskyzone.de"),
- )
+ c := app.customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"})
c.OnHTML(".product--info", func(e *colly.HTMLElement) {
@@ -157,7 +154,10 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot {
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shops.go b/crawler/shops.go
index d9fcc0d..79eff96 100644
--- a/crawler/shops.go
+++ b/crawler/shops.go
@@ -90,6 +90,14 @@ func getShopsFromStruct() []Shop {
Shipping_costs: 495,
Free_shipping: "75€",
})
+ Shops = append(Shops, Shop{
+ Name: "Drankdozijn",
+ Url: "https://Drankdozijn.de",
+ Short_url: "https://l.fuselkoenig.de/drankdozijn",
+ Logo_url: "",
+ Shipping_costs: 595,
+ Free_shipping: "250€",
+ })
return Shops
}
diff --git a/crawler/utility.go b/crawler/utility.go
index 5fa78c4..0650546 100644
--- a/crawler/utility.go
+++ b/crawler/utility.go
@@ -81,12 +81,29 @@ func detect_spirit_type(name string) string {
}
func extract_volume(volume string) (float32, error) {
- r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`)
+ var volume_noisy string
+ var is_litre_instead_of_cl bool
+
+ // difference between cl...
+ r_cl, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC][lL]`)
if err != nil {
- Fatal(err, "Extract volume regex failed")
+ Fatal(err, "Extract volume (centiliter) regex failed")
+ }
+
+ volume_noisy = r_cl.FindString(volume)
+
+ if volume_noisy == "" {
+ // ...and litre
+ is_litre_instead_of_cl = true
+
+ r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`)
+ if err != nil {
+ Fatal(err, "Extract volume regex failed")
+ }
+ volume_noisy = r_liter.FindString(volume)
}
- volume_noisy := r_liter.FindString(volume)
+ // extract numbers
r_liter2, err := regexp.Compile(`[0-9]+([,.][0-9]+)?`)
if err != nil {
Fatal(err, "2nd extract volume regex failed")
@@ -99,6 +116,11 @@ func extract_volume(volume string) (float32, error) {
return 0, err
}
+ // converting from cl to litre
+ if !is_litre_instead_of_cl {
+ volume64 = volume64 / 100
+ }
+
return float32(volume64), err
}