summaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
authorhorus2018-02-16 18:06:50 +0100
committerhorus2018-02-16 18:06:50 +0100
commitbf5f6b98a1d933d5f0ffb7fe965428f4dab5e3b0 (patch)
treec95eb6426b61965b37da2b60da36cfe2c02a92b4 /crawler
parented6ab4da59f80bf9fa2cbf15da5c9167dff44ea4 (diff)
downloadalkobote-bf5f6b98a1d933d5f0ffb7fe965428f4dab5e3b0.tar.gz
Structured logging part two. (crawler)
Diffstat (limited to 'crawler')
-rw-r--r--crawler/sanitize.go35
-rw-r--r--crawler/scrape.go2
-rw-r--r--crawler/shop_bottleworld.go17
-rw-r--r--crawler/shop_mcwhisky.go21
-rw-r--r--crawler/shop_rumundco.go6
-rw-r--r--crawler/shop_whic.go21
-rw-r--r--crawler/shop_whiskyde.go15
-rw-r--r--crawler/shop_whiskyworld.go23
-rw-r--r--crawler/shop_whiskyzone.go9
-rw-r--r--crawler/utility.go10
10 files changed, 102 insertions, 57 deletions
diff --git a/crawler/sanitize.go b/crawler/sanitize.go
index 949e0f0..4f76c69 100644
--- a/crawler/sanitize.go
+++ b/crawler/sanitize.go
@@ -1,12 +1,13 @@
package main
import (
- log "github.com/Sirupsen/logrus"
"regexp"
"strings"
+
+ log "github.com/Sirupsen/logrus"
)
-func sanitize_offer(angebote []Angebot) []Angebot {
+func sanitize_offer(angebote []Angebot, shop Shop) []Angebot {
var W []Angebot
@@ -14,23 +15,31 @@ func sanitize_offer(angebote []Angebot) []Angebot {
offer.Name = sanitize_name(offer.Name)
if offer.Abv == 0 {
- log.Println("sanitize.go: abv zero: " + offer.Name + "( " + offer.Url + ")")
+ DebugOffer(offer, "Sanitizer: Abv is zero")
continue
}
if offer.Volume == 0 {
- log.Println("sanitize.go: volume zero: " + offer.Name + "( " + offer.Url + ")")
+ DebugOffer(offer, "Sanitizer: Volume is zero")
continue
}
if offer.Discounted_price == 0 {
- log.Println("sanitize.go: discounted_price zero: " + offer.Name + "( " + offer.Url + ")")
+ DebugOffer(offer, "Sanitizer: Discounted price is zero")
continue
}
if offer.Original_price == 0 {
- log.Println("sanitize.go: original_price zero: " + offer.Name + "( " + offer.Url + ")")
+ DebugOffer(offer, "Sanitizer: Original price is zero")
continue
}
if offer.Base_price == 0 {
- log.Println("sanitize.go: base_price zero: " + offer.Name + "( " + offer.Url + ")")
+ DebugOffer(offer, "Sanitizer: Base price is zero")
+ continue
+ }
+ if offer.Url == "" {
+ DebugOffer(offer, "Sanitizer: URL is empty")
+ continue
+ }
+ if offer.Image_url == "" {
+ DebugOffer(offer, "Sanitizer: Image-URL is empty")
continue
}
@@ -39,6 +48,10 @@ func sanitize_offer(angebote []Angebot) []Angebot {
W = append(W, offer)
}
+ if len(W) < 1 {
+ log.Warn("Sanitizer: No results for shop: " + shop.Name)
+ }
+
return W
}
@@ -57,7 +70,7 @@ func sanitize_name(name string) string {
r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "sanitize_name: Liter-Regexp failed")
}
for {
name_liter := r_liter.FindString(name)
@@ -80,7 +93,7 @@ func sanitize_name(name string) string {
r_procent, err := regexp.Compile(`[0-9]+([,.][0-9]+)?\%`)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "sanitize_name: Procent-Regexp failed")
}
for {
name_procent := r_procent.FindString(name)
@@ -93,7 +106,7 @@ func sanitize_name(name string) string {
r_release, err := regexp.Compile(`Release$`)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "sanitize_name: Release-Regexp failed")
}
name_release := r_release.FindString(name)
name = strings.Replace(name, name_release, "", 1)
@@ -101,7 +114,7 @@ func sanitize_name(name string) string {
r_2x, err := regexp.Compile(`[0-9]+( )*[xX]( )`)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "sanitize_name: '2x'-Regexp failed")
}
for {
name_2x := r_2x.FindString(name)
diff --git a/crawler/scrape.go b/crawler/scrape.go
index ced1d98..f6ad80b 100644
--- a/crawler/scrape.go
+++ b/crawler/scrape.go
@@ -30,7 +30,7 @@ func (app *App) Scrape(shop Shop, wait chan bool) {
W = ScrapeShop(shop)
- W = sanitize_offer(W)
+ W = sanitize_offer(W, shop)
err = app.save_offer(W)
if err != nil {
diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go
index 2eea3e5..db45791 100644
--- a/crawler/shop_bottleworld.go
+++ b/crawler/shop_bottleworld.go
@@ -1,7 +1,6 @@
package main
import (
- log "github.com/Sirupsen/logrus"
"regexp"
"strings"
@@ -33,13 +32,13 @@ func ScrapeBottleWord(shop Shop) []Angebot {
e.ForEach(".old-price", func(i int, e *colly.HTMLElement) {
W.Original_price, err = convert_price(e.ChildText(".price"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Bottleworld: Converting original price failed")
}
})
e.ForEach(".special-price", func(i int, e *colly.HTMLElement) {
W.Discounted_price, err = convert_price(e.ChildText(".price"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Bottleworld: Converting discounted price failed")
}
})
})
@@ -47,7 +46,7 @@ func ScrapeBottleWord(shop Shop) []Angebot {
price_per_litre_noisy := e.ChildText(".price-per-liter")
price_per_litre, err := sanitize_base_price(price_per_litre_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Bottleworld: Sanitizing base price failed")
}
W.Base_price = price_per_litre
@@ -58,7 +57,15 @@ func ScrapeBottleWord(shop Shop) []Angebot {
W.Shop = shop.Id
W.Volume = get_volume(e)
+ if W.Volume == 0 {
+ DebugOffer(W, "Bottleworld: Volume is zero")
+ return
+ }
W.Abv = get_abv(e)
+ if W.Abv == 0 {
+ DebugOffer(W, "Bottleworld: Abv is zero")
+ return
+ }
W.Spirit_type = e.Request.Ctx.Get("spirit_type")
W.Website = e.Request.Ctx.Get("website")
@@ -71,7 +78,7 @@ func ScrapeBottleWord(shop Shop) []Angebot {
td_str := e.ChildText("td")
matched, err := regexp.MatchString("^[0-9]+([,.][0-9]+)? l$", td_str)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Bottleworld: Volume and ABV Regex failed")
}
if matched {
e.Request.Ctx.Put("volume", td_str)
diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go
index 165d944..b423c72 100644
--- a/crawler/shop_mcwhisky.go
+++ b/crawler/shop_mcwhisky.go
@@ -1,11 +1,8 @@
package main
import (
- log "github.com/Sirupsen/logrus"
"regexp"
- // "strings"
- // "github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
)
@@ -38,13 +35,13 @@ func ScrapeMCWhisky(shop Shop) []Angebot {
e.ForEach(".old-price", func(i int, e *colly.HTMLElement) {
W.Original_price, err = convert_price(e.ChildText(".price"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "MC Whisky: Converting original price failed")
}
})
e.ForEach(".special-price", func(i int, e *colly.HTMLElement) {
W.Discounted_price, err = convert_price(e.ChildText(".price"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "MC Whisky: Converting discounted price failed")
}
})
})
@@ -52,7 +49,7 @@ func ScrapeMCWhisky(shop Shop) []Angebot {
price_per_litre_noisy := e.ChildText(".price-box-extended-info-ppl")
W.Base_price, err = sanitize_base_price(price_per_litre_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "MC Whisky: Sanitizing base price failed")
}
W.Image_url = e.ChildAttr("img", "src")
@@ -60,7 +57,15 @@ func ScrapeMCWhisky(shop Shop) []Angebot {
e.Request.Visit(W.Url)
W.Volume = get_volume(e)
+ if W.Abv == 0 {
+ DebugOffer(W, "MC Whisky: Volume is zero")
+ return
+ }
W.Abv = get_abv(e)
+ if W.Abv == 0 {
+ DebugOffer(W, "MC Whisky: Abv is zero")
+ return
+ }
W.Shop = shop.Id
W.Spirit_type = "Whisky"
@@ -75,14 +80,14 @@ func ScrapeMCWhisky(shop Shop) []Angebot {
r_abv, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?%`)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "MC Whisky: ABV regex failed")
}
e.Request.Ctx.Put("abv", r_abv.FindString(text_noisy))
r_volume, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?Liter$`)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "MC Whisky: Volume regex failed")
}
e.Request.Ctx.Put("volume", r_volume.FindString(text_noisy))
diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go
index e1516ba..25b89bd 100644
--- a/crawler/shop_rumundco.go
+++ b/crawler/shop_rumundco.go
@@ -95,7 +95,7 @@ func ScrapeRumundCo(shop Shop) []Angebot {
W.Volume = get_volume(e)
if W.Volume == 0 {
- PrintlnOffer(W, "Rum & Co: No Volume found")
+ DebugOffer(W, "Rum & Co: Volume is zero")
return
}
@@ -107,6 +107,10 @@ func ScrapeRumundCo(shop Shop) []Angebot {
Fatal(err, "Rum & Co: Base price: Extracting ABV failed")
}
}
+ if W.Abv == 0 {
+ DebugOffer(W, "Rum & Co: Abv is zero")
+ return
+ }
W.Shop = shop.Id
W.Spirit_type = "Whisky"
diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go
index e2bb6b9..e082ad1 100644
--- a/crawler/shop_whic.go
+++ b/crawler/shop_whic.go
@@ -1,7 +1,6 @@
package main
import (
- log "github.com/Sirupsen/logrus"
"regexp"
"strings"
@@ -37,13 +36,13 @@ func ScrapeWhic(shop Shop) []Angebot {
e.ForEach(".old-price", func(i int, e *colly.HTMLElement) {
W.Original_price, err = convert_price(e.ChildText(".price"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whic: Converting original price failed")
}
})
e.ForEach(".special-price", func(i int, e *colly.HTMLElement) {
W.Discounted_price, err = convert_price(e.ChildText(".price"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whic: Converting discounted price failed")
}
})
})
@@ -51,7 +50,7 @@ func ScrapeWhic(shop Shop) []Angebot {
base_price_noisy := e.ChildText(".base-price")
W.Base_price, err = sanitize_base_price(base_price_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whic: Sanitizing base price failed")
}
/*
@@ -61,13 +60,21 @@ func ScrapeWhic(shop Shop) []Angebot {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(img_link_noisy))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whic: Parsing document in Goquery failed")
}
W.Image_url, _ = doc.Find("img").Attr("src")
e.Request.Visit(W.Url)
W.Volume = get_volume(e)
+ if W.Volume == 0 {
+ DebugOffer(W, "Whic: Volume is zero")
+ return
+ }
W.Abv = get_abv(e)
+ if W.Abv == 0 {
+ DebugOffer(W, "Whic: Abv is zero")
+ return
+ }
W.Shop = shop.Id
W.Spirit_type = "Whisky"
@@ -81,7 +88,7 @@ func ScrapeWhic(shop Shop) []Angebot {
r_volume, err := regexp.Compile("Volumen: ([0-9]+([.,][0-9]+)) Liter")
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whic: Volumen regex failed")
}
volume := r_volume.FindStringSubmatch(text_noisy)
if volume == nil || len(volume) < 2 {
@@ -92,7 +99,7 @@ func ScrapeWhic(shop Shop) []Angebot {
r_abv, err := regexp.Compile("Alkoholgehalt: ([0-9]+([.,][0-9]+))%")
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whic: Abv regex failed")
}
abv := r_abv.FindStringSubmatch(text_noisy)
if abv == nil || len(abv) < 2 {
diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go
index e635546..f13190b 100644
--- a/crawler/shop_whiskyde.go
+++ b/crawler/shop_whiskyde.go
@@ -1,7 +1,6 @@
package main
import (
- log "github.com/Sirupsen/logrus"
"strings"
"github.com/gocolly/colly"
@@ -34,13 +33,13 @@ func ScrapeWhiskyde(shop Shop) []Angebot {
e.ForEach(".article-price-original", func(i int, e *colly.HTMLElement) {
W.Original_price, err = convert_price(e.ChildText("del"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whisky.de: Converting original price failed")
}
})
e.ForEach(".article-price", func(i int, e *colly.HTMLElement) {
W.Discounted_price, err = convert_price(e.ChildText(".article-price-default"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whisky.de: Converting discounted price failed")
}
})
@@ -61,19 +60,19 @@ func ScrapeWhiskyde(shop Shop) []Angebot {
abv_noisy := strings.TrimSpace(strings.SplitAfter(text_noisy, "Liter")[1])
W.Volume, err = extract_volume(text_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whisky.de: Extracting volume failed")
}
W.Abv, err = extract_abv(abv_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whisky.de: Extracting abv failed")
}
if W.Volume == 0 {
- log.Println("Whisky.de: " + W.Name + " kein Volume erkannt")
+ DebugOffer(W, "Whisky.de: Volume is zero")
return
}
if W.Abv == 0 {
- log.Println("Whisky.de: " + W.Name + " kein Abv erkannt")
+ DebugOffer(W, "Whisky.de: Abv is zero")
return
}
@@ -82,7 +81,7 @@ func ScrapeWhiskyde(shop Shop) []Angebot {
W.Base_price, err = convert_price(e.ChildText(".article-unitprice-default"))
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whisky.de: Converting base price failed")
}
e.Request.Visit(W.Url)
diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go
index 9b968a3..5235d3c 100644
--- a/crawler/shop_whiskyworld.go
+++ b/crawler/shop_whiskyworld.go
@@ -1,7 +1,6 @@
package main
import (
- log "github.com/Sirupsen/logrus"
"strings"
"github.com/gocolly/colly"
@@ -43,15 +42,13 @@ func ScrapeWhiskyworld(shop Shop) []Angebot {
W.Original_price, err = convert_price(regular_price)
if err != nil {
- log.Println("Whisky World: Original_price failed: " + regular_price + " // " + W.Name + " // " + W.Url + " // " + e.Request.URL.String())
- log.Fatal(err)
+ Fatal(err, "Whiskyworld: Converting original price failed")
return
}
W.Discounted_price, err = convert_price(e.ChildText(".uvp"))
if err != nil {
- log.Println("Whisky World: Discounted_price failed")
- log.Fatal(err)
+ Fatal(err, "Whiskyworld: Converting discounted price failed")
return
}
@@ -59,13 +56,13 @@ func ScrapeWhiskyworld(shop Shop) []Angebot {
text_noisy := e.ChildText(".item-inh")
W.Volume, err = extract_volume(text_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whiskyworld: Extracting volume failed")
}
abv_noisy := strings.TrimSpace(strings.SplitAfter(text_noisy, "Liter")[1])
abv_noisy = strings.TrimPrefix(abv_noisy, "/")
W.Abv, err = extract_abv(abv_noisy)
if err != nil {
- log.Fatal(err)
+ Fatal(err, "Whiskyworld: Extracting abv failed")
}
})
@@ -75,8 +72,7 @@ func ScrapeWhiskyworld(shop Shop) []Angebot {
base_price_noisy = strings.TrimSpace(strings.SplitAfter(base_price_noisy, "Liter")[0])
W.Base_price, err = sanitize_base_price(base_price_noisy)
if err != nil {
- log.Println("Whisky World: Base_price failed")
- log.Fatal(err)
+ Fatal(err, "Whiskyworld: Sanitizing base price failed")
}
}
@@ -87,6 +83,15 @@ func ScrapeWhiskyworld(shop Shop) []Angebot {
W.Shop = shop.Id
W.Spirit_type = "Whisky"
+ if W.Volume == 0 {
+ DebugOffer(W, "Whiskyworld: Volume is zero")
+ return
+ }
+ if W.Abv == 0 {
+ DebugOffer(W, "Whiskyworld: Abv is zero")
+ return
+ }
+
e.Request.Visit(W.Url)
W.Website = e.Request.Ctx.Get("website")
diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go
index c07c14f..8d86b8a 100644
--- a/crawler/shop_whiskyzone.go
+++ b/crawler/shop_whiskyzone.go
@@ -56,6 +56,15 @@ func ScrapeWhiskyzone(shop Shop) []Angebot {
W.Volume = get_volume(e)
W.Abv = get_abv(e)
+ if W.Volume == 0 {
+ DebugOffer(W, "Whiskyzone: Volume is zero")
+ return
+ }
+ if W.Abv == 0 {
+ DebugOffer(W, "Whiskyzone: Abv is zero")
+ return
+ }
+
base_price := e.Request.Ctx.Get("base_price")
if base_price == "same_as_discounted_price" {
W.Base_price = W.Discounted_price
diff --git a/crawler/utility.go b/crawler/utility.go
index 98587c9..3c587b9 100644
--- a/crawler/utility.go
+++ b/crawler/utility.go
@@ -138,17 +138,15 @@ func get_volume(e *colly.HTMLElement) float32 {
matched, err := regexp.MatchString(`[lL](iter)?`, volume_noisy)
if err != nil {
Fatal(err, "Get volume regex failed")
- log.Fatal(err)
}
if !matched {
- log.Println("get_volume: not matched: " + volume_noisy)
+ log.Debug("get_volume: not matched: " + volume_noisy)
return 0
}
volume, err := extract_volume(volume_noisy)
if err != nil {
- log.Println("get_volume: " + volume_noisy)
- Fatal(err, "Get Volume: Extract Volume failed")
+ Fatal(err, "Get Volume: Extract Volume failed: "+volume_noisy)
}
return volume
@@ -168,8 +166,7 @@ func get_abv(e *colly.HTMLElement) float32 {
abv, err := extract_abv(abv_noisy)
if err != nil {
- log.Println("get_abv: " + abv_noisy)
- Fatal(err, "Get ABV: Extract ABV failed")
+ Fatal(err, "Get ABV: Extract ABV failed: "+abv_noisy)
}
return abv
@@ -188,7 +185,6 @@ func get_base_price(e *colly.HTMLElement) int {
base_price, err := sanitize_base_price(base_price_noisy)
if err != nil {
- log.Println("get_base_price: " + base_price_noisy)
Fatal(err, "Get base price: sanitize base price failed")
}