diff options
Diffstat (limited to 'crawler')
| -rw-r--r-- | crawler/sanitize.go | 35 | ||||
| -rw-r--r-- | crawler/scrape.go | 2 | ||||
| -rw-r--r-- | crawler/shop_bottleworld.go | 17 | ||||
| -rw-r--r-- | crawler/shop_mcwhisky.go | 21 | ||||
| -rw-r--r-- | crawler/shop_rumundco.go | 6 | ||||
| -rw-r--r-- | crawler/shop_whic.go | 21 | ||||
| -rw-r--r-- | crawler/shop_whiskyde.go | 15 | ||||
| -rw-r--r-- | crawler/shop_whiskyworld.go | 23 | ||||
| -rw-r--r-- | crawler/shop_whiskyzone.go | 9 | ||||
| -rw-r--r-- | crawler/utility.go | 10 |
10 files changed, 102 insertions, 57 deletions
diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 949e0f0..4f76c69 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -1,12 +1,13 @@ package main import ( - log "github.com/Sirupsen/logrus" "regexp" "strings" + + log "github.com/Sirupsen/logrus" ) -func sanitize_offer(angebote []Angebot) []Angebot { +func sanitize_offer(angebote []Angebot, shop Shop) []Angebot { var W []Angebot @@ -14,23 +15,31 @@ func sanitize_offer(angebote []Angebot) []Angebot { offer.Name = sanitize_name(offer.Name) if offer.Abv == 0 { - log.Println("sanitize.go: abv zero: " + offer.Name + "( " + offer.Url + ")") + DebugOffer(offer, "Sanitizer: Abv is zero") continue } if offer.Volume == 0 { - log.Println("sanitize.go: volume zero: " + offer.Name + "( " + offer.Url + ")") + DebugOffer(offer, "Sanitizer: Volume is zero") continue } if offer.Discounted_price == 0 { - log.Println("sanitize.go: discounted_price zero: " + offer.Name + "( " + offer.Url + ")") + DebugOffer(offer, "Sanitizer: Discounted price is zero") continue } if offer.Original_price == 0 { - log.Println("sanitize.go: original_price zero: " + offer.Name + "( " + offer.Url + ")") + DebugOffer(offer, "Sanitizer: Original price is zero") continue } if offer.Base_price == 0 { - log.Println("sanitize.go: base_price zero: " + offer.Name + "( " + offer.Url + ")") + DebugOffer(offer, "Sanitizer: Base price is zero") + continue + } + if offer.Url == "" { + DebugOffer(offer, "Sanitizer: URL is empty") + continue + } + if offer.Image_url == "" { + DebugOffer(offer, "Sanitizer: Image-URL is empty") continue } @@ -39,6 +48,10 @@ func sanitize_offer(angebote []Angebot) []Angebot { W = append(W, offer) } + if len(W) < 1 { + log.Warn("Sanitizer: No results for shop: " + shop.Name) + } + return W } @@ -57,7 +70,7 @@ func sanitize_name(name string) string { r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) if err != nil { - log.Fatal(err) + Fatal(err, "sanitize_name: Liter-Regexp failed") } for { name_liter := r_liter.FindString(name) @@ -80,7 +93,7 @@ func sanitize_name(name string) string { r_procent, err := regexp.Compile(`[0-9]+([,.][0-9]+)?\%`) if err != nil { - log.Fatal(err) + Fatal(err, "sanitize_name: Procent-Regexp failed") } for { name_procent := r_procent.FindString(name) @@ -93,7 +106,7 @@ func sanitize_name(name string) string { r_release, err := regexp.Compile(`Release$`) if err != nil { - log.Fatal(err) + Fatal(err, "sanitize_name: Release-Regexp failed") } name_release := r_release.FindString(name) name = strings.Replace(name, name_release, "", 1) @@ -101,7 +114,7 @@ func sanitize_name(name string) string { r_2x, err := regexp.Compile(`[0-9]+( )*[xX]( )`) if err != nil { - log.Fatal(err) + Fatal(err, "sanitize_name: '2x'-Regexp failed") } for { name_2x := r_2x.FindString(name) diff --git a/crawler/scrape.go b/crawler/scrape.go index ced1d98..f6ad80b 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -30,7 +30,7 @@ func (app *App) Scrape(shop Shop, wait chan bool) { W = ScrapeShop(shop) - W = sanitize_offer(W) + W = sanitize_offer(W, shop) err = app.save_offer(W) if err != nil { diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index 2eea3e5..db45791 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -1,7 +1,6 @@ package main import ( - log "github.com/Sirupsen/logrus" "regexp" "strings" @@ -33,13 +32,13 @@ func ScrapeBottleWord(shop Shop) []Angebot { e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { W.Original_price, err = convert_price(e.ChildText(".price")) if err != nil { - log.Fatal(err) + Fatal(err, "Bottleworld: Converting original price failed") } }) e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { W.Discounted_price, err = convert_price(e.ChildText(".price")) if err != nil { - log.Fatal(err) + Fatal(err, "Bottleworld: Converting discounted price failed") } }) }) @@ -47,7 +46,7 @@ func ScrapeBottleWord(shop Shop) []Angebot { price_per_litre_noisy := e.ChildText(".price-per-liter") price_per_litre, err := sanitize_base_price(price_per_litre_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "Bottleworld: Sanitizing base price failed") } W.Base_price = price_per_litre @@ -58,7 +57,15 @@ func ScrapeBottleWord(shop Shop) []Angebot { W.Shop = shop.Id W.Volume = get_volume(e) + if W.Volume == 0 { + DebugOffer(W, "Bottleworld: Volume is zero") + return + } W.Abv = get_abv(e) + if W.Abv == 0 { + DebugOffer(W, "Bottleworld: Abv is zero") + return + } W.Spirit_type = e.Request.Ctx.Get("spirit_type") W.Website = e.Request.Ctx.Get("website") @@ -71,7 +78,7 @@ func ScrapeBottleWord(shop Shop) []Angebot { td_str := e.ChildText("td") matched, err := regexp.MatchString("^[0-9]+([,.][0-9]+)? l$", td_str) if err != nil { - log.Fatal(err) + Fatal(err, "Bottleworld: Volume and ABV Regex failed") } if matched { e.Request.Ctx.Put("volume", td_str) diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index 165d944..b423c72 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -1,11 +1,8 @@ package main import ( - log "github.com/Sirupsen/logrus" "regexp" - // "strings" - // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" ) @@ -38,13 +35,13 @@ func ScrapeMCWhisky(shop Shop) []Angebot { e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { W.Original_price, err = convert_price(e.ChildText(".price")) if err != nil { - log.Fatal(err) + Fatal(err, "MC Whisky: Converting original price failed") } }) e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { W.Discounted_price, err = convert_price(e.ChildText(".price")) if err != nil { - log.Fatal(err) + Fatal(err, "MC Whisky: Converting discounted price failed") } }) }) @@ -52,7 +49,7 @@ func ScrapeMCWhisky(shop Shop) []Angebot { price_per_litre_noisy := e.ChildText(".price-box-extended-info-ppl") W.Base_price, err = sanitize_base_price(price_per_litre_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "MC Whisky: Sanitizing base price failed") } W.Image_url = e.ChildAttr("img", "src") @@ -60,7 +57,15 @@ func ScrapeMCWhisky(shop Shop) []Angebot { e.Request.Visit(W.Url) W.Volume = get_volume(e) + if W.Abv == 0 { + DebugOffer(W, "MC Whisky: Volume is zero") + return + } W.Abv = get_abv(e) + if W.Abv == 0 { + DebugOffer(W, "MC Whisky: Abv is zero") + return + } W.Shop = shop.Id W.Spirit_type = "Whisky" @@ -75,14 +80,14 @@ func ScrapeMCWhisky(shop Shop) []Angebot { r_abv, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?%`) if err != nil { - log.Fatal(err) + Fatal(err, "MC Whisky: ABV regex failed") } e.Request.Ctx.Put("abv", r_abv.FindString(text_noisy)) r_volume, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?Liter$`) if err != nil { - log.Fatal(err) + Fatal(err, "MC Whisky: Volume regex failed") } e.Request.Ctx.Put("volume", r_volume.FindString(text_noisy)) diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index e1516ba..25b89bd 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -95,7 +95,7 @@ func ScrapeRumundCo(shop Shop) []Angebot { W.Volume = get_volume(e) if W.Volume == 0 { - PrintlnOffer(W, "Rum & Co: No Volume found") + DebugOffer(W, "Rum & Co: Volume is zero") return } @@ -107,6 +107,10 @@ func ScrapeRumundCo(shop Shop) []Angebot { Fatal(err, "Rum & Co: Base price: Extracting ABV failed") } } + if W.Abv == 0 { + DebugOffer(W, "Rum & Co: Abv is zero") + return + } W.Shop = shop.Id W.Spirit_type = "Whisky" diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index e2bb6b9..e082ad1 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -1,7 +1,6 @@ package main import ( - log "github.com/Sirupsen/logrus" "regexp" "strings" @@ -37,13 +36,13 @@ func ScrapeWhic(shop Shop) []Angebot { e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { W.Original_price, err = convert_price(e.ChildText(".price")) if err != nil { - log.Fatal(err) + Fatal(err, "Whic: Converting original price failed") } }) e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { W.Discounted_price, err = convert_price(e.ChildText(".price")) if err != nil { - log.Fatal(err) + Fatal(err, "Whic: Converting discounted price failed") } }) }) @@ -51,7 +50,7 @@ func ScrapeWhic(shop Shop) []Angebot { base_price_noisy := e.ChildText(".base-price") W.Base_price, err = sanitize_base_price(base_price_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "Whic: Sanitizing base price failed") } /* @@ -61,13 +60,21 @@ func ScrapeWhic(shop Shop) []Angebot { doc, err := goquery.NewDocumentFromReader(strings.NewReader(img_link_noisy)) if err != nil { - log.Fatal(err) + Fatal(err, "Whic: Parsing document in Goquery failed") } W.Image_url, _ = doc.Find("img").Attr("src") e.Request.Visit(W.Url) W.Volume = get_volume(e) + if W.Volume == 0 { + DebugOffer(W, "Whic: Volume is zero") + return + } W.Abv = get_abv(e) + if W.Abv == 0 { + DebugOffer(W, "Whic: Abv is zero") + return + } W.Shop = shop.Id W.Spirit_type = "Whisky" @@ -81,7 +88,7 @@ func ScrapeWhic(shop Shop) []Angebot { r_volume, err := regexp.Compile("Volumen: ([0-9]+([.,][0-9]+)) Liter") if err != nil { - log.Fatal(err) + Fatal(err, "Whic: Volumen regex failed") } volume := r_volume.FindStringSubmatch(text_noisy) if volume == nil || len(volume) < 2 { @@ -92,7 +99,7 @@ func ScrapeWhic(shop Shop) []Angebot { r_abv, err := regexp.Compile("Alkoholgehalt: ([0-9]+([.,][0-9]+))%") if err != nil { - log.Fatal(err) + Fatal(err, "Whic: Abv regex failed") } abv := r_abv.FindStringSubmatch(text_noisy) if abv == nil || len(abv) < 2 { diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index e635546..f13190b 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -1,7 +1,6 @@ package main import ( - log "github.com/Sirupsen/logrus" "strings" "github.com/gocolly/colly" @@ -34,13 +33,13 @@ func ScrapeWhiskyde(shop Shop) []Angebot { e.ForEach(".article-price-original", func(i int, e *colly.HTMLElement) { W.Original_price, err = convert_price(e.ChildText("del")) if err != nil { - log.Fatal(err) + Fatal(err, "Whisky.de: Converting original price failed") } }) e.ForEach(".article-price", func(i int, e *colly.HTMLElement) { W.Discounted_price, err = convert_price(e.ChildText(".article-price-default")) if err != nil { - log.Fatal(err) + Fatal(err, "Whisky.de: Converting discounted price failed") } }) @@ -61,19 +60,19 @@ func ScrapeWhiskyde(shop Shop) []Angebot { abv_noisy := strings.TrimSpace(strings.SplitAfter(text_noisy, "Liter")[1]) W.Volume, err = extract_volume(text_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "Whisky.de: Extracting volume failed") } W.Abv, err = extract_abv(abv_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "Whisky.de: Extracting abv failed") } if W.Volume == 0 { - log.Println("Whisky.de: " + W.Name + " kein Volume erkannt") + DebugOffer(W, "Whisky.de: Volume is zero") return } if W.Abv == 0 { - log.Println("Whisky.de: " + W.Name + " kein Abv erkannt") + DebugOffer(W, "Whisky.de: Abv is zero") return } @@ -82,7 +81,7 @@ func ScrapeWhiskyde(shop Shop) []Angebot { W.Base_price, err = convert_price(e.ChildText(".article-unitprice-default")) if err != nil { - log.Fatal(err) + Fatal(err, "Whisky.de: Converting base price failed") } e.Request.Visit(W.Url) diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index 9b968a3..5235d3c 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -1,7 +1,6 @@ package main import ( - log "github.com/Sirupsen/logrus" "strings" "github.com/gocolly/colly" @@ -43,15 +42,13 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { W.Original_price, err = convert_price(regular_price) if err != nil { - log.Println("Whisky World: Original_price failed: " + regular_price + " // " + W.Name + " // " + W.Url + " // " + e.Request.URL.String()) - log.Fatal(err) + Fatal(err, "Whiskyworld: Converting original price failed") return } W.Discounted_price, err = convert_price(e.ChildText(".uvp")) if err != nil { - log.Println("Whisky World: Discounted_price failed") - log.Fatal(err) + Fatal(err, "Whiskyworld: Converting discounted price failed") return } @@ -59,13 +56,13 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { text_noisy := e.ChildText(".item-inh") W.Volume, err = extract_volume(text_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "Whiskyworld: Extracting volume failed") } abv_noisy := strings.TrimSpace(strings.SplitAfter(text_noisy, "Liter")[1]) abv_noisy = strings.TrimPrefix(abv_noisy, "/") W.Abv, err = extract_abv(abv_noisy) if err != nil { - log.Fatal(err) + Fatal(err, "Whiskyworld: Extracting abv failed") } }) @@ -75,8 +72,7 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { base_price_noisy = strings.TrimSpace(strings.SplitAfter(base_price_noisy, "Liter")[0]) W.Base_price, err = sanitize_base_price(base_price_noisy) if err != nil { - log.Println("Whisky World: Base_price failed") - log.Fatal(err) + Fatal(err, "Whiskyworld: Sanitizing base price failed") } } @@ -87,6 +83,15 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { W.Shop = shop.Id W.Spirit_type = "Whisky" + if W.Volume == 0 { + DebugOffer(W, "Whiskyworld: Volume is zero") + return + } + if W.Abv == 0 { + DebugOffer(W, "Whiskyworld: Abv is zero") + return + } + e.Request.Visit(W.Url) W.Website = e.Request.Ctx.Get("website") diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index c07c14f..8d86b8a 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -56,6 +56,15 @@ func ScrapeWhiskyzone(shop Shop) []Angebot { W.Volume = get_volume(e) W.Abv = get_abv(e) + if W.Volume == 0 { + DebugOffer(W, "Whiskyzone: Volume is zero") + return + } + if W.Abv == 0 { + DebugOffer(W, "Whiskyzone: Abv is zero") + return + } + base_price := e.Request.Ctx.Get("base_price") if base_price == "same_as_discounted_price" { W.Base_price = W.Discounted_price diff --git a/crawler/utility.go b/crawler/utility.go index 98587c9..3c587b9 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -138,17 +138,15 @@ func get_volume(e *colly.HTMLElement) float32 { matched, err := regexp.MatchString(`[lL](iter)?`, volume_noisy) if err != nil { Fatal(err, "Get volume regex failed") - log.Fatal(err) } if !matched { - log.Println("get_volume: not matched: " + volume_noisy) + log.Debug("get_volume: not matched: " + volume_noisy) return 0 } volume, err := extract_volume(volume_noisy) if err != nil { - log.Println("get_volume: " + volume_noisy) - Fatal(err, "Get Volume: Extract Volume failed") + Fatal(err, "Get Volume: Extract Volume failed: "+volume_noisy) } return volume @@ -168,8 +166,7 @@ func get_abv(e *colly.HTMLElement) float32 { abv, err := extract_abv(abv_noisy) if err != nil { - log.Println("get_abv: " + abv_noisy) - Fatal(err, "Get ABV: Extract ABV failed") + Fatal(err, "Get ABV: Extract ABV failed: "+abv_noisy) } return abv @@ -188,7 +185,6 @@ func get_base_price(e *colly.HTMLElement) int { base_price, err := sanitize_base_price(base_price_noisy) if err != nil { - log.Println("get_base_price: " + base_price_noisy) Fatal(err, "Get base price: sanitize base price failed") } |
