summaryrefslogtreecommitdiff
path: root/crawler/sanitize.go
diff options
context:
space:
mode:
authorhorus2018-06-18 15:54:53 +0200
committerhorus2018-06-18 15:54:53 +0200
commit01e0cbe79f37b4be2fc82d31c71042b5ce4d699a (patch)
treebb179b5c5c6349a69853c3781236b6056b7e7ea6 /crawler/sanitize.go
parent88a2628258eb5ea79736338637ab8b5b83680c92 (diff)
parent8114b7b17b723a5fe0fee24470e255faf587332e (diff)
downloadalkobote-01e0cbe79f37b4be2fc82d31c71042b5ce4d699a.tar.gz
Merge branch 'master' of /home/horus/app/fk_angebote
Diffstat (limited to 'crawler/sanitize.go')
-rw-r--r--crawler/sanitize.go65
1 files changed, 50 insertions, 15 deletions
diff --git a/crawler/sanitize.go b/crawler/sanitize.go
index 2fef9a4..d67b32b 100644
--- a/crawler/sanitize.go
+++ b/crawler/sanitize.go
@@ -2,6 +2,7 @@ package main
import (
"fmt"
+ "net/http"
"regexp"
"strconv"
"strings"
@@ -49,6 +50,13 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot {
continue
}
+ if err := sanitize_image_url(offer.Image_url); err != nil {
+ offer.error_ctx = offer.Image_url
+ offer.error_msg = err.Error()
+ WarnOffer(offer, "Sanitizer: Image-URL is not valid")
+ continue
+ }
+
//offer.Website = ""
W = append(W, offer)
@@ -74,6 +82,10 @@ func sanitize_name(name string) string {
name = strings.Replace(name, "years old", "Jahre", 1)
}
+ if strings.Contains(name, "years") {
+ name = strings.Replace(name, "years", "Jahre", 1)
+ }
+
if strings.Contains(name, "Years Old") {
name = strings.Replace(name, "Years Old", "Jahre", 1)
}
@@ -100,7 +112,7 @@ func sanitize_name(name string) string {
name = strings.Replace(name, age_noisy, age+" Jahre ", 1)
}
- r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`)
+ r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC]?[lL]((iter)|(tr))?`)
if err != nil {
Fatal(err, "sanitize_name: Liter-Regexp failed")
}
@@ -193,34 +205,38 @@ func sanitize_base_price(price_noisy string) (price int, err error) {
if strings.Contains(price_noisy, "Grundpreis:") {
price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1)
- price_noisy = strings.TrimSpace(price_noisy)
}
if strings.Contains(price_noisy, "/Liter") {
price_noisy = strings.Replace(price_noisy, "/Liter", "", -1)
- price_noisy = strings.TrimSpace(price_noisy)
}
+ if strings.Contains(price_noisy, "/L") {
+ price_noisy = strings.Replace(price_noisy, "/L", "", -1)
+ }
+ price_noisy = strings.TrimSpace(price_noisy)
return convert_price(price_noisy)
}
func _check_abv_for_spirit_type(offer Angebot) bool {
- if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") {
- WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type)
- return false
- }
+ /*
+ if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") {
+ WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type)
+ return false
+ }
- if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") {
- WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type)
- return false
- }
+ if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") {
+ WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type)
+ return false
+ }
- if offer.Abv < 14 && offer.Spirit_type == "Likör" {
- WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type)
- return false
+ if offer.Abv < 14 && offer.Spirit_type == "Likör" {
+ WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type)
+ return false
- }
+ }
+ */
if offer.Abv == 0 {
WarnOffer(offer, "Sanitizer: Abv is zero")
@@ -254,3 +270,22 @@ func get_age_from_name(name string) int {
}
return age
}
+
+func sanitize_image_url(url string) error {
+
+ log.Debugf("sanitize_image_url: Making HEAD request to %s \n", url)
+ resp, err := http.Head(url)
+ if err != nil {
+ return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error())
+ }
+
+ if resp.StatusCode != 200 {
+ return fmt.Errorf("sanitize_image_url: HEAD request failed. StatusCode not 200, got %d \n", resp.StatusCode)
+ }
+
+ if !strings.HasPrefix(resp.Header.Get("Content-Type"), "image") {
+ return fmt.Errorf("sanitize_image_url: HEAD request failed. Got no image, content-type is %s \n", resp.Header.Get("Content-Type"))
+ }
+
+ return nil
+}