diff options
| author | horus | 2018-06-18 15:54:53 +0200 |
|---|---|---|
| committer | horus | 2018-06-18 15:54:53 +0200 |
| commit | 01e0cbe79f37b4be2fc82d31c71042b5ce4d699a (patch) | |
| tree | bb179b5c5c6349a69853c3781236b6056b7e7ea6 /crawler/sanitize.go | |
| parent | 88a2628258eb5ea79736338637ab8b5b83680c92 (diff) | |
| parent | 8114b7b17b723a5fe0fee24470e255faf587332e (diff) | |
| download | alkobote-01e0cbe79f37b4be2fc82d31c71042b5ce4d699a.tar.gz | |
Merge branch 'master' of /home/horus/app/fk_angebote
Diffstat (limited to 'crawler/sanitize.go')
| -rw-r--r-- | crawler/sanitize.go | 65 |
1 files changed, 50 insertions, 15 deletions
diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 2fef9a4..d67b32b 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "net/http" "regexp" "strconv" "strings" @@ -49,6 +50,13 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot { continue } + if err := sanitize_image_url(offer.Image_url); err != nil { + offer.error_ctx = offer.Image_url + offer.error_msg = err.Error() + WarnOffer(offer, "Sanitizer: Image-URL is not valid") + continue + } + //offer.Website = "" W = append(W, offer) @@ -74,6 +82,10 @@ func sanitize_name(name string) string { name = strings.Replace(name, "years old", "Jahre", 1) } + if strings.Contains(name, "years") { + name = strings.Replace(name, "years", "Jahre", 1) + } + if strings.Contains(name, "Years Old") { name = strings.Replace(name, "Years Old", "Jahre", 1) } @@ -100,7 +112,7 @@ func sanitize_name(name string) string { name = strings.Replace(name, age_noisy, age+" Jahre ", 1) } - r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC]?[lL]((iter)|(tr))?`) if err != nil { Fatal(err, "sanitize_name: Liter-Regexp failed") } @@ -193,34 +205,38 @@ func sanitize_base_price(price_noisy string) (price int, err error) { if strings.Contains(price_noisy, "Grundpreis:") { price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } if strings.Contains(price_noisy, "/Liter") { price_noisy = strings.Replace(price_noisy, "/Liter", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } + if strings.Contains(price_noisy, "/L") { + price_noisy = strings.Replace(price_noisy, "/L", "", -1) + } + price_noisy = strings.TrimSpace(price_noisy) return convert_price(price_noisy) } func _check_abv_for_spirit_type(offer Angebot) bool { - if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { - WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) - return false - } + /* + if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { + WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) + return false + } - if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { - WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) - return false - } + if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { + WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) + return false + } - if offer.Abv < 14 && offer.Spirit_type == "Likör" { - WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) - return false + if offer.Abv < 14 && offer.Spirit_type == "Likör" { + WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) + return false - } + } + */ if offer.Abv == 0 { WarnOffer(offer, "Sanitizer: Abv is zero") @@ -254,3 +270,22 @@ func get_age_from_name(name string) int { } return age } + +func sanitize_image_url(url string) error { + + log.Debugf("sanitize_image_url: Making HEAD request to %s \n", url) + resp, err := http.Head(url) + if err != nil { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error()) + } + + if resp.StatusCode != 200 { + return fmt.Errorf("sanitize_image_url: HEAD request failed. StatusCode not 200, got %d \n", resp.StatusCode) + } + + if !strings.HasPrefix(resp.Header.Get("Content-Type"), "image") { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got no image, content-type is %s \n", resp.Header.Get("Content-Type")) + } + + return nil +} |
