From f37ebbb81785fb2c02f166b84581b9e92c829b2a Mon Sep 17 00:00:00 2001 From: horus Date: Fri, 15 Jun 2018 16:21:12 +0200 Subject: Tries to validate image url by making head request. (crawler) --- crawler/sanitize.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'crawler/sanitize.go') diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 2fef9a4..6370588 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "net/http" "regexp" "strconv" "strings" @@ -49,6 +50,13 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot { continue } + if err := sanitize_image_url(offer.Image_url); err != nil { + offer.error_ctx = offer.Image_url + offer.error_msg = err.Error() + WarnOffer(offer, "Sanitizer: Image-URL is not valid") + continue + } + //offer.Website = "" W = append(W, offer) @@ -254,3 +262,21 @@ func get_age_from_name(name string) int { } return age } + +func sanitize_image_url(url string) error { + + resp, err := http.Head(url) + if err != nil { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error()) + } + + if resp.StatusCode != 200 { + return fmt.Errorf("sanitize_image_url: HEAD request failed. StatusCode not 200, got %d \n", resp.StatusCode) + } + + if !strings.HasPrefix(resp.Header.Get("Content-Type"), "image") { + return fmt.Errorf("sanitize_image_url: HEAD request failed. Got no image, content-type is %s \n", resp.Header.Get("Content-Type")) + } + + return nil +} -- cgit v1.2.3 From b3b35a1706cd99e0978147a4d1b841381cf48348 Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 15 Jun 2018 19:37:33 +0200 Subject: Improves debugging output. (crawler) --- crawler/sanitize.go | 1 + 1 file changed, 1 insertion(+) (limited to 'crawler/sanitize.go') diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 6370588..960d5f6 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -265,6 +265,7 @@ func get_age_from_name(name string) int { func sanitize_image_url(url string) error { + log.Debugf("sanitize_image_url: Making HEAD request to %s \n", url) resp, err := http.Head(url) if err != nil { return fmt.Errorf("sanitize_image_url: HEAD request failed. Got error: %s \n", err.Error()) -- cgit v1.2.3 From a25368ce25e3de3add81e4347639a9b0401750a7 Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 13:52:50 +0200 Subject: Improves sanitizing function. (crawler) --- crawler/sanitize.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'crawler/sanitize.go') diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 960d5f6..4a5197b 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -82,6 +82,10 @@ func sanitize_name(name string) string { name = strings.Replace(name, "years old", "Jahre", 1) } + if strings.Contains(name, "years") { + name = strings.Replace(name, "years", "Jahre", 1) + } + if strings.Contains(name, "Years Old") { name = strings.Replace(name, "Years Old", "Jahre", 1) } @@ -201,13 +205,15 @@ func sanitize_base_price(price_noisy string) (price int, err error) { if strings.Contains(price_noisy, "Grundpreis:") { price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } if strings.Contains(price_noisy, "/Liter") { price_noisy = strings.Replace(price_noisy, "/Liter", "", -1) - price_noisy = strings.TrimSpace(price_noisy) } + if strings.Contains(price_noisy, "/L") { + price_noisy = strings.Replace(price_noisy, "/L", "", -1) + } + price_noisy = strings.TrimSpace(price_noisy) return convert_price(price_noisy) } -- cgit v1.2.3 From db6fa4428e8b6d6c7fd845463a93d83affbf880b Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 15:15:36 +0200 Subject: Detects cl in sanitize_name(). (crawler) --- crawler/sanitize.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'crawler/sanitize.go') diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 4a5197b..262bfa6 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -112,7 +112,7 @@ func sanitize_name(name string) string { name = strings.Replace(name, age_noisy, age+" Jahre ", 1) } - r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[cC]?[lL]((iter)|(tr))?`) if err != nil { Fatal(err, "sanitize_name: Liter-Regexp failed") } -- cgit v1.2.3 From 0dedda30a0cb983c41f879e9fe0be53a79ba347c Mon Sep 17 00:00:00 2001 From: horus Date: Sat, 16 Jun 2018 16:02:59 +0200 Subject: Removes validating abv based of spirit type. (crawler) --- crawler/sanitize.go | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'crawler/sanitize.go') diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 262bfa6..d67b32b 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -220,21 +220,23 @@ func sanitize_base_price(price_noisy string) (price int, err error) { func _check_abv_for_spirit_type(offer Angebot) bool { - if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { - WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) - return false - } + /* + if offer.Abv < 40 && (offer.Spirit_type == "Whisky" || offer.Spirit_type == "Cognac") { + WarnOffer(offer, "Sanitizer: Abv below 40% for "+offer.Spirit_type) + return false + } - if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { - WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) - return false - } + if offer.Abv < 37.5 && (offer.Spirit_type == "Rum" || offer.Spirit_type == "Gin" || offer.Spirit_type == "Wodka" || offer.Spirit_type == "Grappa") { + WarnOffer(offer, "Sanitizer: Abv below 37,5% for "+offer.Spirit_type) + return false + } - if offer.Abv < 14 && offer.Spirit_type == "Likör" { - WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) - return false + if offer.Abv < 14 && offer.Spirit_type == "Likör" { + WarnOffer(offer, "Sanitizer: Abv below 14% for "+offer.Spirit_type) + return false - } + } + */ if offer.Abv == 0 { WarnOffer(offer, "Sanitizer: Abv is zero") -- cgit v1.2.3