From 14a89c34096d2ddb2a7750feda143207110e838b Mon Sep 17 00:00:00 2001 From: horus_arch Date: Wed, 7 Feb 2018 19:09:22 +0100 Subject: Adds sanitizer. --- crawler/convert_price.go | 103 +++++++++++++++++++++++++++++++++++++++++++ crawler/sanitize.go | 30 +++++++++++++ crawler/sanitize_name.go | 13 ------ crawler/sanitize_price.go | 103 ------------------------------------------- crawler/scrape.go | 2 + crawler/shop_bottleworld.go | 4 +- crawler/shop_mcwhisky.go | 6 +-- crawler/shop_rumundco.go | 4 +- crawler/shop_whic.go | 4 +- crawler/shop_whiskyde.go | 4 +- crawler/shop_whiskysitenl.go | 4 +- crawler/shop_whiskyworld.go | 4 +- crawler/shop_whiskyzone.go | 4 +- 13 files changed, 152 insertions(+), 133 deletions(-) create mode 100644 crawler/convert_price.go create mode 100644 crawler/sanitize.go delete mode 100644 crawler/sanitize_name.go delete mode 100644 crawler/sanitize_price.go (limited to 'crawler') diff --git a/crawler/convert_price.go b/crawler/convert_price.go new file mode 100644 index 0000000..54386d6 --- /dev/null +++ b/crawler/convert_price.go @@ -0,0 +1,103 @@ +package main + +import ( + "errors" + "strconv" + "strings" +) + +func convert_price(price string) (int, error) { + if "" == price { + return 0, errors.New("Empty string") + } + + multiply_by_10 := false + multiply_by_100 := true + + price = strings.TrimSpace(price) + + price = strings.TrimPrefix(price, "€") + price = strings.TrimSpace(price) + + price = strings.TrimSuffix(price, "€") + price = strings.TrimSpace(price) + + price = strings.TrimSuffix(strings.ToLower(price), "eur") + price = strings.TrimSpace(price) + + price = strings.TrimSuffix(strings.ToLower(price), "euro") + price = strings.TrimSpace(price) + + if len(price) < 2 { + price = "0" + price + } else if len(price) < 3 { + price = "00" + price + } + + c := string(price[len(price)-2:]) + c = string(c[0:1]) + + /* + Extracts the second last char and checks if it's a "." or a ",". + */ + if "," == c { + if strings.Count(price, ",") > 1 { + return 0, errors.New("Invalid format") + } + + multiply_by_10 = true + multiply_by_100 = false + + } else if "." == c { + if strings.Count(price, ".") > 1 { + return 0, errors.New("Invalid format") + } + + multiply_by_10 = true + multiply_by_100 = false + + } + + c = string(price[len(price)-3:]) + c = string(c[0:1]) + + /* + Extracts the third last char and checks if it's a "." or a ",". + */ + if "," == c { + if strings.Count(price, ",") > 1 { + return 0, errors.New("Invalid format") + } + + multiply_by_10 = false + multiply_by_100 = false + + } else if "." == c { + if strings.Count(price, ".") > 1 { + return 0, errors.New("Invalid format") + } + + multiply_by_10 = false + multiply_by_100 = false + + } + + price = strings.Replace(price, ",", "", -1) + price = strings.Replace(price, ".", "", -1) + + /* + Casts the price to integer in cents (not euro!). + */ + price_int, err := strconv.Atoi(price) + if err != nil { + return 0, err + } + + if multiply_by_10 { + price_int = price_int * 10 + } else if multiply_by_100 { + price_int = price_int * 100 + } + + return price_int, nil +} diff --git a/crawler/sanitize.go b/crawler/sanitize.go new file mode 100644 index 0000000..ddcd4f6 --- /dev/null +++ b/crawler/sanitize.go @@ -0,0 +1,30 @@ +package main + +import ( + "log" + "regexp" + "strings" +) + +func sanitize_offer(angebote []Angebot) []Angebot { + + for _, offer := range angebote { + offer.Name = sanitize_name(offer.Name) + } + + return angebote +} + +func sanitize_name(name string) string { + if strings.Contains(name, "y.o.") { + name = strings.Replace(name, "y.o.", "Jahre", 1) + } + r_liter, err := regexp.Compile("[0-9]+([,.][0-9](([lL])| ([Ll]iter))?") + if err != nil { + log.Fatal(err) + } + name_liter := r_liter.FindString(name) + name = strings.Replace(name, name_liter, "", 1) + + return name +} diff --git a/crawler/sanitize_name.go b/crawler/sanitize_name.go deleted file mode 100644 index 73b2714..0000000 --- a/crawler/sanitize_name.go +++ /dev/null @@ -1,13 +0,0 @@ -package main - -import ( - "strings" -) - -func sanitize_name(name string) string { - if strings.Contains(name, "y.o.") { - name = strings.Replace(name, "y.o.", "Jahre", 1) - } - - return name -} diff --git a/crawler/sanitize_price.go b/crawler/sanitize_price.go deleted file mode 100644 index 2052842..0000000 --- a/crawler/sanitize_price.go +++ /dev/null @@ -1,103 +0,0 @@ -package main - -import ( - "errors" - "strconv" - "strings" -) - -func sanitize_price(price string) (int, error) { - if "" == price { - return 0, errors.New("Empty string") - } - - multiply_by_10 := false - multiply_by_100 := true - - price = strings.TrimSpace(price) - - price = strings.TrimPrefix(price, "€") - price = strings.TrimSpace(price) - - price = strings.TrimSuffix(price, "€") - price = strings.TrimSpace(price) - - price = strings.TrimSuffix(strings.ToLower(price), "eur") - price = strings.TrimSpace(price) - - price = strings.TrimSuffix(strings.ToLower(price), "euro") - price = strings.TrimSpace(price) - - if len(price) < 2 { - price = "0" + price - } else if len(price) < 3 { - price = "00" + price - } - - c := string(price[len(price)-2:]) - c = string(c[0:1]) - - /* - Extracts the second last char and checks if it's a "." or a ",". - */ - if "," == c { - if strings.Count(price, ",") > 1 { - return 0, errors.New("Invalid format") - } - - multiply_by_10 = true - multiply_by_100 = false - - } else if "." == c { - if strings.Count(price, ".") > 1 { - return 0, errors.New("Invalid format") - } - - multiply_by_10 = true - multiply_by_100 = false - - } - - c = string(price[len(price)-3:]) - c = string(c[0:1]) - - /* - Extracts the third last char and checks if it's a "." or a ",". - */ - if "," == c { - if strings.Count(price, ",") > 1 { - return 0, errors.New("Invalid format") - } - - multiply_by_10 = false - multiply_by_100 = false - - } else if "." == c { - if strings.Count(price, ".") > 1 { - return 0, errors.New("Invalid format") - } - - multiply_by_10 = false - multiply_by_100 = false - - } - - price = strings.Replace(price, ",", "", -1) - price = strings.Replace(price, ".", "", -1) - - /* - Casts the price to integer in cents (not euro!). - */ - price_int, err := strconv.Atoi(price) - if err != nil { - return 0, err - } - - if multiply_by_10 { - price_int = price_int * 10 - } else if multiply_by_100 { - price_int = price_int * 100 - } - - return price_int, nil -} diff --git a/crawler/scrape.go b/crawler/scrape.go index 0595240..31b3618 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -17,6 +17,8 @@ func (app *App) ScrapeHTML(shops []Shop) { W = ScrapeShop(shop) + W = sanitize_offer(W) + err = app.save_offer(W) if err != nil { log.Fatal(err) diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index 3a3c631..b6af7e0 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -39,13 +39,13 @@ func ScrapeBottleWord(shop Shop) []Angebot { e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { - W.Original_price, err = sanitize_price(e.ChildText(".price")) + W.Original_price, err = convert_price(e.ChildText(".price")) if err != nil { log.Fatal(err) } }) e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { - W.Discounted_price, err = sanitize_price(e.ChildText(".price")) + W.Discounted_price, err = convert_price(e.ChildText(".price")) if err != nil { log.Fatal(err) } diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index e45e740..e0c1ab8 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -19,7 +19,7 @@ func ScrapeMCWhisky(shop Shop) []Angebot { c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} - whisky_name := sanitize_name(e.ChildAttr("a", "title")) + whisky_name := e.ChildAttr("a", "title") whisky_url := e.ChildAttr("a", "href") W.Name = whisky_name W.Url = whisky_url @@ -28,13 +28,13 @@ func ScrapeMCWhisky(shop Shop) []Angebot { e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { - W.Original_price, err = sanitize_price(e.ChildText(".price")) + W.Original_price, err = convert_price(e.ChildText(".price")) if err != nil { log.Fatal(err) } }) e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { - W.Discounted_price, err = sanitize_price(e.ChildText(".price")) + W.Discounted_price, err = convert_price(e.ChildText(".price")) if err != nil { log.Fatal(err) } diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index ae349f3..d62250e 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -39,11 +39,11 @@ func ScrapeRumundCo(shop Shop) []Angebot { if "" == regular_price { return } - W.Original_price, err = sanitize_price(regular_price) + W.Original_price, err = convert_price(regular_price) if err != nil { log.Fatal(err) } - W.Discounted_price, err = sanitize_price(e.ChildText(".price-value")) + W.Discounted_price, err = convert_price(e.ChildText(".price-value")) if err != nil { log.Fatal(err) } diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index 896b1fb..e489161 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -28,13 +28,13 @@ func ScrapeWhic(shop Shop) []Angebot { e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { - W.Original_price, err = sanitize_price(e.ChildText(".price")) + W.Original_price, err = convert_price(e.ChildText(".price")) if err != nil { log.Fatal(err) } }) e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { - W.Discounted_price, err = sanitize_price(e.ChildText(".price")) + W.Discounted_price, err = convert_price(e.ChildText(".price")) if err != nil { log.Fatal(err) } diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index 657bfe0..d90e0cc 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -26,13 +26,13 @@ func ScrapeWhiskyde(shop Shop) []Angebot { var err error e.ForEach(".article-price-original", func(i int, e *colly.HTMLElement) { - W.Original_price, err = sanitize_price(e.ChildText("del")) + W.Original_price, err = convert_price(e.ChildText("del")) if err != nil { log.Fatal(err) } }) e.ForEach(".article-price", func(i int, e *colly.HTMLElement) { - W.Discounted_price, err = sanitize_price(e.ChildText(".article-price-default")) + W.Discounted_price, err = convert_price(e.ChildText(".article-price-default")) if err != nil { log.Fatal(err) } diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go index c8b35a2..656cf18 100644 --- a/crawler/shop_whiskysitenl.go +++ b/crawler/shop_whiskysitenl.go @@ -34,12 +34,12 @@ func ScrapeWhiskysitenl(shop Shop) []Angebot { } discounted_price := r.FindString(strings.Trim(strings.TrimPrefix(price_discount_noisy, regular_price), "")) - W.Original_price, err = sanitize_price(regular_price) + W.Original_price, err = convert_price(regular_price) if err != nil { //log.Println(W.Name, err) return } - W.Discounted_price, err = sanitize_price(discounted_price) + W.Discounted_price, err = convert_price(discounted_price) if err != nil { //log.Println(W.Name, err) return diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index e07c42f..36b144e 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -32,13 +32,13 @@ func ScrapeWhiskyworld(shop Shop) []Angebot { var err error - W.Original_price, err = sanitize_price(regular_price) + W.Original_price, err = convert_price(regular_price) if err != nil { log.Fatal(err) return } - W.Discounted_price, err = sanitize_price(e.ChildText(".uvp")) + W.Discounted_price, err = convert_price(e.ChildText(".uvp")) if err != nil { log.Fatal(err) return diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index a9e73d0..3303b5e 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -29,12 +29,12 @@ func ScrapeWhiskyzone(shop Shop) []Angebot { if err != nil { log.Fatal(err) } - W.Discounted_price, err = sanitize_price(r.FindString(price_discount_noisy)) + W.Discounted_price, err = convert_price(r.FindString(price_discount_noisy)) if err != nil { log.Fatal(err) return } - W.Original_price, err = sanitize_price(r.FindString(price_regular_noisy)) + W.Original_price, err = convert_price(r.FindString(price_regular_noisy)) if err != nil { log.Fatal(err) return -- cgit v1.2.3