summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crawler/convert_price.go9
-rw-r--r--crawler/database.go10
-rw-r--r--crawler/main.go2
-rw-r--r--crawler/sanitize.go24
-rw-r--r--crawler/shop_bottleworld.go19
-rw-r--r--crawler/shop_mcwhisky.go41
-rw-r--r--crawler/shop_rumundco.go46
-rw-r--r--crawler/shop_whic.go45
-rw-r--r--crawler/utility.go49
9 files changed, 221 insertions, 24 deletions
diff --git a/crawler/convert_price.go b/crawler/convert_price.go
index d9725a0..a76c067 100644
--- a/crawler/convert_price.go
+++ b/crawler/convert_price.go
@@ -2,6 +2,8 @@ package main
import (
"errors"
+ "log"
+ "regexp"
"strconv"
"strings"
)
@@ -28,6 +30,12 @@ func convert_price(price string) (int, error) {
price = strings.TrimSuffix(strings.ToLower(price), "euro")
price = strings.TrimSpace(price)
+ r, err := regexp.Compile(`[0-9]+([.,][0-9]+)?`)
+ if err != nil {
+ return 0, err
+ }
+ price = r.FindString(price)
+
if len(price) < 2 {
price = "00" + price
} else if len(price) < 3 {
@@ -90,6 +98,7 @@ func convert_price(price string) (int, error) {
*/
price_int, err := strconv.Atoi(price)
if err != nil {
+ log.Println(price)
return 0, err
}
diff --git a/crawler/database.go b/crawler/database.go
index b1d4e5e..085f210 100644
--- a/crawler/database.go
+++ b/crawler/database.go
@@ -29,7 +29,7 @@ func (app *App) createTables() error {
volume FLOAT,
original_price INT,
discounted_price INT,
- price_per_litre INT,
+ base_price INT,
image_url TEXT,
spirit_type TEXT,
valid_until INT DEFAULT NULL,
@@ -43,7 +43,7 @@ func (app *App) createTables() error {
query3 := `CREATE OR REPLACE VIEW whisky_view AS
SELECT
- angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url,original_price, discounted_price, angebot.price_per_litre, image_url,
+ angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url,original_price, discounted_price, angebot.base_price, image_url,
shop.name as shop, shop.url as shop_url, (original_price/discounted_price) AS quotient
FROM angebot
JOIN shop ON angebot.shop = shop.id
@@ -56,7 +56,7 @@ func (app *App) createTables() error {
func (app *App) save_offer(W []Angebot) error {
- query := `INSERT INTO angebot (shop, name, url, abv, volume, original_price, discounted_price, price_per_litre, valid_until, image_url, spirit_type, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+ query := `INSERT INTO angebot (shop, name, url, abv, volume, original_price, discounted_price, base_price, valid_until, image_url, spirit_type, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
stmt, err := app.DB.Prepare(query)
if err != nil {
@@ -80,9 +80,9 @@ func (app *App) save_offer(W []Angebot) error {
if err == sql.ErrNoRows {
if 0 == o.Valid_until {
- _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Price_per_litre, sql.NullInt64{}, o.Image_url, o.Spirit_type, app.Now)
+ _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Base_price, sql.NullInt64{}, o.Image_url, o.Spirit_type, app.Now)
} else {
- _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Price_per_litre, o.Valid_until, o.Image_url, o.Spirit_type, app.Now)
+ _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Base_price, o.Valid_until, o.Image_url, o.Spirit_type, app.Now)
}
if err != nil {
return err
diff --git a/crawler/main.go b/crawler/main.go
index a25feaf..8a3556a 100644
--- a/crawler/main.go
+++ b/crawler/main.go
@@ -31,7 +31,7 @@ type Angebot struct {
Url string
Original_price int
Discounted_price int
- Price_per_litre int
+ Base_price int
Image_url string
Spirit_type string
Valid_until int
diff --git a/crawler/sanitize.go b/crawler/sanitize.go
index fc4ee81..a40745a 100644
--- a/crawler/sanitize.go
+++ b/crawler/sanitize.go
@@ -92,10 +92,32 @@ func sanitize_name(name string) string {
return name
}
-func sanitize_price_per(price_noisy string) (price int, err error) {
+func sanitize_base_price(price_noisy string) (price int, err error) {
if strings.Contains(price_noisy, "Preis pro Liter") {
price_noisy = strings.Replace(price_noisy, "Preis pro Liter", "", -1)
}
+ if strings.Contains(price_noisy, " pro 1 l") {
+ price_noisy = strings.Replace(price_noisy, " pro 1 l", "", -1)
+ }
+
+ if strings.Contains(price_noisy, " pro 1 stück") {
+ price_noisy = strings.Replace(price_noisy, " pro 1 stück", "", -1)
+ }
+
+ if strings.Contains(price_noisy, " pro 1 Stück") {
+ price_noisy = strings.Replace(price_noisy, " pro 1 Stück", "", -1)
+ }
+
+ if strings.Contains(price_noisy, "Grundpreis:") {
+ price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1)
+ price_noisy = strings.TrimSpace(price_noisy)
+ }
+
+ if strings.Contains(price_noisy, "/Liter") {
+ price_noisy = strings.Replace(price_noisy, "/Liter", "", -1)
+ price_noisy = strings.TrimSpace(price_noisy)
+ }
+
return convert_price(price_noisy)
}
diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go
index a3eae35..fdf1cd8 100644
--- a/crawler/shop_bottleworld.go
+++ b/crawler/shop_bottleworld.go
@@ -52,11 +52,11 @@ func ScrapeBottleWord(shop Shop) []Angebot {
})
price_per_litre_noisy := e.ChildText(".price-per-liter")
- price_per_litre, err := sanitize_price_per(price_per_litre_noisy)
+ price_per_litre, err := sanitize_base_price(price_per_litre_noisy)
if err != nil {
log.Fatal(err)
}
- W.Price_per_litre = price_per_litre
+ W.Base_price = price_per_litre
W.Image_url = e.ChildAttr("img", "src")
@@ -64,17 +64,8 @@ func ScrapeBottleWord(shop Shop) []Angebot {
W.Shop = shop.Id
- volume_noisy := e.Request.Ctx.Get("volume")
- W.Volume, err = extract_volume(volume_noisy)
- if err != nil {
- log.Fatal(err)
- }
-
- abv_noisy := e.Request.Ctx.Get("abv")
- W.Abv, err = extract_abv(abv_noisy)
- if err != nil {
- log.Fatal(err)
- }
+ W.Volume = get_volume(e)
+ W.Abv = get_abv(e)
Whiskys = append(Whiskys, W)
})
@@ -82,7 +73,7 @@ func ScrapeBottleWord(shop Shop) []Angebot {
c.OnHTML("#product-attribute-specs-table", func(e *colly.HTMLElement) {
e.ForEach("tr", func(i int, e *colly.HTMLElement) {
td_str := e.ChildText("td")
- matched, err := regexp.MatchString("[0-9]+([,.][0-9]+)? l$", td_str)
+ matched, err := regexp.MatchString("^[0-9]+([,.][0-9]+)? l$", td_str)
if err != nil {
log.Fatal(err)
}
diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go
index e0c1ab8..b44e892 100644
--- a/crawler/shop_mcwhisky.go
+++ b/crawler/shop_mcwhisky.go
@@ -2,6 +2,7 @@ package main
import (
"log"
+ "regexp"
// "strings"
// "github.com/PuerkitoBio/goquery"
@@ -9,6 +10,8 @@ import (
)
func ScrapeMCWhisky(shop Shop) []Angebot {
+ Shop_url := "https://www.mcwhisky.com/whisky/whisky-sonderangebote.html"
+
Whiskys := []Angebot{}
c := colly.NewCollector(
@@ -17,6 +20,11 @@ func ScrapeMCWhisky(shop Shop) []Angebot {
)
c.OnHTML("li.item", func(e *colly.HTMLElement) {
+
+ if e.Request.URL.String() != Shop_url {
+ return
+ }
+
W := Angebot{}
whisky_name := e.ChildAttr("a", "title")
@@ -40,15 +48,46 @@ func ScrapeMCWhisky(shop Shop) []Angebot {
}
})
})
+
+ price_per_litre_noisy := e.ChildText(".price-box-extended-info-ppl")
+ W.Base_price, err = sanitize_base_price(price_per_litre_noisy)
+ if err != nil {
+ log.Fatal(err)
+ }
+
W.Image_url = e.ChildAttr("img", "src")
+ e.Request.Visit(W.Url)
+
+ W.Volume = get_volume(e)
+ W.Abv = get_abv(e)
+
W.Shop = shop.Id
W.Spirit_type = "Whisky"
Whiskys = append(Whiskys, W)
})
- c.Visit("https://www.mcwhisky.com/whisky/whisky-sonderangebote.html")
+ c.OnHTML(".products-attributes-alcohol", func(e *colly.HTMLElement) {
+ text_noisy := e.Text
+
+ r_abv, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?%`)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ e.Request.Ctx.Put("abv", r_abv.FindString(text_noisy))
+
+ r_volume, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?Liter$`)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ e.Request.Ctx.Put("volume", r_volume.FindString(text_noisy))
+
+ })
+
+ c.Visit(Shop_url)
return Whiskys
}
diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go
index d62250e..ef98e90 100644
--- a/crawler/shop_rumundco.go
+++ b/crawler/shop_rumundco.go
@@ -10,6 +10,9 @@ import (
)
func ScrapeRumundCo(shop Shop) []Angebot {
+
+ Shop_url := "https://www.rumundco.de/navi.php?q=4&kf=29&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350"
+
Whiskys := []Angebot{}
c := colly.NewCollector(
@@ -18,6 +21,11 @@ func ScrapeRumundCo(shop Shop) []Angebot {
)
c.OnHTML(".product-teaser", func(e *colly.HTMLElement) {
+
+ if e.Request.URL.String() != Shop_url {
+ return
+ }
+
W := Angebot{}
whisky_name := strings.TrimPrefix(e.ChildAttr("img", "alt"), "Restposten: ")
@@ -47,16 +55,52 @@ func ScrapeRumundCo(shop Shop) []Angebot {
if err != nil {
log.Fatal(err)
}
+
+ e.ForEach(".base_price", func(i int, e *colly.HTMLElement) {
+ price_per_litre_noisy := e.ChildText(".value")
+ W.Base_price, err = sanitize_base_price(price_per_litre_noisy)
+ if err != nil {
+ log.Fatal(err)
+ }
+ })
+
})
+
W.Image_url = "https://www.rumundco.de/" + e.ChildAttr("img", "src")
+ e.Request.Visit(W.Url)
+
+ W.Volume = get_volume(e)
+
+ if W.Volume == 0 {
+ return
+ }
+
+ W.Abv = get_abv(e)
+
W.Shop = shop.Id
W.Spirit_type = "Whisky"
Whiskys = append(Whiskys, W)
})
- c.Visit("https://www.rumundco.de/navi.php?q=4&kf=29&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350")
+ c.OnHTML("#table-collapse .product-attributes table", func(e *colly.HTMLElement) {
+
+ e.ForEach("tr", func(i int, e *colly.HTMLElement) {
+ text_noisy := e.ChildText("th")
+
+ //log.Println("Visiting (" + e.Request.URL.String() + "). Found: " + text_noisy + " END")
+
+ if strings.Contains(text_noisy, "Genauer Inhalt:") {
+ //log.Println("Visiting (" + e.Request.URL.String() + "). Found (V): " + e.ChildText("td") + " END")
+ e.Request.Ctx.Put("volume", e.ChildText("td"))
+ } else if strings.Contains(text_noisy, "Alkoholgehalt in %:") {
+ e.Request.Ctx.Put("abv", e.ChildText("a"))
+ }
+ })
+ })
+
+ c.Visit(Shop_url)
return Whiskys
}
diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go
index e489161..6025050 100644
--- a/crawler/shop_whic.go
+++ b/crawler/shop_whic.go
@@ -2,6 +2,7 @@ package main
import (
"log"
+ "regexp"
"strings"
"github.com/PuerkitoBio/goquery"
@@ -9,6 +10,7 @@ import (
)
func ScrapeWhic(shop Shop) []Angebot {
+ Shop_url := "https://whic.de/angebote"
Whiskys := []Angebot{}
c := colly.NewCollector(
@@ -16,6 +18,11 @@ func ScrapeWhic(shop Shop) []Angebot {
)
c.OnHTML("li.item", func(e *colly.HTMLElement) {
+
+ if e.Request.URL.String() != Shop_url {
+ return
+ }
+
W := Angebot{}
whisky_name := e.ChildAttr("a", "title")
@@ -41,6 +48,12 @@ func ScrapeWhic(shop Shop) []Angebot {
})
})
+ base_price_noisy := e.ChildText(".base-price")
+ W.Base_price, err = sanitize_base_price(base_price_noisy)
+ if err != nil {
+ log.Fatal(err)
+ }
+
/*
* colly does not parse a <noscript>, thus we are reading the content and parse it as html.
*/
@@ -52,13 +65,43 @@ func ScrapeWhic(shop Shop) []Angebot {
}
W.Image_url, _ = doc.Find("img").Attr("src")
+ e.Request.Visit(W.Url)
+ W.Volume = get_volume(e)
+ W.Abv = get_abv(e)
+
W.Shop = shop.Id
W.Spirit_type = "Whisky"
Whiskys = append(Whiskys, W)
})
- c.Visit("https://whic.de/angebote")
+ c.OnHTML("#product-view-head-txt-extra-info", func(e *colly.HTMLElement) {
+ text_noisy := e.Text
+
+ r_volume, err := regexp.Compile("Volumen: ([0-9]+([.,][0-9]+)) Liter")
+ if err != nil {
+ log.Fatal(err)
+ }
+ volume := r_volume.FindStringSubmatch(text_noisy)
+ if volume == nil || len(volume) < 2 {
+ return
+ }
+
+ e.Request.Ctx.Put("volume", volume[1]+"l")
+
+ r_abv, err := regexp.Compile("Alkoholgehalt: ([0-9]+([.,][0-9]+))%")
+ if err != nil {
+ log.Fatal(err)
+ }
+ abv := r_abv.FindStringSubmatch(text_noisy)
+ if abv == nil || len(abv) < 2 {
+ return
+ }
+
+ e.Request.Ctx.Put("abv", abv[1]+"%")
+ })
+
+ c.Visit(Shop_url)
return Whiskys
}
diff --git a/crawler/utility.go b/crawler/utility.go
index a794c4b..9de7845 100644
--- a/crawler/utility.go
+++ b/crawler/utility.go
@@ -5,6 +5,8 @@ import (
"regexp"
"strconv"
"strings"
+
+ "github.com/gocolly/colly"
)
func detect_spirit_type(name string) string {
@@ -47,6 +49,7 @@ func extract_volume(volume string) (float32, error) {
if err != nil {
return 0, err
}
+
return float32(volume64), err
}
@@ -61,5 +64,51 @@ func extract_abv(abv_noisy string) (float32, error) {
if err != nil {
return 0, err
}
+
return float32(abv64), nil
}
+
+/*
+ * In litre, but float.
+ */
+func get_volume(e *colly.HTMLElement) float32 {
+
+ volume_noisy := e.Request.Ctx.Get("volume")
+
+ matched, err := regexp.MatchString(`[lL](iter)?`, volume_noisy)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if !matched {
+ log.Println("get_volume: not matched: " + volume_noisy)
+ return 0
+ }
+
+ volume, err := extract_volume(volume_noisy)
+ if err != nil {
+ log.Println("get_volume: " + volume_noisy)
+ log.Fatal(err)
+ }
+
+ return volume
+}
+
+/*
+ * In procent. (float)
+ */
+func get_abv(e *colly.HTMLElement) float32 {
+
+ abv_noisy := e.Request.Ctx.Get("abv")
+
+ if abv_noisy == "" {
+ return 0
+ }
+
+ abv, err := extract_abv(abv_noisy)
+ if err != nil {
+ log.Println("get_abv: " + abv_noisy)
+ log.Fatal(err)
+ }
+
+ return abv
+}