diff options
| author | Max | 2018-02-08 18:26:41 +0100 |
|---|---|---|
| committer | Max | 2018-02-08 18:26:41 +0100 |
| commit | f6904aab20e2d09255fd0adabfd246165ff3cb02 (patch) | |
| tree | f7ac27cb5dd34443640235a97ce9bde8f2a1816a /crawler | |
| parent | ae7ed42df6a55e36c82b88e7c71569951847a68c (diff) | |
| download | alkobote-f6904aab20e2d09255fd0adabfd246165ff3cb02.tar.gz | |
Crawler extracts volume, price per litre and abv. (MC Whisky, Rum & Co, Whic)
Diffstat (limited to 'crawler')
| -rw-r--r-- | crawler/convert_price.go | 9 | ||||
| -rw-r--r-- | crawler/database.go | 10 | ||||
| -rw-r--r-- | crawler/main.go | 2 | ||||
| -rw-r--r-- | crawler/sanitize.go | 24 | ||||
| -rw-r--r-- | crawler/shop_bottleworld.go | 19 | ||||
| -rw-r--r-- | crawler/shop_mcwhisky.go | 41 | ||||
| -rw-r--r-- | crawler/shop_rumundco.go | 46 | ||||
| -rw-r--r-- | crawler/shop_whic.go | 45 | ||||
| -rw-r--r-- | crawler/utility.go | 49 |
9 files changed, 221 insertions, 24 deletions
diff --git a/crawler/convert_price.go b/crawler/convert_price.go index d9725a0..a76c067 100644 --- a/crawler/convert_price.go +++ b/crawler/convert_price.go @@ -2,6 +2,8 @@ package main import ( "errors" + "log" + "regexp" "strconv" "strings" ) @@ -28,6 +30,12 @@ func convert_price(price string) (int, error) { price = strings.TrimSuffix(strings.ToLower(price), "euro") price = strings.TrimSpace(price) + r, err := regexp.Compile(`[0-9]+([.,][0-9]+)?`) + if err != nil { + return 0, err + } + price = r.FindString(price) + if len(price) < 2 { price = "00" + price } else if len(price) < 3 { @@ -90,6 +98,7 @@ func convert_price(price string) (int, error) { */ price_int, err := strconv.Atoi(price) if err != nil { + log.Println(price) return 0, err } diff --git a/crawler/database.go b/crawler/database.go index b1d4e5e..085f210 100644 --- a/crawler/database.go +++ b/crawler/database.go @@ -29,7 +29,7 @@ func (app *App) createTables() error { volume FLOAT, original_price INT, discounted_price INT, - price_per_litre INT, + base_price INT, image_url TEXT, spirit_type TEXT, valid_until INT DEFAULT NULL, @@ -43,7 +43,7 @@ func (app *App) createTables() error { query3 := `CREATE OR REPLACE VIEW whisky_view AS SELECT - angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url,original_price, discounted_price, angebot.price_per_litre, image_url, + angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url,original_price, discounted_price, angebot.base_price, image_url, shop.name as shop, shop.url as shop_url, (original_price/discounted_price) AS quotient FROM angebot JOIN shop ON angebot.shop = shop.id @@ -56,7 +56,7 @@ func (app *App) createTables() error { func (app *App) save_offer(W []Angebot) error { - query := `INSERT INTO angebot (shop, name, url, abv, volume, original_price, discounted_price, price_per_litre, valid_until, image_url, spirit_type, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + query := `INSERT INTO angebot (shop, name, url, abv, volume, original_price, discounted_price, base_price, valid_until, image_url, spirit_type, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` stmt, err := app.DB.Prepare(query) if err != nil { @@ -80,9 +80,9 @@ func (app *App) save_offer(W []Angebot) error { if err == sql.ErrNoRows { if 0 == o.Valid_until { - _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Price_per_litre, sql.NullInt64{}, o.Image_url, o.Spirit_type, app.Now) + _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Base_price, sql.NullInt64{}, o.Image_url, o.Spirit_type, app.Now) } else { - _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Price_per_litre, o.Valid_until, o.Image_url, o.Spirit_type, app.Now) + _, err = stmt.Exec(o.Shop, o.Name, o.Url, o.Abv, o.Volume, o.Original_price, o.Discounted_price, o.Base_price, o.Valid_until, o.Image_url, o.Spirit_type, app.Now) } if err != nil { return err diff --git a/crawler/main.go b/crawler/main.go index a25feaf..8a3556a 100644 --- a/crawler/main.go +++ b/crawler/main.go @@ -31,7 +31,7 @@ type Angebot struct { Url string Original_price int Discounted_price int - Price_per_litre int + Base_price int Image_url string Spirit_type string Valid_until int diff --git a/crawler/sanitize.go b/crawler/sanitize.go index fc4ee81..a40745a 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -92,10 +92,32 @@ func sanitize_name(name string) string { return name } -func sanitize_price_per(price_noisy string) (price int, err error) { +func sanitize_base_price(price_noisy string) (price int, err error) { if strings.Contains(price_noisy, "Preis pro Liter") { price_noisy = strings.Replace(price_noisy, "Preis pro Liter", "", -1) } + if strings.Contains(price_noisy, " pro 1 l") { + price_noisy = strings.Replace(price_noisy, " pro 1 l", "", -1) + } + + if strings.Contains(price_noisy, " pro 1 stück") { + price_noisy = strings.Replace(price_noisy, " pro 1 stück", "", -1) + } + + if strings.Contains(price_noisy, " pro 1 Stück") { + price_noisy = strings.Replace(price_noisy, " pro 1 Stück", "", -1) + } + + if strings.Contains(price_noisy, "Grundpreis:") { + price_noisy = strings.Replace(price_noisy, "Grundpreis", "", -1) + price_noisy = strings.TrimSpace(price_noisy) + } + + if strings.Contains(price_noisy, "/Liter") { + price_noisy = strings.Replace(price_noisy, "/Liter", "", -1) + price_noisy = strings.TrimSpace(price_noisy) + } + return convert_price(price_noisy) } diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index a3eae35..fdf1cd8 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -52,11 +52,11 @@ func ScrapeBottleWord(shop Shop) []Angebot { }) price_per_litre_noisy := e.ChildText(".price-per-liter") - price_per_litre, err := sanitize_price_per(price_per_litre_noisy) + price_per_litre, err := sanitize_base_price(price_per_litre_noisy) if err != nil { log.Fatal(err) } - W.Price_per_litre = price_per_litre + W.Base_price = price_per_litre W.Image_url = e.ChildAttr("img", "src") @@ -64,17 +64,8 @@ func ScrapeBottleWord(shop Shop) []Angebot { W.Shop = shop.Id - volume_noisy := e.Request.Ctx.Get("volume") - W.Volume, err = extract_volume(volume_noisy) - if err != nil { - log.Fatal(err) - } - - abv_noisy := e.Request.Ctx.Get("abv") - W.Abv, err = extract_abv(abv_noisy) - if err != nil { - log.Fatal(err) - } + W.Volume = get_volume(e) + W.Abv = get_abv(e) Whiskys = append(Whiskys, W) }) @@ -82,7 +73,7 @@ func ScrapeBottleWord(shop Shop) []Angebot { c.OnHTML("#product-attribute-specs-table", func(e *colly.HTMLElement) { e.ForEach("tr", func(i int, e *colly.HTMLElement) { td_str := e.ChildText("td") - matched, err := regexp.MatchString("[0-9]+([,.][0-9]+)? l$", td_str) + matched, err := regexp.MatchString("^[0-9]+([,.][0-9]+)? l$", td_str) if err != nil { log.Fatal(err) } diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index e0c1ab8..b44e892 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -2,6 +2,7 @@ package main import ( "log" + "regexp" // "strings" // "github.com/PuerkitoBio/goquery" @@ -9,6 +10,8 @@ import ( ) func ScrapeMCWhisky(shop Shop) []Angebot { + Shop_url := "https://www.mcwhisky.com/whisky/whisky-sonderangebote.html" + Whiskys := []Angebot{} c := colly.NewCollector( @@ -17,6 +20,11 @@ func ScrapeMCWhisky(shop Shop) []Angebot { ) c.OnHTML("li.item", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url { + return + } + W := Angebot{} whisky_name := e.ChildAttr("a", "title") @@ -40,15 +48,46 @@ func ScrapeMCWhisky(shop Shop) []Angebot { } }) }) + + price_per_litre_noisy := e.ChildText(".price-box-extended-info-ppl") + W.Base_price, err = sanitize_base_price(price_per_litre_noisy) + if err != nil { + log.Fatal(err) + } + W.Image_url = e.ChildAttr("img", "src") + e.Request.Visit(W.Url) + + W.Volume = get_volume(e) + W.Abv = get_abv(e) + W.Shop = shop.Id W.Spirit_type = "Whisky" Whiskys = append(Whiskys, W) }) - c.Visit("https://www.mcwhisky.com/whisky/whisky-sonderangebote.html") + c.OnHTML(".products-attributes-alcohol", func(e *colly.HTMLElement) { + text_noisy := e.Text + + r_abv, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?%`) + if err != nil { + log.Fatal(err) + } + + e.Request.Ctx.Put("abv", r_abv.FindString(text_noisy)) + + r_volume, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?Liter$`) + if err != nil { + log.Fatal(err) + } + + e.Request.Ctx.Put("volume", r_volume.FindString(text_noisy)) + + }) + + c.Visit(Shop_url) return Whiskys } diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index d62250e..ef98e90 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -10,6 +10,9 @@ import ( ) func ScrapeRumundCo(shop Shop) []Angebot { + + Shop_url := "https://www.rumundco.de/navi.php?q=4&kf=29&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350" + Whiskys := []Angebot{} c := colly.NewCollector( @@ -18,6 +21,11 @@ func ScrapeRumundCo(shop Shop) []Angebot { ) c.OnHTML(".product-teaser", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url { + return + } + W := Angebot{} whisky_name := strings.TrimPrefix(e.ChildAttr("img", "alt"), "Restposten: ") @@ -47,16 +55,52 @@ func ScrapeRumundCo(shop Shop) []Angebot { if err != nil { log.Fatal(err) } + + e.ForEach(".base_price", func(i int, e *colly.HTMLElement) { + price_per_litre_noisy := e.ChildText(".value") + W.Base_price, err = sanitize_base_price(price_per_litre_noisy) + if err != nil { + log.Fatal(err) + } + }) + }) + W.Image_url = "https://www.rumundco.de/" + e.ChildAttr("img", "src") + e.Request.Visit(W.Url) + + W.Volume = get_volume(e) + + if W.Volume == 0 { + return + } + + W.Abv = get_abv(e) + W.Shop = shop.Id W.Spirit_type = "Whisky" Whiskys = append(Whiskys, W) }) - c.Visit("https://www.rumundco.de/navi.php?q=4&kf=29&kk-suesse-von=0&kk-suesse-bis=100&kk-milde-von=0&kk-milde-bis=100&kk-wuerze-von=0&kk-wuerze-bis=100&kk-frucht-von=0&kk-frucht-bis=100&kk-torf-von=0&kk-torf-bis=100&hf=0&af=90&Sortierung=11&a=350") + c.OnHTML("#table-collapse .product-attributes table", func(e *colly.HTMLElement) { + + e.ForEach("tr", func(i int, e *colly.HTMLElement) { + text_noisy := e.ChildText("th") + + //log.Println("Visiting (" + e.Request.URL.String() + "). Found: " + text_noisy + " END") + + if strings.Contains(text_noisy, "Genauer Inhalt:") { + //log.Println("Visiting (" + e.Request.URL.String() + "). Found (V): " + e.ChildText("td") + " END") + e.Request.Ctx.Put("volume", e.ChildText("td")) + } else if strings.Contains(text_noisy, "Alkoholgehalt in %:") { + e.Request.Ctx.Put("abv", e.ChildText("a")) + } + }) + }) + + c.Visit(Shop_url) return Whiskys } diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index e489161..6025050 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -2,6 +2,7 @@ package main import ( "log" + "regexp" "strings" "github.com/PuerkitoBio/goquery" @@ -9,6 +10,7 @@ import ( ) func ScrapeWhic(shop Shop) []Angebot { + Shop_url := "https://whic.de/angebote" Whiskys := []Angebot{} c := colly.NewCollector( @@ -16,6 +18,11 @@ func ScrapeWhic(shop Shop) []Angebot { ) c.OnHTML("li.item", func(e *colly.HTMLElement) { + + if e.Request.URL.String() != Shop_url { + return + } + W := Angebot{} whisky_name := e.ChildAttr("a", "title") @@ -41,6 +48,12 @@ func ScrapeWhic(shop Shop) []Angebot { }) }) + base_price_noisy := e.ChildText(".base-price") + W.Base_price, err = sanitize_base_price(base_price_noisy) + if err != nil { + log.Fatal(err) + } + /* * colly does not parse a <noscript>, thus we are reading the content and parse it as html. */ @@ -52,13 +65,43 @@ func ScrapeWhic(shop Shop) []Angebot { } W.Image_url, _ = doc.Find("img").Attr("src") + e.Request.Visit(W.Url) + W.Volume = get_volume(e) + W.Abv = get_abv(e) + W.Shop = shop.Id W.Spirit_type = "Whisky" Whiskys = append(Whiskys, W) }) - c.Visit("https://whic.de/angebote") + c.OnHTML("#product-view-head-txt-extra-info", func(e *colly.HTMLElement) { + text_noisy := e.Text + + r_volume, err := regexp.Compile("Volumen: ([0-9]+([.,][0-9]+)) Liter") + if err != nil { + log.Fatal(err) + } + volume := r_volume.FindStringSubmatch(text_noisy) + if volume == nil || len(volume) < 2 { + return + } + + e.Request.Ctx.Put("volume", volume[1]+"l") + + r_abv, err := regexp.Compile("Alkoholgehalt: ([0-9]+([.,][0-9]+))%") + if err != nil { + log.Fatal(err) + } + abv := r_abv.FindStringSubmatch(text_noisy) + if abv == nil || len(abv) < 2 { + return + } + + e.Request.Ctx.Put("abv", abv[1]+"%") + }) + + c.Visit(Shop_url) return Whiskys } diff --git a/crawler/utility.go b/crawler/utility.go index a794c4b..9de7845 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -5,6 +5,8 @@ import ( "regexp" "strconv" "strings" + + "github.com/gocolly/colly" ) func detect_spirit_type(name string) string { @@ -47,6 +49,7 @@ func extract_volume(volume string) (float32, error) { if err != nil { return 0, err } + return float32(volume64), err } @@ -61,5 +64,51 @@ func extract_abv(abv_noisy string) (float32, error) { if err != nil { return 0, err } + return float32(abv64), nil } + +/* + * In litre, but float. + */ +func get_volume(e *colly.HTMLElement) float32 { + + volume_noisy := e.Request.Ctx.Get("volume") + + matched, err := regexp.MatchString(`[lL](iter)?`, volume_noisy) + if err != nil { + log.Fatal(err) + } + if !matched { + log.Println("get_volume: not matched: " + volume_noisy) + return 0 + } + + volume, err := extract_volume(volume_noisy) + if err != nil { + log.Println("get_volume: " + volume_noisy) + log.Fatal(err) + } + + return volume +} + +/* + * In procent. (float) + */ +func get_abv(e *colly.HTMLElement) float32 { + + abv_noisy := e.Request.Ctx.Get("abv") + + if abv_noisy == "" { + return 0 + } + + abv, err := extract_abv(abv_noisy) + if err != nil { + log.Println("get_abv: " + abv_noisy) + log.Fatal(err) + } + + return abv +} |
