summaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
authorhorus2018-05-14 16:40:03 +0200
committerhorus2018-05-14 16:40:03 +0200
commitd2e65d7a6d51c030ebc87b660bf482ae2ad024f3 (patch)
treefce56a19f25d0118600f38c1c1d94575c3c1f663 /crawler
parent006efbf61b28febfb79e93f6476e0552bbcc08bc (diff)
downloadalkobote-d2e65d7a6d51c030ebc87b660bf482ae2ad024f3.tar.gz
Various fix, e.g. it repairs wrong image urls. (crawler)
Diffstat (limited to 'crawler')
-rw-r--r--crawler/database.go29
-rw-r--r--crawler/main.go9
-rw-r--r--crawler/post_process.go8
-rw-r--r--crawler/sanitize.go6
-rw-r--r--crawler/scrape.go2
-rw-r--r--crawler/shop_whiskyworld.go5
-rw-r--r--crawler/shop_whiskyzone.go10
7 files changed, 49 insertions, 20 deletions
diff --git a/crawler/database.go b/crawler/database.go
index 6e08b78..45f7fda 100644
--- a/crawler/database.go
+++ b/crawler/database.go
@@ -35,8 +35,8 @@ func (app *App) createTables() error {
base_price INT NOT NULL,
image_url TEXT,
spirit_type TEXT NOT NULL,
- website TEXT,
- website_raw TEXT NOT NULL,
+ website LONGTEXT,
+ website_raw LONGTEXT NOT NULL,
valid_until INT NOT NULL DEFAULT 0,
created_at INT,
FOREIGN KEY(shop) REFERENCES shop(id),
@@ -76,7 +76,7 @@ func (app *App) createTables() error {
}
}
- view_query := `CREATE OR REPLACE VIEW _intern_view AS SELECT angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type, original_price, discounted_price, base_price, shop.name as shop, shop.id as shop_id, shop.short_url as shop_url, created_at, valid_until
+ view_query := `CREATE OR REPLACE VIEW _intern_view AS SELECT angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type, original_price, discounted_price, base_price, image_url, shop.name as shop, shop.id as shop_id, shop.short_url as shop_url, website_raw, created_at, valid_until
FROM angebot
JOIN shop ON angebot.shop = shop.id
WHERE (valid_until = 0 OR valid_until > (SELECT UNIX_TIMESTAMP()))`
@@ -131,11 +131,24 @@ func (app *App) save_offer(W []Angebot) error {
o.error_msg = err.Error()
o.error_ctx = fmt.Sprintf(strings.Replace(detect_duplicate_query, "?", `"%s"`, 1), o.Name)
WarnOffer(o, "database.go: Duplicate query failed")
- } /* else {
- o.error_msg = "database.go: Duplicate detected"
- o.error_ctx = fmt.Sprintf(strings.Replace(detect_duplicate_query, "?", `"%s"`, 1), o.Name)
- DebugOffer(o, "database.go: Duplicate detected")
- }*/
+ } else {
+ /*
+ * If everything went right we update the image url to reflect new changes.
+ */
+ update_img_query := fmt.Sprintf(`UPDATE _intern_view SET image_url = ?, website_raw = ? WHERE name = ? AND shop_id = %d AND volume = %4.2f AND abv = %4.2f AND original_price = %d AND discounted_price = %d AND valid_until = %d`, o.Shop, o.Volume, o.Abv, o.Original_price, o.Discounted_price, o.Valid_until)
+ update_img_stmt, err := app.DB.Prepare(update_img_query)
+ if err != nil {
+ o.error_msg = err.Error()
+ o.error_ctx = fmt.Sprintf(`UPDATE _intern_view SET image_url = %s, website_raw = %s WHERE name = %s AND shop_id = %d AND volume = %4.2f AND abv = %4.2f AND original_price = %d AND discounted_price = %d AND valid_until = %d`, o.Image_url, "redacted", o.Name, o.Shop, o.Volume, o.Abv, o.Original_price, o.Discounted_price, o.Valid_until)
+ WarnOffer(o, "database.go: Preparing update_img_query failed")
+ }
+ _, err = update_img_stmt.Exec(o.Image_url, o.Website, o.Name)
+ if err != nil {
+ o.error_msg = err.Error()
+ o.error_ctx = fmt.Sprintf(`UPDATE _intern_view SET image_url = %s WHERE name = %s AND shop_id = %d AND volume = %4.2f AND abv = %4.2f AND original_price = %d AND discounted_price = %d AND valid_until = %d`, o.Image_url, o.Name, o.Shop, o.Volume, o.Abv, o.Original_price, o.Discounted_price, o.Valid_until)
+ WarnOffer(o, "database.go: Executing update_img_query failed")
+ }
+ }
}
return nil
diff --git a/crawler/main.go b/crawler/main.go
index 8370db5..034c588 100644
--- a/crawler/main.go
+++ b/crawler/main.go
@@ -65,6 +65,15 @@ func main() {
Fatal(err, "Getting shops failed")
}
+ // reruns sanitizing functions over database
+ if app.Config.FixDatabase {
+ err := app.fix_db()
+ if err != nil {
+ Fatal(err, "Fix: Fixing database failed")
+ }
+ return
+ }
+
app.ScrapeHTML(shops)
// short url
diff --git a/crawler/post_process.go b/crawler/post_process.go
index ea4426c..8ef4dce 100644
--- a/crawler/post_process.go
+++ b/crawler/post_process.go
@@ -20,14 +20,6 @@ func (app *App) post_process() error {
}
}
- // reruns sanitizing functions over database
- if app.Config.FixDatabase {
- err := app.fix_db()
- if err != nil {
- return err
- }
- }
-
return nil
}
diff --git a/crawler/sanitize.go b/crawler/sanitize.go
index 7bfbc03..e1c6e84 100644
--- a/crawler/sanitize.go
+++ b/crawler/sanitize.go
@@ -49,7 +49,7 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot {
continue
}
- offer.Website = ""
+ //offer.Website = ""
W = append(W, offer)
}
@@ -62,6 +62,10 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot {
}
func sanitize_name(name string) string {
+ if strings.Contains(name, "Literflasche") {
+ name = strings.Replace(name, "Literflasche", "", 1)
+ }
+
if strings.Contains(name, "y.o.") {
name = strings.Replace(name, "y.o.", "Jahre", 1)
}
diff --git a/crawler/scrape.go b/crawler/scrape.go
index d7797ca..4bc66e0 100644
--- a/crawler/scrape.go
+++ b/crawler/scrape.go
@@ -66,7 +66,7 @@ func (app *App) ScrapeShop(shop Shop) []Angebot {
return app.ScrapeWhic(shop)
case "Whisky.de":
return app.ScrapeWhiskyde(shop)
- case "Whiskysite.nl":
+ case "Whiskysite":
return app.ScrapeWhiskysitenl(shop)
case "Whisky World":
return app.ScrapeWhiskyworld(shop)
diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go
index 0f39db7..af97511 100644
--- a/crawler/shop_whiskyworld.go
+++ b/crawler/shop_whiskyworld.go
@@ -31,7 +31,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
W.Shop = shop.Id
W.Spirit_type = "Whisky"
- whisky_name_part1 := e.ChildText("h3")
+ whisky_name_part1 := e.ChildText(".item-brand")
whisky_name_part2 := e.ChildText(".item-description")
W.Name = whisky_name_part1 + " " + whisky_name_part2
@@ -106,10 +106,11 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
})
- W.Image_url = "https:" + e.ChildAttr("img", "src")
+ W.Image_url = "https:" + e.ChildAttr("img", "data-src")
e.Request.Visit(W.Url)
W.Website = e.Request.Ctx.Get("website")
+ //Debug(nil, W.Website)
Whiskys = append(Whiskys, W)
})
diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go
index eae0852..2c1fb99 100644
--- a/crawler/shop_whiskyzone.go
+++ b/crawler/shop_whiskyzone.go
@@ -1,6 +1,7 @@
package main
import (
+ "errors"
"strings"
"github.com/gocolly/colly"
@@ -35,6 +36,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot {
W.Shop = shop.Id
W.Spirit_type = "Whisky"
+ e.Request.Ctx.Put("offer_url", W.Url)
e.Request.Visit(W.Url)
if "sold_out" == e.Request.Ctx.Get("sold_out") {
@@ -93,12 +95,20 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot {
})
c.OnHTML(".product--buybox", func(e *colly.HTMLElement) {
+ offer_url := e.Request.Ctx.Get("offer_url")
+ Debug(nil, "Visiting: "+e.Request.URL.String()+" with offer_url: "+offer_url)
+
+ if e.Request.URL.String() != offer_url {
+ return
+ }
+
if e.Request.URL.String() == Shop_url {
return
}
// Original & Discounted Price
e.ForEach(".product--price.price--default.price--discount", func(i int, e *colly.HTMLElement) {
+ Debug(errors.New("Discount: "+e.ChildText(".price--content.content--default")), "Whiskyzone: Original:"+e.ChildText(".price--line-through"))
e.Request.Ctx.Put("discounted_price", e.ChildText(".price--content.content--default"))
e.Request.Ctx.Put("original_price", e.ChildText(".price--line-through"))
})