diff options
| author | horus | 2018-05-14 16:40:03 +0200 |
|---|---|---|
| committer | horus | 2018-05-14 16:40:03 +0200 |
| commit | d2e65d7a6d51c030ebc87b660bf482ae2ad024f3 (patch) | |
| tree | fce56a19f25d0118600f38c1c1d94575c3c1f663 | |
| parent | 006efbf61b28febfb79e93f6476e0552bbcc08bc (diff) | |
| download | alkobote-d2e65d7a6d51c030ebc87b660bf482ae2ad024f3.tar.gz | |
Various fix, e.g. it repairs wrong image urls. (crawler)
| -rw-r--r-- | crawler/database.go | 29 | ||||
| -rw-r--r-- | crawler/main.go | 9 | ||||
| -rw-r--r-- | crawler/post_process.go | 8 | ||||
| -rw-r--r-- | crawler/sanitize.go | 6 | ||||
| -rw-r--r-- | crawler/scrape.go | 2 | ||||
| -rw-r--r-- | crawler/shop_whiskyworld.go | 5 | ||||
| -rw-r--r-- | crawler/shop_whiskyzone.go | 10 |
7 files changed, 49 insertions, 20 deletions
diff --git a/crawler/database.go b/crawler/database.go index 6e08b78..45f7fda 100644 --- a/crawler/database.go +++ b/crawler/database.go @@ -35,8 +35,8 @@ func (app *App) createTables() error { base_price INT NOT NULL, image_url TEXT, spirit_type TEXT NOT NULL, - website TEXT, - website_raw TEXT NOT NULL, + website LONGTEXT, + website_raw LONGTEXT NOT NULL, valid_until INT NOT NULL DEFAULT 0, created_at INT, FOREIGN KEY(shop) REFERENCES shop(id), @@ -76,7 +76,7 @@ func (app *App) createTables() error { } } - view_query := `CREATE OR REPLACE VIEW _intern_view AS SELECT angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type, original_price, discounted_price, base_price, shop.name as shop, shop.id as shop_id, shop.short_url as shop_url, created_at, valid_until + view_query := `CREATE OR REPLACE VIEW _intern_view AS SELECT angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type, original_price, discounted_price, base_price, image_url, shop.name as shop, shop.id as shop_id, shop.short_url as shop_url, website_raw, created_at, valid_until FROM angebot JOIN shop ON angebot.shop = shop.id WHERE (valid_until = 0 OR valid_until > (SELECT UNIX_TIMESTAMP()))` @@ -131,11 +131,24 @@ func (app *App) save_offer(W []Angebot) error { o.error_msg = err.Error() o.error_ctx = fmt.Sprintf(strings.Replace(detect_duplicate_query, "?", `"%s"`, 1), o.Name) WarnOffer(o, "database.go: Duplicate query failed") - } /* else { - o.error_msg = "database.go: Duplicate detected" - o.error_ctx = fmt.Sprintf(strings.Replace(detect_duplicate_query, "?", `"%s"`, 1), o.Name) - DebugOffer(o, "database.go: Duplicate detected") - }*/ + } else { + /* + * If everything went right we update the image url to reflect new changes. + */ + update_img_query := fmt.Sprintf(`UPDATE _intern_view SET image_url = ?, website_raw = ? WHERE name = ? AND shop_id = %d AND volume = %4.2f AND abv = %4.2f AND original_price = %d AND discounted_price = %d AND valid_until = %d`, o.Shop, o.Volume, o.Abv, o.Original_price, o.Discounted_price, o.Valid_until) + update_img_stmt, err := app.DB.Prepare(update_img_query) + if err != nil { + o.error_msg = err.Error() + o.error_ctx = fmt.Sprintf(`UPDATE _intern_view SET image_url = %s, website_raw = %s WHERE name = %s AND shop_id = %d AND volume = %4.2f AND abv = %4.2f AND original_price = %d AND discounted_price = %d AND valid_until = %d`, o.Image_url, "redacted", o.Name, o.Shop, o.Volume, o.Abv, o.Original_price, o.Discounted_price, o.Valid_until) + WarnOffer(o, "database.go: Preparing update_img_query failed") + } + _, err = update_img_stmt.Exec(o.Image_url, o.Website, o.Name) + if err != nil { + o.error_msg = err.Error() + o.error_ctx = fmt.Sprintf(`UPDATE _intern_view SET image_url = %s WHERE name = %s AND shop_id = %d AND volume = %4.2f AND abv = %4.2f AND original_price = %d AND discounted_price = %d AND valid_until = %d`, o.Image_url, o.Name, o.Shop, o.Volume, o.Abv, o.Original_price, o.Discounted_price, o.Valid_until) + WarnOffer(o, "database.go: Executing update_img_query failed") + } + } } return nil diff --git a/crawler/main.go b/crawler/main.go index 8370db5..034c588 100644 --- a/crawler/main.go +++ b/crawler/main.go @@ -65,6 +65,15 @@ func main() { Fatal(err, "Getting shops failed") } + // reruns sanitizing functions over database + if app.Config.FixDatabase { + err := app.fix_db() + if err != nil { + Fatal(err, "Fix: Fixing database failed") + } + return + } + app.ScrapeHTML(shops) // short url diff --git a/crawler/post_process.go b/crawler/post_process.go index ea4426c..8ef4dce 100644 --- a/crawler/post_process.go +++ b/crawler/post_process.go @@ -20,14 +20,6 @@ func (app *App) post_process() error { } } - // reruns sanitizing functions over database - if app.Config.FixDatabase { - err := app.fix_db() - if err != nil { - return err - } - } - return nil } diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 7bfbc03..e1c6e84 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -49,7 +49,7 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot { continue } - offer.Website = "" + //offer.Website = "" W = append(W, offer) } @@ -62,6 +62,10 @@ func sanitize_offer(angebote []Angebot, shop Shop, try int) []Angebot { } func sanitize_name(name string) string { + if strings.Contains(name, "Literflasche") { + name = strings.Replace(name, "Literflasche", "", 1) + } + if strings.Contains(name, "y.o.") { name = strings.Replace(name, "y.o.", "Jahre", 1) } diff --git a/crawler/scrape.go b/crawler/scrape.go index d7797ca..4bc66e0 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -66,7 +66,7 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return app.ScrapeWhic(shop) case "Whisky.de": return app.ScrapeWhiskyde(shop) - case "Whiskysite.nl": + case "Whiskysite": return app.ScrapeWhiskysitenl(shop) case "Whisky World": return app.ScrapeWhiskyworld(shop) diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index 0f39db7..af97511 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -31,7 +31,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { W.Shop = shop.Id W.Spirit_type = "Whisky" - whisky_name_part1 := e.ChildText("h3") + whisky_name_part1 := e.ChildText(".item-brand") whisky_name_part2 := e.ChildText(".item-description") W.Name = whisky_name_part1 + " " + whisky_name_part2 @@ -106,10 +106,11 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { }) - W.Image_url = "https:" + e.ChildAttr("img", "src") + W.Image_url = "https:" + e.ChildAttr("img", "data-src") e.Request.Visit(W.Url) W.Website = e.Request.Ctx.Get("website") + //Debug(nil, W.Website) Whiskys = append(Whiskys, W) }) diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index eae0852..2c1fb99 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -1,6 +1,7 @@ package main import ( + "errors" "strings" "github.com/gocolly/colly" @@ -35,6 +36,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { W.Shop = shop.Id W.Spirit_type = "Whisky" + e.Request.Ctx.Put("offer_url", W.Url) e.Request.Visit(W.Url) if "sold_out" == e.Request.Ctx.Get("sold_out") { @@ -93,12 +95,20 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { }) c.OnHTML(".product--buybox", func(e *colly.HTMLElement) { + offer_url := e.Request.Ctx.Get("offer_url") + Debug(nil, "Visiting: "+e.Request.URL.String()+" with offer_url: "+offer_url) + + if e.Request.URL.String() != offer_url { + return + } + if e.Request.URL.String() == Shop_url { return } // Original & Discounted Price e.ForEach(".product--price.price--default.price--discount", func(i int, e *colly.HTMLElement) { + Debug(errors.New("Discount: "+e.ChildText(".price--content.content--default")), "Whiskyzone: Original:"+e.ChildText(".price--line-through")) e.Request.Ctx.Put("discounted_price", e.ChildText(".price--content.content--default")) e.Request.Ctx.Put("original_price", e.ChildText(".price--line-through")) }) |
