summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMax2018-02-01 16:13:56 +0100
committerMax2018-02-01 16:13:56 +0100
commit35882837a2821749f3a2b1dfa23f19c4168004d3 (patch)
tree5d6ac5078aebac93db47e507f564842d120bd1a3
parenta5bda60647639e787a777446dce693ac330fe940 (diff)
downloadalkobote-35882837a2821749f3a2b1dfa23f19c4168004d3.tar.gz
Crawled the first seven shops.
-rw-r--r--.gitignore4
-rw-r--r--bottleworld.go52
-rw-r--r--mcwhisky.go42
-rw-r--r--whic.go51
-rw-r--r--whiskyde.go46
-rw-r--r--whiskysitenl.go44
-rw-r--r--whiskyworld.go47
-rw-r--r--whiskyzone.go45
8 files changed, 331 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8dc6b3e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.test
+*.html
+*~
+*.swp
diff --git a/bottleworld.go b/bottleworld.go
new file mode 100644
index 0000000..2f5991a
--- /dev/null
+++ b/bottleworld.go
@@ -0,0 +1,52 @@
+package main
+
+import (
+ "fmt"
+ "log"
+ "regexp"
+ // "strings"
+
+ // "github.com/PuerkitoBio/goquery"
+ "github.com/gocolly/colly"
+)
+
+func ScrapeBottleWord() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("bottleworld.de"),
+ colly.AllowedDomains("www.bottleworld.de"),
+ )
+
+ c.OnHTML("li.item", func(e *colly.HTMLElement) {
+ whisky_name := e.ChildText("h2 > a")
+
+ matched, err := regexp.MatchString("Whiske?y", whisky_name)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if !matched {
+ return
+ }
+
+ whisky_url := e.ChildAttr("a", "href")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+
+ e.ForEach(".price-box", func(i int, e *colly.HTMLElement) {
+ e.ForEach(".old-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".price"))
+ })
+ e.ForEach(".special-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".price"))
+ })
+ })
+ log.Println(e.ChildAttr("img", "src"))
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://www.bottleworld.de/aktuelle-sonderpreise/show/all")
+}
+
+func main() {
+ ScrapeBottleWord()
+}
diff --git a/mcwhisky.go b/mcwhisky.go
new file mode 100644
index 0000000..a70750f
--- /dev/null
+++ b/mcwhisky.go
@@ -0,0 +1,42 @@
+package main
+
+import (
+ "fmt"
+ "log"
+ // "strings"
+
+ // "github.com/PuerkitoBio/goquery"
+ "github.com/gocolly/colly"
+)
+
+func ScrapeMCWhisky() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("mcwhisky.com"),
+ colly.AllowedDomains("www.mcwhisky.com"),
+ )
+
+ c.OnHTML("li.item", func(e *colly.HTMLElement) {
+ whisky_name := e.ChildAttr("a", "title")
+ whisky_url := e.ChildAttr("a", "href")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+
+ e.ForEach(".price-box", func(i int, e *colly.HTMLElement) {
+ e.ForEach(".old-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".price"))
+ })
+ e.ForEach(".special-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".price"))
+ })
+ })
+ log.Println(e.ChildAttr("img", "src"))
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://www.mcwhisky.com/whisky/whisky-sonderangebote.html")
+}
+
+func main() {
+ ScrapeMCWhisky()
+}
diff --git a/whic.go b/whic.go
new file mode 100644
index 0000000..e8090b0
--- /dev/null
+++ b/whic.go
@@ -0,0 +1,51 @@
+package main
+
+import (
+ "fmt"
+ "log"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/gocolly/colly"
+)
+
+func ScrapeWhic() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("whic.de"),
+ )
+
+ c.OnHTML("li.item", func(e *colly.HTMLElement) {
+ whisky_name := e.ChildAttr("a", "title")
+ whisky_url := e.ChildAttr("a", "href")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+
+ e.ForEach(".price-box", func(i int, e *colly.HTMLElement) {
+ e.ForEach(".old-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".price"))
+ })
+ e.ForEach(".special-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".price"))
+ })
+ })
+
+ /*
+ * colly does not parse a <noscript>, thus we are reading the content and parse it as html.
+ */
+ img_link_noisy := e.ChildText(".product-image")
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(img_link_noisy))
+ if err != nil {
+ log.Fatal(err)
+ }
+ log.Println(doc.Find("img").Attr("src"))
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://whic.de/angebote")
+}
+
+func main() {
+ ScrapeWhic()
+}
diff --git a/whiskyde.go b/whiskyde.go
new file mode 100644
index 0000000..aa00279
--- /dev/null
+++ b/whiskyde.go
@@ -0,0 +1,46 @@
+package main
+
+import (
+ "fmt"
+ "log"
+
+ "github.com/gocolly/colly"
+)
+
+func ScrapeWhiskyde() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("whisky.de"),
+ colly.AllowedDomains("www.whisky.de"),
+ )
+
+ c.OnHTML(".is-buyable", func(e *colly.HTMLElement) {
+
+ whisky_name := e.ChildAttr("a", "title")
+ whisky_url := e.ChildAttr("a", "href")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+
+ e.ForEach(".article-price-original", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText("del"))
+ })
+ e.ForEach(".article-price", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".article-price-default"))
+ })
+
+ e.ForEach(".article-thumbnail", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildAttr("img", "data-src"))
+ })
+
+ e.ForEach(".article-price-prefix", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildText(".article-price-special"))
+ })
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://www.whisky.de/shop/Aktuell/Sonderangebote/")
+}
+
+func main() {
+ ScrapeWhiskyde()
+}
diff --git a/whiskysitenl.go b/whiskysitenl.go
new file mode 100644
index 0000000..30af0ef
--- /dev/null
+++ b/whiskysitenl.go
@@ -0,0 +1,44 @@
+package main
+
+import (
+ "fmt"
+ "log"
+ "regexp"
+ "strings"
+
+ "github.com/gocolly/colly"
+)
+
+func ScrapeWhiskysitenl() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("whiskysite.nl"),
+ colly.AllowedDomains("www.whiskysite.nl"),
+ )
+
+ c.OnHTML(".product-block", func(e *colly.HTMLElement) {
+
+ whisky_name := e.ChildAttr("img", "alt")
+ whisky_url := e.ChildAttr("a", "href")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+ regular_price := e.ChildText(".price-old")
+ price_discount_noisy := e.ChildText(".product-block-price")
+ r, err := regexp.Compile("[0-9]+(,[0-9]{1,2})")
+ if err != nil {
+ log.Fatal(err)
+ }
+ discounted_price := r.FindString(strings.Trim(strings.TrimPrefix(price_discount_noisy, regular_price), ""))
+ log.Println(discounted_price + "€")
+ log.Println(strings.TrimPrefix(regular_price, "€") + "€")
+
+ log.Println(e.ChildAttr("img", "src"))
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://www.whiskysite.nl/en/specials/?limit=100")
+}
+
+func main() {
+ ScrapeWhiskysitenl()
+}
diff --git a/whiskyworld.go b/whiskyworld.go
new file mode 100644
index 0000000..58735b9
--- /dev/null
+++ b/whiskyworld.go
@@ -0,0 +1,47 @@
+package main
+
+import (
+ "fmt"
+ "log"
+ "strings"
+
+ "github.com/gocolly/colly"
+)
+
+func ScrapeWhiskyworld() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("whiskyworld.de"),
+ colly.AllowedDomains("www.whiskyworld.de"),
+ )
+
+ c.OnHTML(".product-item", func(e *colly.HTMLElement) {
+
+ whisky_name_part1 := e.ChildText("h3")
+ whisky_name_part2 := e.ChildText(".item-description")
+
+ whisky_name := whisky_name_part1 + " " + whisky_name_part2
+
+ whisky_url := "https://www.whiskyworld.de/" + strings.TrimPrefix(e.ChildAttr("a", "href"), "../")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+
+ regular_price_noisy := e.ChildText(".offer")
+ regular_price := strings.TrimSuffix(strings.TrimPrefix(regular_price_noisy, "statt "), " €*")
+
+ discounted_price := e.ChildText(".uvp")
+ log.Println(strings.TrimSuffix(discounted_price, " €") + "€")
+ log.Println(regular_price + "€")
+
+ log.Println("https:" + e.ChildAttr("img", "src"))
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BMalt%2522%257D")
+ c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BWhiskies%2522%257D")
+ c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Single%2BMalt%2522%257D")
+}
+
+func main() {
+ ScrapeWhiskyworld()
+}
diff --git a/whiskyzone.go b/whiskyzone.go
new file mode 100644
index 0000000..10b996b
--- /dev/null
+++ b/whiskyzone.go
@@ -0,0 +1,45 @@
+package main
+
+import (
+ "fmt"
+ "log"
+ "regexp"
+
+ "github.com/gocolly/colly"
+)
+
+func ScrapeWhiskyzone() {
+ c := colly.NewCollector(
+ colly.AllowedDomains("whiskyzone.de"),
+ colly.AllowedDomains("www.whiskyzone.de"),
+ )
+
+ c.OnHTML(".product--info", func(e *colly.HTMLElement) {
+
+ whisky_name := e.ChildAttr("a", "title")
+ whisky_url := e.ChildAttr("a", "href")
+ log.Println(whisky_name)
+ log.Println(whisky_url)
+ price_discount_noisy := e.ChildText(".price--default")
+ price_regular_noisy := e.ChildText(".price--discount")
+
+ r, err := regexp.Compile("[0-9]+(,[0-9]{1,2})")
+ if err != nil {
+ log.Fatal(err)
+ }
+ log.Println(r.FindString(price_discount_noisy) + "€")
+ log.Println(r.FindString(price_regular_noisy) + "€")
+
+ e.ForEach(".image--media", func(i int, e *colly.HTMLElement) {
+ log.Println(e.ChildAttr("img", "src"))
+ })
+
+ fmt.Println("")
+ })
+
+ c.Visit("https://www.whiskyzone.de/widgets/emotion/index/emotionId/248/controllerName/listing")
+}
+
+func main() {
+ ScrapeWhiskyzone()
+}