From 35882837a2821749f3a2b1dfa23f19c4168004d3 Mon Sep 17 00:00:00 2001 From: Max Date: Thu, 1 Feb 2018 16:13:56 +0100 Subject: Crawled the first seven shops. --- .gitignore | 4 ++++ bottleworld.go | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mcwhisky.go | 42 ++++++++++++++++++++++++++++++++++++++++++ whic.go | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ whiskyde.go | 46 ++++++++++++++++++++++++++++++++++++++++++++++ whiskysitenl.go | 44 ++++++++++++++++++++++++++++++++++++++++++++ whiskyworld.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++ whiskyzone.go | 45 +++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 331 insertions(+) create mode 100644 .gitignore create mode 100644 bottleworld.go create mode 100644 mcwhisky.go create mode 100644 whic.go create mode 100644 whiskyde.go create mode 100644 whiskysitenl.go create mode 100644 whiskyworld.go create mode 100644 whiskyzone.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8dc6b3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.test +*.html +*~ +*.swp diff --git a/bottleworld.go b/bottleworld.go new file mode 100644 index 0000000..2f5991a --- /dev/null +++ b/bottleworld.go @@ -0,0 +1,52 @@ +package main + +import ( + "fmt" + "log" + "regexp" + // "strings" + + // "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" +) + +func ScrapeBottleWord() { + c := colly.NewCollector( + colly.AllowedDomains("bottleworld.de"), + colly.AllowedDomains("www.bottleworld.de"), + ) + + c.OnHTML("li.item", func(e *colly.HTMLElement) { + whisky_name := e.ChildText("h2 > a") + + matched, err := regexp.MatchString("Whiske?y", whisky_name) + if err != nil { + log.Fatal(err) + } + if !matched { + return + } + + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { + e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + }) + log.Println(e.ChildAttr("img", "src")) + + fmt.Println("") + }) + + c.Visit("https://www.bottleworld.de/aktuelle-sonderpreise/show/all") +} + +func main() { + ScrapeBottleWord() +} diff --git a/mcwhisky.go b/mcwhisky.go new file mode 100644 index 0000000..a70750f --- /dev/null +++ b/mcwhisky.go @@ -0,0 +1,42 @@ +package main + +import ( + "fmt" + "log" + // "strings" + + // "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" +) + +func ScrapeMCWhisky() { + c := colly.NewCollector( + colly.AllowedDomains("mcwhisky.com"), + colly.AllowedDomains("www.mcwhisky.com"), + ) + + c.OnHTML("li.item", func(e *colly.HTMLElement) { + whisky_name := e.ChildAttr("a", "title") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { + e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + }) + log.Println(e.ChildAttr("img", "src")) + + fmt.Println("") + }) + + c.Visit("https://www.mcwhisky.com/whisky/whisky-sonderangebote.html") +} + +func main() { + ScrapeMCWhisky() +} diff --git a/whic.go b/whic.go new file mode 100644 index 0000000..e8090b0 --- /dev/null +++ b/whic.go @@ -0,0 +1,51 @@ +package main + +import ( + "fmt" + "log" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" +) + +func ScrapeWhic() { + c := colly.NewCollector( + colly.AllowedDomains("whic.de"), + ) + + c.OnHTML("li.item", func(e *colly.HTMLElement) { + whisky_name := e.ChildAttr("a", "title") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { + e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + }) + + /* + * colly does not parse a