diff options
| author | Max | 2018-02-01 16:13:56 +0100 |
|---|---|---|
| committer | Max | 2018-02-01 16:13:56 +0100 |
| commit | 35882837a2821749f3a2b1dfa23f19c4168004d3 (patch) | |
| tree | 5d6ac5078aebac93db47e507f564842d120bd1a3 | |
| parent | a5bda60647639e787a777446dce693ac330fe940 (diff) | |
| download | alkobote-35882837a2821749f3a2b1dfa23f19c4168004d3.tar.gz | |
Crawled the first seven shops.
| -rw-r--r-- | .gitignore | 4 | ||||
| -rw-r--r-- | bottleworld.go | 52 | ||||
| -rw-r--r-- | mcwhisky.go | 42 | ||||
| -rw-r--r-- | whic.go | 51 | ||||
| -rw-r--r-- | whiskyde.go | 46 | ||||
| -rw-r--r-- | whiskysitenl.go | 44 | ||||
| -rw-r--r-- | whiskyworld.go | 47 | ||||
| -rw-r--r-- | whiskyzone.go | 45 |
8 files changed, 331 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8dc6b3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.test +*.html +*~ +*.swp diff --git a/bottleworld.go b/bottleworld.go new file mode 100644 index 0000000..2f5991a --- /dev/null +++ b/bottleworld.go @@ -0,0 +1,52 @@ +package main + +import ( + "fmt" + "log" + "regexp" + // "strings" + + // "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" +) + +func ScrapeBottleWord() { + c := colly.NewCollector( + colly.AllowedDomains("bottleworld.de"), + colly.AllowedDomains("www.bottleworld.de"), + ) + + c.OnHTML("li.item", func(e *colly.HTMLElement) { + whisky_name := e.ChildText("h2 > a") + + matched, err := regexp.MatchString("Whiske?y", whisky_name) + if err != nil { + log.Fatal(err) + } + if !matched { + return + } + + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { + e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + }) + log.Println(e.ChildAttr("img", "src")) + + fmt.Println("") + }) + + c.Visit("https://www.bottleworld.de/aktuelle-sonderpreise/show/all") +} + +func main() { + ScrapeBottleWord() +} diff --git a/mcwhisky.go b/mcwhisky.go new file mode 100644 index 0000000..a70750f --- /dev/null +++ b/mcwhisky.go @@ -0,0 +1,42 @@ +package main + +import ( + "fmt" + "log" + // "strings" + + // "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" +) + +func ScrapeMCWhisky() { + c := colly.NewCollector( + colly.AllowedDomains("mcwhisky.com"), + colly.AllowedDomains("www.mcwhisky.com"), + ) + + c.OnHTML("li.item", func(e *colly.HTMLElement) { + whisky_name := e.ChildAttr("a", "title") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { + e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + }) + log.Println(e.ChildAttr("img", "src")) + + fmt.Println("") + }) + + c.Visit("https://www.mcwhisky.com/whisky/whisky-sonderangebote.html") +} + +func main() { + ScrapeMCWhisky() +} @@ -0,0 +1,51 @@ +package main + +import ( + "fmt" + "log" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" +) + +func ScrapeWhic() { + c := colly.NewCollector( + colly.AllowedDomains("whic.de"), + ) + + c.OnHTML("li.item", func(e *colly.HTMLElement) { + whisky_name := e.ChildAttr("a", "title") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".price-box", func(i int, e *colly.HTMLElement) { + e.ForEach(".old-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + e.ForEach(".special-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".price")) + }) + }) + + /* + * colly does not parse a <noscript>, thus we are reading the content and parse it as html. + */ + img_link_noisy := e.ChildText(".product-image") + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(img_link_noisy)) + if err != nil { + log.Fatal(err) + } + log.Println(doc.Find("img").Attr("src")) + + fmt.Println("") + }) + + c.Visit("https://whic.de/angebote") +} + +func main() { + ScrapeWhic() +} diff --git a/whiskyde.go b/whiskyde.go new file mode 100644 index 0000000..aa00279 --- /dev/null +++ b/whiskyde.go @@ -0,0 +1,46 @@ +package main + +import ( + "fmt" + "log" + + "github.com/gocolly/colly" +) + +func ScrapeWhiskyde() { + c := colly.NewCollector( + colly.AllowedDomains("whisky.de"), + colly.AllowedDomains("www.whisky.de"), + ) + + c.OnHTML(".is-buyable", func(e *colly.HTMLElement) { + + whisky_name := e.ChildAttr("a", "title") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + + e.ForEach(".article-price-original", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText("del")) + }) + e.ForEach(".article-price", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".article-price-default")) + }) + + e.ForEach(".article-thumbnail", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildAttr("img", "data-src")) + }) + + e.ForEach(".article-price-prefix", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildText(".article-price-special")) + }) + + fmt.Println("") + }) + + c.Visit("https://www.whisky.de/shop/Aktuell/Sonderangebote/") +} + +func main() { + ScrapeWhiskyde() +} diff --git a/whiskysitenl.go b/whiskysitenl.go new file mode 100644 index 0000000..30af0ef --- /dev/null +++ b/whiskysitenl.go @@ -0,0 +1,44 @@ +package main + +import ( + "fmt" + "log" + "regexp" + "strings" + + "github.com/gocolly/colly" +) + +func ScrapeWhiskysitenl() { + c := colly.NewCollector( + colly.AllowedDomains("whiskysite.nl"), + colly.AllowedDomains("www.whiskysite.nl"), + ) + + c.OnHTML(".product-block", func(e *colly.HTMLElement) { + + whisky_name := e.ChildAttr("img", "alt") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + regular_price := e.ChildText(".price-old") + price_discount_noisy := e.ChildText(".product-block-price") + r, err := regexp.Compile("[0-9]+(,[0-9]{1,2})") + if err != nil { + log.Fatal(err) + } + discounted_price := r.FindString(strings.Trim(strings.TrimPrefix(price_discount_noisy, regular_price), "")) + log.Println(discounted_price + "€") + log.Println(strings.TrimPrefix(regular_price, "€") + "€") + + log.Println(e.ChildAttr("img", "src")) + + fmt.Println("") + }) + + c.Visit("https://www.whiskysite.nl/en/specials/?limit=100") +} + +func main() { + ScrapeWhiskysitenl() +} diff --git a/whiskyworld.go b/whiskyworld.go new file mode 100644 index 0000000..58735b9 --- /dev/null +++ b/whiskyworld.go @@ -0,0 +1,47 @@ +package main + +import ( + "fmt" + "log" + "strings" + + "github.com/gocolly/colly" +) + +func ScrapeWhiskyworld() { + c := colly.NewCollector( + colly.AllowedDomains("whiskyworld.de"), + colly.AllowedDomains("www.whiskyworld.de"), + ) + + c.OnHTML(".product-item", func(e *colly.HTMLElement) { + + whisky_name_part1 := e.ChildText("h3") + whisky_name_part2 := e.ChildText(".item-description") + + whisky_name := whisky_name_part1 + " " + whisky_name_part2 + + whisky_url := "https://www.whiskyworld.de/" + strings.TrimPrefix(e.ChildAttr("a", "href"), "../") + log.Println(whisky_name) + log.Println(whisky_url) + + regular_price_noisy := e.ChildText(".offer") + regular_price := strings.TrimSuffix(strings.TrimPrefix(regular_price_noisy, "statt "), " €*") + + discounted_price := e.ChildText(".uvp") + log.Println(strings.TrimSuffix(discounted_price, " €") + "€") + log.Println(regular_price + "€") + + log.Println("https:" + e.ChildAttr("img", "src")) + + fmt.Println("") + }) + + c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BMalt%2522%257D") + c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Blended%2BWhiskies%2522%257D") + c.Visit("https://www.whiskyworld.de/themen/sonderangebote?ft=%257B%2522produkt_kategorie%2522:%2522Single%2BMalt%2522%257D") +} + +func main() { + ScrapeWhiskyworld() +} diff --git a/whiskyzone.go b/whiskyzone.go new file mode 100644 index 0000000..10b996b --- /dev/null +++ b/whiskyzone.go @@ -0,0 +1,45 @@ +package main + +import ( + "fmt" + "log" + "regexp" + + "github.com/gocolly/colly" +) + +func ScrapeWhiskyzone() { + c := colly.NewCollector( + colly.AllowedDomains("whiskyzone.de"), + colly.AllowedDomains("www.whiskyzone.de"), + ) + + c.OnHTML(".product--info", func(e *colly.HTMLElement) { + + whisky_name := e.ChildAttr("a", "title") + whisky_url := e.ChildAttr("a", "href") + log.Println(whisky_name) + log.Println(whisky_url) + price_discount_noisy := e.ChildText(".price--default") + price_regular_noisy := e.ChildText(".price--discount") + + r, err := regexp.Compile("[0-9]+(,[0-9]{1,2})") + if err != nil { + log.Fatal(err) + } + log.Println(r.FindString(price_discount_noisy) + "€") + log.Println(r.FindString(price_regular_noisy) + "€") + + e.ForEach(".image--media", func(i int, e *colly.HTMLElement) { + log.Println(e.ChildAttr("img", "src")) + }) + + fmt.Println("") + }) + + c.Visit("https://www.whiskyzone.de/widgets/emotion/index/emotionId/248/controllerName/listing") +} + +func main() { + ScrapeWhiskyzone() +} |
