summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhorus2018-06-15 23:28:18 +0200
committerhorus2018-06-15 23:28:18 +0200
commit8d68ac7c900241eb8499a94c23ab1f60750e7aed (patch)
tree3a5d444f866383d5cdefc512242dc2afa236641e
parent0026ba55f03c5378d5773459fcdd7c6931ff42a5 (diff)
downloadalkobote-8d68ac7c900241eb8499a94c23ab1f60750e7aed.tar.gz
Introduces config for user agent, robots.txt and crawler delay. (crawler)
-rw-r--r--crawler/config.go22
-rw-r--r--crawler/init.go13
-rw-r--r--crawler/scrape.go28
-rw-r--r--crawler/shop_bottleworld.go17
-rw-r--r--crawler/shop_mcwhisky.go7
-rw-r--r--crawler/shop_rumundco.go7
-rw-r--r--crawler/shop_whic.go7
-rw-r--r--crawler/shop_whiskyde.go7
-rw-r--r--crawler/shop_whiskysitenl.go7
-rw-r--r--crawler/shop_whiskyworld.go7
-rw-r--r--crawler/shop_whiskyzone.go7
-rw-r--r--crawler/utility.go6
12 files changed, 109 insertions, 26 deletions
diff --git a/crawler/config.go b/crawler/config.go
index f89fa45..a3939c4 100644
--- a/crawler/config.go
+++ b/crawler/config.go
@@ -17,6 +17,10 @@ type Config struct {
DBOptions string
DBPath string // for sqlite
+ UserAgent string
+ Delay int
+ IgnoreRobotsTXT bool
+
DisableURLShorter bool
Polr_URL string
Polr_API_Key string
@@ -40,6 +44,12 @@ func (c *Config) parseConfig(configFile string) {
viper.SetDefault("FixDatabase", false)
viper.SetDefault("DisableURLShorter", false)
viper.SetDefault("ShopIDs", []string{})
+ viper.SetDefault("Delay", 0)
+
+ // needs some refactoring to truly respect robots.txt
+ viper.SetDefault("IgnoreRobotsTXT", true)
+
+ viper.SetDefault("UserAgent", "colly - a friendly crawler :)")
// Name of the configuration file
viper.SetConfigName("config")
@@ -95,10 +105,16 @@ func (c *Config) setsConfig() {
c.DBDBName = viper.GetString("DB_DBName")
c.DBOptions = viper.GetString("DB_Options")
c.DBPath = viper.GetString("DB_Path")
- c.Debug = viper.GetBool("Debug")
- c.FixDatabase = viper.GetBool("FixDatabase")
+
+ c.UserAgent = viper.GetString("UserAgent")
+ c.Delay = viper.GetInt("Delay")
+ c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT")
+
c.DisableURLShorter = viper.GetBool("DisableURLShorter")
- c.ShopIDs = viper.GetStringSlice("ShopIDs")
c.Polr_URL = viper.GetString("Polr_URL")
c.Polr_API_Key = viper.GetString("Polr_API_Key")
+
+ c.Debug = viper.GetBool("Debug")
+ c.FixDatabase = viper.GetBool("FixDatabase")
+ c.ShopIDs = viper.GetStringSlice("ShopIDs")
}
diff --git a/crawler/init.go b/crawler/init.go
index 60f7e47..668df2d 100644
--- a/crawler/init.go
+++ b/crawler/init.go
@@ -23,6 +23,9 @@ func init() {
loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`)
flag.Bool("list-shops", false, `lists all crawlable shops`)
shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`)
+ user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent")
+ delay_f := flag.Int("delay", 0, "toggles random delay between crawls")
+ ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt")
flag.Parse()
loglevel := strings.ToLower(*loglevel_f)
@@ -41,6 +44,16 @@ func init() {
_conf.parseConfig(*configFile)
+ if *user_agent_f != "" {
+ _conf.UserAgent = *user_agent_f
+ }
+ if *delay_f != 0 {
+ _conf.Delay = *delay_f
+ }
+ if !*ignore_robots_f {
+ _conf.IgnoreRobotsTXT = *ignore_robots_f
+ }
+
if _conf.Debug && !*silent {
log.SetLevel(log.DebugLevel)
}
diff --git a/crawler/scrape.go b/crawler/scrape.go
index 4bc66e0..f9e758d 100644
--- a/crawler/scrape.go
+++ b/crawler/scrape.go
@@ -1,7 +1,10 @@
package main
import (
+ "time"
+
log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
)
func (app *App) ScrapeHTML(shops []Shop) {
@@ -41,6 +44,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) {
}
}
+ // if no results, return early
+ if len(W) == 0 {
+ wait <- true
+ return
+
+ }
+
err = app.save_offer(W)
if err != nil {
Warn(err, "Saving offers failed. Shop: "+shop.Name)
@@ -78,3 +88,21 @@ func (app *App) ScrapeShop(shop Shop) []Angebot {
return []Angebot{}
}
+
+/*
+ * Sets the crawler config.
+ */
+func (app *App) customCollector(allowed_urls []string) *colly.Collector {
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ colly.AllowedDomains(allowed_urls...),
+ )
+ c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT
+
+ c.Limit(&colly.LimitRule{
+ DomainGlob: "*",
+ RandomDelay: time.Duration(app.Config.Delay) * time.Second,
+ })
+
+ return c
+}
diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go
index de9fe13..d679b43 100644
--- a/crawler/shop_bottleworld.go
+++ b/crawler/shop_bottleworld.go
@@ -6,13 +6,18 @@ import (
// "github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
+ "log"
+ "time"
)
func (app *App) ScrapeBottleWord(shop Shop) []Angebot {
Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all"
Whiskys := []Angebot{}
- c := customCollector([]string{"bottleworld.de", "www.bottleworld.de"})
+ c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"})
+
+ log.Println(c.IgnoreRobotsTxt)
+ log.Println(time.Duration(app.Config.Delay))
c.OnHTML("li.item", func(e *colly.HTMLElement) {
W := Angebot{}
@@ -60,7 +65,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot {
W.Image_url = e.ChildAttr("img", "src")
- e.Request.Visit(W.Url)
+ erro := e.Request.Visit(W.Url)
+ if erro != nil {
+ Warn(nil, W.Url+" "+erro.Error())
+ }
var ctx string
W.Volume, ctx = get_volume(e)
@@ -109,7 +117,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot {
e.Request.Ctx.Put("spirit_type", detect_spirit_type(text_noisy))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go
index ef780a9..941f3b9 100644
--- a/crawler/shop_mcwhisky.go
+++ b/crawler/shop_mcwhisky.go
@@ -11,7 +11,7 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"})
+ c := app.customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"})
c.OnHTML("li.item", func(e *colly.HTMLElement) {
@@ -130,7 +130,10 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot {
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go
index 4b72c08..45069c2 100644
--- a/crawler/shop_rumundco.go
+++ b/crawler/shop_rumundco.go
@@ -14,7 +14,7 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := customCollector([]string{"rumundco.de", "www.rumundco.de"})
+ c := app.customCollector([]string{"rumundco.de", "www.rumundco.de"})
c.OnHTML(".product-teaser", func(e *colly.HTMLElement) {
@@ -155,7 +155,10 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go
index 2d0170b..93bff23 100644
--- a/crawler/shop_whic.go
+++ b/crawler/shop_whic.go
@@ -12,7 +12,7 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot {
Shop_url := "https://whic.de/angebote"
Whiskys := []Angebot{}
- c := customCollector([]string{"whic.de"})
+ c := app.customCollector([]string{"whic.de"})
c.OnHTML("li.item", func(e *colly.HTMLElement) {
@@ -127,7 +127,10 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go
index 9e061ac..d3087ca 100644
--- a/crawler/shop_whiskyde.go
+++ b/crawler/shop_whiskyde.go
@@ -11,7 +11,7 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := customCollector([]string{"whisky.de", "www.whisky.de"})
+ c := app.customCollector([]string{"whisky.de", "www.whisky.de"})
c.OnHTML(".is-buyable", func(e *colly.HTMLElement) {
@@ -118,7 +118,10 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go
index 4dad313..e3ae075 100644
--- a/crawler/shop_whiskysitenl.go
+++ b/crawler/shop_whiskysitenl.go
@@ -13,7 +13,7 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot {
Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100"
- c := customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"})
+ c := app.customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"})
c.OnHTML(".product-block", func(e *colly.HTMLElement) {
@@ -138,7 +138,10 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot {
e.Request.Ctx.Put("website", string(e.Response.Body))
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go
index 7b57d37..3f0874d 100644
--- a/crawler/shop_whiskyworld.go
+++ b/crawler/shop_whiskyworld.go
@@ -15,7 +15,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"})
+ c := app.customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"})
c.OnHTML(".product-item", func(e *colly.HTMLElement) {
if !stringInSlice(e.Request.URL.String(), Shop_urls) {
@@ -123,7 +123,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot {
})
for _, url := range Shop_urls {
- c.Visit(url)
+ err := c.Visit(url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
}
return Whiskys
diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go
index 4dc825a..dbaf0ba 100644
--- a/crawler/shop_whiskyzone.go
+++ b/crawler/shop_whiskyzone.go
@@ -13,7 +13,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot {
Whiskys := []Angebot{}
- c := customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"})
+ c := app.customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"})
c.OnHTML(".product--info", func(e *colly.HTMLElement) {
@@ -154,7 +154,10 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot {
})
- c.Visit(Shop_url)
+ err := c.Visit(Shop_url)
+ if err != nil {
+ Warn(nil, shop.Name+": "+err.Error())
+ }
return Whiskys
}
diff --git a/crawler/utility.go b/crawler/utility.go
index e0acf3f..5fa78c4 100644
--- a/crawler/utility.go
+++ b/crawler/utility.go
@@ -10,12 +10,6 @@ import (
"github.com/gocolly/colly"
)
-func customCollector(allowed_urls []string) *colly.Collector {
- return colly.NewCollector(
- colly.AllowedDomains(allowed_urls...),
- )
-}
-
func stringInSlice(a string, list []string) bool {
for _, b := range list {
if b == a {