diff options
| -rw-r--r-- | crawler/config.go | 22 | ||||
| -rw-r--r-- | crawler/init.go | 13 | ||||
| -rw-r--r-- | crawler/scrape.go | 28 | ||||
| -rw-r--r-- | crawler/shop_bottleworld.go | 17 | ||||
| -rw-r--r-- | crawler/shop_mcwhisky.go | 7 | ||||
| -rw-r--r-- | crawler/shop_rumundco.go | 7 | ||||
| -rw-r--r-- | crawler/shop_whic.go | 7 | ||||
| -rw-r--r-- | crawler/shop_whiskyde.go | 7 | ||||
| -rw-r--r-- | crawler/shop_whiskysitenl.go | 7 | ||||
| -rw-r--r-- | crawler/shop_whiskyworld.go | 7 | ||||
| -rw-r--r-- | crawler/shop_whiskyzone.go | 7 | ||||
| -rw-r--r-- | crawler/utility.go | 6 |
12 files changed, 109 insertions, 26 deletions
diff --git a/crawler/config.go b/crawler/config.go index f89fa45..a3939c4 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -17,6 +17,10 @@ type Config struct { DBOptions string DBPath string // for sqlite + UserAgent string + Delay int + IgnoreRobotsTXT bool + DisableURLShorter bool Polr_URL string Polr_API_Key string @@ -40,6 +44,12 @@ func (c *Config) parseConfig(configFile string) { viper.SetDefault("FixDatabase", false) viper.SetDefault("DisableURLShorter", false) viper.SetDefault("ShopIDs", []string{}) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") // Name of the configuration file viper.SetConfigName("config") @@ -95,10 +105,16 @@ func (c *Config) setsConfig() { c.DBDBName = viper.GetString("DB_DBName") c.DBOptions = viper.GetString("DB_Options") c.DBPath = viper.GetString("DB_Path") - c.Debug = viper.GetBool("Debug") - c.FixDatabase = viper.GetBool("FixDatabase") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + c.DisableURLShorter = viper.GetBool("DisableURLShorter") - c.ShopIDs = viper.GetStringSlice("ShopIDs") c.Polr_URL = viper.GetString("Polr_URL") c.Polr_API_Key = viper.GetString("Polr_API_Key") + + c.Debug = viper.GetBool("Debug") + c.FixDatabase = viper.GetBool("FixDatabase") + c.ShopIDs = viper.GetStringSlice("ShopIDs") } diff --git a/crawler/init.go b/crawler/init.go index 60f7e47..668df2d 100644 --- a/crawler/init.go +++ b/crawler/init.go @@ -23,6 +23,9 @@ func init() { loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`) flag.Bool("list-shops", false, `lists all crawlable shops`) shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`) + user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent") + delay_f := flag.Int("delay", 0, "toggles random delay between crawls") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt") flag.Parse() loglevel := strings.ToLower(*loglevel_f) @@ -41,6 +44,16 @@ func init() { _conf.parseConfig(*configFile) + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + if _conf.Debug && !*silent { log.SetLevel(log.DebugLevel) } diff --git a/crawler/scrape.go b/crawler/scrape.go index 4bc66e0..f9e758d 100644 --- a/crawler/scrape.go +++ b/crawler/scrape.go @@ -1,7 +1,10 @@ package main import ( + "time" + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" ) func (app *App) ScrapeHTML(shops []Shop) { @@ -41,6 +44,13 @@ func (app *App) Scrape(shop Shop, wait chan bool) { } } + // if no results, return early + if len(W) == 0 { + wait <- true + return + + } + err = app.save_offer(W) if err != nil { Warn(err, "Saving offers failed. Shop: "+shop.Name) @@ -78,3 +88,21 @@ func (app *App) ScrapeShop(shop Shop) []Angebot { return []Angebot{} } + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} diff --git a/crawler/shop_bottleworld.go b/crawler/shop_bottleworld.go index de9fe13..d679b43 100644 --- a/crawler/shop_bottleworld.go +++ b/crawler/shop_bottleworld.go @@ -6,13 +6,18 @@ import ( // "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" + "log" + "time" ) func (app *App) ScrapeBottleWord(shop Shop) []Angebot { Shop_url := "https://www.bottleworld.de/aktuelle-sonderpreise/show/all" Whiskys := []Angebot{} - c := customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) + c := app.customCollector([]string{"bottleworld.de", "www.bottleworld.de"}) + + log.Println(c.IgnoreRobotsTxt) + log.Println(time.Duration(app.Config.Delay)) c.OnHTML("li.item", func(e *colly.HTMLElement) { W := Angebot{} @@ -60,7 +65,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { W.Image_url = e.ChildAttr("img", "src") - e.Request.Visit(W.Url) + erro := e.Request.Visit(W.Url) + if erro != nil { + Warn(nil, W.Url+" "+erro.Error()) + } var ctx string W.Volume, ctx = get_volume(e) @@ -109,7 +117,10 @@ func (app *App) ScrapeBottleWord(shop Shop) []Angebot { e.Request.Ctx.Put("spirit_type", detect_spirit_type(text_noisy)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_mcwhisky.go b/crawler/shop_mcwhisky.go index ef780a9..941f3b9 100644 --- a/crawler/shop_mcwhisky.go +++ b/crawler/shop_mcwhisky.go @@ -11,7 +11,7 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"}) + c := app.customCollector([]string{"mcwhisky.com", "www.mcwhisky.com"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { @@ -130,7 +130,10 @@ func (app *App) ScrapeMCWhisky(shop Shop) []Angebot { }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_rumundco.go b/crawler/shop_rumundco.go index 4b72c08..45069c2 100644 --- a/crawler/shop_rumundco.go +++ b/crawler/shop_rumundco.go @@ -14,7 +14,7 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"rumundco.de", "www.rumundco.de"}) + c := app.customCollector([]string{"rumundco.de", "www.rumundco.de"}) c.OnHTML(".product-teaser", func(e *colly.HTMLElement) { @@ -155,7 +155,10 @@ func (app *App) ScrapeRumundCo(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whic.go b/crawler/shop_whic.go index 2d0170b..93bff23 100644 --- a/crawler/shop_whic.go +++ b/crawler/shop_whic.go @@ -12,7 +12,7 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { Shop_url := "https://whic.de/angebote" Whiskys := []Angebot{} - c := customCollector([]string{"whic.de"}) + c := app.customCollector([]string{"whic.de"}) c.OnHTML("li.item", func(e *colly.HTMLElement) { @@ -127,7 +127,10 @@ func (app *App) ScrapeWhic(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskyde.go b/crawler/shop_whiskyde.go index 9e061ac..d3087ca 100644 --- a/crawler/shop_whiskyde.go +++ b/crawler/shop_whiskyde.go @@ -11,7 +11,7 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"whisky.de", "www.whisky.de"}) + c := app.customCollector([]string{"whisky.de", "www.whisky.de"}) c.OnHTML(".is-buyable", func(e *colly.HTMLElement) { @@ -118,7 +118,10 @@ func (app *App) ScrapeWhiskyde(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskysitenl.go b/crawler/shop_whiskysitenl.go index 4dad313..e3ae075 100644 --- a/crawler/shop_whiskysitenl.go +++ b/crawler/shop_whiskysitenl.go @@ -13,7 +13,7 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { Shop_url := "https://www.whiskysite.nl/en/specials/?limit=100" - c := customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"}) + c := app.customCollector([]string{"whiskysite.nl", "www.whiskysite.nl"}) c.OnHTML(".product-block", func(e *colly.HTMLElement) { @@ -138,7 +138,10 @@ func (app *App) ScrapeWhiskysitenl(shop Shop) []Angebot { e.Request.Ctx.Put("website", string(e.Response.Body)) }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/shop_whiskyworld.go b/crawler/shop_whiskyworld.go index 7b57d37..3f0874d 100644 --- a/crawler/shop_whiskyworld.go +++ b/crawler/shop_whiskyworld.go @@ -15,7 +15,7 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"}) + c := app.customCollector([]string{"whiskyworld.de", "www.whiskyworld.de"}) c.OnHTML(".product-item", func(e *colly.HTMLElement) { if !stringInSlice(e.Request.URL.String(), Shop_urls) { @@ -123,7 +123,10 @@ func (app *App) ScrapeWhiskyworld(shop Shop) []Angebot { }) for _, url := range Shop_urls { - c.Visit(url) + err := c.Visit(url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } } return Whiskys diff --git a/crawler/shop_whiskyzone.go b/crawler/shop_whiskyzone.go index 4dc825a..dbaf0ba 100644 --- a/crawler/shop_whiskyzone.go +++ b/crawler/shop_whiskyzone.go @@ -13,7 +13,7 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { Whiskys := []Angebot{} - c := customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"}) + c := app.customCollector([]string{"whiskyzone.de", "www.whiskyzone.de"}) c.OnHTML(".product--info", func(e *colly.HTMLElement) { @@ -154,7 +154,10 @@ func (app *App) ScrapeWhiskyzone(shop Shop) []Angebot { }) - c.Visit(Shop_url) + err := c.Visit(Shop_url) + if err != nil { + Warn(nil, shop.Name+": "+err.Error()) + } return Whiskys } diff --git a/crawler/utility.go b/crawler/utility.go index e0acf3f..5fa78c4 100644 --- a/crawler/utility.go +++ b/crawler/utility.go @@ -10,12 +10,6 @@ import ( "github.com/gocolly/colly" ) -func customCollector(allowed_urls []string) *colly.Collector { - return colly.NewCollector( - colly.AllowedDomains(allowed_urls...), - ) -} - func stringInSlice(a string, list []string) bool { for _, b := range list { if b == a { |
