summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crawler/config.go9
-rw-r--r--crawler/init.go26
-rw-r--r--crawler/shops.go5
3 files changed, 28 insertions, 12 deletions
diff --git a/crawler/config.go b/crawler/config.go
index a3939c4..ba3c92f 100644
--- a/crawler/config.go
+++ b/crawler/config.go
@@ -25,9 +25,10 @@ type Config struct {
Polr_URL string
Polr_API_Key string
- Debug bool // sets log level to debug
- FixDatabase bool // reruns some sanitizing functions over the db
- ShopIDs []string // limits which shops to crawl, wants shop_id
+ Debug bool // sets log level to debug
+ FixDatabase bool // reruns some sanitizing functions over the db
+ ShopIDs []string // limits which shops to crawl, wants shop_id
+ ExcludeShopIDs []string // excludes shops from being crawled, wants shop_id
}
// Parses the configuration and sets the configuration struct.
@@ -44,6 +45,7 @@ func (c *Config) parseConfig(configFile string) {
viper.SetDefault("FixDatabase", false)
viper.SetDefault("DisableURLShorter", false)
viper.SetDefault("ShopIDs", []string{})
+ viper.SetDefault("ExcludeShopIDs", []string{})
viper.SetDefault("Delay", 0)
// needs some refactoring to truly respect robots.txt
@@ -117,4 +119,5 @@ func (c *Config) setsConfig() {
c.Debug = viper.GetBool("Debug")
c.FixDatabase = viper.GetBool("FixDatabase")
c.ShopIDs = viper.GetStringSlice("ShopIDs")
+ c.ExcludeShopIDs = viper.GetStringSlice("ExcludeShopIDs")
}
diff --git a/crawler/init.go b/crawler/init.go
index 668df2d..34e440e 100644
--- a/crawler/init.go
+++ b/crawler/init.go
@@ -17,15 +17,16 @@ func init() {
// we need to parse the config because of log level setting
configFile := flag.StringP("config", "c", "", "path to config file")
- debug := flag.BoolP("debug", "d", false, "debug outputs")
- verbose := flag.BoolP("verbose", "v", false, "same as --debug")
- silent := flag.BoolP("silent", "s", false, "suppress outputs except warnings")
- loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`)
- flag.Bool("list-shops", false, `lists all crawlable shops`)
- shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`)
- user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent")
- delay_f := flag.Int("delay", 0, "toggles random delay between crawls")
- ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt")
+ debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"")
+ verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug")
+ silent := flag.BoolP("silent", "s", false, "suppress output except warnings")
+ loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`)
+ flag.BoolP("list-shops", "l", false, `list all crawlable shops`)
+ shopids_f := flag.StringP("only-shop", "o", "", `comma separated list of shop ids, crawl only these`)
+ not_shopids_f := flag.StringP("exclude-shop", "x", "", `comma separated list of shop ids, DO NOT crawl these`)
+ user_agent_f := flag.StringP("user-agent", "u", "", "set user agent")
+ delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)")
+ ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt")
flag.Parse()
loglevel := strings.ToLower(*loglevel_f)
@@ -61,4 +62,11 @@ func init() {
if "" != *shopids_f {
_conf.ShopIDs = strings.Split(*shopids_f, ",")
}
+ if "" != *not_shopids_f {
+ _conf.ExcludeShopIDs = strings.Split(*not_shopids_f, ",")
+ }
+
+ if "" != *shopids_f && "" != *not_shopids_f {
+ log.Fatal("init.go: Config error: Cannot use both flags --exclude-shop and --only-shop at the same time.")
+ }
}
diff --git a/crawler/shops.go b/crawler/shops.go
index 79eff96..224ae96 100644
--- a/crawler/shops.go
+++ b/crawler/shops.go
@@ -113,6 +113,11 @@ func (app *App) getShops() ([]Shop, error) {
shop_query = " WHERE id IN (" + shopIDs + ")"
}
+ } else if len(app.Config.ExcludeShopIDs) > 0 {
+ excludeShopIDs := strings.Join(app.Config.ExcludeShopIDs, `, `)
+ if excludeShopIDs != "" {
+ shop_query = " WHERE id NOT IN (" + excludeShopIDs + ")"
+ }
}
query := `SELECT id,name,short_url,url,logo_url,shipping_costs,free_shipping FROM shop ` + shop_query