diff options
| author | horus | 2018-09-16 17:17:04 +0200 |
|---|---|---|
| committer | horus | 2018-09-16 17:17:04 +0200 |
| commit | d1ce36763bb1f5dc3d4f58b59a20cffc2b03a3a4 (patch) | |
| tree | b87dd41ba45240c746ee1cea823f918a3f2be209 | |
| parent | 03682685add64a9dd307a3d99b49717446b11b9d (diff) | |
| download | alkobote-d1ce36763bb1f5dc3d4f58b59a20cffc2b03a3a4.tar.gz | |
Rename flags. Adds new flag to exclude shops. (crawler)
| -rw-r--r-- | crawler/config.go | 9 | ||||
| -rw-r--r-- | crawler/init.go | 26 | ||||
| -rw-r--r-- | crawler/shops.go | 5 |
3 files changed, 28 insertions, 12 deletions
diff --git a/crawler/config.go b/crawler/config.go index a3939c4..ba3c92f 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -25,9 +25,10 @@ type Config struct { Polr_URL string Polr_API_Key string - Debug bool // sets log level to debug - FixDatabase bool // reruns some sanitizing functions over the db - ShopIDs []string // limits which shops to crawl, wants shop_id + Debug bool // sets log level to debug + FixDatabase bool // reruns some sanitizing functions over the db + ShopIDs []string // limits which shops to crawl, wants shop_id + ExcludeShopIDs []string // excludes shops from being crawled, wants shop_id } // Parses the configuration and sets the configuration struct. @@ -44,6 +45,7 @@ func (c *Config) parseConfig(configFile string) { viper.SetDefault("FixDatabase", false) viper.SetDefault("DisableURLShorter", false) viper.SetDefault("ShopIDs", []string{}) + viper.SetDefault("ExcludeShopIDs", []string{}) viper.SetDefault("Delay", 0) // needs some refactoring to truly respect robots.txt @@ -117,4 +119,5 @@ func (c *Config) setsConfig() { c.Debug = viper.GetBool("Debug") c.FixDatabase = viper.GetBool("FixDatabase") c.ShopIDs = viper.GetStringSlice("ShopIDs") + c.ExcludeShopIDs = viper.GetStringSlice("ExcludeShopIDs") } diff --git a/crawler/init.go b/crawler/init.go index 668df2d..34e440e 100644 --- a/crawler/init.go +++ b/crawler/init.go @@ -17,15 +17,16 @@ func init() { // we need to parse the config because of log level setting configFile := flag.StringP("config", "c", "", "path to config file") - debug := flag.BoolP("debug", "d", false, "debug outputs") - verbose := flag.BoolP("verbose", "v", false, "same as --debug") - silent := flag.BoolP("silent", "s", false, "suppress outputs except warnings") - loglevel_f := flag.StringP("loglevel", "l", "Warn", `sets log level, can be "Warn", "Info" or "Debug"`) - flag.Bool("list-shops", false, `lists all crawlable shops`) - shopids_f := flag.StringP("restrict-shops", "r", "", `comma separated list of shop ids, crawls only these`) - user_agent_f := flag.StringP("user-agent", "u", "", "sets user agent") - delay_f := flag.Int("delay", 0, "toggles random delay between crawls") - ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignores robots.txt") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + flag.BoolP("list-shops", "l", false, `list all crawlable shops`) + shopids_f := flag.StringP("only-shop", "o", "", `comma separated list of shop ids, crawl only these`) + not_shopids_f := flag.StringP("exclude-shop", "x", "", `comma separated list of shop ids, DO NOT crawl these`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") flag.Parse() loglevel := strings.ToLower(*loglevel_f) @@ -61,4 +62,11 @@ func init() { if "" != *shopids_f { _conf.ShopIDs = strings.Split(*shopids_f, ",") } + if "" != *not_shopids_f { + _conf.ExcludeShopIDs = strings.Split(*not_shopids_f, ",") + } + + if "" != *shopids_f && "" != *not_shopids_f { + log.Fatal("init.go: Config error: Cannot use both flags --exclude-shop and --only-shop at the same time.") + } } diff --git a/crawler/shops.go b/crawler/shops.go index 79eff96..224ae96 100644 --- a/crawler/shops.go +++ b/crawler/shops.go @@ -113,6 +113,11 @@ func (app *App) getShops() ([]Shop, error) { shop_query = " WHERE id IN (" + shopIDs + ")" } + } else if len(app.Config.ExcludeShopIDs) > 0 { + excludeShopIDs := strings.Join(app.Config.ExcludeShopIDs, `, `) + if excludeShopIDs != "" { + shop_query = " WHERE id NOT IN (" + excludeShopIDs + ")" + } } query := `SELECT id,name,short_url,url,logo_url,shipping_costs,free_shipping FROM shop ` + shop_query |
