diff options
| -rw-r--r-- | .gitignore | 7 | ||||
| -rw-r--r-- | Makefile | 20 | ||||
| -rw-r--r-- | config.go | 102 | ||||
| -rw-r--r-- | database.go | 200 | ||||
| -rw-r--r-- | getdata.go | 76 | ||||
| -rw-r--r-- | github.go | 5 | ||||
| -rw-r--r-- | init.go | 58 | ||||
| -rw-r--r-- | log.go | 53 | ||||
| -rw-r--r-- | main.go | 47 | ||||
| -rw-r--r-- | schema.sql | 40 | ||||
| -rw-r--r-- | scrape.go | 84 | ||||
| -rw-r--r-- | struct.go | 41 |
12 files changed, 733 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4e89156 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.test +*.html +*~ +*.swp +*.db +ghrss +config.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7126e13 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +BINARY := $(notdir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))) +MAINFILE := +SOURCEDIR := . +SOURCES := $(shell find $(SOURCEDIR) -name '*.go') + +all: build run + +build: $(BINARY) + +$(BINARY): $(SOURCES) + go build -o $(BINARY) + +run: build + ./$(BINARY) + +clean: + $(RM) $(RMFLAGS) $(BINARY) + +edit: + $(EDITOR) $(MAINFILE) *.go diff --git a/config.go b/config.go new file mode 100644 index 0000000..adbcb14 --- /dev/null +++ b/config.go @@ -0,0 +1,102 @@ +package main + +import ( + "os" + + log "github.com/Sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + IgnoreRobotsTXT bool + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "alkobote") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath("/etc/ghrss/") + viper.AddConfigPath(".") + viper.AddConfigPath("$HOME/app/ghrss/") + viper.AddConfigPath("$HOME/.config/ghrss/") + viper.AddConfigPath("$HOME/ghrss/") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("ALKOBOTE") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + + c.Debug = viper.GetBool("Debug") +} diff --git a/database.go b/database.go new file mode 100644 index 0000000..044fa08 --- /dev/null +++ b/database.go @@ -0,0 +1,200 @@ +package main + +import ( +/* + "database/sql" + "fmt" + "strings" +*/ +) + +func (app *App) createTables() error { + /** + * Copied from schema.sql + * TODO: Load this from the file itself. + */ + query := ` +CREATE TABLE IF NOT EXISTS platform ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS language ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS update_period ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS owner ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) NOT NULL, + url VARCHAR(255) NOT NULL UNIQUE, + platform INT NOT NULL, + CONSTRAINT fk_owner_platform FOREIGN KEY (platform) REFERENCES platform(id) +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS entry ( + id INT PRIMARY KEY AUTO_INCREMENT, + title VARCHAR(255) NOT NULL, + synopsis VARCHAR(255) NOT NULL, + owner INT NOT NULL, + platform INT NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL, + language INT NOT NULL, + stars VARCHAR(255) NOT NULL, + update_period INT NOT NULL, + created_at TIMESTAMP NOT NULL, + CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id), + CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id), + CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id), + CONSTRAINT fk_entry_period FOREIGN KEY (update_period) REFERENCES update_period(id) +) CHARSET=utf8; + ` + + _, err := app.DB.Exec(query) + if err != nil { + return err + } + + /** + * Populates database with first entries. + * TODO: Make this customizable. + */ + init_platform_query := ` + INSERT IGNORE INTO platform (id, name, url) VALUES ( + NULL, + "Github", + "https://github.com/trending" + ); + ` + _, err = app.DB.Exec(init_platform_query) + if err != nil { + return err + } + + init_language_query := ` + INSERT IGNORE INTO language (name) VALUES ( + "Go" + ),( + "PHP" + ), ( + "Javascript" + ), ( + "CSS" + ), ( + "HTML" + ), ( + "Java" + ), ( + "SQL" + ), ( + "Python" + ); + ` + _, err = app.DB.Exec(init_language_query) + if err != nil { + return err + } + + init_period_query := ` + INSERT IGNORE INTO update_period (name) VALUES ( + "Daily" + ),( + "Weekly" + ), ( + "Monthly" + ); + ` + _, err = app.DB.Exec(init_period_query) + if err != nil { + return err + } + + return nil +} + +func (app *App) SaveEntries(entries []Entry) error { + query := ` + INSERT IGNORE INTO entry ( + id, + title, + synopsis, + owner, + platform, + url, + language, + stars, + update_period, + created_at + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ); + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + Warn(err, "SaveEntries: Preparing query failed") + return err + } + defer stmt.Close() + + for _, e := range entries { + + err = app.SaveOwner(*e.Owner) + if err != nil { + continue + } + _, err = stmt.Exec(e.Title, e.Synopsis, e.Owner.ID, e.Platform.ID, e.URL, e.Language.ID, e.Stars, e.UpdatePeriod.ID, app.Now) + + if err != nil { + Warn(err, "SaveEntries: Statement execution failed") + return err + } + + } + + return nil +} + +func (app *App) SaveOwner(owner Owner) error { + query := ` + INSERT IGNORE owner ( + id, + name, + url, + platform + ) VALUES ( + NULL, + ?, + ?, + ?, + );` + + stmt, err := app.DB.Prepare(query) + if err != nil { + Warn(err, "SaveOwner: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(owner.Name, owner.URL, owner.Platform.ID) + if err != nil { + Warn(err, "SaveOwner: Statement execution failed") + return err + } + + return nil +} diff --git a/getdata.go b/getdata.go new file mode 100644 index 0000000..f9e5844 --- /dev/null +++ b/getdata.go @@ -0,0 +1,76 @@ +package main + +func (app *App) GetPlatforms() []Platform { + + platforms := []Platform{} + + query := ` + SELECT id, name, FROM Platform; + ` + + rows, err := app.DB.Queryx(query) + if err != nil { + Fatal(err, "GetPlatforms: Query failed") + } + for rows.Next() { + var p Platform + err = rows.StructScan(&p) + if err != nil { + Fatal(err, "GetPlatforms: StructScan failed") + } + + platforms = append(platforms, p) + } + + return platforms +} + +func (app *App) GetLanguages() []Language { + + languages := []Language{} + + query := ` + SELECT id, name, FROM language; + ` + + rows, err := app.DB.Queryx(query) + if err != nil { + Fatal(err, "GetLanguages: Query failed") + } + for rows.Next() { + var l Language + err = rows.StructScan(&l) + if err != nil { + Fatal(err, "GetLanguages: StructScan failed") + } + + languages = append(languages, l) + } + + return languages +} + +func (app *App) GetUpdatePeriods() []UpdatePeriod { + + periods := []UpdatePeriod{} + + query := ` + SELECT id, name, FROM update_period; + ` + + rows, err := app.DB.Queryx(query) + if err != nil { + Fatal(err, "GetUpdatePeriods: Query failed") + } + for rows.Next() { + var p UpdatePeriod + err = rows.StructScan(&p) + if err != nil { + Fatal(err, "GetUpdatePeriods: StructScan failed") + } + + periods = append(periods, p) + } + + return periods +} diff --git a/github.go b/github.go new file mode 100644 index 0000000..f673d1d --- /dev/null +++ b/github.go @@ -0,0 +1,5 @@ +package main + +func (app *App) ScrapeGithub(platform Platform) []Entry { + return []Entry{} +} @@ -0,0 +1,58 @@ +package main + +import ( + "errors" + "strings" + + log "github.com/Sirupsen/logrus" + flag "github.com/spf13/pflag" +) + +// global config, gets overwritten by main +var _conf Config + +func init() { + // overwrites unhelpful error message + flag.ErrHelp = errors.New("") + + // we need to parse the config because of log level setting + configFile := flag.StringP("config", "c", "", "path to config file") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") + + flag.Parse() + loglevel := strings.ToLower(*loglevel_f) + + if *debug || *verbose || loglevel == "debug" { + log.SetLevel(log.DebugLevel) + } else if loglevel == "info" { + log.SetLevel(log.InfoLevel) + } else { + log.SetLevel(log.WarnLevel) + } + + if *silent { + log.SetLevel(log.WarnLevel) + } + + _conf.parseConfig(*configFile) + + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + + if _conf.Debug && !*silent { + log.SetLevel(log.DebugLevel) + } +} @@ -0,0 +1,53 @@ +package main + +import ( + log "github.com/Sirupsen/logrus" +) + +func Fatal(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Fatal(msg) + } else { + log.Fatal(msg) + } +} + +func Println(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Println(msg) + } else { + log.Println(msg) + } +} + +func Debug(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Debug(msg) + } else { + log.Debug(msg) + } +} + +func Warn(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Warn(msg) + } else { + log.Warn(msg) + } +} @@ -0,0 +1,47 @@ +package main + +import ( + "fmt" + "time" + + log "github.com/Sirupsen/logrus" + //"github.com/gocolly/colly" + "github.com/jmoiron/sqlx" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now int64 + Debug bool +} + +func main() { + var err error + + // copy global config to avoid woring with globals + _own_config := _conf + app := App{Config: &_own_config} + // overwrite the global + _conf = Config{} + + app.Now = time.Now().Unix() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + Fatal(err, "No connection to database") + } + defer app.DB.Close() + + err = app.createTables() + if err != nil { + Fatal(err, "Creating table failed") + } + +} diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..7048798 --- /dev/null +++ b/schema.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS platform ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS language ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS update_period ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS owner ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) NOT NULL, + url VARCHAR(255) NOT NULL UNIQUE, + platform INT NOT NULL, + CONSTRAINT fk_owner_platform FOREIGN KEY (platform) REFERENCES platform(id) +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS entry ( + id INT PRIMARY KEY AUTO_INCREMENT, + title VARCHAR(255) NOT NULL, + synopsis VARCHAR(255) NOT NULL, + owner INT NOT NULL, + platform INT NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL, + language INT NOT NULL, + stars VARCHAR(255) NOT NULL, + update_period INT NOT NULL, + created_at TIMESTAMP NOT NULL, + CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id), + CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id), + CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id), + CONSTRAINT fk_entry_period FOREIGN KEY (update_period) REFERENCES update_period(id) +) CHARSET=utf8; diff --git a/scrape.go b/scrape.go new file mode 100644 index 0000000..ebb02fa --- /dev/null +++ b/scrape.go @@ -0,0 +1,84 @@ +package main + +import ( + "time" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) ScrapeHTML(platforms []Platform) { + + wait := make(chan bool) + count := 0 + + for _, platform := range platforms { + + go app.Scrape(platform, wait) + count++ + + } + + // Wait until all go routines finished + for i := 0; i < count; i++ { + <-wait + } +} + +func (app *App) Scrape(platform Platform, wait chan bool) { + var Entries []Entry + var err error + + // retry on error + for i := 1; i < 4; i++ { + Entries = app.ScrapePlatform(platform) + + if len(Entries) >= 1 { + break + } + } + + // if no results, return early + if len(Entries) == 0 { + wait <- true + return + + } + + err = app.SaveEntries(Entries) + if err != nil { + Warn(err, "Saving entries failed. Platform: "+platform.Name) + } + + wait <- true +} + +func (app *App) ScrapePlatform(platform Platform) []Entry { + + switch platform.Name { + case "Github": + return app.ScrapeGithub(platform) + default: + log.Println(platform.Name + ": No Crawler") + } + + return []Entry{} +} + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..ccd2dc3 --- /dev/null +++ b/struct.go @@ -0,0 +1,41 @@ +package main + +import ( + "time" +) + +type Platform struct { + ID int + Name string + URL string +} + +type Language struct { + ID int + Name string +} + +type UpdatePeriod struct { + ID int + Name string +} + +type Owner struct { + ID int + Name string + URL string + Platform *Platform +} + +type Entry struct { + ID int + Title string + Synopsis string + Owner *Owner + Platform *Platform + URL string + Language *Language + Stars int + UpdatePeriod *UpdatePeriod + Created_At time.Time +} |
