From 4dc18e3691127e058833fd9c7a5bbee333c3a66c Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 17 Apr 2019 09:44:56 +0200 Subject: Initial commit. --- .gitignore | 7 +++ Makefile | 20 ++++++ config.go | 102 +++++++++++++++++++++++++++++++ database.go | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ getdata.go | 76 +++++++++++++++++++++++ github.go | 5 ++ init.go | 58 ++++++++++++++++++ log.go | 53 ++++++++++++++++ main.go | 47 ++++++++++++++ schema.sql | 40 ++++++++++++ scrape.go | 84 +++++++++++++++++++++++++ struct.go | 41 +++++++++++++ 12 files changed, 733 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 config.go create mode 100644 database.go create mode 100644 getdata.go create mode 100644 github.go create mode 100644 init.go create mode 100644 log.go create mode 100644 main.go create mode 100644 schema.sql create mode 100644 scrape.go create mode 100644 struct.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4e89156 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.test +*.html +*~ +*.swp +*.db +ghrss +config.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7126e13 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +BINARY := $(notdir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))) +MAINFILE := +SOURCEDIR := . +SOURCES := $(shell find $(SOURCEDIR) -name '*.go') + +all: build run + +build: $(BINARY) + +$(BINARY): $(SOURCES) + go build -o $(BINARY) + +run: build + ./$(BINARY) + +clean: + $(RM) $(RMFLAGS) $(BINARY) + +edit: + $(EDITOR) $(MAINFILE) *.go diff --git a/config.go b/config.go new file mode 100644 index 0000000..adbcb14 --- /dev/null +++ b/config.go @@ -0,0 +1,102 @@ +package main + +import ( + "os" + + log "github.com/Sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + IgnoreRobotsTXT bool + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "alkobote") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath("/etc/ghrss/") + viper.AddConfigPath(".") + viper.AddConfigPath("$HOME/app/ghrss/") + viper.AddConfigPath("$HOME/.config/ghrss/") + viper.AddConfigPath("$HOME/ghrss/") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("ALKOBOTE") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + + c.Debug = viper.GetBool("Debug") +} diff --git a/database.go b/database.go new file mode 100644 index 0000000..044fa08 --- /dev/null +++ b/database.go @@ -0,0 +1,200 @@ +package main + +import ( +/* + "database/sql" + "fmt" + "strings" +*/ +) + +func (app *App) createTables() error { + /** + * Copied from schema.sql + * TODO: Load this from the file itself. + */ + query := ` +CREATE TABLE IF NOT EXISTS platform ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS language ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS update_period ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS owner ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) NOT NULL, + url VARCHAR(255) NOT NULL UNIQUE, + platform INT NOT NULL, + CONSTRAINT fk_owner_platform FOREIGN KEY (platform) REFERENCES platform(id) +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS entry ( + id INT PRIMARY KEY AUTO_INCREMENT, + title VARCHAR(255) NOT NULL, + synopsis VARCHAR(255) NOT NULL, + owner INT NOT NULL, + platform INT NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL, + language INT NOT NULL, + stars VARCHAR(255) NOT NULL, + update_period INT NOT NULL, + created_at TIMESTAMP NOT NULL, + CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id), + CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id), + CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id), + CONSTRAINT fk_entry_period FOREIGN KEY (update_period) REFERENCES update_period(id) +) CHARSET=utf8; + ` + + _, err := app.DB.Exec(query) + if err != nil { + return err + } + + /** + * Populates database with first entries. + * TODO: Make this customizable. + */ + init_platform_query := ` + INSERT IGNORE INTO platform (id, name, url) VALUES ( + NULL, + "Github", + "https://github.com/trending" + ); + ` + _, err = app.DB.Exec(init_platform_query) + if err != nil { + return err + } + + init_language_query := ` + INSERT IGNORE INTO language (name) VALUES ( + "Go" + ),( + "PHP" + ), ( + "Javascript" + ), ( + "CSS" + ), ( + "HTML" + ), ( + "Java" + ), ( + "SQL" + ), ( + "Python" + ); + ` + _, err = app.DB.Exec(init_language_query) + if err != nil { + return err + } + + init_period_query := ` + INSERT IGNORE INTO update_period (name) VALUES ( + "Daily" + ),( + "Weekly" + ), ( + "Monthly" + ); + ` + _, err = app.DB.Exec(init_period_query) + if err != nil { + return err + } + + return nil +} + +func (app *App) SaveEntries(entries []Entry) error { + query := ` + INSERT IGNORE INTO entry ( + id, + title, + synopsis, + owner, + platform, + url, + language, + stars, + update_period, + created_at + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ); + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + Warn(err, "SaveEntries: Preparing query failed") + return err + } + defer stmt.Close() + + for _, e := range entries { + + err = app.SaveOwner(*e.Owner) + if err != nil { + continue + } + _, err = stmt.Exec(e.Title, e.Synopsis, e.Owner.ID, e.Platform.ID, e.URL, e.Language.ID, e.Stars, e.UpdatePeriod.ID, app.Now) + + if err != nil { + Warn(err, "SaveEntries: Statement execution failed") + return err + } + + } + + return nil +} + +func (app *App) SaveOwner(owner Owner) error { + query := ` + INSERT IGNORE owner ( + id, + name, + url, + platform + ) VALUES ( + NULL, + ?, + ?, + ?, + );` + + stmt, err := app.DB.Prepare(query) + if err != nil { + Warn(err, "SaveOwner: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(owner.Name, owner.URL, owner.Platform.ID) + if err != nil { + Warn(err, "SaveOwner: Statement execution failed") + return err + } + + return nil +} diff --git a/getdata.go b/getdata.go new file mode 100644 index 0000000..f9e5844 --- /dev/null +++ b/getdata.go @@ -0,0 +1,76 @@ +package main + +func (app *App) GetPlatforms() []Platform { + + platforms := []Platform{} + + query := ` + SELECT id, name, FROM Platform; + ` + + rows, err := app.DB.Queryx(query) + if err != nil { + Fatal(err, "GetPlatforms: Query failed") + } + for rows.Next() { + var p Platform + err = rows.StructScan(&p) + if err != nil { + Fatal(err, "GetPlatforms: StructScan failed") + } + + platforms = append(platforms, p) + } + + return platforms +} + +func (app *App) GetLanguages() []Language { + + languages := []Language{} + + query := ` + SELECT id, name, FROM language; + ` + + rows, err := app.DB.Queryx(query) + if err != nil { + Fatal(err, "GetLanguages: Query failed") + } + for rows.Next() { + var l Language + err = rows.StructScan(&l) + if err != nil { + Fatal(err, "GetLanguages: StructScan failed") + } + + languages = append(languages, l) + } + + return languages +} + +func (app *App) GetUpdatePeriods() []UpdatePeriod { + + periods := []UpdatePeriod{} + + query := ` + SELECT id, name, FROM update_period; + ` + + rows, err := app.DB.Queryx(query) + if err != nil { + Fatal(err, "GetUpdatePeriods: Query failed") + } + for rows.Next() { + var p UpdatePeriod + err = rows.StructScan(&p) + if err != nil { + Fatal(err, "GetUpdatePeriods: StructScan failed") + } + + periods = append(periods, p) + } + + return periods +} diff --git a/github.go b/github.go new file mode 100644 index 0000000..f673d1d --- /dev/null +++ b/github.go @@ -0,0 +1,5 @@ +package main + +func (app *App) ScrapeGithub(platform Platform) []Entry { + return []Entry{} +} diff --git a/init.go b/init.go new file mode 100644 index 0000000..7ce40c5 --- /dev/null +++ b/init.go @@ -0,0 +1,58 @@ +package main + +import ( + "errors" + "strings" + + log "github.com/Sirupsen/logrus" + flag "github.com/spf13/pflag" +) + +// global config, gets overwritten by main +var _conf Config + +func init() { + // overwrites unhelpful error message + flag.ErrHelp = errors.New("") + + // we need to parse the config because of log level setting + configFile := flag.StringP("config", "c", "", "path to config file") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") + + flag.Parse() + loglevel := strings.ToLower(*loglevel_f) + + if *debug || *verbose || loglevel == "debug" { + log.SetLevel(log.DebugLevel) + } else if loglevel == "info" { + log.SetLevel(log.InfoLevel) + } else { + log.SetLevel(log.WarnLevel) + } + + if *silent { + log.SetLevel(log.WarnLevel) + } + + _conf.parseConfig(*configFile) + + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + + if _conf.Debug && !*silent { + log.SetLevel(log.DebugLevel) + } +} diff --git a/log.go b/log.go new file mode 100644 index 0000000..a367d3d --- /dev/null +++ b/log.go @@ -0,0 +1,53 @@ +package main + +import ( + log "github.com/Sirupsen/logrus" +) + +func Fatal(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Fatal(msg) + } else { + log.Fatal(msg) + } +} + +func Println(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Println(msg) + } else { + log.Println(msg) + } +} + +func Debug(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Debug(msg) + } else { + log.Debug(msg) + } +} + +func Warn(err error, msg string) { + if err != nil { + log.WithFields( + log.Fields{ + "error": err.Error(), + }, + ).Warn(msg) + } else { + log.Warn(msg) + } +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..39fbe49 --- /dev/null +++ b/main.go @@ -0,0 +1,47 @@ +package main + +import ( + "fmt" + "time" + + log "github.com/Sirupsen/logrus" + //"github.com/gocolly/colly" + "github.com/jmoiron/sqlx" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now int64 + Debug bool +} + +func main() { + var err error + + // copy global config to avoid woring with globals + _own_config := _conf + app := App{Config: &_own_config} + // overwrite the global + _conf = Config{} + + app.Now = time.Now().Unix() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + Fatal(err, "No connection to database") + } + defer app.DB.Close() + + err = app.createTables() + if err != nil { + Fatal(err, "Creating table failed") + } + +} diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..7048798 --- /dev/null +++ b/schema.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS platform ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS language ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS update_period ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) UNIQUE NOT NULL +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS owner ( + id INT PRIMARY KEY AUTO_INCREMENT, + name VARCHAR(255) NOT NULL, + url VARCHAR(255) NOT NULL UNIQUE, + platform INT NOT NULL, + CONSTRAINT fk_owner_platform FOREIGN KEY (platform) REFERENCES platform(id) +) CHARSET=utf8; + +CREATE TABLE IF NOT EXISTS entry ( + id INT PRIMARY KEY AUTO_INCREMENT, + title VARCHAR(255) NOT NULL, + synopsis VARCHAR(255) NOT NULL, + owner INT NOT NULL, + platform INT NOT NULL, + url VARCHAR(255) UNIQUE NOT NULL, + language INT NOT NULL, + stars VARCHAR(255) NOT NULL, + update_period INT NOT NULL, + created_at TIMESTAMP NOT NULL, + CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id), + CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id), + CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id), + CONSTRAINT fk_entry_period FOREIGN KEY (update_period) REFERENCES update_period(id) +) CHARSET=utf8; diff --git a/scrape.go b/scrape.go new file mode 100644 index 0000000..ebb02fa --- /dev/null +++ b/scrape.go @@ -0,0 +1,84 @@ +package main + +import ( + "time" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) ScrapeHTML(platforms []Platform) { + + wait := make(chan bool) + count := 0 + + for _, platform := range platforms { + + go app.Scrape(platform, wait) + count++ + + } + + // Wait until all go routines finished + for i := 0; i < count; i++ { + <-wait + } +} + +func (app *App) Scrape(platform Platform, wait chan bool) { + var Entries []Entry + var err error + + // retry on error + for i := 1; i < 4; i++ { + Entries = app.ScrapePlatform(platform) + + if len(Entries) >= 1 { + break + } + } + + // if no results, return early + if len(Entries) == 0 { + wait <- true + return + + } + + err = app.SaveEntries(Entries) + if err != nil { + Warn(err, "Saving entries failed. Platform: "+platform.Name) + } + + wait <- true +} + +func (app *App) ScrapePlatform(platform Platform) []Entry { + + switch platform.Name { + case "Github": + return app.ScrapeGithub(platform) + default: + log.Println(platform.Name + ": No Crawler") + } + + return []Entry{} +} + +/* + * Sets the crawler config. + */ +func (app *App) customCollector(allowed_urls []string) *colly.Collector { + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + colly.AllowedDomains(allowed_urls...), + ) + c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT + + c.Limit(&colly.LimitRule{ + DomainGlob: "*", + RandomDelay: time.Duration(app.Config.Delay) * time.Second, + }) + + return c +} diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..ccd2dc3 --- /dev/null +++ b/struct.go @@ -0,0 +1,41 @@ +package main + +import ( + "time" +) + +type Platform struct { + ID int + Name string + URL string +} + +type Language struct { + ID int + Name string +} + +type UpdatePeriod struct { + ID int + Name string +} + +type Owner struct { + ID int + Name string + URL string + Platform *Platform +} + +type Entry struct { + ID int + Title string + Synopsis string + Owner *Owner + Platform *Platform + URL string + Language *Language + Stars int + UpdatePeriod *UpdatePeriod + Created_At time.Time +} -- cgit v1.2.3