summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMax2019-04-17 09:44:56 +0200
committerMax2019-04-17 09:44:56 +0200
commit4dc18e3691127e058833fd9c7a5bbee333c3a66c (patch)
treecfc6829bea0bf05854beb5a6a4b78eb1899c0b59
downloadghrss-4dc18e3691127e058833fd9c7a5bbee333c3a66c.tar.gz
Initial commit.
-rw-r--r--.gitignore7
-rw-r--r--Makefile20
-rw-r--r--config.go102
-rw-r--r--database.go200
-rw-r--r--getdata.go76
-rw-r--r--github.go5
-rw-r--r--init.go58
-rw-r--r--log.go53
-rw-r--r--main.go47
-rw-r--r--schema.sql40
-rw-r--r--scrape.go84
-rw-r--r--struct.go41
12 files changed, 733 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4e89156
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.test
+*.html
+*~
+*.swp
+*.db
+ghrss
+config.json
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7126e13
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+BINARY := $(notdir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
+MAINFILE :=
+SOURCEDIR := .
+SOURCES := $(shell find $(SOURCEDIR) -name '*.go')
+
+all: build run
+
+build: $(BINARY)
+
+$(BINARY): $(SOURCES)
+ go build -o $(BINARY)
+
+run: build
+ ./$(BINARY)
+
+clean:
+ $(RM) $(RMFLAGS) $(BINARY)
+
+edit:
+ $(EDITOR) $(MAINFILE) *.go
diff --git a/config.go b/config.go
new file mode 100644
index 0000000..adbcb14
--- /dev/null
+++ b/config.go
@@ -0,0 +1,102 @@
+package main
+
+import (
+ "os"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/spf13/viper"
+)
+
+type Config struct {
+ DBDriver string
+ DBDBName string
+ DBHost string
+ DBPort string
+ DBUser string
+ DBPassword string
+ DBOptions string
+
+ UserAgent string
+ Delay int
+ IgnoreRobotsTXT bool
+
+ Debug bool // sets log level to debug
+}
+
+// Parses the configuration and sets the configuration struct.
+func (c *Config) parseConfig(configFile string) {
+
+ viper.SetDefault("DB_Driver", "mysql")
+ viper.SetDefault("DB_DBName", "alkobote")
+ viper.SetDefault("DB_Host", "localhost")
+ viper.SetDefault("DB_Port", "3306")
+
+ viper.SetDefault("Debug", false)
+ viper.SetDefault("Delay", 0)
+
+ // needs some refactoring to truly respect robots.txt
+ viper.SetDefault("IgnoreRobotsTXT", true)
+
+ viper.SetDefault("UserAgent", "colly - a friendly crawler :)")
+
+ // Name of the configuration file
+ viper.SetConfigName("config")
+
+ // Where to find the config file
+ if configFile == "" {
+ viper.AddConfigPath("/etc/ghrss/")
+ viper.AddConfigPath(".")
+ viper.AddConfigPath("$HOME/app/ghrss/")
+ viper.AddConfigPath("$HOME/.config/ghrss/")
+ viper.AddConfigPath("$HOME/ghrss/")
+ } else {
+ stat, err := os.Stat(configFile)
+ if os.IsNotExist(err) {
+ // provided config file does not exist, so we add the path instead
+ viper.AddConfigPath(configFile)
+ } else if err == nil && stat.IsDir() {
+ // adds the path to look for the config file
+ viper.AddConfigPath(configFile)
+ } else if err == nil {
+ // directly sets the config file
+ viper.SetConfigFile(configFile)
+ } else {
+ // if we are here something went wrong
+ Warn(err, "config.go: os.Stat("+configFile+") error")
+ // adding the path nonetheless because it's not hurting
+ viper.AddConfigPath(configFile)
+ }
+ }
+
+ // Env variables need to be prefixed with "ALKOBOTE_"
+ viper.SetEnvPrefix("ALKOBOTE")
+
+ // Parses automatic the matching env variables
+ viper.AutomaticEnv()
+
+ // Reads the config
+ err := viper.ReadInConfig()
+ if err != nil {
+ Fatal(err, "Config: Error parsing config file.")
+ }
+ log.Debug("Config: Config file used: " + viper.ConfigFileUsed())
+
+ c.setsConfig()
+}
+
+// Actually sets the config struct
+func (c *Config) setsConfig() {
+ c.DBDriver = viper.GetString("DB_Driver")
+ c.DBHost = viper.GetString("DB_Host")
+ c.DBPort = viper.GetString("DB_Port")
+ c.DBUser = viper.GetString("DB_User")
+ c.DBPassword = viper.GetString("DB_Password")
+ c.DBDBName = viper.GetString("DB_DBName")
+ c.DBOptions = viper.GetString("DB_Options")
+
+ c.UserAgent = viper.GetString("UserAgent")
+ c.Delay = viper.GetInt("Delay")
+ c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT")
+
+ c.Debug = viper.GetBool("Debug")
+}
diff --git a/database.go b/database.go
new file mode 100644
index 0000000..044fa08
--- /dev/null
+++ b/database.go
@@ -0,0 +1,200 @@
+package main
+
+import (
+/*
+ "database/sql"
+ "fmt"
+ "strings"
+*/
+)
+
+func (app *App) createTables() error {
+ /**
+ * Copied from schema.sql
+ * TODO: Load this from the file itself.
+ */
+ query := `
+CREATE TABLE IF NOT EXISTS platform (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) UNIQUE NOT NULL,
+ url VARCHAR(255) UNIQUE NOT NULL
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS language (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) UNIQUE NOT NULL
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS update_period (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) UNIQUE NOT NULL
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS owner (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) NOT NULL,
+ url VARCHAR(255) NOT NULL UNIQUE,
+ platform INT NOT NULL,
+ CONSTRAINT fk_owner_platform FOREIGN KEY (platform) REFERENCES platform(id)
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS entry (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ title VARCHAR(255) NOT NULL,
+ synopsis VARCHAR(255) NOT NULL,
+ owner INT NOT NULL,
+ platform INT NOT NULL,
+ url VARCHAR(255) UNIQUE NOT NULL,
+ language INT NOT NULL,
+ stars VARCHAR(255) NOT NULL,
+ update_period INT NOT NULL,
+ created_at TIMESTAMP NOT NULL,
+ CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id),
+ CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id),
+ CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id),
+ CONSTRAINT fk_entry_period FOREIGN KEY (update_period) REFERENCES update_period(id)
+) CHARSET=utf8;
+ `
+
+ _, err := app.DB.Exec(query)
+ if err != nil {
+ return err
+ }
+
+ /**
+ * Populates database with first entries.
+ * TODO: Make this customizable.
+ */
+ init_platform_query := `
+ INSERT IGNORE INTO platform (id, name, url) VALUES (
+ NULL,
+ "Github",
+ "https://github.com/trending"
+ );
+ `
+ _, err = app.DB.Exec(init_platform_query)
+ if err != nil {
+ return err
+ }
+
+ init_language_query := `
+ INSERT IGNORE INTO language (name) VALUES (
+ "Go"
+ ),(
+ "PHP"
+ ), (
+ "Javascript"
+ ), (
+ "CSS"
+ ), (
+ "HTML"
+ ), (
+ "Java"
+ ), (
+ "SQL"
+ ), (
+ "Python"
+ );
+ `
+ _, err = app.DB.Exec(init_language_query)
+ if err != nil {
+ return err
+ }
+
+ init_period_query := `
+ INSERT IGNORE INTO update_period (name) VALUES (
+ "Daily"
+ ),(
+ "Weekly"
+ ), (
+ "Monthly"
+ );
+ `
+ _, err = app.DB.Exec(init_period_query)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) SaveEntries(entries []Entry) error {
+ query := `
+ INSERT IGNORE INTO entry (
+ id,
+ title,
+ synopsis,
+ owner,
+ platform,
+ url,
+ language,
+ stars,
+ update_period,
+ created_at
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ );
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ Warn(err, "SaveEntries: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ for _, e := range entries {
+
+ err = app.SaveOwner(*e.Owner)
+ if err != nil {
+ continue
+ }
+ _, err = stmt.Exec(e.Title, e.Synopsis, e.Owner.ID, e.Platform.ID, e.URL, e.Language.ID, e.Stars, e.UpdatePeriod.ID, app.Now)
+
+ if err != nil {
+ Warn(err, "SaveEntries: Statement execution failed")
+ return err
+ }
+
+ }
+
+ return nil
+}
+
+func (app *App) SaveOwner(owner Owner) error {
+ query := `
+ INSERT IGNORE owner (
+ id,
+ name,
+ url,
+ platform
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ );`
+
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ Warn(err, "SaveOwner: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(owner.Name, owner.URL, owner.Platform.ID)
+ if err != nil {
+ Warn(err, "SaveOwner: Statement execution failed")
+ return err
+ }
+
+ return nil
+}
diff --git a/getdata.go b/getdata.go
new file mode 100644
index 0000000..f9e5844
--- /dev/null
+++ b/getdata.go
@@ -0,0 +1,76 @@
+package main
+
+func (app *App) GetPlatforms() []Platform {
+
+ platforms := []Platform{}
+
+ query := `
+ SELECT id, name, FROM Platform;
+ `
+
+ rows, err := app.DB.Queryx(query)
+ if err != nil {
+ Fatal(err, "GetPlatforms: Query failed")
+ }
+ for rows.Next() {
+ var p Platform
+ err = rows.StructScan(&p)
+ if err != nil {
+ Fatal(err, "GetPlatforms: StructScan failed")
+ }
+
+ platforms = append(platforms, p)
+ }
+
+ return platforms
+}
+
+func (app *App) GetLanguages() []Language {
+
+ languages := []Language{}
+
+ query := `
+ SELECT id, name, FROM language;
+ `
+
+ rows, err := app.DB.Queryx(query)
+ if err != nil {
+ Fatal(err, "GetLanguages: Query failed")
+ }
+ for rows.Next() {
+ var l Language
+ err = rows.StructScan(&l)
+ if err != nil {
+ Fatal(err, "GetLanguages: StructScan failed")
+ }
+
+ languages = append(languages, l)
+ }
+
+ return languages
+}
+
+func (app *App) GetUpdatePeriods() []UpdatePeriod {
+
+ periods := []UpdatePeriod{}
+
+ query := `
+ SELECT id, name, FROM update_period;
+ `
+
+ rows, err := app.DB.Queryx(query)
+ if err != nil {
+ Fatal(err, "GetUpdatePeriods: Query failed")
+ }
+ for rows.Next() {
+ var p UpdatePeriod
+ err = rows.StructScan(&p)
+ if err != nil {
+ Fatal(err, "GetUpdatePeriods: StructScan failed")
+ }
+
+ periods = append(periods, p)
+ }
+
+ return periods
+}
diff --git a/github.go b/github.go
new file mode 100644
index 0000000..f673d1d
--- /dev/null
+++ b/github.go
@@ -0,0 +1,5 @@
+package main
+
+func (app *App) ScrapeGithub(platform Platform) []Entry {
+ return []Entry{}
+}
diff --git a/init.go b/init.go
new file mode 100644
index 0000000..7ce40c5
--- /dev/null
+++ b/init.go
@@ -0,0 +1,58 @@
+package main
+
+import (
+ "errors"
+ "strings"
+
+ log "github.com/Sirupsen/logrus"
+ flag "github.com/spf13/pflag"
+)
+
+// global config, gets overwritten by main
+var _conf Config
+
+func init() {
+ // overwrites unhelpful error message
+ flag.ErrHelp = errors.New("")
+
+ // we need to parse the config because of log level setting
+ configFile := flag.StringP("config", "c", "", "path to config file")
+ debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"")
+ verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug")
+ silent := flag.BoolP("silent", "s", false, "suppress output except warnings")
+ loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`)
+ user_agent_f := flag.StringP("user-agent", "u", "", "set user agent")
+ delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)")
+ ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt")
+
+ flag.Parse()
+ loglevel := strings.ToLower(*loglevel_f)
+
+ if *debug || *verbose || loglevel == "debug" {
+ log.SetLevel(log.DebugLevel)
+ } else if loglevel == "info" {
+ log.SetLevel(log.InfoLevel)
+ } else {
+ log.SetLevel(log.WarnLevel)
+ }
+
+ if *silent {
+ log.SetLevel(log.WarnLevel)
+ }
+
+ _conf.parseConfig(*configFile)
+
+ if *user_agent_f != "" {
+ _conf.UserAgent = *user_agent_f
+ }
+ if *delay_f != 0 {
+ _conf.Delay = *delay_f
+ }
+ if !*ignore_robots_f {
+ _conf.IgnoreRobotsTXT = *ignore_robots_f
+ }
+
+ if _conf.Debug && !*silent {
+ log.SetLevel(log.DebugLevel)
+ }
+}
diff --git a/log.go b/log.go
new file mode 100644
index 0000000..a367d3d
--- /dev/null
+++ b/log.go
@@ -0,0 +1,53 @@
+package main
+
+import (
+ log "github.com/Sirupsen/logrus"
+)
+
+func Fatal(err error, msg string) {
+ if err != nil {
+ log.WithFields(
+ log.Fields{
+ "error": err.Error(),
+ },
+ ).Fatal(msg)
+ } else {
+ log.Fatal(msg)
+ }
+}
+
+func Println(err error, msg string) {
+ if err != nil {
+ log.WithFields(
+ log.Fields{
+ "error": err.Error(),
+ },
+ ).Println(msg)
+ } else {
+ log.Println(msg)
+ }
+}
+
+func Debug(err error, msg string) {
+ if err != nil {
+ log.WithFields(
+ log.Fields{
+ "error": err.Error(),
+ },
+ ).Debug(msg)
+ } else {
+ log.Debug(msg)
+ }
+}
+
+func Warn(err error, msg string) {
+ if err != nil {
+ log.WithFields(
+ log.Fields{
+ "error": err.Error(),
+ },
+ ).Warn(msg)
+ } else {
+ log.Warn(msg)
+ }
+}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..39fbe49
--- /dev/null
+++ b/main.go
@@ -0,0 +1,47 @@
+package main
+
+import (
+ "fmt"
+ "time"
+
+ log "github.com/Sirupsen/logrus"
+ //"github.com/gocolly/colly"
+ "github.com/jmoiron/sqlx"
+)
+
+type App struct {
+ Config *Config
+ DB *sqlx.DB
+ Now int64
+ Debug bool
+}
+
+func main() {
+ var err error
+
+ // copy global config to avoid woring with globals
+ _own_config := _conf
+ app := App{Config: &_own_config}
+ // overwrite the global
+ _conf = Config{}
+
+ app.Now = time.Now().Unix()
+
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
+ app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
+ if err != nil {
+ Fatal(err, "Cannot connect to database")
+ }
+
+ if err = app.DB.Ping(); err != nil {
+ Fatal(err, "No connection to database")
+ }
+ defer app.DB.Close()
+
+ err = app.createTables()
+ if err != nil {
+ Fatal(err, "Creating table failed")
+ }
+
+}
diff --git a/schema.sql b/schema.sql
new file mode 100644
index 0000000..7048798
--- /dev/null
+++ b/schema.sql
@@ -0,0 +1,40 @@
+CREATE TABLE IF NOT EXISTS platform (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) UNIQUE NOT NULL,
+ url VARCHAR(255) UNIQUE NOT NULL
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS language (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) UNIQUE NOT NULL
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS update_period (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) UNIQUE NOT NULL
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS owner (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ name VARCHAR(255) NOT NULL,
+ url VARCHAR(255) NOT NULL UNIQUE,
+ platform INT NOT NULL,
+ CONSTRAINT fk_owner_platform FOREIGN KEY (platform) REFERENCES platform(id)
+) CHARSET=utf8;
+
+CREATE TABLE IF NOT EXISTS entry (
+ id INT PRIMARY KEY AUTO_INCREMENT,
+ title VARCHAR(255) NOT NULL,
+ synopsis VARCHAR(255) NOT NULL,
+ owner INT NOT NULL,
+ platform INT NOT NULL,
+ url VARCHAR(255) UNIQUE NOT NULL,
+ language INT NOT NULL,
+ stars VARCHAR(255) NOT NULL,
+ update_period INT NOT NULL,
+ created_at TIMESTAMP NOT NULL,
+ CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id),
+ CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id),
+ CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id),
+ CONSTRAINT fk_entry_period FOREIGN KEY (update_period) REFERENCES update_period(id)
+) CHARSET=utf8;
diff --git a/scrape.go b/scrape.go
new file mode 100644
index 0000000..ebb02fa
--- /dev/null
+++ b/scrape.go
@@ -0,0 +1,84 @@
+package main
+
+import (
+ "time"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) ScrapeHTML(platforms []Platform) {
+
+ wait := make(chan bool)
+ count := 0
+
+ for _, platform := range platforms {
+
+ go app.Scrape(platform, wait)
+ count++
+
+ }
+
+ // Wait until all go routines finished
+ for i := 0; i < count; i++ {
+ <-wait
+ }
+}
+
+func (app *App) Scrape(platform Platform, wait chan bool) {
+ var Entries []Entry
+ var err error
+
+ // retry on error
+ for i := 1; i < 4; i++ {
+ Entries = app.ScrapePlatform(platform)
+
+ if len(Entries) >= 1 {
+ break
+ }
+ }
+
+ // if no results, return early
+ if len(Entries) == 0 {
+ wait <- true
+ return
+
+ }
+
+ err = app.SaveEntries(Entries)
+ if err != nil {
+ Warn(err, "Saving entries failed. Platform: "+platform.Name)
+ }
+
+ wait <- true
+}
+
+func (app *App) ScrapePlatform(platform Platform) []Entry {
+
+ switch platform.Name {
+ case "Github":
+ return app.ScrapeGithub(platform)
+ default:
+ log.Println(platform.Name + ": No Crawler")
+ }
+
+ return []Entry{}
+}
+
+/*
+ * Sets the crawler config.
+ */
+func (app *App) customCollector(allowed_urls []string) *colly.Collector {
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ colly.AllowedDomains(allowed_urls...),
+ )
+ c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT
+
+ c.Limit(&colly.LimitRule{
+ DomainGlob: "*",
+ RandomDelay: time.Duration(app.Config.Delay) * time.Second,
+ })
+
+ return c
+}
diff --git a/struct.go b/struct.go
new file mode 100644
index 0000000..ccd2dc3
--- /dev/null
+++ b/struct.go
@@ -0,0 +1,41 @@
+package main
+
+import (
+ "time"
+)
+
+type Platform struct {
+ ID int
+ Name string
+ URL string
+}
+
+type Language struct {
+ ID int
+ Name string
+}
+
+type UpdatePeriod struct {
+ ID int
+ Name string
+}
+
+type Owner struct {
+ ID int
+ Name string
+ URL string
+ Platform *Platform
+}
+
+type Entry struct {
+ ID int
+ Title string
+ Synopsis string
+ Owner *Owner
+ Platform *Platform
+ URL string
+ Language *Language
+ Stars int
+ UpdatePeriod *UpdatePeriod
+ Created_At time.Time
+}