From 6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b Mon Sep 17 00:00:00 2001 From: admin Date: Wed, 11 Sep 2024 11:02:32 +0200 Subject: init --- Makefile | 18 +++ config.go | 104 ++++++++++++++ database.go | 426 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 50 +++++++ go.sum | 154 ++++++++++++++++++++ helper.go | 168 ++++++++++++++++++++++ init.go | 58 ++++++++ main.go | 462 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ sql.sql | 3 + struct.go | 26 ++++ 10 files changed, 1469 insertions(+) create mode 100644 Makefile create mode 100644 config.go create mode 100644 database.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 helper.go create mode 100644 init.go create mode 100644 main.go create mode 100644 sql.sql create mode 100644 struct.go diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e8115cd --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +BINARY=hn-crawler + +all: build run + +build: + go build -o $(BINARY) + +run: + ./$(BINARY) + +info: build + ./$(BINARY) --loglevel=info + +debug: build + ./$(BINARY) -d + +clean: + rm $(BINARY) diff --git a/config.go b/config.go new file mode 100644 index 0000000..7c36c53 --- /dev/null +++ b/config.go @@ -0,0 +1,104 @@ +package main + +import ( + "os" + + log "github.com/sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + IgnoreRobotsTXT bool + + BasicAuthUsername string + BasicAuthPassword string + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "hncrawler") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath(".") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + log.Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("DISCUSS_") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + log.Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + + c.BasicAuthUsername = viper.GetString("BasicAuthUsername") + c.BasicAuthPassword = viper.GetString("BasicAuthPassword") + + c.Debug = viper.GetBool("Debug") +} diff --git a/database.go b/database.go new file mode 100644 index 0000000..e3ba060 --- /dev/null +++ b/database.go @@ -0,0 +1,426 @@ +package main + +import ( + log "github.com/sirupsen/logrus" + "strconv" + + "database/sql" + _ "github.com/go-sql-driver/mysql" +) + +func (app *App) saveStory(s Story) error { + query := ` + INSERT IGNORE story ( + id, + story_id, + created_at, + updated_at, + type, + title, + text, + descendants, + time, + poster + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing query failed") + return err + } + defer stmt.Close() + + res, err := stmt.Exec(s.Id, app.Now, app.Now, s.Type, s.Title, s.Text, s.Score, s.Time, s.By) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + lid, err := res.LastInsertId() + if err != nil { + log.Warn("saveStory: lastInsertId() failed") + return err + } + + log.Debugf("saveStory: Successfull insert for item %d\n", s.Id) + + for _, l := range s.Links { + query = ` + INSERT IGNORE links( + id, + created_at, + updated_at, + story_id, + url, + field + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: InsertLinks: Preparing query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field) + if err != nil { + log.Warn("saveStory: InsertLinks: Statement execution failed") + return err + } + } + + return nil + + /* + query = ` + INSERT IGNORE discussion ( + id, + created_at, + updated_at, + article_id, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + (SELECT id FROM article WHERE url = ?), + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing second query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + */ + + return nil +} + +func (app *App) saveCode(s Story) error { + query := ` + INSERT IGNORE code( + id, + created_at, + updated_at, + url, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveCode: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveCode: Statement execution failed") + return err + } + + return nil +} + +func (app *App) updateDiscussion(story Story) error { + + query := ` + UPDATE discussion + set updated_at = ?, + comments = ?, + upvotes = ? + WHERE item_id = ?; + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateDiscussion: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, story.Descendants, story.Score, story.Id) + if err != nil { + log.Warnf("updateDiscussion: Statement execution failed") + return err + } + log.Debugf("updateDiscussion: Successful update of %d with new Score: %d, Comments: %d\n", story.Id, story.Score, story.Descendants) + + return nil +} + +func (app *App) updateArticleUrl(id int, url string) error { + query := ` + UPDATE article + set updated_at = ?, + url = ? + WHERE id = ? + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateArticleUrl: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, url, id) + if err != nil { + log.Warnf("updateArticleUrl: Statement execution failed") + return err + } + log.Debugf("updateArticleUrl: Successful update new url: %s\n", url) + + return nil + +} + +func (app *App) mergeArticles(id_to_delete int, correct_url string) error { + query := "SELECT id FROM discussion WHERE article_id = ?" + row := app.DB.QueryRow(query, id_to_delete) + var disc_id int + err := row.Scan(&disc_id) + if err != nil { + log.Warnf("mergeArticles: Query first row failed. id: %d url: %s", id_to_delete, correct_url) + return err + } + query = "SELECT id FROM article WHERE url = ?" + row = app.DB.QueryRow(query, correct_url) + var article_id int + err = row.Scan(&article_id) + if err != nil { + log.Warn("mergeArticles: Query second row failed") + return err + } + + query = "UPDATE discussion SET article_id = ?, updated_at = ? WHERE id = ?;" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("mergeArticles: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(article_id, app.Now, disc_id) + if err != nil { + log.Warn("mergeArticles: Update discussion failed") + return err + } + + query = "UPDATE article_category SET article_id = ? WHERE id = ?;" + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("mergeArticles: Preparing article_category query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(article_id, id_to_delete) + if err != nil { + log.Warn("mergeArticles: Update article_category failed") + return err + } + + return nil +} + +func (app *App) deleteOrphanedArticles() error { + query := ` + SELECT a.id FROM + article AS a + LEFT JOIN + discussion AS d ON a.id = d.article_id + WHERE d.id IS NULL;` + row := app.DB.QueryRow(query) + + var article_id int + err := row.Scan(&article_id) + + if err != nil { + if err == sql.ErrNoRows { + return nil + } else { + log.Warnf("deleteOrphanedArticles: Executing query failed: %s", err.Error()) + return err + } + } + + query = ` + DELETE FROM article_category WHERE article_id = ? + ` + _, err = app.DB.Exec(query, article_id) + if err != nil { + log.Warnf("deleteOrphanedArticles: Delete from article_category query failed: %s", err.Error()) + return err + } + + query = ` + DELETE FROM article WHERE id = ? + + ` + _, err = app.DB.Exec(query, article_id) + if err != nil { + log.Warnf("deleteOrphanedArticles: Delete from article query failed: %s", err.Error()) + return err + } + + return nil +} + +func (app *App) saveCategory(article_id int, categories []string) { + + for _, category := range categories { + if "" == category { + log.Warnf("saveCategory: category is empty for article_id: %d", article_id) + continue + } + + query := "SELECT id FROM category WHERE name = ?" + row := app.DB.QueryRow(query, category) + var category_id int + err := row.Scan(&category_id) + + if err != nil { + if err != sql.ErrNoRows { + log.Warn("saveCategory: Selecting category id failed") + log.Fatal(err) + } + } + + if err == sql.ErrNoRows { + query = ` + INSERT INTO category ( + id, + created_at, + updated_at, + name + ) VALUES ( + null, + ?, + ?, + ? + )` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + defer stmt.Close() + + result, err := stmt.Exec(app.Now, app.Now, category) + if err != nil { + log.Fatal(err) + } + + category_id64, err := result.LastInsertId() + category_id = int(category_id64) + if err != nil { + log.Fatal(err) + } + } + + query = ` + INSERT IGNORE article_category ( + id, + article_id, + category_id + ) VALUES ( + null, + ?, + ? + ) + ` + + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + _, err = stmt2.Exec(article_id, category_id) + if err != nil { + log.Fatal(err) + } + } +} + +func (app *App) getArticleIdFromUrl(wiki_url string) int { + row := app.DB.QueryRow("SELECT id FROM article WHERE url = ?", wiki_url) + var article_id int + err := row.Scan(&article_id) + if err != nil { + log.Warnf("getArticleIdFromUrl: Query or scanning failed for: %s", wiki_url) + log.Fatal(err) + } + return article_id +} + +func (app *App) getAllArticles() { + rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + + err = rows.Scan(&article_id) + if err != nil { + log.Fatal(err) + } + + log.Println(article_id) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..85bdde6 --- /dev/null +++ b/go.mod @@ -0,0 +1,50 @@ +module hn-crawler + +go 1.19 + +require ( + github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde + github.com/go-sql-driver/mysql v1.7.1 + github.com/gocolly/colly v1.2.0 + github.com/jmoiron/sqlx v1.3.5 + github.com/sirupsen/logrus v1.9.3 + github.com/spf13/pflag v1.0.5 + github.com/spf13/viper v1.18.2 +) + +require ( + github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.3.0 // indirect + github.com/antchfx/xmlquery v1.3.18 // indirect + github.com/antchfx/xpath v1.2.4 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/magiconair/properties v1.8.7 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/mvdan/xurls v1.1.0 // indirect + github.com/pelletier/go-toml/v2 v2.1.0 // indirect + github.com/sagikazarmark/locafero v0.4.0 // indirect + github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/sourcegraph/conc v0.3.0 // indirect + github.com/spf13/afero v1.11.0 // indirect + github.com/spf13/cast v1.6.0 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + go.uber.org/atomic v1.9.0 // indirect + go.uber.org/multierr v1.9.0 // indirect + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.31.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + mvdan.cc/xurls/v2 v2.5.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..5fb070d --- /dev/null +++ b/go.sum @@ -0,0 +1,154 @@ +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde h1:xcvoK8AzKQi2TR/lgV5lcG3PcDU4T3F8hN75Ou3KZ6w= +github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde/go.mod h1:CZ177vKofY/zZG0s1KUJQflzzEWlceyyqW8RRpyMqfs= +github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= +github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= +github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0= +github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= +github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY= +github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= +github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= +github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g= +github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0= +github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg= +github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/mvdan/xurls v1.1.0 h1:OpuDelGQ1R1ueQ6sSryzi6P+1RtBpfQHM8fJwlE45ww= +github.com/mvdan/xurls v1.1.0/go.mod h1:tQlNn3BED8bE/15hnSL2HLkDeLWpNPAwtw7wkEq44oU= +github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= +github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= +github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= +github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= +github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= +github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= +github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= +github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= +github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.18.2 h1:LUXCnvUvSM6FXAsj6nnfc8Q2tp1dIgUfY9Kc8GsSOiQ= +github.com/spf13/viper v1.18.2/go.mod h1:EKmWIqdnk5lOcmR72yw6hS+8OPYcwD0jteitLMVB+yk= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= +go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= +go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= +mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= diff --git a/helper.go b/helper.go new file mode 100644 index 0000000..af5f4c1 --- /dev/null +++ b/helper.go @@ -0,0 +1,168 @@ +package main + +import ( + _url "net/url" + "strings" + log "github.com/sirupsen/logrus" + "regexp" +) + +func stripHNPrefix(title string) string { + title = strings.TrimPrefix(title, "Ask HN:") + title = strings.TrimPrefix(title, "Show HN:") + title = strings.TrimPrefix(title, "Tell HN:") + title = strings.TrimPrefix(title, "Experiment HN:") + title = strings.TrimPrefix(title, "Launch HN:") + + return strings.TrimSpace(title) +} + +/** + * removes given param from URL + */ +func _removeParam(url, key string) string { + u, err := _url.Parse(url) + if err != nil { + log.Fatal(err) + } + q := u.Query() + q.Del(key) + u.RawQuery = q.Encode() + return u.String() +} + +func normalizeUrl(url string) string { + match, err := regexp.MatchString("^http://", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "http:// ", url) + r := regexp.MustCompile("^http://") + url = r.ReplaceAllString(url, "https://") + } + + // add missing https:// if no scheme + u, err := _url.Parse(url) + if err != nil { + log.Fatal(err) + } + + if "" == u.Scheme { + if strings.HasPrefix(url, "/") { + url = "https:" + url + } else { + url = "https://" + url + } + } + + + match, err = regexp.MatchString("youtube://", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("youtube://") + url = r.ReplaceAllString(url, "https://") + } + + match, err = regexp.MatchString("youtu.be/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "youtu.be ", url) + + /** + * remove tracking param "si" + */ + url = _removeParam(url, "si") + url = _removeParam(url, "feature") + + u, err := _url.Parse(url) + if err != nil { + log.Fatal(err) + } + q := u.Query() + q.Add("v", strings.TrimLeft(u.Path, "/")) + + u.Host = "www.youtube.com" + u.Path = "watch" + + u.RawQuery = q.Encode() + url = u.String() + + //r := regexp.MustCompile("youtu.be/") + //url = r.ReplaceAllString(url, "youtube.com/watch?v=") + } + + match, err = regexp.MatchString("/m.youtube.com/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "m.youtube.com ", url) + + /** + * remove tracking param "si" + */ + url = _removeParam(url, "si") + url = _removeParam(url, "feature") + + r := regexp.MustCompile("/m.youtube.com/") + url = r.ReplaceAllString(url, "/www.youtube.com/") + } + + match, err = regexp.MatchString("/m.imdb.com/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "m.imdb.com ", url) + + /** + * remove tracking param "si" + */ + url = _removeParam(url, "si") + url = _removeParam(url, "feature") + + r := regexp.MustCompile("/m.imdb.com/") + url = r.ReplaceAllString(url, "/www.imdb.com") + } + + /* + match, err = regexp.MatchString("m.wikipedia.org", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("m.wikipedia.org") + url = r.ReplaceAllString(url, "wikipedia.org") + } + */ + + /** + * remove tracking utm_ params + */ + url = _removeParam(url, "utm_source") + url = _removeParam(url, "utm_medium") + url = _removeParam(url, "utm_campaign") + url = _removeParam(url, "utm_term") + url = _removeParam(url, "utm_content") + + u, err = _url.Parse(url) + if err != nil { + log.Fatal(err) + } + + /** + * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go + * Screw that, wierd edge case. Someone pasted a + */ + if ! strings.HasPrefix(u.Host, "www.") { + u.Host = "www." + u.Host + } + url = u.String() + + return url +} diff --git a/init.go b/init.go new file mode 100644 index 0000000..9127c5d --- /dev/null +++ b/init.go @@ -0,0 +1,58 @@ +package main + +import ( + "errors" + "strings" + + log "github.com/sirupsen/logrus" + flag "github.com/spf13/pflag" +) + +// global config, gets overwritten by main +var _conf Config + +func init() { + // overwrites unhelpful error message + flag.ErrHelp = errors.New("") + + // we need to parse the config because of log level setting + configFile := flag.StringP("config", "c", "", "path to config file") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") + + flag.Parse() + loglevel := strings.ToLower(*loglevel_f) + + if *debug || *verbose || loglevel == "debug" { + log.SetLevel(log.DebugLevel) + } else if loglevel == "info" { + log.SetLevel(log.InfoLevel) + } else { + log.SetLevel(log.WarnLevel) + } + + if *silent { + log.SetLevel(log.WarnLevel) + } + + _conf.parseConfig(*configFile) + + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + + if _conf.Debug && !*silent { + log.SetLevel(log.DebugLevel) + } +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..8940afc --- /dev/null +++ b/main.go @@ -0,0 +1,462 @@ +package main + +import ( + "html" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/anikhasibul/queue" + "github.com/jmoiron/sqlx" + log "github.com/sirupsen/logrus" + "mvdan.cc/xurls/v2" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() + + /* + app.deleteOrphanedArticles() + app.topStories() + app.deleteOrphanedArticles() + app.updateAllDiscussions() + */ + app.walkDown() + + /** + * Resolve redirects on stored urls. + */ + //return +} + +func (app *App) walkDown() { + + //var err error + + //max_item := getMaxItem() + //max_item := 41495306 + //max_item := 36128477 + max_item := 32670334 + //max_item := 41231601 + //max_item := 41165987 + //max_item := 41136898 + //max_item := 22554000 + //max_item := 22494596 + //max_item := 22354383 + //max_item := 18984000 + //max_item := 18732000 + //max_item := 16017000 + //max_item := 15494000 + //max_item := 15038031 + //max_item := 14450000 + + const maxRoutines = 200 + + q := queue.New(maxRoutines) + defer q.Close() + //for i := max_item; i > 22600000; i-- { + for i := max_item; i > 0; i-- { + q.Add() + go func(i int) { + defer q.Done() + + Story, ok := getStory(i) + if ok { + if len(Story.Links) > 0 { + //log.Debugf("%+v\n", Story) + //log.Debugf("%+v\n", Story.Links) + } + err := app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + /* + */ + } + + /* + * Prints status update every 1000th entry + */ + if i%1000 == 0 { + log.Infof("%s: Getting item %d\n", time.Now(), i) + } + }(i) + } + q.Wait() +} + +func getMaxItem() int { + response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") + if err != nil { + panic(err) + } + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + max_item, err := strconv.Atoi(string(data)) + if err != nil { + panic(err) + } + + return max_item +} + +func (app *App) topStories() { + var err error + + data1 := strings.TrimSuffix(string(getTopStories()), "]") + data2 := strings.TrimPrefix(string(getBestStories()), "[") + + data1 = data1 + "," + data := data1 + data2 + + var story_ids []int + err = json.Unmarshal([]byte(data), &story_ids) + if err != nil { + log.Warn("topStories: Unmarshaling json failed") + panic(err) + } + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for _, id := range story_ids { + q.Add() + go func(id int) { + Story, ok := getStory(id) + defer q.Done() + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + + } + }(id) + } + q.Wait() +} + +func getStory(id int) (Story, bool) { + Story := getDetail(id) + if Story.Dead { + return Story, false + } + if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 { + return Story, false + } + var duplicates = make(map[string]bool) + /* + if (time.Now().Unix() - 3456000) > int64(Story.Time) { + } + */ + + Story.Title = stripHNPrefix(Story.Title) + + u, err := url.Parse(Story.Url) + if err != nil { + log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) + return Story, false + } + + /** + * Check if story links to Youtube + */ + is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match youtube host") + log.Infof("%+v\n", Story) + + duplicates[link.Url] = true + } + + /** + * Check if story links to movie platform + */ + is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform url") + log.Infof("%+v\n", Story) + + duplicates[link.Url] = true + } + + /** + * Check for (Video) in title + */ + is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + if ! duplicates[Story.Url] { + + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match video title") + log.Infof("%+v\n", Story) + + duplicates[Story.Url] = true + } + + } + + /** + * Check if story links to movie platform + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + if ! duplicates[Story.Url] { + + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform url") + log.Infof("%+v\n", Story) + + duplicates[Story.Url] = true + } + + } + + /** + * Parse all URLs in Story.Text + */ + rxRelaxed := xurls.Relaxed() + rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1) + + for _, rxLink := range rxLinks { + + /** + * Check for Youtube in text field + */ + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + if ! duplicates[rxLink] { + + var link Link + link.Url = normalizeUrl(rxLink) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match youtube text") + log.Infof("%+v\n", Story) + + duplicates[rxLink] = true + } + + } + + /** + * Check for movie platforms in text field + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + if ! duplicates[rxLink] { + + var link Link + link.Url = normalizeUrl(rxLink) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform text") + log.Infof("%+v\n", Story) + + duplicates[rxLink] = true + } + + } + } + + //Story.Url = normalizeUrl(Story.Url) + + if len(Story.Links) > 0 { + return Story, true + } else { + return Story, false + } +} + +func getResponse(url string) *http.Response { + var err error + var response *http.Response + + response, err = http.Get(url) + if err != nil { + for i := 0; i < 4; i++ { + if i == 0 { + log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + } else { + log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + } + resp2, err2 := http.Get(url) + if err2 == nil { + return resp2 + } + } + panic(err) + } + return response +} + +func getBestResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/beststories.json" + return getResponse(_url) +} + +func getTopResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/topstories.json" + return getResponse(_url) +} + +func getStoryResponse(item_id string) *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" + return getResponse(_url) +} + +func getDetail(id int) Story { + response := getStoryResponse(strconv.Itoa(id)) + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + var story Story + err = json.Unmarshal(data, &story) + if err != nil { + log.Warn("getDetail: Unmarshaling json failed ", data) + panic(err) + } + //log.Debug("%+v\n", Story) + + story.Text = html.UnescapeString(story.Text) + + return story +} + +func getTopStories() []byte { + response := getTopResponse() + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func getBestStories() []byte { + response := getBestResponse() + + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func (app *App) updateAllDiscussions() { + const maxRoutines = 20 + var item_ids []int + + app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") + + q := queue.New(maxRoutines) + defer q.Close() + + for _, item_id := range item_ids { + q.Add() + go func(item_id int) { + defer q.Done() + Story, ok := getStory(item_id) + if !ok { + /** + * Check if we got a network error or a dead story. + */ + if 0 == Story.Id { + log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) + } else if Story.Descendants > 10 || Story.Score > 10 { + log.Infof(` + updateAllDiscussions: There is a bug. Can't update discussion with id %d. + NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source. + %+v\n + `, item_id, Story) + } + return + } + err := app.updateDiscussion(Story) + if err != nil { + log.Warn(err) + return + } + }(item_id) + } + q.Wait() +} diff --git a/sql.sql b/sql.sql new file mode 100644 index 0000000..3a5d8da --- /dev/null +++ b/sql.sql @@ -0,0 +1,3 @@ +create table story( id int primary key auto_increment, story_id int not null unique, created_at timestamp, updated_at timestamp, type varchar(255) not null, title varchar(255) not null, text text, score int not null, descendants int not null, time int not null, poster varchar(255) not null); + +create table links (id int primary key auto_increment, created_at timestamp, updated_at timestamp, story_id int not null, url varchar(255) not null, field int not null, foreign key(story_id) references story(id)); diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..a20c244 --- /dev/null +++ b/struct.go @@ -0,0 +1,26 @@ +package main + +type Story struct { + Id int + //Deleted bool + Type string /* story, comment (or job, poll, pollopt) */ + Title string /* title (only story) */ + Text string /* comment text or possible text on story (HTML) */ + Dead bool + Url string /* verbatim parsed URL */ + //NormalizedUrl string /* normalized */ + Score int /* only story */ + Descendants int /* comments on score or kids on comments */ + //Kids []int /* id of the item's comments */ + Time int /* posted at */ + By string /* hn commenter */ + Links []Link /* matched urls */ +} + +type Link struct { + Url string + Field int /* 2 = video, 1 = movies, 0 = bug */ +} + +type URL struct { +} -- cgit v1.2.3