diff options
| -rw-r--r-- | Makefile | 18 | ||||
| -rw-r--r-- | config.go | 104 | ||||
| -rw-r--r-- | database.go | 426 | ||||
| -rw-r--r-- | go.mod | 50 | ||||
| -rw-r--r-- | go.sum | 154 | ||||
| -rw-r--r-- | helper.go | 168 | ||||
| -rw-r--r-- | init.go | 58 | ||||
| -rw-r--r-- | main.go | 462 | ||||
| -rw-r--r-- | sql.sql | 3 | ||||
| -rw-r--r-- | struct.go | 26 |
10 files changed, 1469 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e8115cd --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +BINARY=hn-crawler + +all: build run + +build: + go build -o $(BINARY) + +run: + ./$(BINARY) + +info: build + ./$(BINARY) --loglevel=info + +debug: build + ./$(BINARY) -d + +clean: + rm $(BINARY) diff --git a/config.go b/config.go new file mode 100644 index 0000000..7c36c53 --- /dev/null +++ b/config.go @@ -0,0 +1,104 @@ +package main + +import ( + "os" + + log "github.com/sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + IgnoreRobotsTXT bool + + BasicAuthUsername string + BasicAuthPassword string + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "hncrawler") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath(".") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + log.Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("DISCUSS_") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + log.Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + + c.BasicAuthUsername = viper.GetString("BasicAuthUsername") + c.BasicAuthPassword = viper.GetString("BasicAuthPassword") + + c.Debug = viper.GetBool("Debug") +} diff --git a/database.go b/database.go new file mode 100644 index 0000000..e3ba060 --- /dev/null +++ b/database.go @@ -0,0 +1,426 @@ +package main + +import ( + log "github.com/sirupsen/logrus" + "strconv" + + "database/sql" + _ "github.com/go-sql-driver/mysql" +) + +func (app *App) saveStory(s Story) error { + query := ` + INSERT IGNORE story ( + id, + story_id, + created_at, + updated_at, + type, + title, + text, + descendants, + time, + poster + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing query failed") + return err + } + defer stmt.Close() + + res, err := stmt.Exec(s.Id, app.Now, app.Now, s.Type, s.Title, s.Text, s.Score, s.Time, s.By) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + lid, err := res.LastInsertId() + if err != nil { + log.Warn("saveStory: lastInsertId() failed") + return err + } + + log.Debugf("saveStory: Successfull insert for item %d\n", s.Id) + + for _, l := range s.Links { + query = ` + INSERT IGNORE links( + id, + created_at, + updated_at, + story_id, + url, + field + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: InsertLinks: Preparing query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field) + if err != nil { + log.Warn("saveStory: InsertLinks: Statement execution failed") + return err + } + } + + return nil + + /* + query = ` + INSERT IGNORE discussion ( + id, + created_at, + updated_at, + article_id, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + (SELECT id FROM article WHERE url = ?), + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing second query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + */ + + return nil +} + +func (app *App) saveCode(s Story) error { + query := ` + INSERT IGNORE code( + id, + created_at, + updated_at, + url, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveCode: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveCode: Statement execution failed") + return err + } + + return nil +} + +func (app *App) updateDiscussion(story Story) error { + + query := ` + UPDATE discussion + set updated_at = ?, + comments = ?, + upvotes = ? + WHERE item_id = ?; + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateDiscussion: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, story.Descendants, story.Score, story.Id) + if err != nil { + log.Warnf("updateDiscussion: Statement execution failed") + return err + } + log.Debugf("updateDiscussion: Successful update of %d with new Score: %d, Comments: %d\n", story.Id, story.Score, story.Descendants) + + return nil +} + +func (app *App) updateArticleUrl(id int, url string) error { + query := ` + UPDATE article + set updated_at = ?, + url = ? + WHERE id = ? + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateArticleUrl: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, url, id) + if err != nil { + log.Warnf("updateArticleUrl: Statement execution failed") + return err + } + log.Debugf("updateArticleUrl: Successful update new url: %s\n", url) + + return nil + +} + +func (app *App) mergeArticles(id_to_delete int, correct_url string) error { + query := "SELECT id FROM discussion WHERE article_id = ?" + row := app.DB.QueryRow(query, id_to_delete) + var disc_id int + err := row.Scan(&disc_id) + if err != nil { + log.Warnf("mergeArticles: Query first row failed. id: %d url: %s", id_to_delete, correct_url) + return err + } + query = "SELECT id FROM article WHERE url = ?" + row = app.DB.QueryRow(query, correct_url) + var article_id int + err = row.Scan(&article_id) + if err != nil { + log.Warn("mergeArticles: Query second row failed") + return err + } + + query = "UPDATE discussion SET article_id = ?, updated_at = ? WHERE id = ?;" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("mergeArticles: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(article_id, app.Now, disc_id) + if err != nil { + log.Warn("mergeArticles: Update discussion failed") + return err + } + + query = "UPDATE article_category SET article_id = ? WHERE id = ?;" + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("mergeArticles: Preparing article_category query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(article_id, id_to_delete) + if err != nil { + log.Warn("mergeArticles: Update article_category failed") + return err + } + + return nil +} + +func (app *App) deleteOrphanedArticles() error { + query := ` + SELECT a.id FROM + article AS a + LEFT JOIN + discussion AS d ON a.id = d.article_id + WHERE d.id IS NULL;` + row := app.DB.QueryRow(query) + + var article_id int + err := row.Scan(&article_id) + + if err != nil { + if err == sql.ErrNoRows { + return nil + } else { + log.Warnf("deleteOrphanedArticles: Executing query failed: %s", err.Error()) + return err + } + } + + query = ` + DELETE FROM article_category WHERE article_id = ? + ` + _, err = app.DB.Exec(query, article_id) + if err != nil { + log.Warnf("deleteOrphanedArticles: Delete from article_category query failed: %s", err.Error()) + return err + } + + query = ` + DELETE FROM article WHERE id = ? + + ` + _, err = app.DB.Exec(query, article_id) + if err != nil { + log.Warnf("deleteOrphanedArticles: Delete from article query failed: %s", err.Error()) + return err + } + + return nil +} + +func (app *App) saveCategory(article_id int, categories []string) { + + for _, category := range categories { + if "" == category { + log.Warnf("saveCategory: category is empty for article_id: %d", article_id) + continue + } + + query := "SELECT id FROM category WHERE name = ?" + row := app.DB.QueryRow(query, category) + var category_id int + err := row.Scan(&category_id) + + if err != nil { + if err != sql.ErrNoRows { + log.Warn("saveCategory: Selecting category id failed") + log.Fatal(err) + } + } + + if err == sql.ErrNoRows { + query = ` + INSERT INTO category ( + id, + created_at, + updated_at, + name + ) VALUES ( + null, + ?, + ?, + ? + )` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + defer stmt.Close() + + result, err := stmt.Exec(app.Now, app.Now, category) + if err != nil { + log.Fatal(err) + } + + category_id64, err := result.LastInsertId() + category_id = int(category_id64) + if err != nil { + log.Fatal(err) + } + } + + query = ` + INSERT IGNORE article_category ( + id, + article_id, + category_id + ) VALUES ( + null, + ?, + ? + ) + ` + + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + _, err = stmt2.Exec(article_id, category_id) + if err != nil { + log.Fatal(err) + } + } +} + +func (app *App) getArticleIdFromUrl(wiki_url string) int { + row := app.DB.QueryRow("SELECT id FROM article WHERE url = ?", wiki_url) + var article_id int + err := row.Scan(&article_id) + if err != nil { + log.Warnf("getArticleIdFromUrl: Query or scanning failed for: %s", wiki_url) + log.Fatal(err) + } + return article_id +} + +func (app *App) getAllArticles() { + rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + + err = rows.Scan(&article_id) + if err != nil { + log.Fatal(err) + } + + log.Println(article_id) + } +} @@ -0,0 +1,50 @@ +module hn-crawler + +go 1.19 + +require ( + github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde + github.com/go-sql-driver/mysql v1.7.1 + github.com/gocolly/colly v1.2.0 + github.com/jmoiron/sqlx v1.3.5 + github.com/sirupsen/logrus v1.9.3 + github.com/spf13/pflag v1.0.5 + github.com/spf13/viper v1.18.2 +) + +require ( + github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.3.0 // indirect + github.com/antchfx/xmlquery v1.3.18 // indirect + github.com/antchfx/xpath v1.2.4 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/magiconair/properties v1.8.7 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/mvdan/xurls v1.1.0 // indirect + github.com/pelletier/go-toml/v2 v2.1.0 // indirect + github.com/sagikazarmark/locafero v0.4.0 // indirect + github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/sourcegraph/conc v0.3.0 // indirect + github.com/spf13/afero v1.11.0 // indirect + github.com/spf13/cast v1.6.0 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + go.uber.org/atomic v1.9.0 // indirect + go.uber.org/multierr v1.9.0 // indirect + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.31.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + mvdan.cc/xurls/v2 v2.5.0 // indirect +) @@ -0,0 +1,154 @@ +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde h1:xcvoK8AzKQi2TR/lgV5lcG3PcDU4T3F8hN75Ou3KZ6w= +github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde/go.mod h1:CZ177vKofY/zZG0s1KUJQflzzEWlceyyqW8RRpyMqfs= +github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= +github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= +github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0= +github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= +github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY= +github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= +github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= +github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g= +github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0= +github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg= +github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/mvdan/xurls v1.1.0 h1:OpuDelGQ1R1ueQ6sSryzi6P+1RtBpfQHM8fJwlE45ww= +github.com/mvdan/xurls v1.1.0/go.mod h1:tQlNn3BED8bE/15hnSL2HLkDeLWpNPAwtw7wkEq44oU= +github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= +github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= +github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= +github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= +github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= +github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= +github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= +github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= +github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.18.2 h1:LUXCnvUvSM6FXAsj6nnfc8Q2tp1dIgUfY9Kc8GsSOiQ= +github.com/spf13/viper v1.18.2/go.mod h1:EKmWIqdnk5lOcmR72yw6hS+8OPYcwD0jteitLMVB+yk= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= +go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= +go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= +mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= diff --git a/helper.go b/helper.go new file mode 100644 index 0000000..af5f4c1 --- /dev/null +++ b/helper.go @@ -0,0 +1,168 @@ +package main + +import ( + _url "net/url" + "strings" + log "github.com/sirupsen/logrus" + "regexp" +) + +func stripHNPrefix(title string) string { + title = strings.TrimPrefix(title, "Ask HN:") + title = strings.TrimPrefix(title, "Show HN:") + title = strings.TrimPrefix(title, "Tell HN:") + title = strings.TrimPrefix(title, "Experiment HN:") + title = strings.TrimPrefix(title, "Launch HN:") + + return strings.TrimSpace(title) +} + +/** + * removes given param from URL + */ +func _removeParam(url, key string) string { + u, err := _url.Parse(url) + if err != nil { + log.Fatal(err) + } + q := u.Query() + q.Del(key) + u.RawQuery = q.Encode() + return u.String() +} + +func normalizeUrl(url string) string { + match, err := regexp.MatchString("^http://", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "http:// ", url) + r := regexp.MustCompile("^http://") + url = r.ReplaceAllString(url, "https://") + } + + // add missing https:// if no scheme + u, err := _url.Parse(url) + if err != nil { + log.Fatal(err) + } + + if "" == u.Scheme { + if strings.HasPrefix(url, "/") { + url = "https:" + url + } else { + url = "https://" + url + } + } + + + match, err = regexp.MatchString("youtube://", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("youtube://") + url = r.ReplaceAllString(url, "https://") + } + + match, err = regexp.MatchString("youtu.be/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "youtu.be ", url) + + /** + * remove tracking param "si" + */ + url = _removeParam(url, "si") + url = _removeParam(url, "feature") + + u, err := _url.Parse(url) + if err != nil { + log.Fatal(err) + } + q := u.Query() + q.Add("v", strings.TrimLeft(u.Path, "/")) + + u.Host = "www.youtube.com" + u.Path = "watch" + + u.RawQuery = q.Encode() + url = u.String() + + //r := regexp.MustCompile("youtu.be/") + //url = r.ReplaceAllString(url, "youtube.com/watch?v=") + } + + match, err = regexp.MatchString("/m.youtube.com/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "m.youtube.com ", url) + + /** + * remove tracking param "si" + */ + url = _removeParam(url, "si") + url = _removeParam(url, "feature") + + r := regexp.MustCompile("/m.youtube.com/") + url = r.ReplaceAllString(url, "/www.youtube.com/") + } + + match, err = regexp.MatchString("/m.imdb.com/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "m.imdb.com ", url) + + /** + * remove tracking param "si" + */ + url = _removeParam(url, "si") + url = _removeParam(url, "feature") + + r := regexp.MustCompile("/m.imdb.com/") + url = r.ReplaceAllString(url, "/www.imdb.com") + } + + /* + match, err = regexp.MatchString("m.wikipedia.org", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("m.wikipedia.org") + url = r.ReplaceAllString(url, "wikipedia.org") + } + */ + + /** + * remove tracking utm_ params + */ + url = _removeParam(url, "utm_source") + url = _removeParam(url, "utm_medium") + url = _removeParam(url, "utm_campaign") + url = _removeParam(url, "utm_term") + url = _removeParam(url, "utm_content") + + u, err = _url.Parse(url) + if err != nil { + log.Fatal(err) + } + + /** + * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go + * Screw that, wierd edge case. Someone pasted a + */ + if ! strings.HasPrefix(u.Host, "www.") { + u.Host = "www." + u.Host + } + url = u.String() + + return url +} @@ -0,0 +1,58 @@ +package main + +import ( + "errors" + "strings" + + log "github.com/sirupsen/logrus" + flag "github.com/spf13/pflag" +) + +// global config, gets overwritten by main +var _conf Config + +func init() { + // overwrites unhelpful error message + flag.ErrHelp = errors.New("") + + // we need to parse the config because of log level setting + configFile := flag.StringP("config", "c", "", "path to config file") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") + + flag.Parse() + loglevel := strings.ToLower(*loglevel_f) + + if *debug || *verbose || loglevel == "debug" { + log.SetLevel(log.DebugLevel) + } else if loglevel == "info" { + log.SetLevel(log.InfoLevel) + } else { + log.SetLevel(log.WarnLevel) + } + + if *silent { + log.SetLevel(log.WarnLevel) + } + + _conf.parseConfig(*configFile) + + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + + if _conf.Debug && !*silent { + log.SetLevel(log.DebugLevel) + } +} @@ -0,0 +1,462 @@ +package main + +import ( + "html" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/anikhasibul/queue" + "github.com/jmoiron/sqlx" + log "github.com/sirupsen/logrus" + "mvdan.cc/xurls/v2" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() + + /* + app.deleteOrphanedArticles() + app.topStories() + app.deleteOrphanedArticles() + app.updateAllDiscussions() + */ + app.walkDown() + + /** + * Resolve redirects on stored urls. + */ + //return +} + +func (app *App) walkDown() { + + //var err error + + //max_item := getMaxItem() + //max_item := 41495306 + //max_item := 36128477 + max_item := 32670334 + //max_item := 41231601 + //max_item := 41165987 + //max_item := 41136898 + //max_item := 22554000 + //max_item := 22494596 + //max_item := 22354383 + //max_item := 18984000 + //max_item := 18732000 + //max_item := 16017000 + //max_item := 15494000 + //max_item := 15038031 + //max_item := 14450000 + + const maxRoutines = 200 + + q := queue.New(maxRoutines) + defer q.Close() + //for i := max_item; i > 22600000; i-- { + for i := max_item; i > 0; i-- { + q.Add() + go func(i int) { + defer q.Done() + + Story, ok := getStory(i) + if ok { + if len(Story.Links) > 0 { + //log.Debugf("%+v\n", Story) + //log.Debugf("%+v\n", Story.Links) + } + err := app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + /* + */ + } + + /* + * Prints status update every 1000th entry + */ + if i%1000 == 0 { + log.Infof("%s: Getting item %d\n", time.Now(), i) + } + }(i) + } + q.Wait() +} + +func getMaxItem() int { + response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") + if err != nil { + panic(err) + } + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + max_item, err := strconv.Atoi(string(data)) + if err != nil { + panic(err) + } + + return max_item +} + +func (app *App) topStories() { + var err error + + data1 := strings.TrimSuffix(string(getTopStories()), "]") + data2 := strings.TrimPrefix(string(getBestStories()), "[") + + data1 = data1 + "," + data := data1 + data2 + + var story_ids []int + err = json.Unmarshal([]byte(data), &story_ids) + if err != nil { + log.Warn("topStories: Unmarshaling json failed") + panic(err) + } + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for _, id := range story_ids { + q.Add() + go func(id int) { + Story, ok := getStory(id) + defer q.Done() + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + + } + }(id) + } + q.Wait() +} + +func getStory(id int) (Story, bool) { + Story := getDetail(id) + if Story.Dead { + return Story, false + } + if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 { + return Story, false + } + var duplicates = make(map[string]bool) + /* + if (time.Now().Unix() - 3456000) > int64(Story.Time) { + } + */ + + Story.Title = stripHNPrefix(Story.Title) + + u, err := url.Parse(Story.Url) + if err != nil { + log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) + return Story, false + } + + /** + * Check if story links to Youtube + */ + is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match youtube host") + log.Infof("%+v\n", Story) + + duplicates[link.Url] = true + } + + /** + * Check if story links to movie platform + */ + is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform url") + log.Infof("%+v\n", Story) + + duplicates[link.Url] = true + } + + /** + * Check for (Video) in title + */ + is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + if ! duplicates[Story.Url] { + + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match video title") + log.Infof("%+v\n", Story) + + duplicates[Story.Url] = true + } + + } + + /** + * Check if story links to movie platform + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + if ! duplicates[Story.Url] { + + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform url") + log.Infof("%+v\n", Story) + + duplicates[Story.Url] = true + } + + } + + /** + * Parse all URLs in Story.Text + */ + rxRelaxed := xurls.Relaxed() + rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1) + + for _, rxLink := range rxLinks { + + /** + * Check for Youtube in text field + */ + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + if ! duplicates[rxLink] { + + var link Link + link.Url = normalizeUrl(rxLink) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match youtube text") + log.Infof("%+v\n", Story) + + duplicates[rxLink] = true + } + + } + + /** + * Check for movie platforms in text field + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + if ! duplicates[rxLink] { + + var link Link + link.Url = normalizeUrl(rxLink) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform text") + log.Infof("%+v\n", Story) + + duplicates[rxLink] = true + } + + } + } + + //Story.Url = normalizeUrl(Story.Url) + + if len(Story.Links) > 0 { + return Story, true + } else { + return Story, false + } +} + +func getResponse(url string) *http.Response { + var err error + var response *http.Response + + response, err = http.Get(url) + if err != nil { + for i := 0; i < 4; i++ { + if i == 0 { + log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + } else { + log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + } + resp2, err2 := http.Get(url) + if err2 == nil { + return resp2 + } + } + panic(err) + } + return response +} + +func getBestResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/beststories.json" + return getResponse(_url) +} + +func getTopResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/topstories.json" + return getResponse(_url) +} + +func getStoryResponse(item_id string) *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" + return getResponse(_url) +} + +func getDetail(id int) Story { + response := getStoryResponse(strconv.Itoa(id)) + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + var story Story + err = json.Unmarshal(data, &story) + if err != nil { + log.Warn("getDetail: Unmarshaling json failed ", data) + panic(err) + } + //log.Debug("%+v\n", Story) + + story.Text = html.UnescapeString(story.Text) + + return story +} + +func getTopStories() []byte { + response := getTopResponse() + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func getBestStories() []byte { + response := getBestResponse() + + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func (app *App) updateAllDiscussions() { + const maxRoutines = 20 + var item_ids []int + + app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") + + q := queue.New(maxRoutines) + defer q.Close() + + for _, item_id := range item_ids { + q.Add() + go func(item_id int) { + defer q.Done() + Story, ok := getStory(item_id) + if !ok { + /** + * Check if we got a network error or a dead story. + */ + if 0 == Story.Id { + log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) + } else if Story.Descendants > 10 || Story.Score > 10 { + log.Infof(` + updateAllDiscussions: There is a bug. Can't update discussion with id %d. + NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source. + %+v\n + `, item_id, Story) + } + return + } + err := app.updateDiscussion(Story) + if err != nil { + log.Warn(err) + return + } + }(item_id) + } + q.Wait() +} @@ -0,0 +1,3 @@ +create table story( id int primary key auto_increment, story_id int not null unique, created_at timestamp, updated_at timestamp, type varchar(255) not null, title varchar(255) not null, text text, score int not null, descendants int not null, time int not null, poster varchar(255) not null); + +create table links (id int primary key auto_increment, created_at timestamp, updated_at timestamp, story_id int not null, url varchar(255) not null, field int not null, foreign key(story_id) references story(id)); diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..a20c244 --- /dev/null +++ b/struct.go @@ -0,0 +1,26 @@ +package main + +type Story struct { + Id int + //Deleted bool + Type string /* story, comment (or job, poll, pollopt) */ + Title string /* title (only story) */ + Text string /* comment text or possible text on story (HTML) */ + Dead bool + Url string /* verbatim parsed URL */ + //NormalizedUrl string /* normalized */ + Score int /* only story */ + Descendants int /* comments on score or kids on comments */ + //Kids []int /* id of the item's comments */ + Time int /* posted at */ + By string /* hn commenter */ + Links []Link /* matched urls */ +} + +type Link struct { + Url string + Field int /* 2 = video, 1 = movies, 0 = bug */ +} + +type URL struct { +} |
