summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoradmin2024-09-11 11:02:32 +0200
committeradmin2024-09-11 11:02:32 +0200
commit6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b (patch)
tree0745ad06f9a5e61c3e24f97cdd8c3636f76c1e42
downloadhncrawler-6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b.tar.gz
init
-rw-r--r--Makefile18
-rw-r--r--config.go104
-rw-r--r--database.go426
-rw-r--r--go.mod50
-rw-r--r--go.sum154
-rw-r--r--helper.go168
-rw-r--r--init.go58
-rw-r--r--main.go462
-rw-r--r--sql.sql3
-rw-r--r--struct.go26
10 files changed, 1469 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e8115cd
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+BINARY=hn-crawler
+
+all: build run
+
+build:
+ go build -o $(BINARY)
+
+run:
+ ./$(BINARY)
+
+info: build
+ ./$(BINARY) --loglevel=info
+
+debug: build
+ ./$(BINARY) -d
+
+clean:
+ rm $(BINARY)
diff --git a/config.go b/config.go
new file mode 100644
index 0000000..7c36c53
--- /dev/null
+++ b/config.go
@@ -0,0 +1,104 @@
+package main
+
+import (
+ "os"
+
+ log "github.com/sirupsen/logrus"
+ "github.com/spf13/viper"
+)
+
+type Config struct {
+ DBDriver string
+ DBDBName string
+ DBHost string
+ DBPort string
+ DBUser string
+ DBPassword string
+ DBOptions string
+
+ UserAgent string
+ Delay int
+ IgnoreRobotsTXT bool
+
+ BasicAuthUsername string
+ BasicAuthPassword string
+
+ Debug bool // sets log level to debug
+}
+
+// Parses the configuration and sets the configuration struct.
+func (c *Config) parseConfig(configFile string) {
+
+ viper.SetDefault("DB_Driver", "mysql")
+ viper.SetDefault("DB_DBName", "hncrawler")
+ viper.SetDefault("DB_Host", "localhost")
+ viper.SetDefault("DB_Port", "3306")
+
+ viper.SetDefault("Debug", false)
+ viper.SetDefault("Delay", 0)
+
+ // needs some refactoring to truly respect robots.txt
+ viper.SetDefault("IgnoreRobotsTXT", true)
+
+ viper.SetDefault("UserAgent", "colly - a friendly crawler :)")
+
+ // Name of the configuration file
+ viper.SetConfigName("config")
+
+ // Where to find the config file
+ if configFile == "" {
+ viper.AddConfigPath(".")
+ } else {
+ stat, err := os.Stat(configFile)
+ if os.IsNotExist(err) {
+ // provided config file does not exist, so we add the path instead
+ viper.AddConfigPath(configFile)
+ } else if err == nil && stat.IsDir() {
+ // adds the path to look for the config file
+ viper.AddConfigPath(configFile)
+ } else if err == nil {
+ // directly sets the config file
+ viper.SetConfigFile(configFile)
+ } else {
+ // if we are here something went wrong
+ log.Warn(err, "config.go: os.Stat("+configFile+") error")
+ // adding the path nonetheless because it's not hurting
+ viper.AddConfigPath(configFile)
+ }
+ }
+
+ // Env variables need to be prefixed with "ALKOBOTE_"
+ viper.SetEnvPrefix("DISCUSS_")
+
+ // Parses automatic the matching env variables
+ viper.AutomaticEnv()
+
+ // Reads the config
+ err := viper.ReadInConfig()
+ if err != nil {
+ log.Fatal(err, "Config: Error parsing config file.")
+ }
+ log.Debug("Config: Config file used: " + viper.ConfigFileUsed())
+
+ c.setsConfig()
+}
+
+// Actually sets the config struct
+func (c *Config) setsConfig() {
+ c.DBDriver = viper.GetString("DB_Driver")
+ c.DBHost = viper.GetString("DB_Host")
+ c.DBPort = viper.GetString("DB_Port")
+ c.DBUser = viper.GetString("DB_User")
+ c.DBPassword = viper.GetString("DB_Password")
+ c.DBDBName = viper.GetString("DB_DBName")
+ c.DBOptions = viper.GetString("DB_Options")
+
+ c.UserAgent = viper.GetString("UserAgent")
+ c.Delay = viper.GetInt("Delay")
+ c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT")
+
+ c.BasicAuthUsername = viper.GetString("BasicAuthUsername")
+ c.BasicAuthPassword = viper.GetString("BasicAuthPassword")
+
+ c.Debug = viper.GetBool("Debug")
+}
diff --git a/database.go b/database.go
new file mode 100644
index 0000000..e3ba060
--- /dev/null
+++ b/database.go
@@ -0,0 +1,426 @@
+package main
+
+import (
+ log "github.com/sirupsen/logrus"
+ "strconv"
+
+ "database/sql"
+ _ "github.com/go-sql-driver/mysql"
+)
+
+func (app *App) saveStory(s Story) error {
+ query := `
+ INSERT IGNORE story (
+ id,
+ story_id,
+ created_at,
+ updated_at,
+ type,
+ title,
+ text,
+ descendants,
+ time,
+ poster
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveStory: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ res, err := stmt.Exec(s.Id, app.Now, app.Now, s.Type, s.Title, s.Text, s.Score, s.Time, s.By)
+ if err != nil {
+ log.Warn("saveStory: Statement execution failed")
+ return err
+ }
+ lid, err := res.LastInsertId()
+ if err != nil {
+ log.Warn("saveStory: lastInsertId() failed")
+ return err
+ }
+
+ log.Debugf("saveStory: Successfull insert for item %d\n", s.Id)
+
+ for _, l := range s.Links {
+ query = `
+ INSERT IGNORE links(
+ id,
+ created_at,
+ updated_at,
+ story_id,
+ url,
+ field
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveStory: InsertLinks: Preparing query failed")
+ return err
+ }
+ defer stmt2.Close()
+
+ _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field)
+ if err != nil {
+ log.Warn("saveStory: InsertLinks: Statement execution failed")
+ return err
+ }
+ }
+
+ return nil
+
+ /*
+ query = `
+ INSERT IGNORE discussion (
+ id,
+ created_at,
+ updated_at,
+ article_id,
+ title,
+ source,
+ item_id,
+ source_url,
+ posted_on,
+ comments,
+ upvotes
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ (SELECT id FROM article WHERE url = ?),
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveStory: Preparing second query failed")
+ return err
+ }
+ defer stmt2.Close()
+
+ _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score)
+ if err != nil {
+ log.Warn("saveStory: Statement execution failed")
+ return err
+ }
+ */
+
+ return nil
+}
+
+func (app *App) saveCode(s Story) error {
+ query := `
+ INSERT IGNORE code(
+ id,
+ created_at,
+ updated_at,
+ url,
+ title,
+ source,
+ item_id,
+ source_url,
+ posted_on,
+ comments,
+ upvotes
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveCode: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score)
+ if err != nil {
+ log.Warn("saveCode: Statement execution failed")
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) updateDiscussion(story Story) error {
+
+ query := `
+ UPDATE discussion
+ set updated_at = ?,
+ comments = ?,
+ upvotes = ?
+ WHERE item_id = ?;
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("updateDiscussion: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, story.Descendants, story.Score, story.Id)
+ if err != nil {
+ log.Warnf("updateDiscussion: Statement execution failed")
+ return err
+ }
+ log.Debugf("updateDiscussion: Successful update of %d with new Score: %d, Comments: %d\n", story.Id, story.Score, story.Descendants)
+
+ return nil
+}
+
+func (app *App) updateArticleUrl(id int, url string) error {
+ query := `
+ UPDATE article
+ set updated_at = ?,
+ url = ?
+ WHERE id = ?
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("updateArticleUrl: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, url, id)
+ if err != nil {
+ log.Warnf("updateArticleUrl: Statement execution failed")
+ return err
+ }
+ log.Debugf("updateArticleUrl: Successful update new url: %s\n", url)
+
+ return nil
+
+}
+
+func (app *App) mergeArticles(id_to_delete int, correct_url string) error {
+ query := "SELECT id FROM discussion WHERE article_id = ?"
+ row := app.DB.QueryRow(query, id_to_delete)
+ var disc_id int
+ err := row.Scan(&disc_id)
+ if err != nil {
+ log.Warnf("mergeArticles: Query first row failed. id: %d url: %s", id_to_delete, correct_url)
+ return err
+ }
+ query = "SELECT id FROM article WHERE url = ?"
+ row = app.DB.QueryRow(query, correct_url)
+ var article_id int
+ err = row.Scan(&article_id)
+ if err != nil {
+ log.Warn("mergeArticles: Query second row failed")
+ return err
+ }
+
+ query = "UPDATE discussion SET article_id = ?, updated_at = ? WHERE id = ?;"
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("mergeArticles: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(article_id, app.Now, disc_id)
+ if err != nil {
+ log.Warn("mergeArticles: Update discussion failed")
+ return err
+ }
+
+ query = "UPDATE article_category SET article_id = ? WHERE id = ?;"
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("mergeArticles: Preparing article_category query failed")
+ return err
+ }
+ defer stmt2.Close()
+
+ _, err = stmt2.Exec(article_id, id_to_delete)
+ if err != nil {
+ log.Warn("mergeArticles: Update article_category failed")
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) deleteOrphanedArticles() error {
+ query := `
+ SELECT a.id FROM
+ article AS a
+ LEFT JOIN
+ discussion AS d ON a.id = d.article_id
+ WHERE d.id IS NULL;`
+ row := app.DB.QueryRow(query)
+
+ var article_id int
+ err := row.Scan(&article_id)
+
+ if err != nil {
+ if err == sql.ErrNoRows {
+ return nil
+ } else {
+ log.Warnf("deleteOrphanedArticles: Executing query failed: %s", err.Error())
+ return err
+ }
+ }
+
+ query = `
+ DELETE FROM article_category WHERE article_id = ?
+ `
+ _, err = app.DB.Exec(query, article_id)
+ if err != nil {
+ log.Warnf("deleteOrphanedArticles: Delete from article_category query failed: %s", err.Error())
+ return err
+ }
+
+ query = `
+ DELETE FROM article WHERE id = ?
+
+ `
+ _, err = app.DB.Exec(query, article_id)
+ if err != nil {
+ log.Warnf("deleteOrphanedArticles: Delete from article query failed: %s", err.Error())
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) saveCategory(article_id int, categories []string) {
+
+ for _, category := range categories {
+ if "" == category {
+ log.Warnf("saveCategory: category is empty for article_id: %d", article_id)
+ continue
+ }
+
+ query := "SELECT id FROM category WHERE name = ?"
+ row := app.DB.QueryRow(query, category)
+ var category_id int
+ err := row.Scan(&category_id)
+
+ if err != nil {
+ if err != sql.ErrNoRows {
+ log.Warn("saveCategory: Selecting category id failed")
+ log.Fatal(err)
+ }
+ }
+
+ if err == sql.ErrNoRows {
+ query = `
+ INSERT INTO category (
+ id,
+ created_at,
+ updated_at,
+ name
+ ) VALUES (
+ null,
+ ?,
+ ?,
+ ?
+ )`
+
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer stmt.Close()
+
+ result, err := stmt.Exec(app.Now, app.Now, category)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ category_id64, err := result.LastInsertId()
+ category_id = int(category_id64)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+
+ query = `
+ INSERT IGNORE article_category (
+ id,
+ article_id,
+ category_id
+ ) VALUES (
+ null,
+ ?,
+ ?
+ )
+ `
+
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Fatal(err)
+ }
+ _, err = stmt2.Exec(article_id, category_id)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+}
+
+func (app *App) getArticleIdFromUrl(wiki_url string) int {
+ row := app.DB.QueryRow("SELECT id FROM article WHERE url = ?", wiki_url)
+ var article_id int
+ err := row.Scan(&article_id)
+ if err != nil {
+ log.Warnf("getArticleIdFromUrl: Query or scanning failed for: %s", wiki_url)
+ log.Fatal(err)
+ }
+ return article_id
+}
+
+func (app *App) getAllArticles() {
+ rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var article_id int
+
+ err = rows.Scan(&article_id)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ log.Println(article_id)
+ }
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..85bdde6
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,50 @@
+module hn-crawler
+
+go 1.19
+
+require (
+ github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde
+ github.com/go-sql-driver/mysql v1.7.1
+ github.com/gocolly/colly v1.2.0
+ github.com/jmoiron/sqlx v1.3.5
+ github.com/sirupsen/logrus v1.9.3
+ github.com/spf13/pflag v1.0.5
+ github.com/spf13/viper v1.18.2
+)
+
+require (
+ github.com/PuerkitoBio/goquery v1.8.1 // indirect
+ github.com/andybalholm/cascadia v1.3.1 // indirect
+ github.com/antchfx/htmlquery v1.3.0 // indirect
+ github.com/antchfx/xmlquery v1.3.18 // indirect
+ github.com/antchfx/xpath v1.2.4 // indirect
+ github.com/fsnotify/fsnotify v1.7.0 // indirect
+ github.com/gobwas/glob v0.2.3 // indirect
+ github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
+ github.com/golang/protobuf v1.5.3 // indirect
+ github.com/hashicorp/hcl v1.0.0 // indirect
+ github.com/kennygrant/sanitize v1.2.4 // indirect
+ github.com/magiconair/properties v1.8.7 // indirect
+ github.com/mitchellh/mapstructure v1.5.0 // indirect
+ github.com/mvdan/xurls v1.1.0 // indirect
+ github.com/pelletier/go-toml/v2 v2.1.0 // indirect
+ github.com/sagikazarmark/locafero v0.4.0 // indirect
+ github.com/sagikazarmark/slog-shim v0.1.0 // indirect
+ github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
+ github.com/sourcegraph/conc v0.3.0 // indirect
+ github.com/spf13/afero v1.11.0 // indirect
+ github.com/spf13/cast v1.6.0 // indirect
+ github.com/subosito/gotenv v1.6.0 // indirect
+ github.com/temoto/robotstxt v1.1.2 // indirect
+ go.uber.org/atomic v1.9.0 // indirect
+ go.uber.org/multierr v1.9.0 // indirect
+ golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
+ golang.org/x/net v0.19.0 // indirect
+ golang.org/x/sys v0.15.0 // indirect
+ golang.org/x/text v0.14.0 // indirect
+ google.golang.org/appengine v1.6.7 // indirect
+ google.golang.org/protobuf v1.31.0 // indirect
+ gopkg.in/ini.v1 v1.67.0 // indirect
+ gopkg.in/yaml.v3 v3.0.1 // indirect
+ mvdan.cc/xurls/v2 v2.5.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..5fb070d
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,154 @@
+github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
+github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
+github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
+github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
+github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde h1:xcvoK8AzKQi2TR/lgV5lcG3PcDU4T3F8hN75Ou3KZ6w=
+github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde/go.mod h1:CZ177vKofY/zZG0s1KUJQflzzEWlceyyqW8RRpyMqfs=
+github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E=
+github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8=
+github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0=
+github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA=
+github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
+github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY=
+github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
+github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
+github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
+github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
+github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
+github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
+github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
+github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
+github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
+github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
+github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
+github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
+github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
+github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g=
+github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ=
+github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
+github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0=
+github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
+github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY=
+github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
+github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
+github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
+github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
+github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
+github.com/mvdan/xurls v1.1.0 h1:OpuDelGQ1R1ueQ6sSryzi6P+1RtBpfQHM8fJwlE45ww=
+github.com/mvdan/xurls v1.1.0/go.mod h1:tQlNn3BED8bE/15hnSL2HLkDeLWpNPAwtw7wkEq44oU=
+github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4=
+github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
+github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
+github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ=
+github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4=
+github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
+github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
+github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
+github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
+github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
+github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
+github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
+github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
+github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
+github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0=
+github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
+github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
+github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/viper v1.18.2 h1:LUXCnvUvSM6FXAsj6nnfc8Q2tp1dIgUfY9Kc8GsSOiQ=
+github.com/spf13/viper v1.18.2/go.mod h1:EKmWIqdnk5lOcmR72yw6hS+8OPYcwD0jteitLMVB+yk=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
+github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
+github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
+github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE=
+go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
+go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI=
+go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g=
+golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws=
+golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
+golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
+golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
+google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
+google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
+gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
+gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8=
+mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE=
diff --git a/helper.go b/helper.go
new file mode 100644
index 0000000..af5f4c1
--- /dev/null
+++ b/helper.go
@@ -0,0 +1,168 @@
+package main
+
+import (
+ _url "net/url"
+ "strings"
+ log "github.com/sirupsen/logrus"
+ "regexp"
+)
+
+func stripHNPrefix(title string) string {
+ title = strings.TrimPrefix(title, "Ask HN:")
+ title = strings.TrimPrefix(title, "Show HN:")
+ title = strings.TrimPrefix(title, "Tell HN:")
+ title = strings.TrimPrefix(title, "Experiment HN:")
+ title = strings.TrimPrefix(title, "Launch HN:")
+
+ return strings.TrimSpace(title)
+}
+
+/**
+ * removes given param from URL
+ */
+func _removeParam(url, key string) string {
+ u, err := _url.Parse(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ q := u.Query()
+ q.Del(key)
+ u.RawQuery = q.Encode()
+ return u.String()
+}
+
+func normalizeUrl(url string) string {
+ match, err := regexp.MatchString("^http://", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ log.Debug("normalize: ", "http:// ", url)
+ r := regexp.MustCompile("^http://")
+ url = r.ReplaceAllString(url, "https://")
+ }
+
+ // add missing https:// if no scheme
+ u, err := _url.Parse(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if "" == u.Scheme {
+ if strings.HasPrefix(url, "/") {
+ url = "https:" + url
+ } else {
+ url = "https://" + url
+ }
+ }
+
+
+ match, err = regexp.MatchString("youtube://", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ r := regexp.MustCompile("youtube://")
+ url = r.ReplaceAllString(url, "https://")
+ }
+
+ match, err = regexp.MatchString("youtu.be/", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ log.Debug("normalize: ", "youtu.be ", url)
+
+ /**
+ * remove tracking param "si"
+ */
+ url = _removeParam(url, "si")
+ url = _removeParam(url, "feature")
+
+ u, err := _url.Parse(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ q := u.Query()
+ q.Add("v", strings.TrimLeft(u.Path, "/"))
+
+ u.Host = "www.youtube.com"
+ u.Path = "watch"
+
+ u.RawQuery = q.Encode()
+ url = u.String()
+
+ //r := regexp.MustCompile("youtu.be/")
+ //url = r.ReplaceAllString(url, "youtube.com/watch?v=")
+ }
+
+ match, err = regexp.MatchString("/m.youtube.com/", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ log.Debug("normalize: ", "m.youtube.com ", url)
+
+ /**
+ * remove tracking param "si"
+ */
+ url = _removeParam(url, "si")
+ url = _removeParam(url, "feature")
+
+ r := regexp.MustCompile("/m.youtube.com/")
+ url = r.ReplaceAllString(url, "/www.youtube.com/")
+ }
+
+ match, err = regexp.MatchString("/m.imdb.com/", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ log.Debug("normalize: ", "m.imdb.com ", url)
+
+ /**
+ * remove tracking param "si"
+ */
+ url = _removeParam(url, "si")
+ url = _removeParam(url, "feature")
+
+ r := regexp.MustCompile("/m.imdb.com/")
+ url = r.ReplaceAllString(url, "/www.imdb.com")
+ }
+
+ /*
+ match, err = regexp.MatchString("m.wikipedia.org", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ r := regexp.MustCompile("m.wikipedia.org")
+ url = r.ReplaceAllString(url, "wikipedia.org")
+ }
+ */
+
+ /**
+ * remove tracking utm_ params
+ */
+ url = _removeParam(url, "utm_source")
+ url = _removeParam(url, "utm_medium")
+ url = _removeParam(url, "utm_campaign")
+ url = _removeParam(url, "utm_term")
+ url = _removeParam(url, "utm_content")
+
+ u, err = _url.Parse(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ /**
+ * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go
+ * Screw that, wierd edge case. Someone pasted a
+ */
+ if ! strings.HasPrefix(u.Host, "www.") {
+ u.Host = "www." + u.Host
+ }
+ url = u.String()
+
+ return url
+}
diff --git a/init.go b/init.go
new file mode 100644
index 0000000..9127c5d
--- /dev/null
+++ b/init.go
@@ -0,0 +1,58 @@
+package main
+
+import (
+ "errors"
+ "strings"
+
+ log "github.com/sirupsen/logrus"
+ flag "github.com/spf13/pflag"
+)
+
+// global config, gets overwritten by main
+var _conf Config
+
+func init() {
+ // overwrites unhelpful error message
+ flag.ErrHelp = errors.New("")
+
+ // we need to parse the config because of log level setting
+ configFile := flag.StringP("config", "c", "", "path to config file")
+ debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"")
+ verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug")
+ silent := flag.BoolP("silent", "s", false, "suppress output except warnings")
+ loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`)
+ user_agent_f := flag.StringP("user-agent", "u", "", "set user agent")
+ delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)")
+ ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt")
+
+ flag.Parse()
+ loglevel := strings.ToLower(*loglevel_f)
+
+ if *debug || *verbose || loglevel == "debug" {
+ log.SetLevel(log.DebugLevel)
+ } else if loglevel == "info" {
+ log.SetLevel(log.InfoLevel)
+ } else {
+ log.SetLevel(log.WarnLevel)
+ }
+
+ if *silent {
+ log.SetLevel(log.WarnLevel)
+ }
+
+ _conf.parseConfig(*configFile)
+
+ if *user_agent_f != "" {
+ _conf.UserAgent = *user_agent_f
+ }
+ if *delay_f != 0 {
+ _conf.Delay = *delay_f
+ }
+ if !*ignore_robots_f {
+ _conf.IgnoreRobotsTXT = *ignore_robots_f
+ }
+
+ if _conf.Debug && !*silent {
+ log.SetLevel(log.DebugLevel)
+ }
+}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..8940afc
--- /dev/null
+++ b/main.go
@@ -0,0 +1,462 @@
+package main
+
+import (
+ "html"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/anikhasibul/queue"
+ "github.com/jmoiron/sqlx"
+ log "github.com/sirupsen/logrus"
+ "mvdan.cc/xurls/v2"
+)
+
+type App struct {
+ Config *Config
+ DB *sqlx.DB
+ Now time.Time
+}
+
+func main() {
+ var err error
+ _own_conf := _conf
+ app := App{Config: &_own_conf}
+ _conf = Config{}
+
+ app.Now = time.Now()
+
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
+ app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
+ if err != nil {
+ log.Fatal(err, "Cannot connect to database")
+ }
+
+ if err = app.DB.Ping(); err != nil {
+ log.Fatal(err, "No connection to database")
+ }
+ defer app.DB.Close()
+
+ /*
+ app.deleteOrphanedArticles()
+ app.topStories()
+ app.deleteOrphanedArticles()
+ app.updateAllDiscussions()
+ */
+ app.walkDown()
+
+ /**
+ * Resolve redirects on stored urls.
+ */
+ //return
+}
+
+func (app *App) walkDown() {
+
+ //var err error
+
+ //max_item := getMaxItem()
+ //max_item := 41495306
+ //max_item := 36128477
+ max_item := 32670334
+ //max_item := 41231601
+ //max_item := 41165987
+ //max_item := 41136898
+ //max_item := 22554000
+ //max_item := 22494596
+ //max_item := 22354383
+ //max_item := 18984000
+ //max_item := 18732000
+ //max_item := 16017000
+ //max_item := 15494000
+ //max_item := 15038031
+ //max_item := 14450000
+
+ const maxRoutines = 200
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ //for i := max_item; i > 22600000; i-- {
+ for i := max_item; i > 0; i-- {
+ q.Add()
+ go func(i int) {
+ defer q.Done()
+
+ Story, ok := getStory(i)
+ if ok {
+ if len(Story.Links) > 0 {
+ //log.Debugf("%+v\n", Story)
+ //log.Debugf("%+v\n", Story.Links)
+ }
+ err := app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+ /*
+ */
+ }
+
+ /*
+ * Prints status update every 1000th entry
+ */
+ if i%1000 == 0 {
+ log.Infof("%s: Getting item %d\n", time.Now(), i)
+ }
+ }(i)
+ }
+ q.Wait()
+}
+
+func getMaxItem() int {
+ response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json")
+ if err != nil {
+ panic(err)
+ }
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ max_item, err := strconv.Atoi(string(data))
+ if err != nil {
+ panic(err)
+ }
+
+ return max_item
+}
+
+func (app *App) topStories() {
+ var err error
+
+ data1 := strings.TrimSuffix(string(getTopStories()), "]")
+ data2 := strings.TrimPrefix(string(getBestStories()), "[")
+
+ data1 = data1 + ","
+ data := data1 + data2
+
+ var story_ids []int
+ err = json.Unmarshal([]byte(data), &story_ids)
+ if err != nil {
+ log.Warn("topStories: Unmarshaling json failed")
+ panic(err)
+ }
+
+ const maxRoutines = 20
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ for _, id := range story_ids {
+ q.Add()
+ go func(id int) {
+ Story, ok := getStory(id)
+ defer q.Done()
+ if ok {
+ log.Infof("%+v\n", Story)
+ err = app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ }
+ }(id)
+ }
+ q.Wait()
+}
+
+func getStory(id int) (Story, bool) {
+ Story := getDetail(id)
+ if Story.Dead {
+ return Story, false
+ }
+ if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 {
+ return Story, false
+ }
+ var duplicates = make(map[string]bool)
+ /*
+ if (time.Now().Unix() - 3456000) > int64(Story.Time) {
+ }
+ */
+
+ Story.Title = stripHNPrefix(Story.Title)
+
+ u, err := url.Parse(Story.Url)
+ if err != nil {
+ log.Warnf("getStory: Parsing URL failed: %s\n", err.Error())
+ return Story, false
+ }
+
+ /**
+ * Check if story links to Youtube
+ */
+ is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_video {
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match youtube host")
+ log.Infof("%+v\n", Story)
+
+ duplicates[link.Url] = true
+ }
+
+ /**
+ * Check if story links to movie platform
+ */
+ is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_movie {
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match moview platform url")
+ log.Infof("%+v\n", Story)
+
+ duplicates[link.Url] = true
+ }
+
+ /**
+ * Check for (Video) in title
+ */
+ is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_video {
+ if ! duplicates[Story.Url] {
+
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match video title")
+ log.Infof("%+v\n", Story)
+
+ duplicates[Story.Url] = true
+ }
+
+ }
+
+ /**
+ * Check if story links to movie platform
+ */
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_movie {
+ if ! duplicates[Story.Url] {
+
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match moview platform url")
+ log.Infof("%+v\n", Story)
+
+ duplicates[Story.Url] = true
+ }
+
+ }
+
+ /**
+ * Parse all URLs in Story.Text
+ */
+ rxRelaxed := xurls.Relaxed()
+ rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1)
+
+ for _, rxLink := range rxLinks {
+
+ /**
+ * Check for Youtube in text field
+ */
+ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_video {
+ if ! duplicates[rxLink] {
+
+ var link Link
+ link.Url = normalizeUrl(rxLink)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match youtube text")
+ log.Infof("%+v\n", Story)
+
+ duplicates[rxLink] = true
+ }
+
+ }
+
+ /**
+ * Check for movie platforms in text field
+ */
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_movie {
+ if ! duplicates[rxLink] {
+
+ var link Link
+ link.Url = normalizeUrl(rxLink)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match moview platform text")
+ log.Infof("%+v\n", Story)
+
+ duplicates[rxLink] = true
+ }
+
+ }
+ }
+
+ //Story.Url = normalizeUrl(Story.Url)
+
+ if len(Story.Links) > 0 {
+ return Story, true
+ } else {
+ return Story, false
+ }
+}
+
+func getResponse(url string) *http.Response {
+ var err error
+ var response *http.Response
+
+ response, err = http.Get(url)
+ if err != nil {
+ for i := 0; i < 4; i++ {
+ if i == 0 {
+ log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
+ } else {
+ log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
+ }
+ resp2, err2 := http.Get(url)
+ if err2 == nil {
+ return resp2
+ }
+ }
+ panic(err)
+ }
+ return response
+}
+
+func getBestResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/beststories.json"
+ return getResponse(_url)
+}
+
+func getTopResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/topstories.json"
+ return getResponse(_url)
+}
+
+func getStoryResponse(item_id string) *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
+ return getResponse(_url)
+}
+
+func getDetail(id int) Story {
+ response := getStoryResponse(strconv.Itoa(id))
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ var story Story
+ err = json.Unmarshal(data, &story)
+ if err != nil {
+ log.Warn("getDetail: Unmarshaling json failed ", data)
+ panic(err)
+ }
+ //log.Debug("%+v\n", Story)
+
+ story.Text = html.UnescapeString(story.Text)
+
+ return story
+}
+
+func getTopStories() []byte {
+ response := getTopResponse()
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func getBestStories() []byte {
+ response := getBestResponse()
+
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func (app *App) updateAllDiscussions() {
+ const maxRoutines = 20
+ var item_ids []int
+
+ app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on")
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+
+ for _, item_id := range item_ids {
+ q.Add()
+ go func(item_id int) {
+ defer q.Done()
+ Story, ok := getStory(item_id)
+ if !ok {
+ /**
+ * Check if we got a network error or a dead story.
+ */
+ if 0 == Story.Id {
+ log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id)
+ } else if Story.Descendants > 10 || Story.Score > 10 {
+ log.Infof(`
+ updateAllDiscussions: There is a bug. Can't update discussion with id %d.
+ NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source.
+ %+v\n
+ `, item_id, Story)
+ }
+ return
+ }
+ err := app.updateDiscussion(Story)
+ if err != nil {
+ log.Warn(err)
+ return
+ }
+ }(item_id)
+ }
+ q.Wait()
+}
diff --git a/sql.sql b/sql.sql
new file mode 100644
index 0000000..3a5d8da
--- /dev/null
+++ b/sql.sql
@@ -0,0 +1,3 @@
+create table story( id int primary key auto_increment, story_id int not null unique, created_at timestamp, updated_at timestamp, type varchar(255) not null, title varchar(255) not null, text text, score int not null, descendants int not null, time int not null, poster varchar(255) not null);
+
+create table links (id int primary key auto_increment, created_at timestamp, updated_at timestamp, story_id int not null, url varchar(255) not null, field int not null, foreign key(story_id) references story(id));
diff --git a/struct.go b/struct.go
new file mode 100644
index 0000000..a20c244
--- /dev/null
+++ b/struct.go
@@ -0,0 +1,26 @@
+package main
+
+type Story struct {
+ Id int
+ //Deleted bool
+ Type string /* story, comment (or job, poll, pollopt) */
+ Title string /* title (only story) */
+ Text string /* comment text or possible text on story (HTML) */
+ Dead bool
+ Url string /* verbatim parsed URL */
+ //NormalizedUrl string /* normalized */
+ Score int /* only story */
+ Descendants int /* comments on score or kids on comments */
+ //Kids []int /* id of the item's comments */
+ Time int /* posted at */
+ By string /* hn commenter */
+ Links []Link /* matched urls */
+}
+
+type Link struct {
+ Url string
+ Field int /* 2 = video, 1 = movies, 0 = bug */
+}
+
+type URL struct {
+}