summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhorus2020-04-02 21:53:30 +0200
committerhorus2020-04-02 21:53:30 +0200
commit0b90b7a3b0f38f0babf4d788f4d7dd5e43253341 (patch)
treea5492cf5246522a5dd0e201be3ae988ae7e6245c
downloadcurious-crawler-0b90b7a3b0f38f0babf4d788f4d7dd5e43253341.tar.gz
Initial commit.
-rw-r--r--categories.go145
-rw-r--r--config.go104
-rw-r--r--database.go425
-rw-r--r--helper.go15
-rw-r--r--init.go58
-rw-r--r--main.go322
-rw-r--r--struct.go20
-rw-r--r--wikipedia.go272
8 files changed, 1361 insertions, 0 deletions
diff --git a/categories.go b/categories.go
new file mode 100644
index 0000000..2ee3448
--- /dev/null
+++ b/categories.go
@@ -0,0 +1,145 @@
+package main
+
+import (
+ "encoding/json"
+ "io/ioutil"
+ "strings"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) crawlForCategories(wiki_url string) ([]string, bool) {
+ return app.queryWMLabs(wiki_url)
+}
+
+func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
+ defer func() { recover() }()
+
+ var categories []string
+
+ title, hostname := getWikipediaTitle(wiki_url)
+ wm_url := ("https://xtools.wmflabs.org/api/page/assessments/" + hostname + "/" + title)
+
+ if "" == title || "/" == title {
+ return []string{}, false
+ }
+
+ response := getResponse(wm_url)
+ resp_data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url)
+ panic(err)
+ }
+
+ var data map[string]interface{}
+ if err = json.Unmarshal(resp_data, &data); err != nil {
+ log.Warnf("queryWMLabs: Decoding JSON failed for: %s", wm_url)
+ panic(err)
+ }
+
+ for k, v := range data {
+ if "project" != k && "elapsed_time" != k {
+ wp := v.(map[string]interface{})
+ for k2, v2 := range wp {
+ if k2 == "wikiprojects" {
+ list := v2.(map[string]interface{})
+ for k3, _ := range list {
+ cat := normalizeCategory(k3)
+ if "" != cat {
+ categories = append(categories, cat)
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if len(categories) > 0 {
+ return categories, true
+ }
+ return categories, false
+}
+
+func (app *App) crawlWMLabs(wiki_url string) (Category, bool) {
+ var err error
+
+ //path := strings.TrimPrefix(u.EscapedPath(), "/wiki/")
+ title, hostname := getWikipediaTitle(wiki_url)
+ wm_url := ("https://xtools.wmflabs.org/articleinfo/" + hostname + "/" + title)
+
+ if "" == title || "/" == title {
+ return Category{}, false
+ }
+
+ var category Category
+ c := colly.NewCollector()
+
+ c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) {
+ category.Name = strings.TrimSpace(e.Text)
+ category.Url = strings.TrimSpace(e.Attr("href"))
+ })
+
+ err = c.Visit(wm_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if category.Name == "" || category.Url == "" {
+ log.Warnf("title: %s WM URL: %s \tWiki Url: %s", title, wm_url, wiki_url)
+ } else {
+ log.Warnf("crawler: %+v", category)
+ }
+ return category, true
+}
+
+func (app *App) saveAllCategories() {
+ rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var article_id int
+ var wiki_url string
+
+ err = rows.Scan(&article_id, &wiki_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ //category, ok := app.crawlWMLabs(wiki_url)
+ categories, ok := app.queryWMLabs(wiki_url)
+ if ok {
+ app.saveCategory(article_id, categories)
+ } else {
+ log.Debug("saveAllCategories: No categories for " + wiki_url)
+ }
+ }
+}
+
+func normalizeCategory(s string) string {
+ cat := strings.TrimSpace(s)
+ cat = strings.TrimSuffix(s, "task force")
+ cat = strings.TrimSuffix(s, "taskforce")
+ cat = strings.TrimSuffix(s, "Taskforce")
+ cat = strings.TrimSuffix(s, "Task Force")
+
+ if strings.Contains(strings.ToLower(cat), "articles") {
+ return ""
+ }
+ /**
+ * The category "Wikipedia" is very useful, but other occurrences are not.
+ */
+ if strings.Contains(strings.ToLower(cat), "wikipedia") && cat != strings.ToLower("wikipedia") {
+ return ""
+ }
+ if strings.Contains(strings.ToLower(cat), "/wikiproject") {
+ cat = strings.ReplaceAll(cat, "WikiProject ", "")
+ cat = strings.ReplaceAll(cat, "wikiproject ", "")
+ cat = strings.ReplaceAll(cat, "Wikiproject ", "")
+ }
+ cat = strings.TrimSpace(cat)
+
+ return cat
+}
diff --git a/config.go b/config.go
new file mode 100644
index 0000000..14c75ee
--- /dev/null
+++ b/config.go
@@ -0,0 +1,104 @@
+package main
+
+import (
+ "os"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/spf13/viper"
+)
+
+type Config struct {
+ DBDriver string
+ DBDBName string
+ DBHost string
+ DBPort string
+ DBUser string
+ DBPassword string
+ DBOptions string
+
+ UserAgent string
+ Delay int
+ IgnoreRobotsTXT bool
+
+ BasicAuthUsername string
+ BasicAuthPassword string
+
+ Debug bool // sets log level to debug
+}
+
+// Parses the configuration and sets the configuration struct.
+func (c *Config) parseConfig(configFile string) {
+
+ viper.SetDefault("DB_Driver", "mysql")
+ viper.SetDefault("DB_DBName", "ghrss")
+ viper.SetDefault("DB_Host", "localhost")
+ viper.SetDefault("DB_Port", "3306")
+
+ viper.SetDefault("Debug", false)
+ viper.SetDefault("Delay", 0)
+
+ // needs some refactoring to truly respect robots.txt
+ viper.SetDefault("IgnoreRobotsTXT", true)
+
+ viper.SetDefault("UserAgent", "colly - a friendly crawler :)")
+
+ // Name of the configuration file
+ viper.SetConfigName("config")
+
+ // Where to find the config file
+ if configFile == "" {
+ viper.AddConfigPath(".")
+ } else {
+ stat, err := os.Stat(configFile)
+ if os.IsNotExist(err) {
+ // provided config file does not exist, so we add the path instead
+ viper.AddConfigPath(configFile)
+ } else if err == nil && stat.IsDir() {
+ // adds the path to look for the config file
+ viper.AddConfigPath(configFile)
+ } else if err == nil {
+ // directly sets the config file
+ viper.SetConfigFile(configFile)
+ } else {
+ // if we are here something went wrong
+ log.Warn(err, "config.go: os.Stat("+configFile+") error")
+ // adding the path nonetheless because it's not hurting
+ viper.AddConfigPath(configFile)
+ }
+ }
+
+ // Env variables need to be prefixed with "ALKOBOTE_"
+ viper.SetEnvPrefix("DISCUSS_")
+
+ // Parses automatic the matching env variables
+ viper.AutomaticEnv()
+
+ // Reads the config
+ err := viper.ReadInConfig()
+ if err != nil {
+ log.Fatal(err, "Config: Error parsing config file.")
+ }
+ log.Debug("Config: Config file used: " + viper.ConfigFileUsed())
+
+ c.setsConfig()
+}
+
+// Actually sets the config struct
+func (c *Config) setsConfig() {
+ c.DBDriver = viper.GetString("DB_Driver")
+ c.DBHost = viper.GetString("DB_Host")
+ c.DBPort = viper.GetString("DB_Port")
+ c.DBUser = viper.GetString("DB_User")
+ c.DBPassword = viper.GetString("DB_Password")
+ c.DBDBName = viper.GetString("DB_DBName")
+ c.DBOptions = viper.GetString("DB_Options")
+
+ c.UserAgent = viper.GetString("UserAgent")
+ c.Delay = viper.GetInt("Delay")
+ c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT")
+
+ c.BasicAuthUsername = viper.GetString("BasicAuthUsername")
+ c.BasicAuthPassword = viper.GetString("BasicAuthPassword")
+
+ c.Debug = viper.GetBool("Debug")
+}
diff --git a/database.go b/database.go
new file mode 100644
index 0000000..e40279e
--- /dev/null
+++ b/database.go
@@ -0,0 +1,425 @@
+package main
+
+import (
+ log "github.com/Sirupsen/logrus"
+ "regexp"
+ "strconv"
+
+ "database/sql"
+ _ "github.com/go-sql-driver/mysql"
+)
+
+func (app *App) saveStory(s Story) error {
+ match, err := regexp.MatchString("github.com($|/)", s.Url)
+ if err != nil {
+ log.Warn("Failed to parse and match regex")
+ return err
+
+ }
+ if match {
+ /**
+ * Special handling for github stories.
+ */
+ return app.saveCode(s)
+ }
+
+ query := `
+ INSERT IGNORE article (
+ id,
+ created_at,
+ updated_at,
+ url,
+ title
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveStory: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title)
+ if err != nil {
+ log.Warn("saveStory: Statement execution failed")
+ return err
+ }
+ log.Debugf("saveStory: Successfull insert for item %d\n", s.Id)
+
+ query = `
+ INSERT IGNORE discussion (
+ id,
+ created_at,
+ updated_at,
+ article_id,
+ title,
+ source,
+ item_id,
+ source_url,
+ posted_on,
+ comments,
+ upvotes
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ (SELECT id FROM article WHERE url = ?),
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveStory: Preparing second query failed")
+ return err
+ }
+ defer stmt2.Close()
+
+ _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score)
+ if err != nil {
+ log.Warn("saveStory: Statement execution failed")
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) saveCode(s Story) error {
+ query := `
+ INSERT IGNORE code(
+ id,
+ created_at,
+ updated_at,
+ url,
+ title,
+ source,
+ item_id,
+ source_url,
+ posted_on,
+ comments,
+ upvotes
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveCode: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score)
+ if err != nil {
+ log.Warn("saveCode: Statement execution failed")
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) updateDiscussion(story Story) error {
+
+ query := `
+ UPDATE discussion
+ set updated_at = ?,
+ comments = ?,
+ upvotes = ?
+ WHERE item_id = ?;
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("updateDiscussion: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, story.Descendants, story.Score, story.Id)
+ if err != nil {
+ log.Warnf("updateDiscussion: Statement execution failed")
+ return err
+ }
+ log.Debugf("updateDiscussion: Successful update of %d with new Score: %d, Comments: %d\n", story.Id, story.Score, story.Descendants)
+
+ return nil
+}
+
+func (app *App) updateArticleUrl(id int, url string) error {
+ query := `
+ UPDATE article
+ set updated_at = ?,
+ url = ?
+ WHERE id = ?
+ `
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("updateArticleUrl: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(app.Now, url, id)
+ if err != nil {
+ log.Warnf("updateArticleUrl: Statement execution failed")
+ return err
+ }
+ log.Debugf("updateArticleUrl: Successful update new url: %s\n", url)
+
+ return nil
+
+}
+
+func (app *App) wikipediaMergeArticles(id_to_delete int, correct_url string) error {
+ query := "SELECT id FROM discussion WHERE article_id = ?"
+ row := app.DB.QueryRow(query, id_to_delete)
+ var disc_id int
+ err := row.Scan(&disc_id)
+ if err != nil {
+ log.Warnf("wikipediaMergeArticles: Query first row failed. id: %d url: %s", id_to_delete, correct_url)
+ return err
+ }
+ query = "SELECT id FROM article WHERE url = ?"
+ row = app.DB.QueryRow(query, correct_url)
+ var article_id int
+ err = row.Scan(&article_id)
+ if err != nil {
+ log.Warn("wikipediaMergeArticles: Query second row failed")
+ return err
+ }
+
+ query = "UPDATE discussion SET article_id = ?, updated_at = ? WHERE id = ?;"
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("wikipediaMergeArticles: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(article_id, app.Now, disc_id)
+ if err != nil {
+ log.Warn("wikipediaMergeArticles: Update discussion failed")
+ return err
+ }
+
+ query = "UPDATE article_category SET article_id = ? WHERE id = ?;"
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("wikipediaMergeArticles: Preparing article_category query failed")
+ return err
+ }
+ defer stmt2.Close()
+
+ _, err = stmt2.Exec(article_id, id_to_delete)
+ if err != nil {
+ log.Warn("wikipediaMergeArticles: Update article_category failed")
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) deleteOrphanedArticles() error {
+ query := `
+ DELETE a FROM
+ article AS a
+ LEFT JOIN
+ discussion AS d ON a.id = d.article_id
+ WHERE d.id IS NULL;`
+ _, err := app.DB.Exec(query)
+ if err != nil {
+ log.Warnf("deleteOrphanedArticles: Executing query failed: %s", err.Error())
+ return err
+ }
+
+ return nil
+}
+
+func (app *App) saveCategory(article_id int, categories []string) {
+
+ for _, category := range categories {
+ query := "SELECT id FROM category WHERE name = ?"
+ row := app.DB.QueryRow(query, category)
+ var category_id int
+ err := row.Scan(&category_id)
+
+ if err != nil {
+ if err != sql.ErrNoRows {
+ log.Warn("saveCategory: Selecting category id failed")
+ log.Fatal(err)
+ }
+ }
+
+ if err == sql.ErrNoRows {
+ query = `
+ INSERT INTO category (
+ id,
+ created_at,
+ updated_at,
+ name
+ ) VALUES (
+ null,
+ ?,
+ ?,
+ ?
+ )`
+
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer stmt.Close()
+
+ result, err := stmt.Exec(app.Now, app.Now, category)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ category_id64, err := result.LastInsertId()
+ category_id = int(category_id64)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+
+ query = `
+ INSERT IGNORE article_category (
+ id,
+ article_id,
+ category_id
+ ) VALUES (
+ null,
+ ?,
+ ?
+ )
+ `
+
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Fatal(err)
+ }
+ _, err = stmt2.Exec(article_id, category_id)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+}
+
+func (app *App) updateWikipediaUrls() {
+ rows, err := app.DB.Query("SELECT DISTINCT id, url FROM article;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var wiki_url string
+ var article_id int
+
+ err = rows.Scan(&article_id, &wiki_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ real_url := wikipediaRealUrl(wiki_url)
+ if real_url != wiki_url && "" != real_url {
+
+ /**
+ * Check if we already have the canonical url and merge if necessary.
+ */
+ row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", real_url)
+ var count int
+ err = row.Scan(&count)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if 0 < count {
+ err = app.wikipediaMergeArticles(article_id, real_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ continue
+ }
+
+ stmt, err := app.DB.Prepare("UPDATE article SET url = ? WHERE id = ?")
+ if err != nil {
+ log.Warnf("updateWikipediaUrls: Preparing query failed for: (%d) %s", article_id, wiki_url)
+ log.Fatal(err)
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(real_url, article_id)
+ if err != nil {
+ log.Warnf("updateWikipediaUrls: Executing statement failed for: (%d) %s", article_id, wiki_url)
+ log.Fatal(err)
+ }
+ log.Debugf("(%d) Updated from %s to %s", article_id, wiki_url, real_url)
+ }
+ }
+}
+
+func (app *App) getArticleIdFromUrl(wiki_url string) int {
+ row := app.DB.QueryRow("SELECT id FROM article WHERE url = ?", wiki_url)
+ var article_id int
+ err := row.Scan(&article_id)
+ if err != nil {
+ log.Warnf("getArticleIdFromUrl: Query or scanning failed for: %s", wiki_url)
+ log.Fatal(err)
+ }
+ return article_id
+}
+
+func (app *App) fixAllCategories() {
+ rows, err := app.DB.Query("SELECT id, name FROM category;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var category_id int
+ var category_name string
+
+ err = rows.Scan(&category_id, &category_name)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ category_fixed := normalizeCategory(category_name)
+ if category_fixed != category_name {
+ log.Warn(category_fixed)
+ stmt, err := app.DB.Prepare("UPDATE category SET name = ? WHERE id = ? AND name = ?")
+ if err != nil {
+ log.Warnf("fixAllCategories: Preparing query failed for: (%d) %s", category_id, category_fixed)
+ log.Fatal(err)
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(category_fixed, category_id, category_name)
+ if err != nil {
+ log.Warnf("fixAllCategories: Exec stmt failed for: (%d) %s", category_id, category_fixed)
+ log.Fatal(err)
+ }
+ }
+
+ }
+}
diff --git a/helper.go b/helper.go
new file mode 100644
index 0000000..649c2d4
--- /dev/null
+++ b/helper.go
@@ -0,0 +1,15 @@
+package main
+
+import (
+ "strings"
+)
+
+func stripHNPrefix(title string) string {
+ title = strings.TrimPrefix(title, "Ask HN:")
+ title = strings.TrimPrefix(title, "Show HN:")
+ title = strings.TrimPrefix(title, "Tell HN:")
+ title = strings.TrimPrefix(title, "Experiment HN:")
+ title = strings.TrimPrefix(title, "Launch HN:")
+
+ return strings.TrimSpace(title)
+}
diff --git a/init.go b/init.go
new file mode 100644
index 0000000..7ce40c5
--- /dev/null
+++ b/init.go
@@ -0,0 +1,58 @@
+package main
+
+import (
+ "errors"
+ "strings"
+
+ log "github.com/Sirupsen/logrus"
+ flag "github.com/spf13/pflag"
+)
+
+// global config, gets overwritten by main
+var _conf Config
+
+func init() {
+ // overwrites unhelpful error message
+ flag.ErrHelp = errors.New("")
+
+ // we need to parse the config because of log level setting
+ configFile := flag.StringP("config", "c", "", "path to config file")
+ debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"")
+ verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug")
+ silent := flag.BoolP("silent", "s", false, "suppress output except warnings")
+ loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`)
+ user_agent_f := flag.StringP("user-agent", "u", "", "set user agent")
+ delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)")
+ ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt")
+
+ flag.Parse()
+ loglevel := strings.ToLower(*loglevel_f)
+
+ if *debug || *verbose || loglevel == "debug" {
+ log.SetLevel(log.DebugLevel)
+ } else if loglevel == "info" {
+ log.SetLevel(log.InfoLevel)
+ } else {
+ log.SetLevel(log.WarnLevel)
+ }
+
+ if *silent {
+ log.SetLevel(log.WarnLevel)
+ }
+
+ _conf.parseConfig(*configFile)
+
+ if *user_agent_f != "" {
+ _conf.UserAgent = *user_agent_f
+ }
+ if *delay_f != 0 {
+ _conf.Delay = *delay_f
+ }
+ if !*ignore_robots_f {
+ _conf.IgnoreRobotsTXT = *ignore_robots_f
+ }
+
+ if _conf.Debug && !*silent {
+ log.SetLevel(log.DebugLevel)
+ }
+}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..1548776
--- /dev/null
+++ b/main.go
@@ -0,0 +1,322 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/AnikHasibul/queue"
+ log "github.com/Sirupsen/logrus"
+ "github.com/jmoiron/sqlx"
+)
+
+type App struct {
+ Config *Config
+ DB *sqlx.DB
+ Now time.Time
+}
+
+func main() {
+ var err error
+ _own_conf := _conf
+ app := App{Config: &_own_conf}
+ _conf = Config{}
+
+ app.Now = time.Now()
+
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
+ app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
+ if err != nil {
+ log.Fatal(err, "Cannot connect to database")
+ }
+
+ if err = app.DB.Ping(); err != nil {
+ log.Fatal(err, "No connection to database")
+ }
+ defer app.DB.Close()
+
+ //app.fixAllCategories()
+
+ app.deleteOrphanedArticles()
+ app.topStories()
+ app.wikipediaFixAllUrls()
+ app.deleteOrphanedArticles()
+ app.saveExcerpts()
+ //app.saveAllCategories()
+ app.updateAllDiscussions()
+ //app.walkDown()
+
+ /**
+ * Resolve redirects on stored urls.
+ */
+ //app.updateWikipediaUrls()
+ //app.saveAllCategories()
+ //return
+}
+
+func (app *App) walkDown() {
+
+ var err error
+
+ max_item := getMaxItem()
+ //max_item := 22554000
+ //max_item := 22494596
+ //max_item := 22354383
+ //max_item := 18984000
+ //max_item := 18732000
+ //max_item := 16017000
+ //max_item := 15494000
+ //max_item := 15038031
+ //max_item := 14450000
+
+ const maxRoutines = 20
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ for i := max_item; i > 22600000; i-- {
+ q.Add()
+ go func(i int) {
+ defer q.Done()
+
+ Story, ok := getStory(i)
+ if ok {
+ log.Infof("%+v\n", Story)
+ err = app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+
+ /*
+ * Prints status update every 1000th entry
+ */
+ if i%1000 == 0 {
+ log.Debugf("%s: Getting item %d\n", time.Now(), i)
+ }
+ }(i)
+ }
+ q.Wait()
+}
+
+func getMaxItem() int {
+ response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json")
+ if err != nil {
+ panic(err)
+ }
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ max_item, err := strconv.Atoi(string(data))
+ if err != nil {
+ panic(err)
+ }
+
+ return max_item
+}
+
+func (app *App) topStories() {
+ var err error
+
+ data1 := strings.TrimSuffix(string(getTopStories()), "]")
+ data2 := strings.TrimPrefix(string(getBestStories()), "[")
+
+ data1 = data1 + ","
+ data := data1 + data2
+
+ var story_ids []int
+ err = json.Unmarshal([]byte(data), &story_ids)
+ if err != nil {
+ log.Warn("topStories: Unmarshaling json failed")
+ panic(err)
+ }
+
+ const maxRoutines = 20
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ for _, id := range story_ids {
+ q.Add()
+ go func(id int) {
+ Story, ok := getStory(id)
+ defer q.Done()
+ if ok {
+ log.Infof("%+v\n", Story)
+ err = app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ categories, ok := app.crawlForCategories(Story.Url)
+ if ok {
+ article_id := app.getArticleIdFromUrl(Story.Url)
+ app.saveCategory(article_id, categories)
+ }
+
+ }
+ }(id)
+ }
+ q.Wait()
+}
+
+func getStory(id int) (Story, bool) {
+ Story := getDetail(id)
+ if Story.Dead || Story.Deleted {
+ return Story, false
+ }
+ if Story.Score < 10 && Story.Descendants < 10 {
+ return Story, false
+ }
+ /*
+ if (time.Now().Unix() - 3456000) > int64(Story.Time) {
+ }
+ */
+
+ Story.Title = stripHNPrefix(Story.Title)
+
+ u, err := url.Parse(Story.Url)
+ if err != nil {
+ log.Warnf("getStory: Parsing URL failed: %s\n", err.Error())
+ return Story, false
+ }
+ is_gh, err := regexp.MatchString("(github.com)($|/)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ is_wiki, err := regexp.MatchString("wikipedia.org($|/)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_gh {
+ return Story, true
+ }
+ if is_wiki {
+ Story.Url = wikipediaNormalizeUrl(Story.Url)
+ Story.Url = wikipediaRealUrl(Story.Url)
+ return Story, true
+ }
+ return Story, false
+}
+
+func getResponse(url string) *http.Response {
+ var err error
+ var response *http.Response
+
+ response, err = http.Get(url)
+ if err != nil {
+ for i := 0; i < 4; i++ {
+ log.Warn("getDetail: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
+ resp2, err2 := http.Get(url)
+ if err2 == nil {
+ return resp2
+ }
+ }
+ panic(err)
+ }
+ return response
+}
+
+func getBestResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/beststories.json"
+ return getResponse(_url)
+}
+
+func getTopResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/topstories.json"
+ return getResponse(_url)
+}
+
+func getWikipediaResponse(title string) *http.Response {
+ _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title
+ return getResponse(_url)
+}
+
+func getWikipediaRedirectResponse(hostname, title string) *http.Response {
+ _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title
+ return getResponse(_url)
+}
+
+func getStoryResponse(item_id string) *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
+ return getResponse(_url)
+}
+
+func getDetail(id int) Story {
+ response := getStoryResponse(strconv.Itoa(id))
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ var story Story
+ err = json.Unmarshal(data, &story)
+ if err != nil {
+ log.Warn("getDetail: Unmarshaling json failed")
+ panic(err)
+ }
+ return story
+}
+
+func getTopStories() []byte {
+ response := getTopResponse()
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func getBestStories() []byte {
+ response := getBestResponse()
+
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func (app *App) updateAllDiscussions() {
+ const maxRoutines = 20
+ var item_ids []int
+
+ app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on")
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+
+ for _, item_id := range item_ids {
+ q.Add()
+ go func(item_id int) {
+ defer q.Done()
+ Story, ok := getStory(item_id)
+ if !ok {
+ /**
+ * Check if we got a network error or a dead story.
+ */
+ if 0 == Story.Id {
+ log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id)
+ } else if Story.Descendants > 10 || Story.Score > 10 {
+ log.Warnf("%+v\n", Story)
+ }
+ return
+ }
+ err := app.updateDiscussion(Story)
+ if err != nil {
+ log.Warn(err)
+ return
+ }
+ }(item_id)
+ }
+ q.Wait()
+}
diff --git a/struct.go b/struct.go
new file mode 100644
index 0000000..f5b8f7c
--- /dev/null
+++ b/struct.go
@@ -0,0 +1,20 @@
+package main
+
+type Story struct {
+ Id int
+ Deleted bool
+ Type string
+ Time int
+ Text string
+ Dead bool
+ Url string
+ Score int
+ Title string
+ Descendants int
+}
+
+type Category struct {
+ ID int
+ Name string
+ Url string
+}
diff --git a/wikipedia.go b/wikipedia.go
new file mode 100644
index 0000000..338881a
--- /dev/null
+++ b/wikipedia.go
@@ -0,0 +1,272 @@
+package main
+
+import (
+ "encoding/json"
+ "regexp"
+ "strings"
+ //"strconv"
+ "io/ioutil"
+ "net/url"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) crawlWikipedia(url string) {
+ c := colly.NewCollector()
+
+ c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) {
+ e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) {
+ log.Debug("Text: " + e.Text + " Title: " + e.Attr("title") + " Url: " + e.Attr("href"))
+ })
+ })
+ c.OnHTML("#firstHeading", func(e *colly.HTMLElement) {
+ log.Debug("Title: " + e.Text)
+ })
+
+ err := c.Visit(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func (app *App) getAllArticles() {
+ rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var article_id int
+
+ err = rows.Scan(&article_id)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ log.Println(article_id)
+ }
+}
+
+func (app *App) wikipediaFixAllUrls() {
+ rows, err := app.DB.Query("SELECT id, url FROM article WHERE (url LIKE '%m.wiki%' OR url like 'http:%');")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var id int
+ var url string
+
+ err = rows.Scan(&id, &url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ //log.Debug(id, url)
+
+ url = wikipediaNormalizeUrl(url)
+
+ row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", url)
+ var count int
+ err = row.Scan(&count)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if 0 < count {
+ err = app.wikipediaMergeArticles(id, url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ continue
+ }
+ err = app.updateArticleUrl(id, url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ //log.Debug("UPDATE article SET url = " + url + " WHERE id = " + strconv.Itoa(id))
+ }
+}
+
+func wikipediaNormalizeUrl(url string) string {
+ match, err := regexp.MatchString("^http://", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ r := regexp.MustCompile("^http://")
+ url = r.ReplaceAllString(url, "https://")
+ }
+
+ match, err = regexp.MatchString("m.wikipedia.org", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ r := regexp.MustCompile("m.wikipedia.org")
+ url = r.ReplaceAllString(url, "wikipedia.org")
+ }
+ return url
+}
+
+func (app *App) _changeTitle(id_to_delete int, correct_url string) {
+ var new_title string
+
+ query := `
+ SELECT * FROM
+ (SELECT title as old_title FROM article WHERE id = ?) as t1
+ JOIN
+ (SELECT title as cur_title FROM article WHERE url = ?) as t2
+ ;
+ `
+
+ row := app.DB.QueryRow(query, id_to_delete, correct_url)
+ var old_title string
+ var cur_title string
+ err := row.Scan(&old_title, &cur_title)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ old_title = stripHNPrefix(old_title)
+ cur_title = stripHNPrefix(cur_title)
+
+ if len(old_title) > len(cur_title) {
+ new_title = old_title
+ } else {
+ new_title = cur_title
+ }
+
+ log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title)
+}
+
+func getWikipediaExcerpt(title string) string {
+ var err error
+
+ response := getWikipediaResponse(title)
+ resp_data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ var data map[string]interface{}
+ if err = json.Unmarshal(resp_data, &data); err != nil {
+ log.Warn("getWikipediaExcerpt: Unmarshaling json failed")
+ log.Fatal(err)
+ }
+ defer func() { recover() }()
+ query := (data["query"]).(map[string]interface{})
+ pages := query["pages"].(map[string]interface{})
+ for _, site_id := range pages {
+ extract := site_id.(map[string]interface{})
+ excerpt := (extract["extract"]).(string)
+ excerpt = strings.TrimSpace(excerpt)
+ if "" != excerpt {
+ return excerpt
+ }
+ }
+ return ""
+}
+
+func getWikipediaTitle(full_url string) (string, string) {
+ u, err := url.Parse(full_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ var title string
+ title = u.Query().Get("title")
+ if "" == title {
+ title = strings.TrimPrefix(u.EscapedPath(), "/wiki/")
+ }
+ title = strings.TrimSpace(title)
+ return title, u.Hostname()
+}
+
+func (app *App) saveExcerpts() error {
+ query := "SELECT id, url FROM article WHERE excerpt_html IS NULL;"
+
+ rows, err := app.DB.Query(query)
+ if err != nil {
+ return err
+ }
+
+ for rows.Next() {
+ var id int
+ var url string
+
+ err = rows.Scan(&id, &url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ title, _ := getWikipediaTitle(url)
+ excerpt := getWikipediaExcerpt(title)
+
+ query = "UPDATE article SET excerpt_html = ? WHERE id = ?"
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveExcerpts: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(excerpt, id)
+ if err != nil {
+ log.Warn("saveExcerpts: Executing stmt failed")
+ return err
+ }
+
+ }
+ return nil
+}
+
+func wikipediaRealUrl(wiki_url string) string {
+ /**
+ * We don't change urls with parameters, because we would loose the context.
+ */
+ if strings.Contains(wiki_url, "&") {
+ return wiki_url
+ }
+
+ var err error
+ var fragment string
+
+ u, err := url.Parse(wiki_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if u.Fragment != "" {
+ fragment = "#" + u.Fragment
+ }
+
+ title, hostname := getWikipediaTitle(wiki_url)
+
+ if title == "/" || title == "" {
+ return wiki_url
+ }
+
+ response := getWikipediaRedirectResponse(hostname, title)
+ resp_data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ var data map[string]interface{}
+ if err = json.Unmarshal(resp_data, &data); err != nil {
+ log.Warn("wikipediaRealUrl: Unmarshaling json failed ", string(resp_data))
+ log.Fatal(err)
+ }
+ defer func() { recover() }()
+ query := (data["query"]).(map[string]interface{})
+ pages := query["pages"].(map[string]interface{})
+ for _, site_id := range pages {
+ key := site_id.(map[string]interface{})
+ canonical_url := key["canonicalurl"].(string)
+ if "" != canonical_url {
+ return canonical_url + fragment
+ }
+ }
+ return wiki_url
+}