From 0b90b7a3b0f38f0babf4d788f4d7dd5e43253341 Mon Sep 17 00:00:00 2001 From: horus Date: Thu, 2 Apr 2020 21:53:30 +0200 Subject: Initial commit. --- categories.go | 145 ++++++++++++++++++++ config.go | 104 ++++++++++++++ database.go | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ helper.go | 15 +++ init.go | 58 ++++++++ main.go | 322 ++++++++++++++++++++++++++++++++++++++++++++ struct.go | 20 +++ wikipedia.go | 272 +++++++++++++++++++++++++++++++++++++ 8 files changed, 1361 insertions(+) create mode 100644 categories.go create mode 100644 config.go create mode 100644 database.go create mode 100644 helper.go create mode 100644 init.go create mode 100644 main.go create mode 100644 struct.go create mode 100644 wikipedia.go diff --git a/categories.go b/categories.go new file mode 100644 index 0000000..2ee3448 --- /dev/null +++ b/categories.go @@ -0,0 +1,145 @@ +package main + +import ( + "encoding/json" + "io/ioutil" + "strings" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) crawlForCategories(wiki_url string) ([]string, bool) { + return app.queryWMLabs(wiki_url) +} + +func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { + defer func() { recover() }() + + var categories []string + + title, hostname := getWikipediaTitle(wiki_url) + wm_url := ("https://xtools.wmflabs.org/api/page/assessments/" + hostname + "/" + title) + + if "" == title || "/" == title { + return []string{}, false + } + + response := getResponse(wm_url) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url) + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warnf("queryWMLabs: Decoding JSON failed for: %s", wm_url) + panic(err) + } + + for k, v := range data { + if "project" != k && "elapsed_time" != k { + wp := v.(map[string]interface{}) + for k2, v2 := range wp { + if k2 == "wikiprojects" { + list := v2.(map[string]interface{}) + for k3, _ := range list { + cat := normalizeCategory(k3) + if "" != cat { + categories = append(categories, cat) + } + } + } + } + } + } + + if len(categories) > 0 { + return categories, true + } + return categories, false +} + +func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { + var err error + + //path := strings.TrimPrefix(u.EscapedPath(), "/wiki/") + title, hostname := getWikipediaTitle(wiki_url) + wm_url := ("https://xtools.wmflabs.org/articleinfo/" + hostname + "/" + title) + + if "" == title || "/" == title { + return Category{}, false + } + + var category Category + c := colly.NewCollector() + + c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) { + category.Name = strings.TrimSpace(e.Text) + category.Url = strings.TrimSpace(e.Attr("href")) + }) + + err = c.Visit(wm_url) + if err != nil { + log.Fatal(err) + } + + if category.Name == "" || category.Url == "" { + log.Warnf("title: %s WM URL: %s \tWiki Url: %s", title, wm_url, wiki_url) + } else { + log.Warnf("crawler: %+v", category) + } + return category, true +} + +func (app *App) saveAllCategories() { + rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + var wiki_url string + + err = rows.Scan(&article_id, &wiki_url) + if err != nil { + log.Fatal(err) + } + + //category, ok := app.crawlWMLabs(wiki_url) + categories, ok := app.queryWMLabs(wiki_url) + if ok { + app.saveCategory(article_id, categories) + } else { + log.Debug("saveAllCategories: No categories for " + wiki_url) + } + } +} + +func normalizeCategory(s string) string { + cat := strings.TrimSpace(s) + cat = strings.TrimSuffix(s, "task force") + cat = strings.TrimSuffix(s, "taskforce") + cat = strings.TrimSuffix(s, "Taskforce") + cat = strings.TrimSuffix(s, "Task Force") + + if strings.Contains(strings.ToLower(cat), "articles") { + return "" + } + /** + * The category "Wikipedia" is very useful, but other occurrences are not. + */ + if strings.Contains(strings.ToLower(cat), "wikipedia") && cat != strings.ToLower("wikipedia") { + return "" + } + if strings.Contains(strings.ToLower(cat), "/wikiproject") { + cat = strings.ReplaceAll(cat, "WikiProject ", "") + cat = strings.ReplaceAll(cat, "wikiproject ", "") + cat = strings.ReplaceAll(cat, "Wikiproject ", "") + } + cat = strings.TrimSpace(cat) + + return cat +} diff --git a/config.go b/config.go new file mode 100644 index 0000000..14c75ee --- /dev/null +++ b/config.go @@ -0,0 +1,104 @@ +package main + +import ( + "os" + + log "github.com/Sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + IgnoreRobotsTXT bool + + BasicAuthUsername string + BasicAuthPassword string + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "ghrss") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath(".") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + log.Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("DISCUSS_") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + log.Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + + c.BasicAuthUsername = viper.GetString("BasicAuthUsername") + c.BasicAuthPassword = viper.GetString("BasicAuthPassword") + + c.Debug = viper.GetBool("Debug") +} diff --git a/database.go b/database.go new file mode 100644 index 0000000..e40279e --- /dev/null +++ b/database.go @@ -0,0 +1,425 @@ +package main + +import ( + log "github.com/Sirupsen/logrus" + "regexp" + "strconv" + + "database/sql" + _ "github.com/go-sql-driver/mysql" +) + +func (app *App) saveStory(s Story) error { + match, err := regexp.MatchString("github.com($|/)", s.Url) + if err != nil { + log.Warn("Failed to parse and match regex") + return err + + } + if match { + /** + * Special handling for github stories. + */ + return app.saveCode(s) + } + + query := ` + INSERT IGNORE article ( + id, + created_at, + updated_at, + url, + title + ) VALUES ( + NULL, + ?, + ?, + ?, + ? + ); + ` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + log.Debugf("saveStory: Successfull insert for item %d\n", s.Id) + + query = ` + INSERT IGNORE discussion ( + id, + created_at, + updated_at, + article_id, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + (SELECT id FROM article WHERE url = ?), + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing second query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + + return nil +} + +func (app *App) saveCode(s Story) error { + query := ` + INSERT IGNORE code( + id, + created_at, + updated_at, + url, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveCode: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveCode: Statement execution failed") + return err + } + + return nil +} + +func (app *App) updateDiscussion(story Story) error { + + query := ` + UPDATE discussion + set updated_at = ?, + comments = ?, + upvotes = ? + WHERE item_id = ?; + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateDiscussion: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, story.Descendants, story.Score, story.Id) + if err != nil { + log.Warnf("updateDiscussion: Statement execution failed") + return err + } + log.Debugf("updateDiscussion: Successful update of %d with new Score: %d, Comments: %d\n", story.Id, story.Score, story.Descendants) + + return nil +} + +func (app *App) updateArticleUrl(id int, url string) error { + query := ` + UPDATE article + set updated_at = ?, + url = ? + WHERE id = ? + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateArticleUrl: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, url, id) + if err != nil { + log.Warnf("updateArticleUrl: Statement execution failed") + return err + } + log.Debugf("updateArticleUrl: Successful update new url: %s\n", url) + + return nil + +} + +func (app *App) wikipediaMergeArticles(id_to_delete int, correct_url string) error { + query := "SELECT id FROM discussion WHERE article_id = ?" + row := app.DB.QueryRow(query, id_to_delete) + var disc_id int + err := row.Scan(&disc_id) + if err != nil { + log.Warnf("wikipediaMergeArticles: Query first row failed. id: %d url: %s", id_to_delete, correct_url) + return err + } + query = "SELECT id FROM article WHERE url = ?" + row = app.DB.QueryRow(query, correct_url) + var article_id int + err = row.Scan(&article_id) + if err != nil { + log.Warn("wikipediaMergeArticles: Query second row failed") + return err + } + + query = "UPDATE discussion SET article_id = ?, updated_at = ? WHERE id = ?;" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("wikipediaMergeArticles: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(article_id, app.Now, disc_id) + if err != nil { + log.Warn("wikipediaMergeArticles: Update discussion failed") + return err + } + + query = "UPDATE article_category SET article_id = ? WHERE id = ?;" + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("wikipediaMergeArticles: Preparing article_category query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(article_id, id_to_delete) + if err != nil { + log.Warn("wikipediaMergeArticles: Update article_category failed") + return err + } + + return nil +} + +func (app *App) deleteOrphanedArticles() error { + query := ` + DELETE a FROM + article AS a + LEFT JOIN + discussion AS d ON a.id = d.article_id + WHERE d.id IS NULL;` + _, err := app.DB.Exec(query) + if err != nil { + log.Warnf("deleteOrphanedArticles: Executing query failed: %s", err.Error()) + return err + } + + return nil +} + +func (app *App) saveCategory(article_id int, categories []string) { + + for _, category := range categories { + query := "SELECT id FROM category WHERE name = ?" + row := app.DB.QueryRow(query, category) + var category_id int + err := row.Scan(&category_id) + + if err != nil { + if err != sql.ErrNoRows { + log.Warn("saveCategory: Selecting category id failed") + log.Fatal(err) + } + } + + if err == sql.ErrNoRows { + query = ` + INSERT INTO category ( + id, + created_at, + updated_at, + name + ) VALUES ( + null, + ?, + ?, + ? + )` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + defer stmt.Close() + + result, err := stmt.Exec(app.Now, app.Now, category) + if err != nil { + log.Fatal(err) + } + + category_id64, err := result.LastInsertId() + category_id = int(category_id64) + if err != nil { + log.Fatal(err) + } + } + + query = ` + INSERT IGNORE article_category ( + id, + article_id, + category_id + ) VALUES ( + null, + ?, + ? + ) + ` + + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + _, err = stmt2.Exec(article_id, category_id) + if err != nil { + log.Fatal(err) + } + } +} + +func (app *App) updateWikipediaUrls() { + rows, err := app.DB.Query("SELECT DISTINCT id, url FROM article;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var wiki_url string + var article_id int + + err = rows.Scan(&article_id, &wiki_url) + if err != nil { + log.Fatal(err) + } + + real_url := wikipediaRealUrl(wiki_url) + if real_url != wiki_url && "" != real_url { + + /** + * Check if we already have the canonical url and merge if necessary. + */ + row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", real_url) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatal(err) + } + if 0 < count { + err = app.wikipediaMergeArticles(article_id, real_url) + if err != nil { + log.Fatal(err) + } + continue + } + + stmt, err := app.DB.Prepare("UPDATE article SET url = ? WHERE id = ?") + if err != nil { + log.Warnf("updateWikipediaUrls: Preparing query failed for: (%d) %s", article_id, wiki_url) + log.Fatal(err) + } + defer stmt.Close() + + _, err = stmt.Exec(real_url, article_id) + if err != nil { + log.Warnf("updateWikipediaUrls: Executing statement failed for: (%d) %s", article_id, wiki_url) + log.Fatal(err) + } + log.Debugf("(%d) Updated from %s to %s", article_id, wiki_url, real_url) + } + } +} + +func (app *App) getArticleIdFromUrl(wiki_url string) int { + row := app.DB.QueryRow("SELECT id FROM article WHERE url = ?", wiki_url) + var article_id int + err := row.Scan(&article_id) + if err != nil { + log.Warnf("getArticleIdFromUrl: Query or scanning failed for: %s", wiki_url) + log.Fatal(err) + } + return article_id +} + +func (app *App) fixAllCategories() { + rows, err := app.DB.Query("SELECT id, name FROM category;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var category_id int + var category_name string + + err = rows.Scan(&category_id, &category_name) + if err != nil { + log.Fatal(err) + } + + category_fixed := normalizeCategory(category_name) + if category_fixed != category_name { + log.Warn(category_fixed) + stmt, err := app.DB.Prepare("UPDATE category SET name = ? WHERE id = ? AND name = ?") + if err != nil { + log.Warnf("fixAllCategories: Preparing query failed for: (%d) %s", category_id, category_fixed) + log.Fatal(err) + } + defer stmt.Close() + + _, err = stmt.Exec(category_fixed, category_id, category_name) + if err != nil { + log.Warnf("fixAllCategories: Exec stmt failed for: (%d) %s", category_id, category_fixed) + log.Fatal(err) + } + } + + } +} diff --git a/helper.go b/helper.go new file mode 100644 index 0000000..649c2d4 --- /dev/null +++ b/helper.go @@ -0,0 +1,15 @@ +package main + +import ( + "strings" +) + +func stripHNPrefix(title string) string { + title = strings.TrimPrefix(title, "Ask HN:") + title = strings.TrimPrefix(title, "Show HN:") + title = strings.TrimPrefix(title, "Tell HN:") + title = strings.TrimPrefix(title, "Experiment HN:") + title = strings.TrimPrefix(title, "Launch HN:") + + return strings.TrimSpace(title) +} diff --git a/init.go b/init.go new file mode 100644 index 0000000..7ce40c5 --- /dev/null +++ b/init.go @@ -0,0 +1,58 @@ +package main + +import ( + "errors" + "strings" + + log "github.com/Sirupsen/logrus" + flag "github.com/spf13/pflag" +) + +// global config, gets overwritten by main +var _conf Config + +func init() { + // overwrites unhelpful error message + flag.ErrHelp = errors.New("") + + // we need to parse the config because of log level setting + configFile := flag.StringP("config", "c", "", "path to config file") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") + + flag.Parse() + loglevel := strings.ToLower(*loglevel_f) + + if *debug || *verbose || loglevel == "debug" { + log.SetLevel(log.DebugLevel) + } else if loglevel == "info" { + log.SetLevel(log.InfoLevel) + } else { + log.SetLevel(log.WarnLevel) + } + + if *silent { + log.SetLevel(log.WarnLevel) + } + + _conf.parseConfig(*configFile) + + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + + if _conf.Debug && !*silent { + log.SetLevel(log.DebugLevel) + } +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..1548776 --- /dev/null +++ b/main.go @@ -0,0 +1,322 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/AnikHasibul/queue" + log "github.com/Sirupsen/logrus" + "github.com/jmoiron/sqlx" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() + + //app.fixAllCategories() + + app.deleteOrphanedArticles() + app.topStories() + app.wikipediaFixAllUrls() + app.deleteOrphanedArticles() + app.saveExcerpts() + //app.saveAllCategories() + app.updateAllDiscussions() + //app.walkDown() + + /** + * Resolve redirects on stored urls. + */ + //app.updateWikipediaUrls() + //app.saveAllCategories() + //return +} + +func (app *App) walkDown() { + + var err error + + max_item := getMaxItem() + //max_item := 22554000 + //max_item := 22494596 + //max_item := 22354383 + //max_item := 18984000 + //max_item := 18732000 + //max_item := 16017000 + //max_item := 15494000 + //max_item := 15038031 + //max_item := 14450000 + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for i := max_item; i > 22600000; i-- { + q.Add() + go func(i int) { + defer q.Done() + + Story, ok := getStory(i) + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + } + + /* + * Prints status update every 1000th entry + */ + if i%1000 == 0 { + log.Debugf("%s: Getting item %d\n", time.Now(), i) + } + }(i) + } + q.Wait() +} + +func getMaxItem() int { + response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") + if err != nil { + panic(err) + } + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + max_item, err := strconv.Atoi(string(data)) + if err != nil { + panic(err) + } + + return max_item +} + +func (app *App) topStories() { + var err error + + data1 := strings.TrimSuffix(string(getTopStories()), "]") + data2 := strings.TrimPrefix(string(getBestStories()), "[") + + data1 = data1 + "," + data := data1 + data2 + + var story_ids []int + err = json.Unmarshal([]byte(data), &story_ids) + if err != nil { + log.Warn("topStories: Unmarshaling json failed") + panic(err) + } + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for _, id := range story_ids { + q.Add() + go func(id int) { + Story, ok := getStory(id) + defer q.Done() + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + + categories, ok := app.crawlForCategories(Story.Url) + if ok { + article_id := app.getArticleIdFromUrl(Story.Url) + app.saveCategory(article_id, categories) + } + + } + }(id) + } + q.Wait() +} + +func getStory(id int) (Story, bool) { + Story := getDetail(id) + if Story.Dead || Story.Deleted { + return Story, false + } + if Story.Score < 10 && Story.Descendants < 10 { + return Story, false + } + /* + if (time.Now().Unix() - 3456000) > int64(Story.Time) { + } + */ + + Story.Title = stripHNPrefix(Story.Title) + + u, err := url.Parse(Story.Url) + if err != nil { + log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) + return Story, false + } + is_gh, err := regexp.MatchString("(github.com)($|/)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + is_wiki, err := regexp.MatchString("wikipedia.org($|/)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_gh { + return Story, true + } + if is_wiki { + Story.Url = wikipediaNormalizeUrl(Story.Url) + Story.Url = wikipediaRealUrl(Story.Url) + return Story, true + } + return Story, false +} + +func getResponse(url string) *http.Response { + var err error + var response *http.Response + + response, err = http.Get(url) + if err != nil { + for i := 0; i < 4; i++ { + log.Warn("getDetail: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + resp2, err2 := http.Get(url) + if err2 == nil { + return resp2 + } + } + panic(err) + } + return response +} + +func getBestResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/beststories.json" + return getResponse(_url) +} + +func getTopResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/topstories.json" + return getResponse(_url) +} + +func getWikipediaResponse(title string) *http.Response { + _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title + return getResponse(_url) +} + +func getWikipediaRedirectResponse(hostname, title string) *http.Response { + _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title + return getResponse(_url) +} + +func getStoryResponse(item_id string) *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" + return getResponse(_url) +} + +func getDetail(id int) Story { + response := getStoryResponse(strconv.Itoa(id)) + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + var story Story + err = json.Unmarshal(data, &story) + if err != nil { + log.Warn("getDetail: Unmarshaling json failed") + panic(err) + } + return story +} + +func getTopStories() []byte { + response := getTopResponse() + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func getBestStories() []byte { + response := getBestResponse() + + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func (app *App) updateAllDiscussions() { + const maxRoutines = 20 + var item_ids []int + + app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") + + q := queue.New(maxRoutines) + defer q.Close() + + for _, item_id := range item_ids { + q.Add() + go func(item_id int) { + defer q.Done() + Story, ok := getStory(item_id) + if !ok { + /** + * Check if we got a network error or a dead story. + */ + if 0 == Story.Id { + log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) + } else if Story.Descendants > 10 || Story.Score > 10 { + log.Warnf("%+v\n", Story) + } + return + } + err := app.updateDiscussion(Story) + if err != nil { + log.Warn(err) + return + } + }(item_id) + } + q.Wait() +} diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..f5b8f7c --- /dev/null +++ b/struct.go @@ -0,0 +1,20 @@ +package main + +type Story struct { + Id int + Deleted bool + Type string + Time int + Text string + Dead bool + Url string + Score int + Title string + Descendants int +} + +type Category struct { + ID int + Name string + Url string +} diff --git a/wikipedia.go b/wikipedia.go new file mode 100644 index 0000000..338881a --- /dev/null +++ b/wikipedia.go @@ -0,0 +1,272 @@ +package main + +import ( + "encoding/json" + "regexp" + "strings" + //"strconv" + "io/ioutil" + "net/url" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) crawlWikipedia(url string) { + c := colly.NewCollector() + + c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) { + e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) { + log.Debug("Text: " + e.Text + " Title: " + e.Attr("title") + " Url: " + e.Attr("href")) + }) + }) + c.OnHTML("#firstHeading", func(e *colly.HTMLElement) { + log.Debug("Title: " + e.Text) + }) + + err := c.Visit(url) + if err != nil { + log.Fatal(err) + } +} + +func (app *App) getAllArticles() { + rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + + err = rows.Scan(&article_id) + if err != nil { + log.Fatal(err) + } + + log.Println(article_id) + } +} + +func (app *App) wikipediaFixAllUrls() { + rows, err := app.DB.Query("SELECT id, url FROM article WHERE (url LIKE '%m.wiki%' OR url like 'http:%');") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var id int + var url string + + err = rows.Scan(&id, &url) + if err != nil { + log.Fatal(err) + } + + //log.Debug(id, url) + + url = wikipediaNormalizeUrl(url) + + row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", url) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatal(err) + } + if 0 < count { + err = app.wikipediaMergeArticles(id, url) + if err != nil { + log.Fatal(err) + } + continue + } + err = app.updateArticleUrl(id, url) + if err != nil { + log.Fatal(err) + } + + //log.Debug("UPDATE article SET url = " + url + " WHERE id = " + strconv.Itoa(id)) + } +} + +func wikipediaNormalizeUrl(url string) string { + match, err := regexp.MatchString("^http://", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("^http://") + url = r.ReplaceAllString(url, "https://") + } + + match, err = regexp.MatchString("m.wikipedia.org", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("m.wikipedia.org") + url = r.ReplaceAllString(url, "wikipedia.org") + } + return url +} + +func (app *App) _changeTitle(id_to_delete int, correct_url string) { + var new_title string + + query := ` + SELECT * FROM + (SELECT title as old_title FROM article WHERE id = ?) as t1 + JOIN + (SELECT title as cur_title FROM article WHERE url = ?) as t2 + ; + ` + + row := app.DB.QueryRow(query, id_to_delete, correct_url) + var old_title string + var cur_title string + err := row.Scan(&old_title, &cur_title) + if err != nil { + log.Fatal(err) + } + + old_title = stripHNPrefix(old_title) + cur_title = stripHNPrefix(cur_title) + + if len(old_title) > len(cur_title) { + new_title = old_title + } else { + new_title = cur_title + } + + log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title) +} + +func getWikipediaExcerpt(title string) string { + var err error + + response := getWikipediaResponse(title) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warn("getWikipediaExcerpt: Unmarshaling json failed") + log.Fatal(err) + } + defer func() { recover() }() + query := (data["query"]).(map[string]interface{}) + pages := query["pages"].(map[string]interface{}) + for _, site_id := range pages { + extract := site_id.(map[string]interface{}) + excerpt := (extract["extract"]).(string) + excerpt = strings.TrimSpace(excerpt) + if "" != excerpt { + return excerpt + } + } + return "" +} + +func getWikipediaTitle(full_url string) (string, string) { + u, err := url.Parse(full_url) + if err != nil { + log.Fatal(err) + } + var title string + title = u.Query().Get("title") + if "" == title { + title = strings.TrimPrefix(u.EscapedPath(), "/wiki/") + } + title = strings.TrimSpace(title) + return title, u.Hostname() +} + +func (app *App) saveExcerpts() error { + query := "SELECT id, url FROM article WHERE excerpt_html IS NULL;" + + rows, err := app.DB.Query(query) + if err != nil { + return err + } + + for rows.Next() { + var id int + var url string + + err = rows.Scan(&id, &url) + if err != nil { + log.Fatal(err) + } + + title, _ := getWikipediaTitle(url) + excerpt := getWikipediaExcerpt(title) + + query = "UPDATE article SET excerpt_html = ? WHERE id = ?" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveExcerpts: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(excerpt, id) + if err != nil { + log.Warn("saveExcerpts: Executing stmt failed") + return err + } + + } + return nil +} + +func wikipediaRealUrl(wiki_url string) string { + /** + * We don't change urls with parameters, because we would loose the context. + */ + if strings.Contains(wiki_url, "&") { + return wiki_url + } + + var err error + var fragment string + + u, err := url.Parse(wiki_url) + if err != nil { + log.Fatal(err) + } + + if u.Fragment != "" { + fragment = "#" + u.Fragment + } + + title, hostname := getWikipediaTitle(wiki_url) + + if title == "/" || title == "" { + return wiki_url + } + + response := getWikipediaRedirectResponse(hostname, title) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warn("wikipediaRealUrl: Unmarshaling json failed ", string(resp_data)) + log.Fatal(err) + } + defer func() { recover() }() + query := (data["query"]).(map[string]interface{}) + pages := query["pages"].(map[string]interface{}) + for _, site_id := range pages { + key := site_id.(map[string]interface{}) + canonical_url := key["canonicalurl"].(string) + if "" != canonical_url { + return canonical_url + fragment + } + } + return wiki_url +} -- cgit v1.2.3