diff options
| author | horus | 2020-04-02 21:53:30 +0200 |
|---|---|---|
| committer | horus | 2020-04-02 21:53:30 +0200 |
| commit | 0b90b7a3b0f38f0babf4d788f4d7dd5e43253341 (patch) | |
| tree | a5492cf5246522a5dd0e201be3ae988ae7e6245c | |
| download | curious-crawler-0b90b7a3b0f38f0babf4d788f4d7dd5e43253341.tar.gz | |
Initial commit.
| -rw-r--r-- | categories.go | 145 | ||||
| -rw-r--r-- | config.go | 104 | ||||
| -rw-r--r-- | database.go | 425 | ||||
| -rw-r--r-- | helper.go | 15 | ||||
| -rw-r--r-- | init.go | 58 | ||||
| -rw-r--r-- | main.go | 322 | ||||
| -rw-r--r-- | struct.go | 20 | ||||
| -rw-r--r-- | wikipedia.go | 272 |
8 files changed, 1361 insertions, 0 deletions
diff --git a/categories.go b/categories.go new file mode 100644 index 0000000..2ee3448 --- /dev/null +++ b/categories.go @@ -0,0 +1,145 @@ +package main + +import ( + "encoding/json" + "io/ioutil" + "strings" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) crawlForCategories(wiki_url string) ([]string, bool) { + return app.queryWMLabs(wiki_url) +} + +func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { + defer func() { recover() }() + + var categories []string + + title, hostname := getWikipediaTitle(wiki_url) + wm_url := ("https://xtools.wmflabs.org/api/page/assessments/" + hostname + "/" + title) + + if "" == title || "/" == title { + return []string{}, false + } + + response := getResponse(wm_url) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url) + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warnf("queryWMLabs: Decoding JSON failed for: %s", wm_url) + panic(err) + } + + for k, v := range data { + if "project" != k && "elapsed_time" != k { + wp := v.(map[string]interface{}) + for k2, v2 := range wp { + if k2 == "wikiprojects" { + list := v2.(map[string]interface{}) + for k3, _ := range list { + cat := normalizeCategory(k3) + if "" != cat { + categories = append(categories, cat) + } + } + } + } + } + } + + if len(categories) > 0 { + return categories, true + } + return categories, false +} + +func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { + var err error + + //path := strings.TrimPrefix(u.EscapedPath(), "/wiki/") + title, hostname := getWikipediaTitle(wiki_url) + wm_url := ("https://xtools.wmflabs.org/articleinfo/" + hostname + "/" + title) + + if "" == title || "/" == title { + return Category{}, false + } + + var category Category + c := colly.NewCollector() + + c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) { + category.Name = strings.TrimSpace(e.Text) + category.Url = strings.TrimSpace(e.Attr("href")) + }) + + err = c.Visit(wm_url) + if err != nil { + log.Fatal(err) + } + + if category.Name == "" || category.Url == "" { + log.Warnf("title: %s WM URL: %s \tWiki Url: %s", title, wm_url, wiki_url) + } else { + log.Warnf("crawler: %+v", category) + } + return category, true +} + +func (app *App) saveAllCategories() { + rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + var wiki_url string + + err = rows.Scan(&article_id, &wiki_url) + if err != nil { + log.Fatal(err) + } + + //category, ok := app.crawlWMLabs(wiki_url) + categories, ok := app.queryWMLabs(wiki_url) + if ok { + app.saveCategory(article_id, categories) + } else { + log.Debug("saveAllCategories: No categories for " + wiki_url) + } + } +} + +func normalizeCategory(s string) string { + cat := strings.TrimSpace(s) + cat = strings.TrimSuffix(s, "task force") + cat = strings.TrimSuffix(s, "taskforce") + cat = strings.TrimSuffix(s, "Taskforce") + cat = strings.TrimSuffix(s, "Task Force") + + if strings.Contains(strings.ToLower(cat), "articles") { + return "" + } + /** + * The category "Wikipedia" is very useful, but other occurrences are not. + */ + if strings.Contains(strings.ToLower(cat), "wikipedia") && cat != strings.ToLower("wikipedia") { + return "" + } + if strings.Contains(strings.ToLower(cat), "/wikiproject") { + cat = strings.ReplaceAll(cat, "WikiProject ", "") + cat = strings.ReplaceAll(cat, "wikiproject ", "") + cat = strings.ReplaceAll(cat, "Wikiproject ", "") + } + cat = strings.TrimSpace(cat) + + return cat +} diff --git a/config.go b/config.go new file mode 100644 index 0000000..14c75ee --- /dev/null +++ b/config.go @@ -0,0 +1,104 @@ +package main + +import ( + "os" + + log "github.com/Sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + IgnoreRobotsTXT bool + + BasicAuthUsername string + BasicAuthPassword string + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "ghrss") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + // needs some refactoring to truly respect robots.txt + viper.SetDefault("IgnoreRobotsTXT", true) + + viper.SetDefault("UserAgent", "colly - a friendly crawler :)") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath(".") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + log.Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("DISCUSS_") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + log.Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT") + + c.BasicAuthUsername = viper.GetString("BasicAuthUsername") + c.BasicAuthPassword = viper.GetString("BasicAuthPassword") + + c.Debug = viper.GetBool("Debug") +} diff --git a/database.go b/database.go new file mode 100644 index 0000000..e40279e --- /dev/null +++ b/database.go @@ -0,0 +1,425 @@ +package main + +import ( + log "github.com/Sirupsen/logrus" + "regexp" + "strconv" + + "database/sql" + _ "github.com/go-sql-driver/mysql" +) + +func (app *App) saveStory(s Story) error { + match, err := regexp.MatchString("github.com($|/)", s.Url) + if err != nil { + log.Warn("Failed to parse and match regex") + return err + + } + if match { + /** + * Special handling for github stories. + */ + return app.saveCode(s) + } + + query := ` + INSERT IGNORE article ( + id, + created_at, + updated_at, + url, + title + ) VALUES ( + NULL, + ?, + ?, + ?, + ? + ); + ` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + log.Debugf("saveStory: Successfull insert for item %d\n", s.Id) + + query = ` + INSERT IGNORE discussion ( + id, + created_at, + updated_at, + article_id, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + (SELECT id FROM article WHERE url = ?), + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing second query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } + + return nil +} + +func (app *App) saveCode(s Story) error { + query := ` + INSERT IGNORE code( + id, + created_at, + updated_at, + url, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveCode: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveCode: Statement execution failed") + return err + } + + return nil +} + +func (app *App) updateDiscussion(story Story) error { + + query := ` + UPDATE discussion + set updated_at = ?, + comments = ?, + upvotes = ? + WHERE item_id = ?; + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateDiscussion: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, story.Descendants, story.Score, story.Id) + if err != nil { + log.Warnf("updateDiscussion: Statement execution failed") + return err + } + log.Debugf("updateDiscussion: Successful update of %d with new Score: %d, Comments: %d\n", story.Id, story.Score, story.Descendants) + + return nil +} + +func (app *App) updateArticleUrl(id int, url string) error { + query := ` + UPDATE article + set updated_at = ?, + url = ? + WHERE id = ? + ` + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateArticleUrl: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(app.Now, url, id) + if err != nil { + log.Warnf("updateArticleUrl: Statement execution failed") + return err + } + log.Debugf("updateArticleUrl: Successful update new url: %s\n", url) + + return nil + +} + +func (app *App) wikipediaMergeArticles(id_to_delete int, correct_url string) error { + query := "SELECT id FROM discussion WHERE article_id = ?" + row := app.DB.QueryRow(query, id_to_delete) + var disc_id int + err := row.Scan(&disc_id) + if err != nil { + log.Warnf("wikipediaMergeArticles: Query first row failed. id: %d url: %s", id_to_delete, correct_url) + return err + } + query = "SELECT id FROM article WHERE url = ?" + row = app.DB.QueryRow(query, correct_url) + var article_id int + err = row.Scan(&article_id) + if err != nil { + log.Warn("wikipediaMergeArticles: Query second row failed") + return err + } + + query = "UPDATE discussion SET article_id = ?, updated_at = ? WHERE id = ?;" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("wikipediaMergeArticles: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(article_id, app.Now, disc_id) + if err != nil { + log.Warn("wikipediaMergeArticles: Update discussion failed") + return err + } + + query = "UPDATE article_category SET article_id = ? WHERE id = ?;" + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("wikipediaMergeArticles: Preparing article_category query failed") + return err + } + defer stmt2.Close() + + _, err = stmt2.Exec(article_id, id_to_delete) + if err != nil { + log.Warn("wikipediaMergeArticles: Update article_category failed") + return err + } + + return nil +} + +func (app *App) deleteOrphanedArticles() error { + query := ` + DELETE a FROM + article AS a + LEFT JOIN + discussion AS d ON a.id = d.article_id + WHERE d.id IS NULL;` + _, err := app.DB.Exec(query) + if err != nil { + log.Warnf("deleteOrphanedArticles: Executing query failed: %s", err.Error()) + return err + } + + return nil +} + +func (app *App) saveCategory(article_id int, categories []string) { + + for _, category := range categories { + query := "SELECT id FROM category WHERE name = ?" + row := app.DB.QueryRow(query, category) + var category_id int + err := row.Scan(&category_id) + + if err != nil { + if err != sql.ErrNoRows { + log.Warn("saveCategory: Selecting category id failed") + log.Fatal(err) + } + } + + if err == sql.ErrNoRows { + query = ` + INSERT INTO category ( + id, + created_at, + updated_at, + name + ) VALUES ( + null, + ?, + ?, + ? + )` + + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + defer stmt.Close() + + result, err := stmt.Exec(app.Now, app.Now, category) + if err != nil { + log.Fatal(err) + } + + category_id64, err := result.LastInsertId() + category_id = int(category_id64) + if err != nil { + log.Fatal(err) + } + } + + query = ` + INSERT IGNORE article_category ( + id, + article_id, + category_id + ) VALUES ( + null, + ?, + ? + ) + ` + + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Fatal(err) + } + _, err = stmt2.Exec(article_id, category_id) + if err != nil { + log.Fatal(err) + } + } +} + +func (app *App) updateWikipediaUrls() { + rows, err := app.DB.Query("SELECT DISTINCT id, url FROM article;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var wiki_url string + var article_id int + + err = rows.Scan(&article_id, &wiki_url) + if err != nil { + log.Fatal(err) + } + + real_url := wikipediaRealUrl(wiki_url) + if real_url != wiki_url && "" != real_url { + + /** + * Check if we already have the canonical url and merge if necessary. + */ + row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", real_url) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatal(err) + } + if 0 < count { + err = app.wikipediaMergeArticles(article_id, real_url) + if err != nil { + log.Fatal(err) + } + continue + } + + stmt, err := app.DB.Prepare("UPDATE article SET url = ? WHERE id = ?") + if err != nil { + log.Warnf("updateWikipediaUrls: Preparing query failed for: (%d) %s", article_id, wiki_url) + log.Fatal(err) + } + defer stmt.Close() + + _, err = stmt.Exec(real_url, article_id) + if err != nil { + log.Warnf("updateWikipediaUrls: Executing statement failed for: (%d) %s", article_id, wiki_url) + log.Fatal(err) + } + log.Debugf("(%d) Updated from %s to %s", article_id, wiki_url, real_url) + } + } +} + +func (app *App) getArticleIdFromUrl(wiki_url string) int { + row := app.DB.QueryRow("SELECT id FROM article WHERE url = ?", wiki_url) + var article_id int + err := row.Scan(&article_id) + if err != nil { + log.Warnf("getArticleIdFromUrl: Query or scanning failed for: %s", wiki_url) + log.Fatal(err) + } + return article_id +} + +func (app *App) fixAllCategories() { + rows, err := app.DB.Query("SELECT id, name FROM category;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var category_id int + var category_name string + + err = rows.Scan(&category_id, &category_name) + if err != nil { + log.Fatal(err) + } + + category_fixed := normalizeCategory(category_name) + if category_fixed != category_name { + log.Warn(category_fixed) + stmt, err := app.DB.Prepare("UPDATE category SET name = ? WHERE id = ? AND name = ?") + if err != nil { + log.Warnf("fixAllCategories: Preparing query failed for: (%d) %s", category_id, category_fixed) + log.Fatal(err) + } + defer stmt.Close() + + _, err = stmt.Exec(category_fixed, category_id, category_name) + if err != nil { + log.Warnf("fixAllCategories: Exec stmt failed for: (%d) %s", category_id, category_fixed) + log.Fatal(err) + } + } + + } +} diff --git a/helper.go b/helper.go new file mode 100644 index 0000000..649c2d4 --- /dev/null +++ b/helper.go @@ -0,0 +1,15 @@ +package main + +import ( + "strings" +) + +func stripHNPrefix(title string) string { + title = strings.TrimPrefix(title, "Ask HN:") + title = strings.TrimPrefix(title, "Show HN:") + title = strings.TrimPrefix(title, "Tell HN:") + title = strings.TrimPrefix(title, "Experiment HN:") + title = strings.TrimPrefix(title, "Launch HN:") + + return strings.TrimSpace(title) +} @@ -0,0 +1,58 @@ +package main + +import ( + "errors" + "strings" + + log "github.com/Sirupsen/logrus" + flag "github.com/spf13/pflag" +) + +// global config, gets overwritten by main +var _conf Config + +func init() { + // overwrites unhelpful error message + flag.ErrHelp = errors.New("") + + // we need to parse the config because of log level setting + configFile := flag.StringP("config", "c", "", "path to config file") + debug := flag.BoolP("debug", "d", false, "set log level to \"Debug\"") + verbose := flag.BoolP("verbose", "v", false, "set log level to \"Debug\", same as --debug") + silent := flag.BoolP("silent", "s", false, "suppress output except warnings") + loglevel_f := flag.String("loglevel", "Warn", `set log level, can be "Warn", "Info" or "Debug"`) + user_agent_f := flag.StringP("user-agent", "u", "", "set user agent") + delay_f := flag.Int("delay", 0, "enable and set delay in seconds between crawls (default 0)") + ignore_robots_f := flag.Bool("ignore-robots-txt", true, "ignore robots.txt") + + flag.Parse() + loglevel := strings.ToLower(*loglevel_f) + + if *debug || *verbose || loglevel == "debug" { + log.SetLevel(log.DebugLevel) + } else if loglevel == "info" { + log.SetLevel(log.InfoLevel) + } else { + log.SetLevel(log.WarnLevel) + } + + if *silent { + log.SetLevel(log.WarnLevel) + } + + _conf.parseConfig(*configFile) + + if *user_agent_f != "" { + _conf.UserAgent = *user_agent_f + } + if *delay_f != 0 { + _conf.Delay = *delay_f + } + if !*ignore_robots_f { + _conf.IgnoreRobotsTXT = *ignore_robots_f + } + + if _conf.Debug && !*silent { + log.SetLevel(log.DebugLevel) + } +} @@ -0,0 +1,322 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/AnikHasibul/queue" + log "github.com/Sirupsen/logrus" + "github.com/jmoiron/sqlx" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() + + //app.fixAllCategories() + + app.deleteOrphanedArticles() + app.topStories() + app.wikipediaFixAllUrls() + app.deleteOrphanedArticles() + app.saveExcerpts() + //app.saveAllCategories() + app.updateAllDiscussions() + //app.walkDown() + + /** + * Resolve redirects on stored urls. + */ + //app.updateWikipediaUrls() + //app.saveAllCategories() + //return +} + +func (app *App) walkDown() { + + var err error + + max_item := getMaxItem() + //max_item := 22554000 + //max_item := 22494596 + //max_item := 22354383 + //max_item := 18984000 + //max_item := 18732000 + //max_item := 16017000 + //max_item := 15494000 + //max_item := 15038031 + //max_item := 14450000 + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for i := max_item; i > 22600000; i-- { + q.Add() + go func(i int) { + defer q.Done() + + Story, ok := getStory(i) + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + } + + /* + * Prints status update every 1000th entry + */ + if i%1000 == 0 { + log.Debugf("%s: Getting item %d\n", time.Now(), i) + } + }(i) + } + q.Wait() +} + +func getMaxItem() int { + response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") + if err != nil { + panic(err) + } + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + max_item, err := strconv.Atoi(string(data)) + if err != nil { + panic(err) + } + + return max_item +} + +func (app *App) topStories() { + var err error + + data1 := strings.TrimSuffix(string(getTopStories()), "]") + data2 := strings.TrimPrefix(string(getBestStories()), "[") + + data1 = data1 + "," + data := data1 + data2 + + var story_ids []int + err = json.Unmarshal([]byte(data), &story_ids) + if err != nil { + log.Warn("topStories: Unmarshaling json failed") + panic(err) + } + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for _, id := range story_ids { + q.Add() + go func(id int) { + Story, ok := getStory(id) + defer q.Done() + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + + categories, ok := app.crawlForCategories(Story.Url) + if ok { + article_id := app.getArticleIdFromUrl(Story.Url) + app.saveCategory(article_id, categories) + } + + } + }(id) + } + q.Wait() +} + +func getStory(id int) (Story, bool) { + Story := getDetail(id) + if Story.Dead || Story.Deleted { + return Story, false + } + if Story.Score < 10 && Story.Descendants < 10 { + return Story, false + } + /* + if (time.Now().Unix() - 3456000) > int64(Story.Time) { + } + */ + + Story.Title = stripHNPrefix(Story.Title) + + u, err := url.Parse(Story.Url) + if err != nil { + log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) + return Story, false + } + is_gh, err := regexp.MatchString("(github.com)($|/)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + is_wiki, err := regexp.MatchString("wikipedia.org($|/)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_gh { + return Story, true + } + if is_wiki { + Story.Url = wikipediaNormalizeUrl(Story.Url) + Story.Url = wikipediaRealUrl(Story.Url) + return Story, true + } + return Story, false +} + +func getResponse(url string) *http.Response { + var err error + var response *http.Response + + response, err = http.Get(url) + if err != nil { + for i := 0; i < 4; i++ { + log.Warn("getDetail: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + resp2, err2 := http.Get(url) + if err2 == nil { + return resp2 + } + } + panic(err) + } + return response +} + +func getBestResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/beststories.json" + return getResponse(_url) +} + +func getTopResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/topstories.json" + return getResponse(_url) +} + +func getWikipediaResponse(title string) *http.Response { + _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title + return getResponse(_url) +} + +func getWikipediaRedirectResponse(hostname, title string) *http.Response { + _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title + return getResponse(_url) +} + +func getStoryResponse(item_id string) *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" + return getResponse(_url) +} + +func getDetail(id int) Story { + response := getStoryResponse(strconv.Itoa(id)) + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + var story Story + err = json.Unmarshal(data, &story) + if err != nil { + log.Warn("getDetail: Unmarshaling json failed") + panic(err) + } + return story +} + +func getTopStories() []byte { + response := getTopResponse() + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func getBestStories() []byte { + response := getBestResponse() + + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func (app *App) updateAllDiscussions() { + const maxRoutines = 20 + var item_ids []int + + app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") + + q := queue.New(maxRoutines) + defer q.Close() + + for _, item_id := range item_ids { + q.Add() + go func(item_id int) { + defer q.Done() + Story, ok := getStory(item_id) + if !ok { + /** + * Check if we got a network error or a dead story. + */ + if 0 == Story.Id { + log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) + } else if Story.Descendants > 10 || Story.Score > 10 { + log.Warnf("%+v\n", Story) + } + return + } + err := app.updateDiscussion(Story) + if err != nil { + log.Warn(err) + return + } + }(item_id) + } + q.Wait() +} diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..f5b8f7c --- /dev/null +++ b/struct.go @@ -0,0 +1,20 @@ +package main + +type Story struct { + Id int + Deleted bool + Type string + Time int + Text string + Dead bool + Url string + Score int + Title string + Descendants int +} + +type Category struct { + ID int + Name string + Url string +} diff --git a/wikipedia.go b/wikipedia.go new file mode 100644 index 0000000..338881a --- /dev/null +++ b/wikipedia.go @@ -0,0 +1,272 @@ +package main + +import ( + "encoding/json" + "regexp" + "strings" + //"strconv" + "io/ioutil" + "net/url" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) crawlWikipedia(url string) { + c := colly.NewCollector() + + c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) { + e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) { + log.Debug("Text: " + e.Text + " Title: " + e.Attr("title") + " Url: " + e.Attr("href")) + }) + }) + c.OnHTML("#firstHeading", func(e *colly.HTMLElement) { + log.Debug("Title: " + e.Text) + }) + + err := c.Visit(url) + if err != nil { + log.Fatal(err) + } +} + +func (app *App) getAllArticles() { + rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + + err = rows.Scan(&article_id) + if err != nil { + log.Fatal(err) + } + + log.Println(article_id) + } +} + +func (app *App) wikipediaFixAllUrls() { + rows, err := app.DB.Query("SELECT id, url FROM article WHERE (url LIKE '%m.wiki%' OR url like 'http:%');") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var id int + var url string + + err = rows.Scan(&id, &url) + if err != nil { + log.Fatal(err) + } + + //log.Debug(id, url) + + url = wikipediaNormalizeUrl(url) + + row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", url) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatal(err) + } + if 0 < count { + err = app.wikipediaMergeArticles(id, url) + if err != nil { + log.Fatal(err) + } + continue + } + err = app.updateArticleUrl(id, url) + if err != nil { + log.Fatal(err) + } + + //log.Debug("UPDATE article SET url = " + url + " WHERE id = " + strconv.Itoa(id)) + } +} + +func wikipediaNormalizeUrl(url string) string { + match, err := regexp.MatchString("^http://", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("^http://") + url = r.ReplaceAllString(url, "https://") + } + + match, err = regexp.MatchString("m.wikipedia.org", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("m.wikipedia.org") + url = r.ReplaceAllString(url, "wikipedia.org") + } + return url +} + +func (app *App) _changeTitle(id_to_delete int, correct_url string) { + var new_title string + + query := ` + SELECT * FROM + (SELECT title as old_title FROM article WHERE id = ?) as t1 + JOIN + (SELECT title as cur_title FROM article WHERE url = ?) as t2 + ; + ` + + row := app.DB.QueryRow(query, id_to_delete, correct_url) + var old_title string + var cur_title string + err := row.Scan(&old_title, &cur_title) + if err != nil { + log.Fatal(err) + } + + old_title = stripHNPrefix(old_title) + cur_title = stripHNPrefix(cur_title) + + if len(old_title) > len(cur_title) { + new_title = old_title + } else { + new_title = cur_title + } + + log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title) +} + +func getWikipediaExcerpt(title string) string { + var err error + + response := getWikipediaResponse(title) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warn("getWikipediaExcerpt: Unmarshaling json failed") + log.Fatal(err) + } + defer func() { recover() }() + query := (data["query"]).(map[string]interface{}) + pages := query["pages"].(map[string]interface{}) + for _, site_id := range pages { + extract := site_id.(map[string]interface{}) + excerpt := (extract["extract"]).(string) + excerpt = strings.TrimSpace(excerpt) + if "" != excerpt { + return excerpt + } + } + return "" +} + +func getWikipediaTitle(full_url string) (string, string) { + u, err := url.Parse(full_url) + if err != nil { + log.Fatal(err) + } + var title string + title = u.Query().Get("title") + if "" == title { + title = strings.TrimPrefix(u.EscapedPath(), "/wiki/") + } + title = strings.TrimSpace(title) + return title, u.Hostname() +} + +func (app *App) saveExcerpts() error { + query := "SELECT id, url FROM article WHERE excerpt_html IS NULL;" + + rows, err := app.DB.Query(query) + if err != nil { + return err + } + + for rows.Next() { + var id int + var url string + + err = rows.Scan(&id, &url) + if err != nil { + log.Fatal(err) + } + + title, _ := getWikipediaTitle(url) + excerpt := getWikipediaExcerpt(title) + + query = "UPDATE article SET excerpt_html = ? WHERE id = ?" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveExcerpts: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(excerpt, id) + if err != nil { + log.Warn("saveExcerpts: Executing stmt failed") + return err + } + + } + return nil +} + +func wikipediaRealUrl(wiki_url string) string { + /** + * We don't change urls with parameters, because we would loose the context. + */ + if strings.Contains(wiki_url, "&") { + return wiki_url + } + + var err error + var fragment string + + u, err := url.Parse(wiki_url) + if err != nil { + log.Fatal(err) + } + + if u.Fragment != "" { + fragment = "#" + u.Fragment + } + + title, hostname := getWikipediaTitle(wiki_url) + + if title == "/" || title == "" { + return wiki_url + } + + response := getWikipediaRedirectResponse(hostname, title) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warn("wikipediaRealUrl: Unmarshaling json failed ", string(resp_data)) + log.Fatal(err) + } + defer func() { recover() }() + query := (data["query"]).(map[string]interface{}) + pages := query["pages"].(map[string]interface{}) + for _, site_id := range pages { + key := site_id.(map[string]interface{}) + canonical_url := key["canonicalurl"].(string) + if "" != canonical_url { + return canonical_url + fragment + } + } + return wiki_url +} |
