package main import ( "encoding/json" "fmt" "io/ioutil" "net/http" "net/url" "regexp" "strconv" "strings" "time" "github.com/anikhasibul/queue" "github.com/jmoiron/sqlx" log "github.com/sirupsen/logrus" ) type App struct { Config *Config DB *sqlx.DB Now time.Time } func main() { var err error _own_conf := _conf app := App{Config: &_own_conf} _conf = Config{} app.Now = time.Now() log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) if err != nil { log.Fatal(err, "Cannot connect to database") } if err = app.DB.Ping(); err != nil { log.Fatal(err, "No connection to database") } defer app.DB.Close() //app.fixAllCategories() //return app.deleteOrphanedArticles() app.topStories() app.wikipediaFixAllUrls() app.deleteOrphanedArticles() app.saveExcerpts() //app.saveAllCategories() app.updateAllDiscussions() //app.walkDown() /** * Resolve redirects on stored urls. */ //app.updateWikipediaUrls() //app.saveAllCategories() //return } func (app *App) walkDown() { var err error max_item := getMaxItem() //max_item := 22554000 //max_item := 22494596 //max_item := 22354383 //max_item := 18984000 //max_item := 18732000 //max_item := 16017000 //max_item := 15494000 //max_item := 15038031 //max_item := 14450000 const maxRoutines = 20 q := queue.New(maxRoutines) defer q.Close() for i := max_item; i > 22600000; i-- { q.Add() go func(i int) { defer q.Done() Story, ok := getStory(i) if ok { log.Infof("%+v\n", Story) err = app.saveStory(Story) if err != nil { log.Fatal(err) } } /* * Prints status update every 1000th entry */ if i%1000 == 0 { log.Debugf("%s: Getting item %d\n", time.Now(), i) } }(i) } q.Wait() } func getMaxItem() int { response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") if err != nil { panic(err) } data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } max_item, err := strconv.Atoi(string(data)) if err != nil { panic(err) } return max_item } func (app *App) topStories() { var err error data1 := strings.TrimSuffix(string(getTopStories()), "]") data2 := strings.TrimPrefix(string(getBestStories()), "[") data1 = data1 + "," data := data1 + data2 var story_ids []int err = json.Unmarshal([]byte(data), &story_ids) if err != nil { log.Warn("topStories: Unmarshaling json failed") panic(err) } const maxRoutines = 20 q := queue.New(maxRoutines) defer q.Close() for _, id := range story_ids { q.Add() go func(id int) { Story, ok := getStory(id) defer q.Done() if ok { log.Infof("%+v\n", Story) err = app.saveStory(Story) if err != nil { log.Fatal(err) } log.Debug("topStories: crawling for Categories") categories, ok := app.crawlForCategories(Story.Url) if ok { article_id := app.getArticleIdFromUrl(Story.Url) app.saveCategory(article_id, categories) } else { log.Warn("topStories: crawling for Categories: not ok") } } }(id) } q.Wait() } func getStory(id int) (Story, bool) { Story := getDetail(id) if Story.Dead || Story.Deleted { return Story, false } if Story.Score < 10 && Story.Descendants < 10 { return Story, false } /* if (time.Now().Unix() - 3456000) > int64(Story.Time) { } */ Story.Title = stripHNPrefix(Story.Title) u, err := url.Parse(Story.Url) if err != nil { log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) return Story, false } is_gh, err := regexp.MatchString("(github.com)($|/)", u.Host) if err != nil { log.Errorf("Failed to parse and match regex: %s\n", err.Error()) return Story, false } is_wiki, err := regexp.MatchString("(wikipedia.org)($|/)", u.Host) if err != nil { log.Errorf("Failed to parse and match regex: %s\n", err.Error()) return Story, false } if is_gh { return Story, true } if is_wiki { Story.Url = wikipediaNormalizeUrl(Story.Url) Story.Url = wikipediaRealUrl(Story.Url) return Story, true } return Story, false } func getResponse(url string) *http.Response { var err error var response *http.Response response, err = http.Get(url) if err != nil { for i := 0; i < 4; i++ { if i == 0 { log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) } else { log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) } resp2, err2 := http.Get(url) if err2 == nil { return resp2 } } panic(err) } return response } func getBestResponse() *http.Response { _url := "https://hacker-news.firebaseio.com/v0/beststories.json" return getResponse(_url) } func getTopResponse() *http.Response { _url := "https://hacker-news.firebaseio.com/v0/topstories.json" return getResponse(_url) } func getWikipediaResponse(title string) *http.Response { _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title return getResponse(_url) } func getWikipediaRedirectResponse(hostname, title string) *http.Response { _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title return getResponse(_url) } func getStoryResponse(item_id string) *http.Response { _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" return getResponse(_url) } func getDetail(id int) Story { response := getStoryResponse(strconv.Itoa(id)) data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } var story Story err = json.Unmarshal(data, &story) if err != nil { log.Warn("getDetail: Unmarshaling json failed") panic(err) } return story } func getTopStories() []byte { response := getTopResponse() data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } return data } func getBestStories() []byte { response := getBestResponse() data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } return data } func (app *App) updateAllDiscussions() { const maxRoutines = 20 var item_ids []int app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") q := queue.New(maxRoutines) defer q.Close() for _, item_id := range item_ids { q.Add() go func(item_id int) { defer q.Done() Story, ok := getStory(item_id) if !ok { /** * Check if we got a network error or a dead story. */ if 0 == Story.Id { log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) } else if Story.Descendants > 10 || Story.Score > 10 { log.Infof(` updateAllDiscussions: There is a bug. Can't update discussion with id %d. NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source. %+v\n `, item_id, Story) } return } err := app.updateDiscussion(Story) if err != nil { log.Warn(err) return } }(item_id) } q.Wait() }