diff options
Diffstat (limited to 'main.go')
| -rw-r--r-- | main.go | 322 |
1 files changed, 322 insertions, 0 deletions
@@ -0,0 +1,322 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/AnikHasibul/queue" + log "github.com/Sirupsen/logrus" + "github.com/jmoiron/sqlx" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() + + //app.fixAllCategories() + + app.deleteOrphanedArticles() + app.topStories() + app.wikipediaFixAllUrls() + app.deleteOrphanedArticles() + app.saveExcerpts() + //app.saveAllCategories() + app.updateAllDiscussions() + //app.walkDown() + + /** + * Resolve redirects on stored urls. + */ + //app.updateWikipediaUrls() + //app.saveAllCategories() + //return +} + +func (app *App) walkDown() { + + var err error + + max_item := getMaxItem() + //max_item := 22554000 + //max_item := 22494596 + //max_item := 22354383 + //max_item := 18984000 + //max_item := 18732000 + //max_item := 16017000 + //max_item := 15494000 + //max_item := 15038031 + //max_item := 14450000 + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for i := max_item; i > 22600000; i-- { + q.Add() + go func(i int) { + defer q.Done() + + Story, ok := getStory(i) + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + } + + /* + * Prints status update every 1000th entry + */ + if i%1000 == 0 { + log.Debugf("%s: Getting item %d\n", time.Now(), i) + } + }(i) + } + q.Wait() +} + +func getMaxItem() int { + response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") + if err != nil { + panic(err) + } + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + max_item, err := strconv.Atoi(string(data)) + if err != nil { + panic(err) + } + + return max_item +} + +func (app *App) topStories() { + var err error + + data1 := strings.TrimSuffix(string(getTopStories()), "]") + data2 := strings.TrimPrefix(string(getBestStories()), "[") + + data1 = data1 + "," + data := data1 + data2 + + var story_ids []int + err = json.Unmarshal([]byte(data), &story_ids) + if err != nil { + log.Warn("topStories: Unmarshaling json failed") + panic(err) + } + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for _, id := range story_ids { + q.Add() + go func(id int) { + Story, ok := getStory(id) + defer q.Done() + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + + categories, ok := app.crawlForCategories(Story.Url) + if ok { + article_id := app.getArticleIdFromUrl(Story.Url) + app.saveCategory(article_id, categories) + } + + } + }(id) + } + q.Wait() +} + +func getStory(id int) (Story, bool) { + Story := getDetail(id) + if Story.Dead || Story.Deleted { + return Story, false + } + if Story.Score < 10 && Story.Descendants < 10 { + return Story, false + } + /* + if (time.Now().Unix() - 3456000) > int64(Story.Time) { + } + */ + + Story.Title = stripHNPrefix(Story.Title) + + u, err := url.Parse(Story.Url) + if err != nil { + log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) + return Story, false + } + is_gh, err := regexp.MatchString("(github.com)($|/)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + is_wiki, err := regexp.MatchString("wikipedia.org($|/)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_gh { + return Story, true + } + if is_wiki { + Story.Url = wikipediaNormalizeUrl(Story.Url) + Story.Url = wikipediaRealUrl(Story.Url) + return Story, true + } + return Story, false +} + +func getResponse(url string) *http.Response { + var err error + var response *http.Response + + response, err = http.Get(url) + if err != nil { + for i := 0; i < 4; i++ { + log.Warn("getDetail: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + resp2, err2 := http.Get(url) + if err2 == nil { + return resp2 + } + } + panic(err) + } + return response +} + +func getBestResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/beststories.json" + return getResponse(_url) +} + +func getTopResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/topstories.json" + return getResponse(_url) +} + +func getWikipediaResponse(title string) *http.Response { + _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title + return getResponse(_url) +} + +func getWikipediaRedirectResponse(hostname, title string) *http.Response { + _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title + return getResponse(_url) +} + +func getStoryResponse(item_id string) *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" + return getResponse(_url) +} + +func getDetail(id int) Story { + response := getStoryResponse(strconv.Itoa(id)) + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + var story Story + err = json.Unmarshal(data, &story) + if err != nil { + log.Warn("getDetail: Unmarshaling json failed") + panic(err) + } + return story +} + +func getTopStories() []byte { + response := getTopResponse() + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func getBestStories() []byte { + response := getBestResponse() + + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func (app *App) updateAllDiscussions() { + const maxRoutines = 20 + var item_ids []int + + app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") + + q := queue.New(maxRoutines) + defer q.Close() + + for _, item_id := range item_ids { + q.Add() + go func(item_id int) { + defer q.Done() + Story, ok := getStory(item_id) + if !ok { + /** + * Check if we got a network error or a dead story. + */ + if 0 == Story.Id { + log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) + } else if Story.Descendants > 10 || Story.Score > 10 { + log.Warnf("%+v\n", Story) + } + return + } + err := app.updateDiscussion(Story) + if err != nil { + log.Warn(err) + return + } + }(item_id) + } + q.Wait() +} |
