package main import ( "encoding/json" "fmt" "html" "io/ioutil" "net/http" "net/url" "regexp" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/anikhasibul/queue" "github.com/jmoiron/sqlx" log "github.com/sirupsen/logrus" ) type App struct { Config *Config DB *sqlx.DB Now time.Time } func main() { var err error _own_conf := _conf app := App{Config: &_own_conf} _conf = Config{} app.Now = time.Now() log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) if err != nil { log.Fatal(err, "Cannot connect to database") } if err = app.DB.Ping(); err != nil { log.Fatal(err, "No connection to database") } defer app.DB.Close() /* app.deleteOrphanedArticles() app.topStories() app.deleteOrphanedArticles() app.updateAllDiscussions() */ app.walkDown() /** * Resolve redirects on stored urls. */ //return } func (app *App) walkDown() { //var err error max_item := getMaxItem() //max_item := 27351341 //max_item := 27262623 //max_item := 41495306 //max_item := 36128477 //max_item := 32670334 //max_item := 41231601 //max_item := 41165987 //max_item := 41136898 //max_item := 22554000 //max_item := 22494596 //max_item := 22354383 //max_item := 18984000 //max_item := 18732000 //max_item := 16017000 //max_item := 15494000 //max_item := 15038031 //max_item := 14450000 min_item := 0 var new_max_item syncMaxItem if app.Config.OnlyUpdateStories { min_item = app.getMaxStoredItem() } log.Infof("walkDown: max_item: %d; min_item: %d\n", max_item, min_item) const maxRoutines = 400 //const maxRoutines = 1 q := queue.New(maxRoutines) defer q.Close() //for i := max_item; i > 22600000; i-- { for i := max_item; i > min_item; i-- { q.Add() go func(i int, new_max_item *syncMaxItem) { defer q.Done() Story, ok := getStory(i) if ok { if len(Story.Links) > 0 { //log.Debugf("%+v\n", Story) //log.Debugf("%+v\n", Story.Links) } err := app.saveStory(Story) if err != nil { log.Fatal(err) } else { new_max_item.mu.Lock() if Story.Id > new_max_item.max_item { new_max_item.max_item = Story.Id } new_max_item.mu.Unlock() } /* */ } /* * Prints status update every 1000th entry */ if i%1000 == 0 { log.Infof("%s: Getting item %d\n", time.Now(), i) } }(i, &new_max_item) } q.Wait() if min_item == 0 { err := app.createMaxStoredItem(new_max_item.max_item) if err != nil { log.Fatal(err) } } else if min_item != new_max_item.max_item && new_max_item.max_item != 0 { err := app.updateNewMaxStoredItem(new_max_item.max_item) if err != nil { log.Fatal(err) } } } func getMaxItem() int { response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") if err != nil { panic(err) } data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } max_item, err := strconv.Atoi(string(data)) if err != nil { panic(err) } return max_item } func (app *App) topStories() { var err error data1 := strings.TrimSuffix(string(getTopStories()), "]") data2 := strings.TrimPrefix(string(getBestStories()), "[") data1 = data1 + "," data := data1 + data2 var story_ids []int err = json.Unmarshal([]byte(data), &story_ids) if err != nil { log.Warn("topStories: Unmarshaling json failed") panic(err) } const maxRoutines = 20 q := queue.New(maxRoutines) defer q.Close() for _, id := range story_ids { q.Add() go func(id int) { Story, ok := getStory(id) defer q.Done() if ok { log.Infof("%+v\n", Story) err = app.saveStory(Story) if err != nil { log.Fatal(err) } } }(id) } q.Wait() } func getStory(id int) (Story, bool) { Story := getDetail(id) if Story.Dead { return Story, false } if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 { return Story, false } var duplicates = make(map[string]bool) /* if (time.Now().Unix() - 3456000) > int64(Story.Time) { } */ Story.Title = stripHNPrefix(Story.Title) u, err := url.Parse(Story.Url) if err != nil { log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) return Story, false } /** * Check if story links to Youtube */ is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host) if err != nil { log.Errorf("Failed to parse and match regex: %s\n", err.Error()) return Story, false } if is_video { var link Link link.Url = normalizeUrl(Story.Url) link.Field = 2 Story.Links = append(Story.Links, link) log.Info("match youtube host") log.Infof("%+v\n", Story) duplicates[link.Url] = true } /** * Check if story links to movie platform */ is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) if err != nil { log.Errorf("Failed to parse and match regex: %s\n", err.Error()) return Story, false } if is_movie { var link Link link.Url = normalizeUrl(Story.Url) link.Field = 1 Story.Links = append(Story.Links, link) log.Info("match moview platform url") log.Infof("%+v\n", Story) duplicates[link.Url] = true } /** * Check for (Video) in title */ is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title) if err != nil { log.Errorf("Failed to parse and match regex: %s\n", err.Error()) return Story, false } if is_video { if !duplicates[Story.Url] { var link Link link.Url = normalizeUrl(Story.Url) link.Field = 2 Story.Links = append(Story.Links, link) log.Info("match video title") log.Infof("%+v\n", Story) duplicates[Story.Url] = true } } /** * Check if story links to movie platform */ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) if err != nil { log.Errorf("Failed to parse and match regex: %s\n", err.Error()) return Story, false } if is_movie { if !duplicates[Story.Url] { var link Link link.Url = normalizeUrl(Story.Url) link.Field = 1 Story.Links = append(Story.Links, link) log.Info("match moview platform url") log.Infof("%+v\n", Story) duplicates[Story.Url] = true } } /** * Parse all URLs in Story.Text log.Debugf("StoryID: %d\n", Story.Id) log.Debugf("StoryID: %d\n", Story.Text) */ /** * This comment broke my code: * https://news.ycombinator.com/item?id=27351340 */ tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("" + Story.Text + "")) if err != nil { log.Errorf("Failed to parse html: %s\n", err.Error()) return Story, false } sel := tmpdoc.Find("html") // remove all found elements from selection sel.Find("code").Each(func(i int, s *goquery.Selection) { //log.Warnf("%+v\n", s.Get(0)) RemoveNode(sel.Get(0), s.Get(0)) }) tmphtml, err := sel.Html() if err != nil { log.Warn("Failed to generate html from selection: ", err.Error()) } doc, err := goquery.NewDocumentFromReader(strings.NewReader(tmphtml)) if err != nil { log.Errorf("Failed to parse html: %s\n", err.Error()) return Story, false } doc.Find("a").Each(func(i int, s *goquery.Selection) { l, ok := s.Attr("href") if ok { /** * Check for Youtube in text field */ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) if err != nil { log.Fatal("Failed to parse and match regex: %s\n", err.Error()) //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) //return Story, false } if is_video { if !duplicates[l] { var link Link link.Url = normalizeUrl(l) link.Field = 2 Story.Links = append(Story.Links, link) log.Info("match youtube text") log.Infof("%+v\n", Story) duplicates[l] = true } } /** * Check for movie platforms in text field */ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) if err != nil { log.Fatal("Failed to parse and match regex: %s\n", err.Error()) //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) //return Story, false } if is_movie { if !duplicates[l] { var link Link link.Url = normalizeUrl(l) link.Field = 1 Story.Links = append(Story.Links, link) log.Info("match moview platform text") log.Infof("%+v\n", Story) duplicates[l] = true } } } }) //Story.Url = normalizeUrl(Story.Url) if len(Story.Links) > 0 { return Story, true } else { return Story, false } } func getResponse(url string) *http.Response { var err error var response *http.Response response, err = http.Get(url) if err != nil { for i := 0; i < 4; i++ { if i == 0 { log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) } else { log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) } resp2, err2 := http.Get(url) if err2 == nil { return resp2 } } panic(err) } return response } func getBestResponse() *http.Response { _url := "https://hacker-news.firebaseio.com/v0/beststories.json" return getResponse(_url) } func getTopResponse() *http.Response { _url := "https://hacker-news.firebaseio.com/v0/topstories.json" return getResponse(_url) } func getStoryResponse(item_id string) *http.Response { _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" return getResponse(_url) } func getDetail(id int) Story { response := getStoryResponse(strconv.Itoa(id)) data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } var story Story err = json.Unmarshal(data, &story) if err != nil { log.Warn("getDetail: Unmarshaling json failed ", data) panic(err) } //log.Debug("%+v\n", Story) story.Text = html.UnescapeString(story.Text) log.Tracef("StoryID: %d\n", story.Id) return story } func getTopStories() []byte { response := getTopResponse() data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } return data } func getBestStories() []byte { response := getBestResponse() data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } return data } func (app *App) updateAllDiscussions() { const maxRoutines = 20 var item_ids []int app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") q := queue.New(maxRoutines) defer q.Close() for _, item_id := range item_ids { q.Add() go func(item_id int) { defer q.Done() Story, ok := getStory(item_id) if !ok { /** * Check if we got a network error or a dead story. */ if 0 == Story.Id { log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) } else if Story.Descendants > 10 || Story.Score > 10 { log.Infof(` updateAllDiscussions: There is a bug. Can't update discussion with id %d. NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source. %+v\n `, item_id, Story) } return } err := app.updateDiscussion(Story) if err != nil { log.Warn(err) return } }(item_id) } q.Wait() }