diff options
Diffstat (limited to 'main.go')
| -rw-r--r-- | main.go | 462 |
1 files changed, 462 insertions, 0 deletions
@@ -0,0 +1,462 @@ +package main + +import ( + "html" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/anikhasibul/queue" + "github.com/jmoiron/sqlx" + log "github.com/sirupsen/logrus" + "mvdan.cc/xurls/v2" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() + + /* + app.deleteOrphanedArticles() + app.topStories() + app.deleteOrphanedArticles() + app.updateAllDiscussions() + */ + app.walkDown() + + /** + * Resolve redirects on stored urls. + */ + //return +} + +func (app *App) walkDown() { + + //var err error + + //max_item := getMaxItem() + //max_item := 41495306 + //max_item := 36128477 + max_item := 32670334 + //max_item := 41231601 + //max_item := 41165987 + //max_item := 41136898 + //max_item := 22554000 + //max_item := 22494596 + //max_item := 22354383 + //max_item := 18984000 + //max_item := 18732000 + //max_item := 16017000 + //max_item := 15494000 + //max_item := 15038031 + //max_item := 14450000 + + const maxRoutines = 200 + + q := queue.New(maxRoutines) + defer q.Close() + //for i := max_item; i > 22600000; i-- { + for i := max_item; i > 0; i-- { + q.Add() + go func(i int) { + defer q.Done() + + Story, ok := getStory(i) + if ok { + if len(Story.Links) > 0 { + //log.Debugf("%+v\n", Story) + //log.Debugf("%+v\n", Story.Links) + } + err := app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + /* + */ + } + + /* + * Prints status update every 1000th entry + */ + if i%1000 == 0 { + log.Infof("%s: Getting item %d\n", time.Now(), i) + } + }(i) + } + q.Wait() +} + +func getMaxItem() int { + response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json") + if err != nil { + panic(err) + } + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + max_item, err := strconv.Atoi(string(data)) + if err != nil { + panic(err) + } + + return max_item +} + +func (app *App) topStories() { + var err error + + data1 := strings.TrimSuffix(string(getTopStories()), "]") + data2 := strings.TrimPrefix(string(getBestStories()), "[") + + data1 = data1 + "," + data := data1 + data2 + + var story_ids []int + err = json.Unmarshal([]byte(data), &story_ids) + if err != nil { + log.Warn("topStories: Unmarshaling json failed") + panic(err) + } + + const maxRoutines = 20 + + q := queue.New(maxRoutines) + defer q.Close() + for _, id := range story_ids { + q.Add() + go func(id int) { + Story, ok := getStory(id) + defer q.Done() + if ok { + log.Infof("%+v\n", Story) + err = app.saveStory(Story) + if err != nil { + log.Fatal(err) + } + + } + }(id) + } + q.Wait() +} + +func getStory(id int) (Story, bool) { + Story := getDetail(id) + if Story.Dead { + return Story, false + } + if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 { + return Story, false + } + var duplicates = make(map[string]bool) + /* + if (time.Now().Unix() - 3456000) > int64(Story.Time) { + } + */ + + Story.Title = stripHNPrefix(Story.Title) + + u, err := url.Parse(Story.Url) + if err != nil { + log.Warnf("getStory: Parsing URL failed: %s\n", err.Error()) + return Story, false + } + + /** + * Check if story links to Youtube + */ + is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match youtube host") + log.Infof("%+v\n", Story) + + duplicates[link.Url] = true + } + + /** + * Check if story links to movie platform + */ + is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform url") + log.Infof("%+v\n", Story) + + duplicates[link.Url] = true + } + + /** + * Check for (Video) in title + */ + is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + if ! duplicates[Story.Url] { + + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match video title") + log.Infof("%+v\n", Story) + + duplicates[Story.Url] = true + } + + } + + /** + * Check if story links to movie platform + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + if ! duplicates[Story.Url] { + + var link Link + link.Url = normalizeUrl(Story.Url) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform url") + log.Infof("%+v\n", Story) + + duplicates[Story.Url] = true + } + + } + + /** + * Parse all URLs in Story.Text + */ + rxRelaxed := xurls.Relaxed() + rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1) + + for _, rxLink := range rxLinks { + + /** + * Check for Youtube in text field + */ + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_video { + if ! duplicates[rxLink] { + + var link Link + link.Url = normalizeUrl(rxLink) + link.Field = 2 + Story.Links = append(Story.Links, link) + + log.Info("match youtube text") + log.Infof("%+v\n", Story) + + duplicates[rxLink] = true + } + + } + + /** + * Check for movie platforms in text field + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink) + if err != nil { + log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + return Story, false + } + if is_movie { + if ! duplicates[rxLink] { + + var link Link + link.Url = normalizeUrl(rxLink) + link.Field = 1 + Story.Links = append(Story.Links, link) + + log.Info("match moview platform text") + log.Infof("%+v\n", Story) + + duplicates[rxLink] = true + } + + } + } + + //Story.Url = normalizeUrl(Story.Url) + + if len(Story.Links) > 0 { + return Story, true + } else { + return Story, false + } +} + +func getResponse(url string) *http.Response { + var err error + var response *http.Response + + response, err = http.Get(url) + if err != nil { + for i := 0; i < 4; i++ { + if i == 0 { + log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + } else { + log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i)) + } + resp2, err2 := http.Get(url) + if err2 == nil { + return resp2 + } + } + panic(err) + } + return response +} + +func getBestResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/beststories.json" + return getResponse(_url) +} + +func getTopResponse() *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/topstories.json" + return getResponse(_url) +} + +func getStoryResponse(item_id string) *http.Response { + _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" + return getResponse(_url) +} + +func getDetail(id int) Story { + response := getStoryResponse(strconv.Itoa(id)) + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + var story Story + err = json.Unmarshal(data, &story) + if err != nil { + log.Warn("getDetail: Unmarshaling json failed ", data) + panic(err) + } + //log.Debug("%+v\n", Story) + + story.Text = html.UnescapeString(story.Text) + + return story +} + +func getTopStories() []byte { + response := getTopResponse() + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func getBestStories() []byte { + response := getBestResponse() + + data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + return data +} + +func (app *App) updateAllDiscussions() { + const maxRoutines = 20 + var item_ids []int + + app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on") + + q := queue.New(maxRoutines) + defer q.Close() + + for _, item_id := range item_ids { + q.Add() + go func(item_id int) { + defer q.Done() + Story, ok := getStory(item_id) + if !ok { + /** + * Check if we got a network error or a dead story. + */ + if 0 == Story.Id { + log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id) + } else if Story.Descendants > 10 || Story.Score > 10 { + log.Infof(` + updateAllDiscussions: There is a bug. Can't update discussion with id %d. + NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source. + %+v\n + `, item_id, Story) + } + return + } + err := app.updateDiscussion(Story) + if err != nil { + log.Warn(err) + return + } + }(item_id) + } + q.Wait() +} |
