summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'main.go')
-rw-r--r--main.go462
1 files changed, 462 insertions, 0 deletions
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..8940afc
--- /dev/null
+++ b/main.go
@@ -0,0 +1,462 @@
+package main
+
+import (
+ "html"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/anikhasibul/queue"
+ "github.com/jmoiron/sqlx"
+ log "github.com/sirupsen/logrus"
+ "mvdan.cc/xurls/v2"
+)
+
+type App struct {
+ Config *Config
+ DB *sqlx.DB
+ Now time.Time
+}
+
+func main() {
+ var err error
+ _own_conf := _conf
+ app := App{Config: &_own_conf}
+ _conf = Config{}
+
+ app.Now = time.Now()
+
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
+ app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
+ if err != nil {
+ log.Fatal(err, "Cannot connect to database")
+ }
+
+ if err = app.DB.Ping(); err != nil {
+ log.Fatal(err, "No connection to database")
+ }
+ defer app.DB.Close()
+
+ /*
+ app.deleteOrphanedArticles()
+ app.topStories()
+ app.deleteOrphanedArticles()
+ app.updateAllDiscussions()
+ */
+ app.walkDown()
+
+ /**
+ * Resolve redirects on stored urls.
+ */
+ //return
+}
+
+func (app *App) walkDown() {
+
+ //var err error
+
+ //max_item := getMaxItem()
+ //max_item := 41495306
+ //max_item := 36128477
+ max_item := 32670334
+ //max_item := 41231601
+ //max_item := 41165987
+ //max_item := 41136898
+ //max_item := 22554000
+ //max_item := 22494596
+ //max_item := 22354383
+ //max_item := 18984000
+ //max_item := 18732000
+ //max_item := 16017000
+ //max_item := 15494000
+ //max_item := 15038031
+ //max_item := 14450000
+
+ const maxRoutines = 200
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ //for i := max_item; i > 22600000; i-- {
+ for i := max_item; i > 0; i-- {
+ q.Add()
+ go func(i int) {
+ defer q.Done()
+
+ Story, ok := getStory(i)
+ if ok {
+ if len(Story.Links) > 0 {
+ //log.Debugf("%+v\n", Story)
+ //log.Debugf("%+v\n", Story.Links)
+ }
+ err := app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+ /*
+ */
+ }
+
+ /*
+ * Prints status update every 1000th entry
+ */
+ if i%1000 == 0 {
+ log.Infof("%s: Getting item %d\n", time.Now(), i)
+ }
+ }(i)
+ }
+ q.Wait()
+}
+
+func getMaxItem() int {
+ response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json")
+ if err != nil {
+ panic(err)
+ }
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ max_item, err := strconv.Atoi(string(data))
+ if err != nil {
+ panic(err)
+ }
+
+ return max_item
+}
+
+func (app *App) topStories() {
+ var err error
+
+ data1 := strings.TrimSuffix(string(getTopStories()), "]")
+ data2 := strings.TrimPrefix(string(getBestStories()), "[")
+
+ data1 = data1 + ","
+ data := data1 + data2
+
+ var story_ids []int
+ err = json.Unmarshal([]byte(data), &story_ids)
+ if err != nil {
+ log.Warn("topStories: Unmarshaling json failed")
+ panic(err)
+ }
+
+ const maxRoutines = 20
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ for _, id := range story_ids {
+ q.Add()
+ go func(id int) {
+ Story, ok := getStory(id)
+ defer q.Done()
+ if ok {
+ log.Infof("%+v\n", Story)
+ err = app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ }
+ }(id)
+ }
+ q.Wait()
+}
+
+func getStory(id int) (Story, bool) {
+ Story := getDetail(id)
+ if Story.Dead {
+ return Story, false
+ }
+ if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 {
+ return Story, false
+ }
+ var duplicates = make(map[string]bool)
+ /*
+ if (time.Now().Unix() - 3456000) > int64(Story.Time) {
+ }
+ */
+
+ Story.Title = stripHNPrefix(Story.Title)
+
+ u, err := url.Parse(Story.Url)
+ if err != nil {
+ log.Warnf("getStory: Parsing URL failed: %s\n", err.Error())
+ return Story, false
+ }
+
+ /**
+ * Check if story links to Youtube
+ */
+ is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_video {
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match youtube host")
+ log.Infof("%+v\n", Story)
+
+ duplicates[link.Url] = true
+ }
+
+ /**
+ * Check if story links to movie platform
+ */
+ is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_movie {
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match moview platform url")
+ log.Infof("%+v\n", Story)
+
+ duplicates[link.Url] = true
+ }
+
+ /**
+ * Check for (Video) in title
+ */
+ is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_video {
+ if ! duplicates[Story.Url] {
+
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match video title")
+ log.Infof("%+v\n", Story)
+
+ duplicates[Story.Url] = true
+ }
+
+ }
+
+ /**
+ * Check if story links to movie platform
+ */
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_movie {
+ if ! duplicates[Story.Url] {
+
+ var link Link
+ link.Url = normalizeUrl(Story.Url)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match moview platform url")
+ log.Infof("%+v\n", Story)
+
+ duplicates[Story.Url] = true
+ }
+
+ }
+
+ /**
+ * Parse all URLs in Story.Text
+ */
+ rxRelaxed := xurls.Relaxed()
+ rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1)
+
+ for _, rxLink := range rxLinks {
+
+ /**
+ * Check for Youtube in text field
+ */
+ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_video {
+ if ! duplicates[rxLink] {
+
+ var link Link
+ link.Url = normalizeUrl(rxLink)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match youtube text")
+ log.Infof("%+v\n", Story)
+
+ duplicates[rxLink] = true
+ }
+
+ }
+
+ /**
+ * Check for movie platforms in text field
+ */
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_movie {
+ if ! duplicates[rxLink] {
+
+ var link Link
+ link.Url = normalizeUrl(rxLink)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
+
+ log.Info("match moview platform text")
+ log.Infof("%+v\n", Story)
+
+ duplicates[rxLink] = true
+ }
+
+ }
+ }
+
+ //Story.Url = normalizeUrl(Story.Url)
+
+ if len(Story.Links) > 0 {
+ return Story, true
+ } else {
+ return Story, false
+ }
+}
+
+func getResponse(url string) *http.Response {
+ var err error
+ var response *http.Response
+
+ response, err = http.Get(url)
+ if err != nil {
+ for i := 0; i < 4; i++ {
+ if i == 0 {
+ log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
+ } else {
+ log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
+ }
+ resp2, err2 := http.Get(url)
+ if err2 == nil {
+ return resp2
+ }
+ }
+ panic(err)
+ }
+ return response
+}
+
+func getBestResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/beststories.json"
+ return getResponse(_url)
+}
+
+func getTopResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/topstories.json"
+ return getResponse(_url)
+}
+
+func getStoryResponse(item_id string) *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
+ return getResponse(_url)
+}
+
+func getDetail(id int) Story {
+ response := getStoryResponse(strconv.Itoa(id))
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ var story Story
+ err = json.Unmarshal(data, &story)
+ if err != nil {
+ log.Warn("getDetail: Unmarshaling json failed ", data)
+ panic(err)
+ }
+ //log.Debug("%+v\n", Story)
+
+ story.Text = html.UnescapeString(story.Text)
+
+ return story
+}
+
+func getTopStories() []byte {
+ response := getTopResponse()
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func getBestStories() []byte {
+ response := getBestResponse()
+
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func (app *App) updateAllDiscussions() {
+ const maxRoutines = 20
+ var item_ids []int
+
+ app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on")
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+
+ for _, item_id := range item_ids {
+ q.Add()
+ go func(item_id int) {
+ defer q.Done()
+ Story, ok := getStory(item_id)
+ if !ok {
+ /**
+ * Check if we got a network error or a dead story.
+ */
+ if 0 == Story.Id {
+ log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id)
+ } else if Story.Descendants > 10 || Story.Score > 10 {
+ log.Infof(`
+ updateAllDiscussions: There is a bug. Can't update discussion with id %d.
+ NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source.
+ %+v\n
+ `, item_id, Story)
+ }
+ return
+ }
+ err := app.updateDiscussion(Story)
+ if err != nil {
+ log.Warn(err)
+ return
+ }
+ }(item_id)
+ }
+ q.Wait()
+}