From 0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 23 Sep 2024 15:33:44 +0200 Subject: saves last seen max_story_id and stops walk down there --- main.go | 151 +++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 88 insertions(+), 63 deletions(-) (limited to 'main.go') diff --git a/main.go b/main.go index 83a35ff..75e823d 100644 --- a/main.go +++ b/main.go @@ -1,9 +1,9 @@ package main import ( - "html" "encoding/json" "fmt" + "html" "io/ioutil" "net/http" "net/url" @@ -12,10 +12,10 @@ import ( "strings" "time" + "github.com/PuerkitoBio/goquery" "github.com/anikhasibul/queue" "github.com/jmoiron/sqlx" log "github.com/sirupsen/logrus" - "github.com/PuerkitoBio/goquery" ) type App struct { @@ -45,10 +45,10 @@ func main() { defer app.DB.Close() /* - app.deleteOrphanedArticles() - app.topStories() - app.deleteOrphanedArticles() - app.updateAllDiscussions() + app.deleteOrphanedArticles() + app.topStories() + app.deleteOrphanedArticles() + app.updateAllDiscussions() */ app.walkDown() @@ -62,9 +62,9 @@ func (app *App) walkDown() { //var err error - //max_item := getMaxItem() + max_item := getMaxItem() //max_item := 27351341 - max_item := 27262623 + //max_item := 27262623 //max_item := 41495306 //max_item := 36128477 //max_item := 32670334 @@ -81,29 +81,43 @@ func (app *App) walkDown() { //max_item := 15038031 //max_item := 14450000 + min_item := 0 + var new_max_item syncMaxItem + + if app.Config.OnlyUpdateStories { + min_item = app.getMaxStoredItem() + } + log.Infof("walkDown: max_item: %d; min_item: %d\n", max_item, min_item) + const maxRoutines = 400 //const maxRoutines = 1 q := queue.New(maxRoutines) defer q.Close() //for i := max_item; i > 22600000; i-- { - for i := max_item; i > 0; i-- { + for i := max_item; i > min_item; i-- { q.Add() - go func(i int) { + go func(i int, new_max_item *syncMaxItem) { defer q.Done() Story, ok := getStory(i) if ok { - if len(Story.Links) > 0 { - //log.Debugf("%+v\n", Story) - //log.Debugf("%+v\n", Story.Links) + if len(Story.Links) > 0 { + //log.Debugf("%+v\n", Story) + //log.Debugf("%+v\n", Story.Links) } err := app.saveStory(Story) if err != nil { log.Fatal(err) + } else { + new_max_item.mu.Lock() + if Story.Id > new_max_item.max_item { + new_max_item.max_item = Story.Id + } + new_max_item.mu.Unlock() } /* - */ + */ } /* @@ -112,9 +126,21 @@ func (app *App) walkDown() { if i%1000 == 0 { log.Infof("%s: Getting item %d\n", time.Now(), i) } - }(i) + }(i, &new_max_item) } q.Wait() + + if min_item == 0 { + err := app.createMaxStoredItem(new_max_item.max_item) + if err != nil { + log.Fatal(err) + } + } else if min_item != new_max_item.max_item && new_max_item.max_item != 0 { + err := app.updateNewMaxStoredItem(new_max_item.max_item) + if err != nil { + log.Fatal(err) + } + } } func getMaxItem() int { @@ -215,7 +241,7 @@ func getStory(id int) (Story, bool) { } /** - * Check if story links to movie platform + * Check if story links to movie platform */ is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) if err != nil { @@ -243,7 +269,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_video { - if ! duplicates[Story.Url] { + if !duplicates[Story.Url] { var link Link link.Url = normalizeUrl(Story.Url) @@ -259,7 +285,7 @@ func getStory(id int) (Story, bool) { } /** - * Check if story links to movie platform + * Check if story links to movie platform */ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) if err != nil { @@ -267,7 +293,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_movie { - if ! duplicates[Story.Url] { + if !duplicates[Story.Url] { var link Link link.Url = normalizeUrl(Story.Url) @@ -287,13 +313,13 @@ func getStory(id int) (Story, bool) { log.Debugf("StoryID: %d\n", Story.Id) log.Debugf("StoryID: %d\n", Story.Text) - */ + */ /** * This comment broke my code: * https://news.ycombinator.com/item?id=27351340 */ - tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader(""+Story.Text+"")) + tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("" + Story.Text + "")) if err != nil { log.Errorf("Failed to parse html: %s\n", err.Error()) return Story, false @@ -320,62 +346,61 @@ func getStory(id int) (Story, bool) { doc.Find("a").Each(func(i int, s *goquery.Selection) { - l, ok := s.Attr("href") + l, ok := s.Attr("href") - if ok { + if ok { - /** - * Check for Youtube in text field - */ - is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) - if err != nil { - log.Fatal("Failed to parse and match regex: %s\n", err.Error()) - //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - //return Story, false - } - if is_video { - if ! duplicates[l] { + /** + * Check for Youtube in text field + */ + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) + if err != nil { + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false + } + if is_video { + if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 2 - Story.Links = append(Story.Links, link) + var link Link + link.Url = normalizeUrl(l) + link.Field = 2 + Story.Links = append(Story.Links, link) - log.Info("match youtube text") - log.Infof("%+v\n", Story) + log.Info("match youtube text") + log.Infof("%+v\n", Story) - duplicates[l] = true - } + duplicates[l] = true + } - } + } - /** - * Check for movie platforms in text field - */ - is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) - if err != nil { - log.Fatal("Failed to parse and match regex: %s\n", err.Error()) - //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - //return Story, false - } - if is_movie { - if ! duplicates[l] { + /** + * Check for movie platforms in text field + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) + if err != nil { + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false + } + if is_movie { + if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 1 - Story.Links = append(Story.Links, link) + var link Link + link.Url = normalizeUrl(l) + link.Field = 1 + Story.Links = append(Story.Links, link) - log.Info("match moview platform text") - log.Infof("%+v\n", Story) + log.Info("match moview platform text") + log.Infof("%+v\n", Story) - duplicates[l] = true + duplicates[l] = true + } } } - } }) - //Story.Url = normalizeUrl(Story.Url) if len(Story.Links) > 0 { -- cgit v1.2.3