From 0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75 Mon Sep 17 00:00:00 2001
From: admin
Date: Mon, 23 Sep 2024 15:33:44 +0200
Subject: saves last seen max_story_id and stops walk down there

---
 main.go | 151 +++++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 88 insertions(+), 63 deletions(-)

(limited to 'main.go')

diff --git a/main.go b/main.go
index 83a35ff..75e823d 100644
--- a/main.go
+++ b/main.go
@@ -1,9 +1,9 @@
 package main
 
 import (
-	"html"
 	"encoding/json"
 	"fmt"
+	"html"
 	"io/ioutil"
 	"net/http"
 	"net/url"
@@ -12,10 +12,10 @@ import (
 	"strings"
 	"time"
 
+	"github.com/PuerkitoBio/goquery"
 	"github.com/anikhasibul/queue"
 	"github.com/jmoiron/sqlx"
 	log "github.com/sirupsen/logrus"
-	"github.com/PuerkitoBio/goquery"
 )
 
 type App struct {
@@ -45,10 +45,10 @@ func main() {
 	defer app.DB.Close()
 
 	/*
-	app.deleteOrphanedArticles()
-	app.topStories()
-	app.deleteOrphanedArticles()
-	app.updateAllDiscussions()
+		app.deleteOrphanedArticles()
+		app.topStories()
+		app.deleteOrphanedArticles()
+		app.updateAllDiscussions()
 	*/
 	app.walkDown()
 
@@ -62,9 +62,9 @@ func (app *App) walkDown() {
 
 	//var err error
 
-	//max_item := getMaxItem()
+	max_item := getMaxItem()
 	//max_item := 27351341
-	max_item := 27262623
+	//max_item := 27262623
 	//max_item := 41495306
 	//max_item := 36128477
 	//max_item := 32670334
@@ -81,29 +81,43 @@ func (app *App) walkDown() {
 	//max_item := 15038031
 	//max_item := 14450000
 
+	min_item := 0
+	var new_max_item syncMaxItem
+
+	if app.Config.OnlyUpdateStories {
+		min_item = app.getMaxStoredItem()
+	}
+	log.Infof("walkDown: max_item: %d; min_item: %d\n", max_item, min_item)
+
 	const maxRoutines = 400
 	//const maxRoutines = 1
 
 	q := queue.New(maxRoutines)
 	defer q.Close()
 	//for i := max_item; i > 22600000; i-- {
-	for i := max_item; i > 0; i-- {
+	for i := max_item; i > min_item; i-- {
 		q.Add()
-		go func(i int) {
+		go func(i int, new_max_item *syncMaxItem) {
 			defer q.Done()
 
 			Story, ok := getStory(i)
 			if ok {
-				if  len(Story.Links) > 0  {
-				//log.Debugf("%+v\n", Story)
-				//log.Debugf("%+v\n", Story.Links)
+				if len(Story.Links) > 0 {
+					//log.Debugf("%+v\n", Story)
+					//log.Debugf("%+v\n", Story.Links)
 				}
 				err := app.saveStory(Story)
 				if err != nil {
 					log.Fatal(err)
+				} else {
+					new_max_item.mu.Lock()
+					if Story.Id > new_max_item.max_item {
+						new_max_item.max_item = Story.Id
+					}
+					new_max_item.mu.Unlock()
 				}
 				/*
-				*/
+				 */
 			}
 
 			/*
@@ -112,9 +126,21 @@ func (app *App) walkDown() {
 			if i%1000 == 0 {
 				log.Infof("%s: Getting item %d\n", time.Now(), i)
 			}
-		}(i)
+		}(i, &new_max_item)
 	}
 	q.Wait()
+
+	if min_item == 0 {
+		err := app.createMaxStoredItem(new_max_item.max_item)
+		if err != nil {
+			log.Fatal(err)
+		}
+	} else if min_item != new_max_item.max_item && new_max_item.max_item != 0 {
+		err := app.updateNewMaxStoredItem(new_max_item.max_item)
+		if err != nil {
+			log.Fatal(err)
+		}
+	}
 }
 
 func getMaxItem() int {
@@ -215,7 +241,7 @@ func getStory(id int) (Story, bool) {
 	}
 
 	/**
-	 * Check if story links to movie platform 
+	 * Check if story links to movie platform
 	 */
 	is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
 	if err != nil {
@@ -243,7 +269,7 @@ func getStory(id int) (Story, bool) {
 		return Story, false
 	}
 	if is_video {
-		if ! duplicates[Story.Url] {
+		if !duplicates[Story.Url] {
 
 			var link Link
 			link.Url = normalizeUrl(Story.Url)
@@ -259,7 +285,7 @@ func getStory(id int) (Story, bool) {
 	}
 
 	/**
-	 * Check if story links to movie platform 
+	 * Check if story links to movie platform
 	 */
 	is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
 	if err != nil {
@@ -267,7 +293,7 @@ func getStory(id int) (Story, bool) {
 		return Story, false
 	}
 	if is_movie {
-		if ! duplicates[Story.Url] {
+		if !duplicates[Story.Url] {
 
 			var link Link
 			link.Url = normalizeUrl(Story.Url)
@@ -287,13 +313,13 @@ func getStory(id int) (Story, bool) {
 
 	log.Debugf("StoryID: %d\n", Story.Id)
 	log.Debugf("StoryID: %d\n", Story.Text)
-	 */
+	*/
 
 	/**
 	 * This comment broke my code:
 	 * https://news.ycombinator.com/item?id=27351340
 	 */
-	tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>"))
+	tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>" + Story.Text + "</html>"))
 	if err != nil {
 		log.Errorf("Failed to parse html: %s\n", err.Error())
 		return Story, false
@@ -320,62 +346,61 @@ func getStory(id int) (Story, bool) {
 
 	doc.Find("a").Each(func(i int, s *goquery.Selection) {
 
-	    l, ok := s.Attr("href")
+		l, ok := s.Attr("href")
 
-	    if ok {
+		if ok {
 
-		/**
-		 * Check for Youtube in text field
-		 */
-		is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
-		if err != nil {
-			log.Fatal("Failed to parse and match regex: %s\n", err.Error())
-			//log.Errorf("Failed to parse and match regex: %s\n", err.Error())
-			//return Story, false
-		}
-		if is_video {
-			if ! duplicates[l] {
+			/**
+			 * Check for Youtube in text field
+			 */
+			is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
+			if err != nil {
+				log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+				//log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+				//return Story, false
+			}
+			if is_video {
+				if !duplicates[l] {
 
-				var link Link
-				link.Url = normalizeUrl(l)
-				link.Field = 2
-				Story.Links = append(Story.Links, link)
+					var link Link
+					link.Url = normalizeUrl(l)
+					link.Field = 2
+					Story.Links = append(Story.Links, link)
 
-				log.Info("match youtube text")
-				log.Infof("%+v\n", Story)
+					log.Info("match youtube text")
+					log.Infof("%+v\n", Story)
 
-				duplicates[l] = true
-			}
+					duplicates[l] = true
+				}
 
-		}
+			}
 
-		/**
-		 * Check for movie platforms in text field
-		 */
-		is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
-		if err != nil {
-			log.Fatal("Failed to parse and match regex: %s\n", err.Error())
-			//log.Errorf("Failed to parse and match regex: %s\n", err.Error())
-			//return Story, false
-		}
-		if is_movie {
-			if ! duplicates[l] {
+			/**
+			 * Check for movie platforms in text field
+			 */
+			is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
+			if err != nil {
+				log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+				//log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+				//return Story, false
+			}
+			if is_movie {
+				if !duplicates[l] {
 
-				var link Link
-				link.Url = normalizeUrl(l)
-				link.Field = 1
-				Story.Links = append(Story.Links, link)
+					var link Link
+					link.Url = normalizeUrl(l)
+					link.Field = 1
+					Story.Links = append(Story.Links, link)
 
-				log.Info("match moview platform text")
-				log.Infof("%+v\n", Story)
+					log.Info("match moview platform text")
+					log.Infof("%+v\n", Story)
 
-				duplicates[l] = true
+					duplicates[l] = true
+				}
 			}
 		}
-	    }
 	})
 
-
 	//Story.Url = normalizeUrl(Story.Url)
 
 	if len(Story.Links) > 0 {
-- 
cgit v1.2.3