summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
authoradmin2024-09-23 15:33:44 +0200
committeradmin2024-09-23 15:33:44 +0200
commit0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75 (patch)
treebaa63f7d8c862c696affe5913553493c1f3e61c1 /main.go
parentb190512e951efdd1ed4642eed8726bf7bdf2c022 (diff)
downloadhncrawler-0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75.tar.gz
saves last seen max_story_id and stops walk down there
Diffstat (limited to 'main.go')
-rw-r--r--main.go151
1 files changed, 88 insertions, 63 deletions
diff --git a/main.go b/main.go
index 83a35ff..75e823d 100644
--- a/main.go
+++ b/main.go
@@ -1,9 +1,9 @@
package main
import (
- "html"
"encoding/json"
"fmt"
+ "html"
"io/ioutil"
"net/http"
"net/url"
@@ -12,10 +12,10 @@ import (
"strings"
"time"
+ "github.com/PuerkitoBio/goquery"
"github.com/anikhasibul/queue"
"github.com/jmoiron/sqlx"
log "github.com/sirupsen/logrus"
- "github.com/PuerkitoBio/goquery"
)
type App struct {
@@ -45,10 +45,10 @@ func main() {
defer app.DB.Close()
/*
- app.deleteOrphanedArticles()
- app.topStories()
- app.deleteOrphanedArticles()
- app.updateAllDiscussions()
+ app.deleteOrphanedArticles()
+ app.topStories()
+ app.deleteOrphanedArticles()
+ app.updateAllDiscussions()
*/
app.walkDown()
@@ -62,9 +62,9 @@ func (app *App) walkDown() {
//var err error
- //max_item := getMaxItem()
+ max_item := getMaxItem()
//max_item := 27351341
- max_item := 27262623
+ //max_item := 27262623
//max_item := 41495306
//max_item := 36128477
//max_item := 32670334
@@ -81,29 +81,43 @@ func (app *App) walkDown() {
//max_item := 15038031
//max_item := 14450000
+ min_item := 0
+ var new_max_item syncMaxItem
+
+ if app.Config.OnlyUpdateStories {
+ min_item = app.getMaxStoredItem()
+ }
+ log.Infof("walkDown: max_item: %d; min_item: %d\n", max_item, min_item)
+
const maxRoutines = 400
//const maxRoutines = 1
q := queue.New(maxRoutines)
defer q.Close()
//for i := max_item; i > 22600000; i-- {
- for i := max_item; i > 0; i-- {
+ for i := max_item; i > min_item; i-- {
q.Add()
- go func(i int) {
+ go func(i int, new_max_item *syncMaxItem) {
defer q.Done()
Story, ok := getStory(i)
if ok {
- if len(Story.Links) > 0 {
- //log.Debugf("%+v\n", Story)
- //log.Debugf("%+v\n", Story.Links)
+ if len(Story.Links) > 0 {
+ //log.Debugf("%+v\n", Story)
+ //log.Debugf("%+v\n", Story.Links)
}
err := app.saveStory(Story)
if err != nil {
log.Fatal(err)
+ } else {
+ new_max_item.mu.Lock()
+ if Story.Id > new_max_item.max_item {
+ new_max_item.max_item = Story.Id
+ }
+ new_max_item.mu.Unlock()
}
/*
- */
+ */
}
/*
@@ -112,9 +126,21 @@ func (app *App) walkDown() {
if i%1000 == 0 {
log.Infof("%s: Getting item %d\n", time.Now(), i)
}
- }(i)
+ }(i, &new_max_item)
}
q.Wait()
+
+ if min_item == 0 {
+ err := app.createMaxStoredItem(new_max_item.max_item)
+ if err != nil {
+ log.Fatal(err)
+ }
+ } else if min_item != new_max_item.max_item && new_max_item.max_item != 0 {
+ err := app.updateNewMaxStoredItem(new_max_item.max_item)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
}
func getMaxItem() int {
@@ -215,7 +241,7 @@ func getStory(id int) (Story, bool) {
}
/**
- * Check if story links to movie platform
+ * Check if story links to movie platform
*/
is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
if err != nil {
@@ -243,7 +269,7 @@ func getStory(id int) (Story, bool) {
return Story, false
}
if is_video {
- if ! duplicates[Story.Url] {
+ if !duplicates[Story.Url] {
var link Link
link.Url = normalizeUrl(Story.Url)
@@ -259,7 +285,7 @@ func getStory(id int) (Story, bool) {
}
/**
- * Check if story links to movie platform
+ * Check if story links to movie platform
*/
is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
if err != nil {
@@ -267,7 +293,7 @@ func getStory(id int) (Story, bool) {
return Story, false
}
if is_movie {
- if ! duplicates[Story.Url] {
+ if !duplicates[Story.Url] {
var link Link
link.Url = normalizeUrl(Story.Url)
@@ -287,13 +313,13 @@ func getStory(id int) (Story, bool) {
log.Debugf("StoryID: %d\n", Story.Id)
log.Debugf("StoryID: %d\n", Story.Text)
- */
+ */
/**
* This comment broke my code:
* https://news.ycombinator.com/item?id=27351340
*/
- tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>"))
+ tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>" + Story.Text + "</html>"))
if err != nil {
log.Errorf("Failed to parse html: %s\n", err.Error())
return Story, false
@@ -320,62 +346,61 @@ func getStory(id int) (Story, bool) {
doc.Find("a").Each(func(i int, s *goquery.Selection) {
- l, ok := s.Attr("href")
+ l, ok := s.Attr("href")
- if ok {
+ if ok {
- /**
- * Check for Youtube in text field
- */
- is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
- if err != nil {
- log.Fatal("Failed to parse and match regex: %s\n", err.Error())
- //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- //return Story, false
- }
- if is_video {
- if ! duplicates[l] {
+ /**
+ * Check for Youtube in text field
+ */
+ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
+ if err != nil {
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
+ }
+ if is_video {
+ if !duplicates[l] {
- var link Link
- link.Url = normalizeUrl(l)
- link.Field = 2
- Story.Links = append(Story.Links, link)
+ var link Link
+ link.Url = normalizeUrl(l)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
- log.Info("match youtube text")
- log.Infof("%+v\n", Story)
+ log.Info("match youtube text")
+ log.Infof("%+v\n", Story)
- duplicates[l] = true
- }
+ duplicates[l] = true
+ }
- }
+ }
- /**
- * Check for movie platforms in text field
- */
- is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
- if err != nil {
- log.Fatal("Failed to parse and match regex: %s\n", err.Error())
- //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- //return Story, false
- }
- if is_movie {
- if ! duplicates[l] {
+ /**
+ * Check for movie platforms in text field
+ */
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
+ if err != nil {
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
+ }
+ if is_movie {
+ if !duplicates[l] {
- var link Link
- link.Url = normalizeUrl(l)
- link.Field = 1
- Story.Links = append(Story.Links, link)
+ var link Link
+ link.Url = normalizeUrl(l)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
- log.Info("match moview platform text")
- log.Infof("%+v\n", Story)
+ log.Info("match moview platform text")
+ log.Infof("%+v\n", Story)
- duplicates[l] = true
+ duplicates[l] = true
+ }
}
}
- }
})
-
//Story.Url = normalizeUrl(Story.Url)
if len(Story.Links) > 0 {