summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoradmin2024-09-23 15:33:44 +0200
committeradmin2024-09-23 15:33:44 +0200
commit0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75 (patch)
treebaa63f7d8c862c696affe5913553493c1f3e61c1
parentb190512e951efdd1ed4642eed8726bf7bdf2c022 (diff)
downloadhncrawler-0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75.tar.gz
saves last seen max_story_id and stops walk down there
-rw-r--r--config.go3
-rw-r--r--database.go134
-rw-r--r--main.go151
-rw-r--r--sql.sql2
-rw-r--r--struct.go9
5 files changed, 197 insertions, 102 deletions
diff --git a/config.go b/config.go
index 7c36c53..0da0dde 100644
--- a/config.go
+++ b/config.go
@@ -24,6 +24,7 @@ type Config struct {
BasicAuthPassword string
Debug bool // sets log level to debug
+ OnlyUpdateStories bool
}
// Parses the configuration and sets the configuration struct.
@@ -35,6 +36,7 @@ func (c *Config) parseConfig(configFile string) {
viper.SetDefault("DB_Port", "3306")
viper.SetDefault("Debug", false)
+ viper.SetDefault("OnlyUpdateStories", false)
viper.SetDefault("Delay", 0)
// needs some refactoring to truly respect robots.txt
@@ -101,4 +103,5 @@ func (c *Config) setsConfig() {
c.BasicAuthPassword = viper.GetString("BasicAuthPassword")
c.Debug = viper.GetBool("Debug")
+ c.OnlyUpdateStories = viper.GetBool("OnlyUpdateStories")
}
diff --git a/database.go b/database.go
index e3ba060..31594b4 100644
--- a/database.go
+++ b/database.go
@@ -1,10 +1,12 @@
package main
import (
- log "github.com/sirupsen/logrus"
"strconv"
+ log "github.com/sirupsen/logrus"
+
"database/sql"
+
_ "github.com/go-sql-driver/mysql"
)
@@ -90,45 +92,45 @@ func (app *App) saveStory(s Story) error {
return nil
/*
- query = `
- INSERT IGNORE discussion (
- id,
- created_at,
- updated_at,
- article_id,
- title,
- source,
- item_id,
- source_url,
- posted_on,
- comments,
- upvotes
- ) VALUES (
- NULL,
- ?,
- ?,
- (SELECT id FROM article WHERE url = ?),
- ?,
- ?,
- ?,
- ?,
- ?,
- ?,
- ?
- );
- `
- stmt2, err := app.DB.Prepare(query)
- if err != nil {
- log.Warn("saveStory: Preparing second query failed")
- return err
- }
- defer stmt2.Close()
+ query = `
+ INSERT IGNORE discussion (
+ id,
+ created_at,
+ updated_at,
+ article_id,
+ title,
+ source,
+ item_id,
+ source_url,
+ posted_on,
+ comments,
+ upvotes
+ ) VALUES (
+ NULL,
+ ?,
+ ?,
+ (SELECT id FROM article WHERE url = ?),
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?,
+ ?
+ );
+ `
+ stmt2, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveStory: Preparing second query failed")
+ return err
+ }
+ defer stmt2.Close()
- _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score)
- if err != nil {
- log.Warn("saveStory: Statement execution failed")
- return err
- }
+ _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score)
+ if err != nil {
+ log.Warn("saveStory: Statement execution failed")
+ return err
+ }
*/
return nil
@@ -424,3 +426,57 @@ func (app *App) getAllArticles() {
log.Println(article_id)
}
}
+
+func (app *App) getMaxStoredItem() int {
+ row := app.DB.QueryRow("SELECT max_story_id FROM max_item ORDER BY id ASC LIMIT 1;")
+ var max_id int
+ err := row.Scan(&max_id)
+ if err == sql.ErrNoRows {
+ return 0
+ } else if err != nil {
+ log.Warnf("getMaxStoredId failed\n")
+ log.Fatal(err)
+ }
+ return max_id
+}
+
+func (app *App) updateNewMaxStoredItem(new_max_item int) error {
+ query := "UPDATE max_item SET max_story_id = ?, updated_at = ?;"
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("updateNewMaxStoredItem: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(new_max_item, app.Now)
+ if err != nil {
+ log.Warnf("updateNewMaxStoredItem: Statement execution failed: %d\n", new_max_item)
+ return err
+ }
+ log.Infof("updateNewMaxStoredItem: updated max_story_id to: %d\n", new_max_item)
+
+ return nil
+}
+
+/**
+ * Creates new max_story_id
+ */
+func (app *App) createMaxStoredItem(new_max_item int) error {
+ stmt, err := app.DB.Prepare("INSERT INTO max_item VALUES (null, ?, ?, ?);")
+ if err != nil {
+ log.Warn("getMaxStoredItem: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(new_max_item, app.Now, app.Now)
+ if err != nil {
+ log.Warnf("getMaxStoredItem: Executing query failed with new_max_item: %d\n", new_max_item)
+ return err
+ }
+
+ log.Info("getMaxStoredItem: creatd MaxStoredItem for the first time")
+
+ return nil
+}
diff --git a/main.go b/main.go
index 83a35ff..75e823d 100644
--- a/main.go
+++ b/main.go
@@ -1,9 +1,9 @@
package main
import (
- "html"
"encoding/json"
"fmt"
+ "html"
"io/ioutil"
"net/http"
"net/url"
@@ -12,10 +12,10 @@ import (
"strings"
"time"
+ "github.com/PuerkitoBio/goquery"
"github.com/anikhasibul/queue"
"github.com/jmoiron/sqlx"
log "github.com/sirupsen/logrus"
- "github.com/PuerkitoBio/goquery"
)
type App struct {
@@ -45,10 +45,10 @@ func main() {
defer app.DB.Close()
/*
- app.deleteOrphanedArticles()
- app.topStories()
- app.deleteOrphanedArticles()
- app.updateAllDiscussions()
+ app.deleteOrphanedArticles()
+ app.topStories()
+ app.deleteOrphanedArticles()
+ app.updateAllDiscussions()
*/
app.walkDown()
@@ -62,9 +62,9 @@ func (app *App) walkDown() {
//var err error
- //max_item := getMaxItem()
+ max_item := getMaxItem()
//max_item := 27351341
- max_item := 27262623
+ //max_item := 27262623
//max_item := 41495306
//max_item := 36128477
//max_item := 32670334
@@ -81,29 +81,43 @@ func (app *App) walkDown() {
//max_item := 15038031
//max_item := 14450000
+ min_item := 0
+ var new_max_item syncMaxItem
+
+ if app.Config.OnlyUpdateStories {
+ min_item = app.getMaxStoredItem()
+ }
+ log.Infof("walkDown: max_item: %d; min_item: %d\n", max_item, min_item)
+
const maxRoutines = 400
//const maxRoutines = 1
q := queue.New(maxRoutines)
defer q.Close()
//for i := max_item; i > 22600000; i-- {
- for i := max_item; i > 0; i-- {
+ for i := max_item; i > min_item; i-- {
q.Add()
- go func(i int) {
+ go func(i int, new_max_item *syncMaxItem) {
defer q.Done()
Story, ok := getStory(i)
if ok {
- if len(Story.Links) > 0 {
- //log.Debugf("%+v\n", Story)
- //log.Debugf("%+v\n", Story.Links)
+ if len(Story.Links) > 0 {
+ //log.Debugf("%+v\n", Story)
+ //log.Debugf("%+v\n", Story.Links)
}
err := app.saveStory(Story)
if err != nil {
log.Fatal(err)
+ } else {
+ new_max_item.mu.Lock()
+ if Story.Id > new_max_item.max_item {
+ new_max_item.max_item = Story.Id
+ }
+ new_max_item.mu.Unlock()
}
/*
- */
+ */
}
/*
@@ -112,9 +126,21 @@ func (app *App) walkDown() {
if i%1000 == 0 {
log.Infof("%s: Getting item %d\n", time.Now(), i)
}
- }(i)
+ }(i, &new_max_item)
}
q.Wait()
+
+ if min_item == 0 {
+ err := app.createMaxStoredItem(new_max_item.max_item)
+ if err != nil {
+ log.Fatal(err)
+ }
+ } else if min_item != new_max_item.max_item && new_max_item.max_item != 0 {
+ err := app.updateNewMaxStoredItem(new_max_item.max_item)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
}
func getMaxItem() int {
@@ -215,7 +241,7 @@ func getStory(id int) (Story, bool) {
}
/**
- * Check if story links to movie platform
+ * Check if story links to movie platform
*/
is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
if err != nil {
@@ -243,7 +269,7 @@ func getStory(id int) (Story, bool) {
return Story, false
}
if is_video {
- if ! duplicates[Story.Url] {
+ if !duplicates[Story.Url] {
var link Link
link.Url = normalizeUrl(Story.Url)
@@ -259,7 +285,7 @@ func getStory(id int) (Story, bool) {
}
/**
- * Check if story links to movie platform
+ * Check if story links to movie platform
*/
is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
if err != nil {
@@ -267,7 +293,7 @@ func getStory(id int) (Story, bool) {
return Story, false
}
if is_movie {
- if ! duplicates[Story.Url] {
+ if !duplicates[Story.Url] {
var link Link
link.Url = normalizeUrl(Story.Url)
@@ -287,13 +313,13 @@ func getStory(id int) (Story, bool) {
log.Debugf("StoryID: %d\n", Story.Id)
log.Debugf("StoryID: %d\n", Story.Text)
- */
+ */
/**
* This comment broke my code:
* https://news.ycombinator.com/item?id=27351340
*/
- tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>"))
+ tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>" + Story.Text + "</html>"))
if err != nil {
log.Errorf("Failed to parse html: %s\n", err.Error())
return Story, false
@@ -320,62 +346,61 @@ func getStory(id int) (Story, bool) {
doc.Find("a").Each(func(i int, s *goquery.Selection) {
- l, ok := s.Attr("href")
+ l, ok := s.Attr("href")
- if ok {
+ if ok {
- /**
- * Check for Youtube in text field
- */
- is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
- if err != nil {
- log.Fatal("Failed to parse and match regex: %s\n", err.Error())
- //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- //return Story, false
- }
- if is_video {
- if ! duplicates[l] {
+ /**
+ * Check for Youtube in text field
+ */
+ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
+ if err != nil {
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
+ }
+ if is_video {
+ if !duplicates[l] {
- var link Link
- link.Url = normalizeUrl(l)
- link.Field = 2
- Story.Links = append(Story.Links, link)
+ var link Link
+ link.Url = normalizeUrl(l)
+ link.Field = 2
+ Story.Links = append(Story.Links, link)
- log.Info("match youtube text")
- log.Infof("%+v\n", Story)
+ log.Info("match youtube text")
+ log.Infof("%+v\n", Story)
- duplicates[l] = true
- }
+ duplicates[l] = true
+ }
- }
+ }
- /**
- * Check for movie platforms in text field
- */
- is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
- if err != nil {
- log.Fatal("Failed to parse and match regex: %s\n", err.Error())
- //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- //return Story, false
- }
- if is_movie {
- if ! duplicates[l] {
+ /**
+ * Check for movie platforms in text field
+ */
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
+ if err != nil {
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
+ }
+ if is_movie {
+ if !duplicates[l] {
- var link Link
- link.Url = normalizeUrl(l)
- link.Field = 1
- Story.Links = append(Story.Links, link)
+ var link Link
+ link.Url = normalizeUrl(l)
+ link.Field = 1
+ Story.Links = append(Story.Links, link)
- log.Info("match moview platform text")
- log.Infof("%+v\n", Story)
+ log.Info("match moview platform text")
+ log.Infof("%+v\n", Story)
- duplicates[l] = true
+ duplicates[l] = true
+ }
}
}
- }
})
-
//Story.Url = normalizeUrl(Story.Url)
if len(Story.Links) > 0 {
diff --git a/sql.sql b/sql.sql
index 3a5d8da..c409e23 100644
--- a/sql.sql
+++ b/sql.sql
@@ -1,3 +1,5 @@
create table story( id int primary key auto_increment, story_id int not null unique, created_at timestamp, updated_at timestamp, type varchar(255) not null, title varchar(255) not null, text text, score int not null, descendants int not null, time int not null, poster varchar(255) not null);
create table links (id int primary key auto_increment, created_at timestamp, updated_at timestamp, story_id int not null, url varchar(255) not null, field int not null, foreign key(story_id) references story(id));
+
+create table max_item(id int primary key auto_increment, max_story_id int not null unique, created_at timestamp, updated_at timestamp, foreign key(max_story_id) references story(story_id));
diff --git a/struct.go b/struct.go
index a20c244..2b7538a 100644
--- a/struct.go
+++ b/struct.go
@@ -1,5 +1,9 @@
package main
+import (
+ "sync"
+)
+
type Story struct {
Id int
//Deleted bool
@@ -24,3 +28,8 @@ type Link struct {
type URL struct {
}
+
+type syncMaxItem struct {
+ max_item int
+ mu sync.Mutex
+}