diff options
| author | admin | 2024-09-23 15:33:44 +0200 |
|---|---|---|
| committer | admin | 2024-09-23 15:33:44 +0200 |
| commit | 0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75 (patch) | |
| tree | baa63f7d8c862c696affe5913553493c1f3e61c1 | |
| parent | b190512e951efdd1ed4642eed8726bf7bdf2c022 (diff) | |
| download | hncrawler-0b8d19eeae79030458b3e1492dd6b0c5a5bf2d75.tar.gz | |
saves last seen max_story_id and stops walk down there
| -rw-r--r-- | config.go | 3 | ||||
| -rw-r--r-- | database.go | 134 | ||||
| -rw-r--r-- | main.go | 151 | ||||
| -rw-r--r-- | sql.sql | 2 | ||||
| -rw-r--r-- | struct.go | 9 |
5 files changed, 197 insertions, 102 deletions
@@ -24,6 +24,7 @@ type Config struct { BasicAuthPassword string Debug bool // sets log level to debug + OnlyUpdateStories bool } // Parses the configuration and sets the configuration struct. @@ -35,6 +36,7 @@ func (c *Config) parseConfig(configFile string) { viper.SetDefault("DB_Port", "3306") viper.SetDefault("Debug", false) + viper.SetDefault("OnlyUpdateStories", false) viper.SetDefault("Delay", 0) // needs some refactoring to truly respect robots.txt @@ -101,4 +103,5 @@ func (c *Config) setsConfig() { c.BasicAuthPassword = viper.GetString("BasicAuthPassword") c.Debug = viper.GetBool("Debug") + c.OnlyUpdateStories = viper.GetBool("OnlyUpdateStories") } diff --git a/database.go b/database.go index e3ba060..31594b4 100644 --- a/database.go +++ b/database.go @@ -1,10 +1,12 @@ package main import ( - log "github.com/sirupsen/logrus" "strconv" + log "github.com/sirupsen/logrus" + "database/sql" + _ "github.com/go-sql-driver/mysql" ) @@ -90,45 +92,45 @@ func (app *App) saveStory(s Story) error { return nil /* - query = ` - INSERT IGNORE discussion ( - id, - created_at, - updated_at, - article_id, - title, - source, - item_id, - source_url, - posted_on, - comments, - upvotes - ) VALUES ( - NULL, - ?, - ?, - (SELECT id FROM article WHERE url = ?), - ?, - ?, - ?, - ?, - ?, - ?, - ? - ); - ` - stmt2, err := app.DB.Prepare(query) - if err != nil { - log.Warn("saveStory: Preparing second query failed") - return err - } - defer stmt2.Close() + query = ` + INSERT IGNORE discussion ( + id, + created_at, + updated_at, + article_id, + title, + source, + item_id, + source_url, + posted_on, + comments, + upvotes + ) VALUES ( + NULL, + ?, + ?, + (SELECT id FROM article WHERE url = ?), + ?, + ?, + ?, + ?, + ?, + ?, + ? + ); + ` + stmt2, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveStory: Preparing second query failed") + return err + } + defer stmt2.Close() - _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) - if err != nil { - log.Warn("saveStory: Statement execution failed") - return err - } + _, err = stmt2.Exec(app.Now, app.Now, s.Url, s.Title, "HN", s.Id, "https://news.ycombinator.com/item?id="+strconv.Itoa(s.Id), s.Time, s.Descendants, s.Score) + if err != nil { + log.Warn("saveStory: Statement execution failed") + return err + } */ return nil @@ -424,3 +426,57 @@ func (app *App) getAllArticles() { log.Println(article_id) } } + +func (app *App) getMaxStoredItem() int { + row := app.DB.QueryRow("SELECT max_story_id FROM max_item ORDER BY id ASC LIMIT 1;") + var max_id int + err := row.Scan(&max_id) + if err == sql.ErrNoRows { + return 0 + } else if err != nil { + log.Warnf("getMaxStoredId failed\n") + log.Fatal(err) + } + return max_id +} + +func (app *App) updateNewMaxStoredItem(new_max_item int) error { + query := "UPDATE max_item SET max_story_id = ?, updated_at = ?;" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("updateNewMaxStoredItem: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(new_max_item, app.Now) + if err != nil { + log.Warnf("updateNewMaxStoredItem: Statement execution failed: %d\n", new_max_item) + return err + } + log.Infof("updateNewMaxStoredItem: updated max_story_id to: %d\n", new_max_item) + + return nil +} + +/** + * Creates new max_story_id + */ +func (app *App) createMaxStoredItem(new_max_item int) error { + stmt, err := app.DB.Prepare("INSERT INTO max_item VALUES (null, ?, ?, ?);") + if err != nil { + log.Warn("getMaxStoredItem: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(new_max_item, app.Now, app.Now) + if err != nil { + log.Warnf("getMaxStoredItem: Executing query failed with new_max_item: %d\n", new_max_item) + return err + } + + log.Info("getMaxStoredItem: creatd MaxStoredItem for the first time") + + return nil +} @@ -1,9 +1,9 @@ package main import ( - "html" "encoding/json" "fmt" + "html" "io/ioutil" "net/http" "net/url" @@ -12,10 +12,10 @@ import ( "strings" "time" + "github.com/PuerkitoBio/goquery" "github.com/anikhasibul/queue" "github.com/jmoiron/sqlx" log "github.com/sirupsen/logrus" - "github.com/PuerkitoBio/goquery" ) type App struct { @@ -45,10 +45,10 @@ func main() { defer app.DB.Close() /* - app.deleteOrphanedArticles() - app.topStories() - app.deleteOrphanedArticles() - app.updateAllDiscussions() + app.deleteOrphanedArticles() + app.topStories() + app.deleteOrphanedArticles() + app.updateAllDiscussions() */ app.walkDown() @@ -62,9 +62,9 @@ func (app *App) walkDown() { //var err error - //max_item := getMaxItem() + max_item := getMaxItem() //max_item := 27351341 - max_item := 27262623 + //max_item := 27262623 //max_item := 41495306 //max_item := 36128477 //max_item := 32670334 @@ -81,29 +81,43 @@ func (app *App) walkDown() { //max_item := 15038031 //max_item := 14450000 + min_item := 0 + var new_max_item syncMaxItem + + if app.Config.OnlyUpdateStories { + min_item = app.getMaxStoredItem() + } + log.Infof("walkDown: max_item: %d; min_item: %d\n", max_item, min_item) + const maxRoutines = 400 //const maxRoutines = 1 q := queue.New(maxRoutines) defer q.Close() //for i := max_item; i > 22600000; i-- { - for i := max_item; i > 0; i-- { + for i := max_item; i > min_item; i-- { q.Add() - go func(i int) { + go func(i int, new_max_item *syncMaxItem) { defer q.Done() Story, ok := getStory(i) if ok { - if len(Story.Links) > 0 { - //log.Debugf("%+v\n", Story) - //log.Debugf("%+v\n", Story.Links) + if len(Story.Links) > 0 { + //log.Debugf("%+v\n", Story) + //log.Debugf("%+v\n", Story.Links) } err := app.saveStory(Story) if err != nil { log.Fatal(err) + } else { + new_max_item.mu.Lock() + if Story.Id > new_max_item.max_item { + new_max_item.max_item = Story.Id + } + new_max_item.mu.Unlock() } /* - */ + */ } /* @@ -112,9 +126,21 @@ func (app *App) walkDown() { if i%1000 == 0 { log.Infof("%s: Getting item %d\n", time.Now(), i) } - }(i) + }(i, &new_max_item) } q.Wait() + + if min_item == 0 { + err := app.createMaxStoredItem(new_max_item.max_item) + if err != nil { + log.Fatal(err) + } + } else if min_item != new_max_item.max_item && new_max_item.max_item != 0 { + err := app.updateNewMaxStoredItem(new_max_item.max_item) + if err != nil { + log.Fatal(err) + } + } } func getMaxItem() int { @@ -215,7 +241,7 @@ func getStory(id int) (Story, bool) { } /** - * Check if story links to movie platform + * Check if story links to movie platform */ is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) if err != nil { @@ -243,7 +269,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_video { - if ! duplicates[Story.Url] { + if !duplicates[Story.Url] { var link Link link.Url = normalizeUrl(Story.Url) @@ -259,7 +285,7 @@ func getStory(id int) (Story, bool) { } /** - * Check if story links to movie platform + * Check if story links to movie platform */ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host) if err != nil { @@ -267,7 +293,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_movie { - if ! duplicates[Story.Url] { + if !duplicates[Story.Url] { var link Link link.Url = normalizeUrl(Story.Url) @@ -287,13 +313,13 @@ func getStory(id int) (Story, bool) { log.Debugf("StoryID: %d\n", Story.Id) log.Debugf("StoryID: %d\n", Story.Text) - */ + */ /** * This comment broke my code: * https://news.ycombinator.com/item?id=27351340 */ - tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>")) + tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>" + Story.Text + "</html>")) if err != nil { log.Errorf("Failed to parse html: %s\n", err.Error()) return Story, false @@ -320,62 +346,61 @@ func getStory(id int) (Story, bool) { doc.Find("a").Each(func(i int, s *goquery.Selection) { - l, ok := s.Attr("href") + l, ok := s.Attr("href") - if ok { + if ok { - /** - * Check for Youtube in text field - */ - is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) - if err != nil { - log.Fatal("Failed to parse and match regex: %s\n", err.Error()) - //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - //return Story, false - } - if is_video { - if ! duplicates[l] { + /** + * Check for Youtube in text field + */ + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) + if err != nil { + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false + } + if is_video { + if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 2 - Story.Links = append(Story.Links, link) + var link Link + link.Url = normalizeUrl(l) + link.Field = 2 + Story.Links = append(Story.Links, link) - log.Info("match youtube text") - log.Infof("%+v\n", Story) + log.Info("match youtube text") + log.Infof("%+v\n", Story) - duplicates[l] = true - } + duplicates[l] = true + } - } + } - /** - * Check for movie platforms in text field - */ - is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) - if err != nil { - log.Fatal("Failed to parse and match regex: %s\n", err.Error()) - //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - //return Story, false - } - if is_movie { - if ! duplicates[l] { + /** + * Check for movie platforms in text field + */ + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) + if err != nil { + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false + } + if is_movie { + if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 1 - Story.Links = append(Story.Links, link) + var link Link + link.Url = normalizeUrl(l) + link.Field = 1 + Story.Links = append(Story.Links, link) - log.Info("match moview platform text") - log.Infof("%+v\n", Story) + log.Info("match moview platform text") + log.Infof("%+v\n", Story) - duplicates[l] = true + duplicates[l] = true + } } } - } }) - //Story.Url = normalizeUrl(Story.Url) if len(Story.Links) > 0 { @@ -1,3 +1,5 @@ create table story( id int primary key auto_increment, story_id int not null unique, created_at timestamp, updated_at timestamp, type varchar(255) not null, title varchar(255) not null, text text, score int not null, descendants int not null, time int not null, poster varchar(255) not null); create table links (id int primary key auto_increment, created_at timestamp, updated_at timestamp, story_id int not null, url varchar(255) not null, field int not null, foreign key(story_id) references story(id)); + +create table max_item(id int primary key auto_increment, max_story_id int not null unique, created_at timestamp, updated_at timestamp, foreign key(max_story_id) references story(story_id)); @@ -1,5 +1,9 @@ package main +import ( + "sync" +) + type Story struct { Id int //Deleted bool @@ -24,3 +28,8 @@ type Link struct { type URL struct { } + +type syncMaxItem struct { + max_item int + mu sync.Mutex +} |
