summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
authoradmin2026-03-29 18:53:12 +0200
committeradmin2026-03-29 18:53:12 +0200
commit9cc3133c40439075233470ede48c5a8d7d68669f (patch)
tree6925fbc2e95c1e5425e335dfe97cff0a8011c9d2 /main.go
parentf3300bec030793d40115a08f46a7cbf49f06c2fd (diff)
downloadcurious-crawler-9cc3133c40439075233470ede48c5a8d7d68669f.tar.gz
synchronous crawling for categories
Diffstat (limited to 'main.go')
-rw-r--r--main.go33
1 files changed, 20 insertions, 13 deletions
diff --git a/main.go b/main.go
index feecf27..d19598b 100644
--- a/main.go
+++ b/main.go
@@ -43,8 +43,9 @@ func main() {
defer app.DB.Close()
//app.fixAllCategories()
- //return
+ //app.saveAllCategories()
+ //return
app.deleteOrphanedArticles()
app.topStories()
app.wikipediaFixAllUrls()
@@ -80,7 +81,7 @@ func (app *App) walkDown() {
//max_item := 47528683
//max_item := 46750000
- const maxRoutines = 10
+ const maxRoutines = 20
q := queue.New(maxRoutines)
defer q.Close()
@@ -144,6 +145,7 @@ func (app *App) topStories() {
}
const maxRoutines = 20
+ storyChannel := make(chan Story, len(story_ids))
q := queue.New(maxRoutines)
defer q.Close()
@@ -154,27 +156,32 @@ func (app *App) topStories() {
defer q.Done()
if ok {
log.Infof("%+v\n", Story)
+
err = app.saveStory(Story)
if err != nil {
log.Fatal(err)
}
- /*
- log.Debug("topStories: crawling for Categories")
- categories, ok := app.crawlForCategories(Story.Url)
- if ok {
- article_id := app.getArticleIdFromUrl(Story.Url)
- app.saveCategory(article_id, categories)
- } else {
- log.Warn("topStories: crawling for Categories: not ok")
- time.Sleep(time.Duration(app.Config.Delay) * time.Second)
- }
- */
+ log.Debugf("sending Story to channel: %+v\n", Story)
+ storyChannel <- Story
}
}(id)
}
q.Wait()
+ close(storyChannel)
+
+ for story := range storyChannel {
+ log.Debug("topStories: crawling for Categories")
+ categories, ok := app.crawlForCategories(story.Url)
+ if ok {
+ article_id := app.getArticleIdFromUrl(story.Url)
+ app.saveCategory(article_id, categories)
+ } else {
+ log.Info("topStories: crawling for Categories: not ok. Check previous log output.")
+ }
+ time.Sleep(time.Duration(app.Config.Delay) * time.Second)
+ }
}
func (app *App) getStory(id int) (Story, bool) {