summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoradmin2026-03-29 18:53:12 +0200
committeradmin2026-03-29 18:53:12 +0200
commit9cc3133c40439075233470ede48c5a8d7d68669f (patch)
tree6925fbc2e95c1e5425e335dfe97cff0a8011c9d2
parentf3300bec030793d40115a08f46a7cbf49f06c2fd (diff)
downloadcurious-crawler-9cc3133c40439075233470ede48c5a8d7d68669f.tar.gz
synchronous crawling for categories
-rw-r--r--categories.go13
-rw-r--r--main.go33
2 files changed, 29 insertions, 17 deletions
diff --git a/categories.go b/categories.go
index a1bd394..20ab0ee 100644
--- a/categories.go
+++ b/categories.go
@@ -4,6 +4,7 @@ import (
"encoding/json"
"io/ioutil"
"strings"
+ "time"
"github.com/gocolly/colly"
log "github.com/sirupsen/logrus"
@@ -16,17 +17,19 @@ func (app *App) crawlForCategories(wiki_url string) ([]string, bool) {
func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
defer func() { recover() }()
+ log.Debug("queryWMLabs: getting categories for: " + wiki_url)
+
var categories []string
title, hostname := getWikipediaTitle(wiki_url)
wm_url := ("https://xtools.wmcloud.org/api/page/assessments/" + hostname + "/" + title)
if "" == title || "/" == title {
- log.Debug("queryWMLabs: empty title supplied. returning false")
+ log.Info("queryWMLabs: empty title supplied. returning false. wm_url: " + wiki_url)
return []string{}, false
}
if "github.com" == hostname {
- log.Debug("queryWMLabs: hostname == github.com, not wikipedia. returning false")
+ log.Info("queryWMLabs: hostname == github.com, not wikipedia. returning false")
return []string{}, false
}
log.Debugf("queryWMLabs: wm_url: %s", wm_url)
@@ -78,7 +81,7 @@ func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
if len(categories) > 0 {
return categories, true
}
- log.Debug("queryWMLabs: len(categories) == 0. returning false")
+ log.Info("queryWMLabs: len(categories) == 0. returning false")
return categories, false
}
@@ -118,7 +121,7 @@ func (app *App) crawlWMLabs(wiki_url string) (Category, bool) {
}
func (app *App) saveAllCategories() {
- rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;")
+ rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL ORDER BY a.created_at DESC;")
if err != nil {
log.Fatal(err)
}
@@ -139,6 +142,8 @@ func (app *App) saveAllCategories() {
} else {
log.Debug("saveAllCategories: No categories for " + wiki_url)
}
+ // Delay to not get blocked
+ time.Sleep(time.Duration(app.Config.Delay) * time.Second)
}
}
diff --git a/main.go b/main.go
index feecf27..d19598b 100644
--- a/main.go
+++ b/main.go
@@ -43,8 +43,9 @@ func main() {
defer app.DB.Close()
//app.fixAllCategories()
- //return
+ //app.saveAllCategories()
+ //return
app.deleteOrphanedArticles()
app.topStories()
app.wikipediaFixAllUrls()
@@ -80,7 +81,7 @@ func (app *App) walkDown() {
//max_item := 47528683
//max_item := 46750000
- const maxRoutines = 10
+ const maxRoutines = 20
q := queue.New(maxRoutines)
defer q.Close()
@@ -144,6 +145,7 @@ func (app *App) topStories() {
}
const maxRoutines = 20
+ storyChannel := make(chan Story, len(story_ids))
q := queue.New(maxRoutines)
defer q.Close()
@@ -154,27 +156,32 @@ func (app *App) topStories() {
defer q.Done()
if ok {
log.Infof("%+v\n", Story)
+
err = app.saveStory(Story)
if err != nil {
log.Fatal(err)
}
- /*
- log.Debug("topStories: crawling for Categories")
- categories, ok := app.crawlForCategories(Story.Url)
- if ok {
- article_id := app.getArticleIdFromUrl(Story.Url)
- app.saveCategory(article_id, categories)
- } else {
- log.Warn("topStories: crawling for Categories: not ok")
- time.Sleep(time.Duration(app.Config.Delay) * time.Second)
- }
- */
+ log.Debugf("sending Story to channel: %+v\n", Story)
+ storyChannel <- Story
}
}(id)
}
q.Wait()
+ close(storyChannel)
+
+ for story := range storyChannel {
+ log.Debug("topStories: crawling for Categories")
+ categories, ok := app.crawlForCategories(story.Url)
+ if ok {
+ article_id := app.getArticleIdFromUrl(story.Url)
+ app.saveCategory(article_id, categories)
+ } else {
+ log.Info("topStories: crawling for Categories: not ok. Check previous log output.")
+ }
+ time.Sleep(time.Duration(app.Config.Delay) * time.Second)
+ }
}
func (app *App) getStory(id int) (Story, bool) {