summaryrefslogtreecommitdiff
path: root/categories.go
diff options
context:
space:
mode:
authoradmin2026-03-29 18:53:12 +0200
committeradmin2026-03-29 18:53:12 +0200
commit9cc3133c40439075233470ede48c5a8d7d68669f (patch)
tree6925fbc2e95c1e5425e335dfe97cff0a8011c9d2 /categories.go
parentf3300bec030793d40115a08f46a7cbf49f06c2fd (diff)
downloadcurious-crawler-9cc3133c40439075233470ede48c5a8d7d68669f.tar.gz
synchronous crawling for categories
Diffstat (limited to 'categories.go')
-rw-r--r--categories.go13
1 files changed, 9 insertions, 4 deletions
diff --git a/categories.go b/categories.go
index a1bd394..20ab0ee 100644
--- a/categories.go
+++ b/categories.go
@@ -4,6 +4,7 @@ import (
"encoding/json"
"io/ioutil"
"strings"
+ "time"
"github.com/gocolly/colly"
log "github.com/sirupsen/logrus"
@@ -16,17 +17,19 @@ func (app *App) crawlForCategories(wiki_url string) ([]string, bool) {
func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
defer func() { recover() }()
+ log.Debug("queryWMLabs: getting categories for: " + wiki_url)
+
var categories []string
title, hostname := getWikipediaTitle(wiki_url)
wm_url := ("https://xtools.wmcloud.org/api/page/assessments/" + hostname + "/" + title)
if "" == title || "/" == title {
- log.Debug("queryWMLabs: empty title supplied. returning false")
+ log.Info("queryWMLabs: empty title supplied. returning false. wm_url: " + wiki_url)
return []string{}, false
}
if "github.com" == hostname {
- log.Debug("queryWMLabs: hostname == github.com, not wikipedia. returning false")
+ log.Info("queryWMLabs: hostname == github.com, not wikipedia. returning false")
return []string{}, false
}
log.Debugf("queryWMLabs: wm_url: %s", wm_url)
@@ -78,7 +81,7 @@ func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
if len(categories) > 0 {
return categories, true
}
- log.Debug("queryWMLabs: len(categories) == 0. returning false")
+ log.Info("queryWMLabs: len(categories) == 0. returning false")
return categories, false
}
@@ -118,7 +121,7 @@ func (app *App) crawlWMLabs(wiki_url string) (Category, bool) {
}
func (app *App) saveAllCategories() {
- rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;")
+ rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL ORDER BY a.created_at DESC;")
if err != nil {
log.Fatal(err)
}
@@ -139,6 +142,8 @@ func (app *App) saveAllCategories() {
} else {
log.Debug("saveAllCategories: No categories for " + wiki_url)
}
+ // Delay to not get blocked
+ time.Sleep(time.Duration(app.Config.Delay) * time.Second)
}
}