From 9cc3133c40439075233470ede48c5a8d7d68669f Mon Sep 17 00:00:00 2001 From: admin Date: Sun, 29 Mar 2026 18:53:12 +0200 Subject: synchronous crawling for categories --- categories.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'categories.go') diff --git a/categories.go b/categories.go index a1bd394..20ab0ee 100644 --- a/categories.go +++ b/categories.go @@ -4,6 +4,7 @@ import ( "encoding/json" "io/ioutil" "strings" + "time" "github.com/gocolly/colly" log "github.com/sirupsen/logrus" @@ -16,17 +17,19 @@ func (app *App) crawlForCategories(wiki_url string) ([]string, bool) { func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { defer func() { recover() }() + log.Debug("queryWMLabs: getting categories for: " + wiki_url) + var categories []string title, hostname := getWikipediaTitle(wiki_url) wm_url := ("https://xtools.wmcloud.org/api/page/assessments/" + hostname + "/" + title) if "" == title || "/" == title { - log.Debug("queryWMLabs: empty title supplied. returning false") + log.Info("queryWMLabs: empty title supplied. returning false. wm_url: " + wiki_url) return []string{}, false } if "github.com" == hostname { - log.Debug("queryWMLabs: hostname == github.com, not wikipedia. returning false") + log.Info("queryWMLabs: hostname == github.com, not wikipedia. returning false") return []string{}, false } log.Debugf("queryWMLabs: wm_url: %s", wm_url) @@ -78,7 +81,7 @@ func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { if len(categories) > 0 { return categories, true } - log.Debug("queryWMLabs: len(categories) == 0. returning false") + log.Info("queryWMLabs: len(categories) == 0. returning false") return categories, false } @@ -118,7 +121,7 @@ func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { } func (app *App) saveAllCategories() { - rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;") + rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL ORDER BY a.created_at DESC;") if err != nil { log.Fatal(err) } @@ -139,6 +142,8 @@ func (app *App) saveAllCategories() { } else { log.Debug("saveAllCategories: No categories for " + wiki_url) } + // Delay to not get blocked + time.Sleep(time.Duration(app.Config.Delay) * time.Second) } } -- cgit v1.2.3