diff options
| -rw-r--r-- | categories.go | 13 | ||||
| -rw-r--r-- | main.go | 33 |
2 files changed, 29 insertions, 17 deletions
diff --git a/categories.go b/categories.go index a1bd394..20ab0ee 100644 --- a/categories.go +++ b/categories.go @@ -4,6 +4,7 @@ import ( "encoding/json" "io/ioutil" "strings" + "time" "github.com/gocolly/colly" log "github.com/sirupsen/logrus" @@ -16,17 +17,19 @@ func (app *App) crawlForCategories(wiki_url string) ([]string, bool) { func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { defer func() { recover() }() + log.Debug("queryWMLabs: getting categories for: " + wiki_url) + var categories []string title, hostname := getWikipediaTitle(wiki_url) wm_url := ("https://xtools.wmcloud.org/api/page/assessments/" + hostname + "/" + title) if "" == title || "/" == title { - log.Debug("queryWMLabs: empty title supplied. returning false") + log.Info("queryWMLabs: empty title supplied. returning false. wm_url: " + wiki_url) return []string{}, false } if "github.com" == hostname { - log.Debug("queryWMLabs: hostname == github.com, not wikipedia. returning false") + log.Info("queryWMLabs: hostname == github.com, not wikipedia. returning false") return []string{}, false } log.Debugf("queryWMLabs: wm_url: %s", wm_url) @@ -78,7 +81,7 @@ func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { if len(categories) > 0 { return categories, true } - log.Debug("queryWMLabs: len(categories) == 0. returning false") + log.Info("queryWMLabs: len(categories) == 0. returning false") return categories, false } @@ -118,7 +121,7 @@ func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { } func (app *App) saveAllCategories() { - rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;") + rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL ORDER BY a.created_at DESC;") if err != nil { log.Fatal(err) } @@ -139,6 +142,8 @@ func (app *App) saveAllCategories() { } else { log.Debug("saveAllCategories: No categories for " + wiki_url) } + // Delay to not get blocked + time.Sleep(time.Duration(app.Config.Delay) * time.Second) } } @@ -43,8 +43,9 @@ func main() { defer app.DB.Close() //app.fixAllCategories() - //return + //app.saveAllCategories() + //return app.deleteOrphanedArticles() app.topStories() app.wikipediaFixAllUrls() @@ -80,7 +81,7 @@ func (app *App) walkDown() { //max_item := 47528683 //max_item := 46750000 - const maxRoutines = 10 + const maxRoutines = 20 q := queue.New(maxRoutines) defer q.Close() @@ -144,6 +145,7 @@ func (app *App) topStories() { } const maxRoutines = 20 + storyChannel := make(chan Story, len(story_ids)) q := queue.New(maxRoutines) defer q.Close() @@ -154,27 +156,32 @@ func (app *App) topStories() { defer q.Done() if ok { log.Infof("%+v\n", Story) + err = app.saveStory(Story) if err != nil { log.Fatal(err) } - /* - log.Debug("topStories: crawling for Categories") - categories, ok := app.crawlForCategories(Story.Url) - if ok { - article_id := app.getArticleIdFromUrl(Story.Url) - app.saveCategory(article_id, categories) - } else { - log.Warn("topStories: crawling for Categories: not ok") - time.Sleep(time.Duration(app.Config.Delay) * time.Second) - } - */ + log.Debugf("sending Story to channel: %+v\n", Story) + storyChannel <- Story } }(id) } q.Wait() + close(storyChannel) + + for story := range storyChannel { + log.Debug("topStories: crawling for Categories") + categories, ok := app.crawlForCategories(story.Url) + if ok { + article_id := app.getArticleIdFromUrl(story.Url) + app.saveCategory(article_id, categories) + } else { + log.Info("topStories: crawling for Categories: not ok. Check previous log output.") + } + time.Sleep(time.Duration(app.Config.Delay) * time.Second) + } } func (app *App) getStory(id int) (Story, bool) { |
