diff options
Diffstat (limited to 'categories.go')
| -rw-r--r-- | categories.go | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/categories.go b/categories.go new file mode 100644 index 0000000..2ee3448 --- /dev/null +++ b/categories.go @@ -0,0 +1,145 @@ +package main + +import ( + "encoding/json" + "io/ioutil" + "strings" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) crawlForCategories(wiki_url string) ([]string, bool) { + return app.queryWMLabs(wiki_url) +} + +func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { + defer func() { recover() }() + + var categories []string + + title, hostname := getWikipediaTitle(wiki_url) + wm_url := ("https://xtools.wmflabs.org/api/page/assessments/" + hostname + "/" + title) + + if "" == title || "/" == title { + return []string{}, false + } + + response := getResponse(wm_url) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url) + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warnf("queryWMLabs: Decoding JSON failed for: %s", wm_url) + panic(err) + } + + for k, v := range data { + if "project" != k && "elapsed_time" != k { + wp := v.(map[string]interface{}) + for k2, v2 := range wp { + if k2 == "wikiprojects" { + list := v2.(map[string]interface{}) + for k3, _ := range list { + cat := normalizeCategory(k3) + if "" != cat { + categories = append(categories, cat) + } + } + } + } + } + } + + if len(categories) > 0 { + return categories, true + } + return categories, false +} + +func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { + var err error + + //path := strings.TrimPrefix(u.EscapedPath(), "/wiki/") + title, hostname := getWikipediaTitle(wiki_url) + wm_url := ("https://xtools.wmflabs.org/articleinfo/" + hostname + "/" + title) + + if "" == title || "/" == title { + return Category{}, false + } + + var category Category + c := colly.NewCollector() + + c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) { + category.Name = strings.TrimSpace(e.Text) + category.Url = strings.TrimSpace(e.Attr("href")) + }) + + err = c.Visit(wm_url) + if err != nil { + log.Fatal(err) + } + + if category.Name == "" || category.Url == "" { + log.Warnf("title: %s WM URL: %s \tWiki Url: %s", title, wm_url, wiki_url) + } else { + log.Warnf("crawler: %+v", category) + } + return category, true +} + +func (app *App) saveAllCategories() { + rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + var wiki_url string + + err = rows.Scan(&article_id, &wiki_url) + if err != nil { + log.Fatal(err) + } + + //category, ok := app.crawlWMLabs(wiki_url) + categories, ok := app.queryWMLabs(wiki_url) + if ok { + app.saveCategory(article_id, categories) + } else { + log.Debug("saveAllCategories: No categories for " + wiki_url) + } + } +} + +func normalizeCategory(s string) string { + cat := strings.TrimSpace(s) + cat = strings.TrimSuffix(s, "task force") + cat = strings.TrimSuffix(s, "taskforce") + cat = strings.TrimSuffix(s, "Taskforce") + cat = strings.TrimSuffix(s, "Task Force") + + if strings.Contains(strings.ToLower(cat), "articles") { + return "" + } + /** + * The category "Wikipedia" is very useful, but other occurrences are not. + */ + if strings.Contains(strings.ToLower(cat), "wikipedia") && cat != strings.ToLower("wikipedia") { + return "" + } + if strings.Contains(strings.ToLower(cat), "/wikiproject") { + cat = strings.ReplaceAll(cat, "WikiProject ", "") + cat = strings.ReplaceAll(cat, "wikiproject ", "") + cat = strings.ReplaceAll(cat, "Wikiproject ", "") + } + cat = strings.TrimSpace(cat) + + return cat +} |
