package main import ( "encoding/json" "io/ioutil" "strings" log "github.com/Sirupsen/logrus" "github.com/gocolly/colly" ) func (app *App) crawlForCategories(wiki_url string) ([]string, bool) { return app.queryWMLabs(wiki_url) } func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { defer func() { recover() }() var categories []string title, hostname := getWikipediaTitle(wiki_url) wm_url := ("https://xtools.wmflabs.org/api/page/assessments/" + hostname + "/" + title) if "" == title || "/" == title { return []string{}, false } response := getResponse(wm_url) resp_data, err := ioutil.ReadAll(response.Body) if err != nil { log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url) panic(err) } var data map[string]interface{} if err = json.Unmarshal(resp_data, &data); err != nil { log.Warnf("queryWMLabs: Decoding JSON failed for: %s", wm_url) panic(err) } for k, v := range data { if "project" != k && "elapsed_time" != k { wp := v.(map[string]interface{}) for k2, v2 := range wp { if k2 == "wikiprojects" { list := v2.(map[string]interface{}) for k3, _ := range list { cat := normalizeCategory(k3) if "" != cat { categories = append(categories, cat) } } } } } } if len(categories) > 0 { return categories, true } return categories, false } func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { var err error //path := strings.TrimPrefix(u.EscapedPath(), "/wiki/") title, hostname := getWikipediaTitle(wiki_url) wm_url := ("https://xtools.wmflabs.org/articleinfo/" + hostname + "/" + title) if "" == title || "/" == title { return Category{}, false } var category Category c := colly.NewCollector() c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) { category.Name = strings.TrimSpace(e.Text) category.Url = strings.TrimSpace(e.Attr("href")) }) err = c.Visit(wm_url) if err != nil { log.Fatal(err) } if category.Name == "" || category.Url == "" { log.Warnf("title: %s WM URL: %s \tWiki Url: %s", title, wm_url, wiki_url) } else { log.Warnf("crawler: %+v", category) } return category, true } func (app *App) saveAllCategories() { rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;") if err != nil { log.Fatal(err) } for rows.Next() { var article_id int var wiki_url string err = rows.Scan(&article_id, &wiki_url) if err != nil { log.Fatal(err) } //category, ok := app.crawlWMLabs(wiki_url) categories, ok := app.queryWMLabs(wiki_url) if ok { app.saveCategory(article_id, categories) } else { log.Debug("saveAllCategories: No categories for " + wiki_url) } } } func normalizeCategory(s string) string { cat := strings.TrimSpace(s) cat = strings.TrimSuffix(cat, "task force") cat = strings.TrimSuffix(cat, "taskforce") cat = strings.TrimSuffix(cat, "Taskforce") cat = strings.TrimSuffix(cat, "Task Force") cat = strings.TrimSuffix(cat, "work group") if strings.Contains(strings.ToLower(cat), "articles") { return "" } /** * The category "Wikipedia" is very useful, but other occurrences are not. */ if strings.Contains(strings.ToLower(cat), "wikipedia") && cat != strings.ToLower("wikipedia") { return "" } if strings.Contains(strings.ToLower(cat), "/wikiproject") { cat = strings.ReplaceAll(cat, "WikiProject ", "") cat = strings.ReplaceAll(cat, "wikiproject ", "") cat = strings.ReplaceAll(cat, "Wikiproject ", "") } cat = strings.TrimSpace(cat) return cat }