summaryrefslogtreecommitdiff
path: root/categories.go
diff options
context:
space:
mode:
Diffstat (limited to 'categories.go')
-rw-r--r--categories.go145
1 files changed, 145 insertions, 0 deletions
diff --git a/categories.go b/categories.go
new file mode 100644
index 0000000..2ee3448
--- /dev/null
+++ b/categories.go
@@ -0,0 +1,145 @@
+package main
+
+import (
+ "encoding/json"
+ "io/ioutil"
+ "strings"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) crawlForCategories(wiki_url string) ([]string, bool) {
+ return app.queryWMLabs(wiki_url)
+}
+
+func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
+ defer func() { recover() }()
+
+ var categories []string
+
+ title, hostname := getWikipediaTitle(wiki_url)
+ wm_url := ("https://xtools.wmflabs.org/api/page/assessments/" + hostname + "/" + title)
+
+ if "" == title || "/" == title {
+ return []string{}, false
+ }
+
+ response := getResponse(wm_url)
+ resp_data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url)
+ panic(err)
+ }
+
+ var data map[string]interface{}
+ if err = json.Unmarshal(resp_data, &data); err != nil {
+ log.Warnf("queryWMLabs: Decoding JSON failed for: %s", wm_url)
+ panic(err)
+ }
+
+ for k, v := range data {
+ if "project" != k && "elapsed_time" != k {
+ wp := v.(map[string]interface{})
+ for k2, v2 := range wp {
+ if k2 == "wikiprojects" {
+ list := v2.(map[string]interface{})
+ for k3, _ := range list {
+ cat := normalizeCategory(k3)
+ if "" != cat {
+ categories = append(categories, cat)
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if len(categories) > 0 {
+ return categories, true
+ }
+ return categories, false
+}
+
+func (app *App) crawlWMLabs(wiki_url string) (Category, bool) {
+ var err error
+
+ //path := strings.TrimPrefix(u.EscapedPath(), "/wiki/")
+ title, hostname := getWikipediaTitle(wiki_url)
+ wm_url := ("https://xtools.wmflabs.org/articleinfo/" + hostname + "/" + title)
+
+ if "" == title || "/" == title {
+ return Category{}, false
+ }
+
+ var category Category
+ c := colly.NewCollector()
+
+ c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) {
+ category.Name = strings.TrimSpace(e.Text)
+ category.Url = strings.TrimSpace(e.Attr("href"))
+ })
+
+ err = c.Visit(wm_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if category.Name == "" || category.Url == "" {
+ log.Warnf("title: %s WM URL: %s \tWiki Url: %s", title, wm_url, wiki_url)
+ } else {
+ log.Warnf("crawler: %+v", category)
+ }
+ return category, true
+}
+
+func (app *App) saveAllCategories() {
+ rows, err := app.DB.Query("SELECT a.id, url FROM article AS a LEFT JOIN article_category AS j ON a.id = j.article_id WHERE j.id IS NULL;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var article_id int
+ var wiki_url string
+
+ err = rows.Scan(&article_id, &wiki_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ //category, ok := app.crawlWMLabs(wiki_url)
+ categories, ok := app.queryWMLabs(wiki_url)
+ if ok {
+ app.saveCategory(article_id, categories)
+ } else {
+ log.Debug("saveAllCategories: No categories for " + wiki_url)
+ }
+ }
+}
+
+func normalizeCategory(s string) string {
+ cat := strings.TrimSpace(s)
+ cat = strings.TrimSuffix(s, "task force")
+ cat = strings.TrimSuffix(s, "taskforce")
+ cat = strings.TrimSuffix(s, "Taskforce")
+ cat = strings.TrimSuffix(s, "Task Force")
+
+ if strings.Contains(strings.ToLower(cat), "articles") {
+ return ""
+ }
+ /**
+ * The category "Wikipedia" is very useful, but other occurrences are not.
+ */
+ if strings.Contains(strings.ToLower(cat), "wikipedia") && cat != strings.ToLower("wikipedia") {
+ return ""
+ }
+ if strings.Contains(strings.ToLower(cat), "/wikiproject") {
+ cat = strings.ReplaceAll(cat, "WikiProject ", "")
+ cat = strings.ReplaceAll(cat, "wikiproject ", "")
+ cat = strings.ReplaceAll(cat, "Wikiproject ", "")
+ }
+ cat = strings.TrimSpace(cat)
+
+ return cat
+}