summaryrefslogtreecommitdiff
path: root/wikipedia.go
diff options
context:
space:
mode:
Diffstat (limited to 'wikipedia.go')
-rw-r--r--wikipedia.go272
1 files changed, 272 insertions, 0 deletions
diff --git a/wikipedia.go b/wikipedia.go
new file mode 100644
index 0000000..338881a
--- /dev/null
+++ b/wikipedia.go
@@ -0,0 +1,272 @@
+package main
+
+import (
+ "encoding/json"
+ "regexp"
+ "strings"
+ //"strconv"
+ "io/ioutil"
+ "net/url"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) crawlWikipedia(url string) {
+ c := colly.NewCollector()
+
+ c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) {
+ e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) {
+ log.Debug("Text: " + e.Text + " Title: " + e.Attr("title") + " Url: " + e.Attr("href"))
+ })
+ })
+ c.OnHTML("#firstHeading", func(e *colly.HTMLElement) {
+ log.Debug("Title: " + e.Text)
+ })
+
+ err := c.Visit(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func (app *App) getAllArticles() {
+ rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var article_id int
+
+ err = rows.Scan(&article_id)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ log.Println(article_id)
+ }
+}
+
+func (app *App) wikipediaFixAllUrls() {
+ rows, err := app.DB.Query("SELECT id, url FROM article WHERE (url LIKE '%m.wiki%' OR url like 'http:%');")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var id int
+ var url string
+
+ err = rows.Scan(&id, &url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ //log.Debug(id, url)
+
+ url = wikipediaNormalizeUrl(url)
+
+ row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", url)
+ var count int
+ err = row.Scan(&count)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if 0 < count {
+ err = app.wikipediaMergeArticles(id, url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ continue
+ }
+ err = app.updateArticleUrl(id, url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ //log.Debug("UPDATE article SET url = " + url + " WHERE id = " + strconv.Itoa(id))
+ }
+}
+
+func wikipediaNormalizeUrl(url string) string {
+ match, err := regexp.MatchString("^http://", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ r := regexp.MustCompile("^http://")
+ url = r.ReplaceAllString(url, "https://")
+ }
+
+ match, err = regexp.MatchString("m.wikipedia.org", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ r := regexp.MustCompile("m.wikipedia.org")
+ url = r.ReplaceAllString(url, "wikipedia.org")
+ }
+ return url
+}
+
+func (app *App) _changeTitle(id_to_delete int, correct_url string) {
+ var new_title string
+
+ query := `
+ SELECT * FROM
+ (SELECT title as old_title FROM article WHERE id = ?) as t1
+ JOIN
+ (SELECT title as cur_title FROM article WHERE url = ?) as t2
+ ;
+ `
+
+ row := app.DB.QueryRow(query, id_to_delete, correct_url)
+ var old_title string
+ var cur_title string
+ err := row.Scan(&old_title, &cur_title)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ old_title = stripHNPrefix(old_title)
+ cur_title = stripHNPrefix(cur_title)
+
+ if len(old_title) > len(cur_title) {
+ new_title = old_title
+ } else {
+ new_title = cur_title
+ }
+
+ log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title)
+}
+
+func getWikipediaExcerpt(title string) string {
+ var err error
+
+ response := getWikipediaResponse(title)
+ resp_data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ var data map[string]interface{}
+ if err = json.Unmarshal(resp_data, &data); err != nil {
+ log.Warn("getWikipediaExcerpt: Unmarshaling json failed")
+ log.Fatal(err)
+ }
+ defer func() { recover() }()
+ query := (data["query"]).(map[string]interface{})
+ pages := query["pages"].(map[string]interface{})
+ for _, site_id := range pages {
+ extract := site_id.(map[string]interface{})
+ excerpt := (extract["extract"]).(string)
+ excerpt = strings.TrimSpace(excerpt)
+ if "" != excerpt {
+ return excerpt
+ }
+ }
+ return ""
+}
+
+func getWikipediaTitle(full_url string) (string, string) {
+ u, err := url.Parse(full_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ var title string
+ title = u.Query().Get("title")
+ if "" == title {
+ title = strings.TrimPrefix(u.EscapedPath(), "/wiki/")
+ }
+ title = strings.TrimSpace(title)
+ return title, u.Hostname()
+}
+
+func (app *App) saveExcerpts() error {
+ query := "SELECT id, url FROM article WHERE excerpt_html IS NULL;"
+
+ rows, err := app.DB.Query(query)
+ if err != nil {
+ return err
+ }
+
+ for rows.Next() {
+ var id int
+ var url string
+
+ err = rows.Scan(&id, &url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ title, _ := getWikipediaTitle(url)
+ excerpt := getWikipediaExcerpt(title)
+
+ query = "UPDATE article SET excerpt_html = ? WHERE id = ?"
+ stmt, err := app.DB.Prepare(query)
+ if err != nil {
+ log.Warn("saveExcerpts: Preparing query failed")
+ return err
+ }
+ defer stmt.Close()
+
+ _, err = stmt.Exec(excerpt, id)
+ if err != nil {
+ log.Warn("saveExcerpts: Executing stmt failed")
+ return err
+ }
+
+ }
+ return nil
+}
+
+func wikipediaRealUrl(wiki_url string) string {
+ /**
+ * We don't change urls with parameters, because we would loose the context.
+ */
+ if strings.Contains(wiki_url, "&") {
+ return wiki_url
+ }
+
+ var err error
+ var fragment string
+
+ u, err := url.Parse(wiki_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if u.Fragment != "" {
+ fragment = "#" + u.Fragment
+ }
+
+ title, hostname := getWikipediaTitle(wiki_url)
+
+ if title == "/" || title == "" {
+ return wiki_url
+ }
+
+ response := getWikipediaRedirectResponse(hostname, title)
+ resp_data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ var data map[string]interface{}
+ if err = json.Unmarshal(resp_data, &data); err != nil {
+ log.Warn("wikipediaRealUrl: Unmarshaling json failed ", string(resp_data))
+ log.Fatal(err)
+ }
+ defer func() { recover() }()
+ query := (data["query"]).(map[string]interface{})
+ pages := query["pages"].(map[string]interface{})
+ for _, site_id := range pages {
+ key := site_id.(map[string]interface{})
+ canonical_url := key["canonicalurl"].(string)
+ if "" != canonical_url {
+ return canonical_url + fragment
+ }
+ }
+ return wiki_url
+}