From 0b90b7a3b0f38f0babf4d788f4d7dd5e43253341 Mon Sep 17 00:00:00 2001 From: horus Date: Thu, 2 Apr 2020 21:53:30 +0200 Subject: Initial commit. --- wikipedia.go | 272 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 wikipedia.go (limited to 'wikipedia.go') diff --git a/wikipedia.go b/wikipedia.go new file mode 100644 index 0000000..338881a --- /dev/null +++ b/wikipedia.go @@ -0,0 +1,272 @@ +package main + +import ( + "encoding/json" + "regexp" + "strings" + //"strconv" + "io/ioutil" + "net/url" + + log "github.com/Sirupsen/logrus" + "github.com/gocolly/colly" +) + +func (app *App) crawlWikipedia(url string) { + c := colly.NewCollector() + + c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) { + e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) { + log.Debug("Text: " + e.Text + " Title: " + e.Attr("title") + " Url: " + e.Attr("href")) + }) + }) + c.OnHTML("#firstHeading", func(e *colly.HTMLElement) { + log.Debug("Title: " + e.Text) + }) + + err := c.Visit(url) + if err != nil { + log.Fatal(err) + } +} + +func (app *App) getAllArticles() { + rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var article_id int + + err = rows.Scan(&article_id) + if err != nil { + log.Fatal(err) + } + + log.Println(article_id) + } +} + +func (app *App) wikipediaFixAllUrls() { + rows, err := app.DB.Query("SELECT id, url FROM article WHERE (url LIKE '%m.wiki%' OR url like 'http:%');") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var id int + var url string + + err = rows.Scan(&id, &url) + if err != nil { + log.Fatal(err) + } + + //log.Debug(id, url) + + url = wikipediaNormalizeUrl(url) + + row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", url) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatal(err) + } + if 0 < count { + err = app.wikipediaMergeArticles(id, url) + if err != nil { + log.Fatal(err) + } + continue + } + err = app.updateArticleUrl(id, url) + if err != nil { + log.Fatal(err) + } + + //log.Debug("UPDATE article SET url = " + url + " WHERE id = " + strconv.Itoa(id)) + } +} + +func wikipediaNormalizeUrl(url string) string { + match, err := regexp.MatchString("^http://", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("^http://") + url = r.ReplaceAllString(url, "https://") + } + + match, err = regexp.MatchString("m.wikipedia.org", url) + if err != nil { + log.Fatal(err) + } + if match { + r := regexp.MustCompile("m.wikipedia.org") + url = r.ReplaceAllString(url, "wikipedia.org") + } + return url +} + +func (app *App) _changeTitle(id_to_delete int, correct_url string) { + var new_title string + + query := ` + SELECT * FROM + (SELECT title as old_title FROM article WHERE id = ?) as t1 + JOIN + (SELECT title as cur_title FROM article WHERE url = ?) as t2 + ; + ` + + row := app.DB.QueryRow(query, id_to_delete, correct_url) + var old_title string + var cur_title string + err := row.Scan(&old_title, &cur_title) + if err != nil { + log.Fatal(err) + } + + old_title = stripHNPrefix(old_title) + cur_title = stripHNPrefix(cur_title) + + if len(old_title) > len(cur_title) { + new_title = old_title + } else { + new_title = cur_title + } + + log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title) +} + +func getWikipediaExcerpt(title string) string { + var err error + + response := getWikipediaResponse(title) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warn("getWikipediaExcerpt: Unmarshaling json failed") + log.Fatal(err) + } + defer func() { recover() }() + query := (data["query"]).(map[string]interface{}) + pages := query["pages"].(map[string]interface{}) + for _, site_id := range pages { + extract := site_id.(map[string]interface{}) + excerpt := (extract["extract"]).(string) + excerpt = strings.TrimSpace(excerpt) + if "" != excerpt { + return excerpt + } + } + return "" +} + +func getWikipediaTitle(full_url string) (string, string) { + u, err := url.Parse(full_url) + if err != nil { + log.Fatal(err) + } + var title string + title = u.Query().Get("title") + if "" == title { + title = strings.TrimPrefix(u.EscapedPath(), "/wiki/") + } + title = strings.TrimSpace(title) + return title, u.Hostname() +} + +func (app *App) saveExcerpts() error { + query := "SELECT id, url FROM article WHERE excerpt_html IS NULL;" + + rows, err := app.DB.Query(query) + if err != nil { + return err + } + + for rows.Next() { + var id int + var url string + + err = rows.Scan(&id, &url) + if err != nil { + log.Fatal(err) + } + + title, _ := getWikipediaTitle(url) + excerpt := getWikipediaExcerpt(title) + + query = "UPDATE article SET excerpt_html = ? WHERE id = ?" + stmt, err := app.DB.Prepare(query) + if err != nil { + log.Warn("saveExcerpts: Preparing query failed") + return err + } + defer stmt.Close() + + _, err = stmt.Exec(excerpt, id) + if err != nil { + log.Warn("saveExcerpts: Executing stmt failed") + return err + } + + } + return nil +} + +func wikipediaRealUrl(wiki_url string) string { + /** + * We don't change urls with parameters, because we would loose the context. + */ + if strings.Contains(wiki_url, "&") { + return wiki_url + } + + var err error + var fragment string + + u, err := url.Parse(wiki_url) + if err != nil { + log.Fatal(err) + } + + if u.Fragment != "" { + fragment = "#" + u.Fragment + } + + title, hostname := getWikipediaTitle(wiki_url) + + if title == "/" || title == "" { + return wiki_url + } + + response := getWikipediaRedirectResponse(hostname, title) + resp_data, err := ioutil.ReadAll(response.Body) + if err != nil { + panic(err) + } + + var data map[string]interface{} + if err = json.Unmarshal(resp_data, &data); err != nil { + log.Warn("wikipediaRealUrl: Unmarshaling json failed ", string(resp_data)) + log.Fatal(err) + } + defer func() { recover() }() + query := (data["query"]).(map[string]interface{}) + pages := query["pages"].(map[string]interface{}) + for _, site_id := range pages { + key := site_id.(map[string]interface{}) + canonical_url := key["canonicalurl"].(string) + if "" != canonical_url { + return canonical_url + fragment + } + } + return wiki_url +} -- cgit v1.2.3