package main import ( "encoding/json" "regexp" "strings" //"strconv" "io/ioutil" "net/url" log "github.com/Sirupsen/logrus" "github.com/gocolly/colly" ) func (app *App) crawlWikipedia(url string) { c := colly.NewCollector() c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) { e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) { log.Debug("Text: " + e.Text + " Title: " + e.Attr("title") + " Url: " + e.Attr("href")) }) }) c.OnHTML("#firstHeading", func(e *colly.HTMLElement) { log.Debug("Title: " + e.Text) }) err := c.Visit(url) if err != nil { log.Fatal(err) } } func (app *App) getAllArticles() { rows, err := app.DB.Query("SELECT DISTINCT article_id FROM discussion;") if err != nil { log.Fatal(err) } for rows.Next() { var article_id int err = rows.Scan(&article_id) if err != nil { log.Fatal(err) } log.Println(article_id) } } func (app *App) wikipediaFixAllUrls() { rows, err := app.DB.Query("SELECT id, url FROM article WHERE (url LIKE '%m.wiki%' OR url like 'http:%');") if err != nil { log.Fatal(err) } for rows.Next() { var id int var url string err = rows.Scan(&id, &url) if err != nil { log.Fatal(err) } //log.Debug(id, url) url = wikipediaNormalizeUrl(url) row := app.DB.QueryRow("SELECT count(*) FROM article WHERE url = ?", url) var count int err = row.Scan(&count) if err != nil { log.Fatal(err) } if 0 < count { err = app.wikipediaMergeArticles(id, url) if err != nil { log.Fatal(err) } continue } err = app.updateArticleUrl(id, url) if err != nil { log.Fatal(err) } //log.Debug("UPDATE article SET url = " + url + " WHERE id = " + strconv.Itoa(id)) } } func wikipediaNormalizeUrl(url string) string { match, err := regexp.MatchString("^http://", url) if err != nil { log.Fatal(err) } if match { r := regexp.MustCompile("^http://") url = r.ReplaceAllString(url, "https://") } match, err = regexp.MatchString("m.wikipedia.org", url) if err != nil { log.Fatal(err) } if match { r := regexp.MustCompile("m.wikipedia.org") url = r.ReplaceAllString(url, "wikipedia.org") } return url } func (app *App) _changeTitle(id_to_delete int, correct_url string) { var new_title string query := ` SELECT * FROM (SELECT title as old_title FROM article WHERE id = ?) as t1 JOIN (SELECT title as cur_title FROM article WHERE url = ?) as t2 ; ` row := app.DB.QueryRow(query, id_to_delete, correct_url) var old_title string var cur_title string err := row.Scan(&old_title, &cur_title) if err != nil { log.Fatal(err) } old_title = stripHNPrefix(old_title) cur_title = stripHNPrefix(cur_title) if len(old_title) > len(cur_title) { new_title = old_title } else { new_title = cur_title } log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title) } func getWikipediaExcerpt(title string) string { var err error response := getWikipediaResponse(title) resp_data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } var data map[string]interface{} if err = json.Unmarshal(resp_data, &data); err != nil { log.Warn("getWikipediaExcerpt: Unmarshaling json failed") log.Fatal(err) } defer func() { recover() }() query := (data["query"]).(map[string]interface{}) pages := query["pages"].(map[string]interface{}) for _, site_id := range pages { extract := site_id.(map[string]interface{}) excerpt := (extract["extract"]).(string) excerpt = strings.TrimSpace(excerpt) if "" != excerpt { return excerpt } } return "" } func getWikipediaTitle(full_url string) (string, string) { u, err := url.Parse(full_url) if err != nil { log.Fatal(err) } var title string title = u.Query().Get("title") if "" == title { title = strings.TrimPrefix(u.EscapedPath(), "/wiki/") } title = strings.TrimSpace(title) return title, u.Hostname() } func (app *App) saveExcerpts() error { query := "SELECT id, url FROM article WHERE excerpt_html IS NULL;" rows, err := app.DB.Query(query) if err != nil { return err } for rows.Next() { var id int var url string err = rows.Scan(&id, &url) if err != nil { log.Fatal(err) } title, _ := getWikipediaTitle(url) excerpt := getWikipediaExcerpt(title) query = "UPDATE article SET excerpt_html = ? WHERE id = ?" stmt, err := app.DB.Prepare(query) if err != nil { log.Warn("saveExcerpts: Preparing query failed") return err } defer stmt.Close() _, err = stmt.Exec(excerpt, id) if err != nil { log.Warn("saveExcerpts: Executing stmt failed") return err } } return nil } func wikipediaRealUrl(wiki_url string) string { /** * We don't change urls with parameters, because we would loose the context. */ if strings.Contains(wiki_url, "&") { return wiki_url } var err error var fragment string u, err := url.Parse(wiki_url) if err != nil { log.Fatal(err) } if u.Fragment != "" { fragment = "#" + u.Fragment } title, hostname := getWikipediaTitle(wiki_url) if title == "/" || title == "" { return wiki_url } response := getWikipediaRedirectResponse(hostname, title) resp_data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) } var data map[string]interface{} if err = json.Unmarshal(resp_data, &data); err != nil { log.Warn("wikipediaRealUrl: Unmarshaling json failed ", string(resp_data)) log.Fatal(err) } defer func() { recover() }() query := (data["query"]).(map[string]interface{}) pages := query["pages"].(map[string]interface{}) for _, site_id := range pages { key := site_id.(map[string]interface{}) canonical_url := key["canonicalurl"].(string) if "" != canonical_url { return canonical_url + fragment } } return wiki_url }