package main import ( "strconv" "strings" "encoding/json" "io/ioutil" "net/http" "regexp" log "github.com/sirupsen/logrus" "github.com/gocolly/colly" "github.com/abadojack/whatlanggo" "github.com/grokify/html-strip-tags-go" "github.com/writeas/go-strip-markdown" ) func (app *App) ScrapeGithub(platform Platform) []Entry { var err error URL := platform.URL Languages := app.GetLanguages() UpdatePeriods := app.GetUpdatePeriods() current_language := Language{} current_update_period := UpdatePeriod{} Entries := []Entry{} c := app.customCollector([]string{"www.github.com", "github.com"}) c.OnHTML(".Box-row", func(e *colly.HTMLElement) { entry := Entry{} owner := Owner{} e.ForEach("h2", func(i int, e *colly.HTMLElement) { entry.URL = e.ChildAttr("a", "href") owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /") owner.Name = strings.TrimSpace(owner.Name) entry.Title = strings.TrimPrefix(e.ChildText("a"), owner.Name+" /") entry.Title = strings.TrimSpace(entry.Title) }) e.ForEach("p.col-9", func(i int, e *colly.HTMLElement) { entry.Synopsis = e.Text }) e.ForEach("a.Link--muted.d-inline-block.mr-3", func(i int, e *colly.HTMLElement) { if strings.Contains(e.Attr("href"), "stargazers") { stars := strings.TrimSpace(strings.Replace(e.Text, ",", "", -1)) entry.Stars, err = strconv.Atoi(stars) if err != nil { Warn(err, "Github: Extracting stars from "+entry.Title+" failed") } } }) l := Language{} l.ID = current_language.ID l.Name = current_language.Name p := Platform{} p.ID = platform.ID p.Name = platform.Name p.URL = platform.URL u := UpdatePeriod{} u.ID = current_update_period.ID u.Name = current_update_period.Name owner.Platform = &platform owner.URL = URL + "/" + owner.Name entry.Owner = &owner entry.Platform = &p entry.Language = &l entry.UpdatePeriod = &u entry.Created_At = app.Now entry.NaturalLanguage = app.GithubGetLang(entry) if "Mandarin" == entry.NaturalLanguage { entry.NaturalLanguage = "Chinese" } if "undefined" == entry.NaturalLanguage { entry.NaturalLanguage = "" } log.Println(entry.Title + " / " + entry.Synopsis + " --- " + entry.NaturalLanguage) //language_info := whatlanggo.Detect(entry.Synopsis) //entry.NaturalLanguage = language_info.Lang.String() log.Debugf("%+v\n", owner) log.Debugf("%+v\n", entry) Entries = append(Entries, entry) }) for _, l := range Languages { current_language = l for _, t := range UpdatePeriods { current_update_period = t CURRENT_URL := platform.URL + "/trending/" + l.Name + "?since=" + t.Name log.Println("Crawling " + CURRENT_URL) err := c.Visit(CURRENT_URL) if err != nil { Warn(err, "Scraping Platform "+platform.Name+" failed with URL: "+CURRENT_URL) } } } return Entries } func (app *App) GithubGetLang(entry Entry) string { readme := app.GithubGetReadme(entry) return GetLangFromReadme(readme) } func (app *App) GithubGetReadme(entry Entry) string { API_URL := "https://api.github.com/repos" + entry.URL + "/readme" //log.Debug(API_URL) http_client := http.Client{} req, err := http.NewRequest(http.MethodGet, API_URL, nil) if err != nil { // TODO panic(err) } req.Header.Set("accept", "application/json") req.Header.Set("User-Agent", ":)") req.SetBasicAuth(app.Config.BasicAuthUsername, app.Config.BasicAuthPassword) api_resp, err := http_client.Do(req) if err != nil { // TODO panic(err) } api_body, err := ioutil.ReadAll(api_resp.Body) if err != nil { // TODO panic(err) } var tmp_api_map map[string]interface{} err = json.Unmarshal(api_body, &tmp_api_map) if err != nil { // TODO log.Printf("%+v\n", tmp_api_map) log.Println("json unmarshal failed") panic(err) } if v := tmp_api_map["download_url"]; v == nil { log.Debugf("Skipping because empty map: \n%+v\n", entry) return "" } readme_url := tmp_api_map["download_url"].(string) res, err := http.Get(readme_url) if err != nil { log.Fatal(err) } readme, err := ioutil.ReadAll(res.Body) if err != nil { panic(err) } res.Body.Close() return string(readme) } func GetLangFromReadme(readme string) string { r := regexp.MustCompile("(?s)```.+```") r2 := regexp.MustCompile("(?s).+") // I know! readme = r.ReplaceAllString(readme, "") readme = r2.ReplaceAllString(readme, "") readme = strings.TrimSpace(readme) readme = strip.StripTags(readme) readme = strings.TrimSpace(readme) readme = stripmd.Strip(readme) readme = strings.TrimSpace(readme) if IsChinese(readme) { return "Chinese" } info := whatlanggo.Detect(readme) return info.Lang.String() }