diff options
Diffstat (limited to 'github.go')
| -rw-r--r-- | github.go | 118 |
1 files changed, 113 insertions, 5 deletions
@@ -4,8 +4,17 @@ import ( "strconv" "strings" + "encoding/json" + "io/ioutil" + "net/http" + "regexp" + log "github.com/Sirupsen/logrus" "github.com/gocolly/colly" + + "github.com/abadojack/whatlanggo" + "github.com/grokify/html-strip-tags-go" + "github.com/writeas/go-strip-markdown" ) func (app *App) ScrapeGithub(platform Platform) []Entry { @@ -22,11 +31,11 @@ func (app *App) ScrapeGithub(platform Platform) []Entry { c := app.customCollector([]string{"www.github.com", "github.com"}) - c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) { + c.OnHTML(".Box-row", func(e *colly.HTMLElement) { entry := Entry{} owner := Owner{} - e.ForEach("div > h3", func(i int, e *colly.HTMLElement) { + e.ForEach("h1", func(i int, e *colly.HTMLElement) { entry.URL = e.ChildAttr("a", "href") owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /") owner.Name = strings.TrimSpace(owner.Name) @@ -34,8 +43,8 @@ func (app *App) ScrapeGithub(platform Platform) []Entry { entry.Title = strings.TrimSpace(entry.Title) }) - e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) { - entry.Synopsis = e.ChildText("p") + e.ForEach("p.col-9", func(i int, e *colly.HTMLElement) { + entry.Synopsis = e.Text }) e.ForEach("div.text-gray > a.muted-link", func(i int, e *colly.HTMLElement) { @@ -62,7 +71,7 @@ func (app *App) ScrapeGithub(platform Platform) []Entry { u.Name = current_update_period.Name owner.Platform = &platform - owner.URL = URL + owner.Name + owner.URL = URL + "/" + owner.Name entry.Owner = &owner entry.Platform = &p @@ -70,6 +79,20 @@ func (app *App) ScrapeGithub(platform Platform) []Entry { entry.UpdatePeriod = &u entry.Created_At = app.Now + entry.NaturalLanguage = app.GithubGetLang(entry) + + if "Mandarin" == entry.NaturalLanguage { + entry.NaturalLanguage = "Chinese" + } + if "undefined" == entry.NaturalLanguage { + entry.NaturalLanguage = "" + } + + log.Println(entry.Title + " / " + entry.Synopsis + " --- " + entry.NaturalLanguage) + + //language_info := whatlanggo.Detect(entry.Synopsis) + //entry.NaturalLanguage = language_info.Lang.String() + log.Debugf("%+v\n", owner) log.Debugf("%+v\n", entry) @@ -96,3 +119,88 @@ func (app *App) ScrapeGithub(platform Platform) []Entry { return Entries } + +func (app *App) GithubGetLang(entry Entry) string { + readme := app.GithubGetReadme(entry) + return GetLangFromReadme(readme) +} + +func (app *App) GithubGetReadme(entry Entry) string { + API_URL := "https://api.github.com/repos" + entry.URL + "/readme" + //log.Debug(API_URL) + + http_client := http.Client{} + req, err := http.NewRequest(http.MethodGet, API_URL, nil) + if err != nil { + // TODO + panic(err) + } + + req.Header.Set("accept", "application/json") + req.Header.Set("User-Agent", ":)") + + req.SetBasicAuth(app.Config.BasicAuthUsername, app.Config.BasicAuthPassword) + + api_resp, err := http_client.Do(req) + if err != nil { + // TODO + panic(err) + } + + api_body, err := ioutil.ReadAll(api_resp.Body) + if err != nil { + // TODO + panic(err) + } + + var tmp_api_map map[string]interface{} + + err = json.Unmarshal(api_body, &tmp_api_map) + if err != nil { + // TODO + + log.Printf("%+v\n", tmp_api_map) + log.Println("json unmarshal failed") + panic(err) + } + + if v := tmp_api_map["download_url"]; v == nil { + log.Debugf("Skipping because empty map: \n%+v\n", entry) + return "" + } + readme_url := tmp_api_map["download_url"].(string) + + res, err := http.Get(readme_url) + if err != nil { + log.Fatal(err) + } + readme, err := ioutil.ReadAll(res.Body) + if err != nil { + panic(err) + } + res.Body.Close() + + return string(readme) +} + +func GetLangFromReadme(readme string) string { + + r := regexp.MustCompile("(?s)```.+```") + r2 := regexp.MustCompile("(?s)<code>.+</code>") // I know! + readme = r.ReplaceAllString(readme, "") + readme = r2.ReplaceAllString(readme, "") + readme = strings.TrimSpace(readme) + + readme = strip.StripTags(readme) + readme = strings.TrimSpace(readme) + + readme = stripmd.Strip(readme) + readme = strings.TrimSpace(readme) + + if IsChinese(readme) { + return "Chinese" + } + + info := whatlanggo.Detect(readme) + return info.Lang.String() +} |
