summaryrefslogtreecommitdiff
path: root/github.go
diff options
context:
space:
mode:
authorhorus2020-04-19 18:35:27 +0200
committerhorus2020-04-19 18:35:27 +0200
commit8d3be0d8b623a405990448a5ea4fe471ab735ed7 (patch)
tree227a325b7a5972996fb499614a1913693de7a49f /github.go
parent5902524fb85aaf760f3e6a6695b2390868a7bd06 (diff)
downloadghrss-8d3be0d8b623a405990448a5ea4fe471ab735ed7.tar.gz
Update to get spoken language and minor upgrade the github handling.
Diffstat (limited to 'github.go')
-rw-r--r--github.go118
1 files changed, 113 insertions, 5 deletions
diff --git a/github.go b/github.go
index 50564d6..d342752 100644
--- a/github.go
+++ b/github.go
@@ -4,8 +4,17 @@ import (
"strconv"
"strings"
+ "encoding/json"
+ "io/ioutil"
+ "net/http"
+ "regexp"
+
log "github.com/Sirupsen/logrus"
"github.com/gocolly/colly"
+
+ "github.com/abadojack/whatlanggo"
+ "github.com/grokify/html-strip-tags-go"
+ "github.com/writeas/go-strip-markdown"
)
func (app *App) ScrapeGithub(platform Platform) []Entry {
@@ -22,11 +31,11 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
c := app.customCollector([]string{"www.github.com", "github.com"})
- c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) {
+ c.OnHTML(".Box-row", func(e *colly.HTMLElement) {
entry := Entry{}
owner := Owner{}
- e.ForEach("div > h3", func(i int, e *colly.HTMLElement) {
+ e.ForEach("h1", func(i int, e *colly.HTMLElement) {
entry.URL = e.ChildAttr("a", "href")
owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /")
owner.Name = strings.TrimSpace(owner.Name)
@@ -34,8 +43,8 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
entry.Title = strings.TrimSpace(entry.Title)
})
- e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) {
- entry.Synopsis = e.ChildText("p")
+ e.ForEach("p.col-9", func(i int, e *colly.HTMLElement) {
+ entry.Synopsis = e.Text
})
e.ForEach("div.text-gray > a.muted-link", func(i int, e *colly.HTMLElement) {
@@ -62,7 +71,7 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
u.Name = current_update_period.Name
owner.Platform = &platform
- owner.URL = URL + owner.Name
+ owner.URL = URL + "/" + owner.Name
entry.Owner = &owner
entry.Platform = &p
@@ -70,6 +79,20 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
entry.UpdatePeriod = &u
entry.Created_At = app.Now
+ entry.NaturalLanguage = app.GithubGetLang(entry)
+
+ if "Mandarin" == entry.NaturalLanguage {
+ entry.NaturalLanguage = "Chinese"
+ }
+ if "undefined" == entry.NaturalLanguage {
+ entry.NaturalLanguage = ""
+ }
+
+ log.Println(entry.Title + " / " + entry.Synopsis + " --- " + entry.NaturalLanguage)
+
+ //language_info := whatlanggo.Detect(entry.Synopsis)
+ //entry.NaturalLanguage = language_info.Lang.String()
+
log.Debugf("%+v\n", owner)
log.Debugf("%+v\n", entry)
@@ -96,3 +119,88 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
return Entries
}
+
+func (app *App) GithubGetLang(entry Entry) string {
+ readme := app.GithubGetReadme(entry)
+ return GetLangFromReadme(readme)
+}
+
+func (app *App) GithubGetReadme(entry Entry) string {
+ API_URL := "https://api.github.com/repos" + entry.URL + "/readme"
+ //log.Debug(API_URL)
+
+ http_client := http.Client{}
+ req, err := http.NewRequest(http.MethodGet, API_URL, nil)
+ if err != nil {
+ // TODO
+ panic(err)
+ }
+
+ req.Header.Set("accept", "application/json")
+ req.Header.Set("User-Agent", ":)")
+
+ req.SetBasicAuth(app.Config.BasicAuthUsername, app.Config.BasicAuthPassword)
+
+ api_resp, err := http_client.Do(req)
+ if err != nil {
+ // TODO
+ panic(err)
+ }
+
+ api_body, err := ioutil.ReadAll(api_resp.Body)
+ if err != nil {
+ // TODO
+ panic(err)
+ }
+
+ var tmp_api_map map[string]interface{}
+
+ err = json.Unmarshal(api_body, &tmp_api_map)
+ if err != nil {
+ // TODO
+
+ log.Printf("%+v\n", tmp_api_map)
+ log.Println("json unmarshal failed")
+ panic(err)
+ }
+
+ if v := tmp_api_map["download_url"]; v == nil {
+ log.Debugf("Skipping because empty map: \n%+v\n", entry)
+ return ""
+ }
+ readme_url := tmp_api_map["download_url"].(string)
+
+ res, err := http.Get(readme_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ readme, err := ioutil.ReadAll(res.Body)
+ if err != nil {
+ panic(err)
+ }
+ res.Body.Close()
+
+ return string(readme)
+}
+
+func GetLangFromReadme(readme string) string {
+
+ r := regexp.MustCompile("(?s)```.+```")
+ r2 := regexp.MustCompile("(?s)<code>.+</code>") // I know!
+ readme = r.ReplaceAllString(readme, "")
+ readme = r2.ReplaceAllString(readme, "")
+ readme = strings.TrimSpace(readme)
+
+ readme = strip.StripTags(readme)
+ readme = strings.TrimSpace(readme)
+
+ readme = stripmd.Strip(readme)
+ readme = strings.TrimSpace(readme)
+
+ if IsChinese(readme) {
+ return "Chinese"
+ }
+
+ info := whatlanggo.Detect(readme)
+ return info.Lang.String()
+}