summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhorus2020-04-19 18:35:27 +0200
committerhorus2020-04-19 18:35:27 +0200
commit8d3be0d8b623a405990448a5ea4fe471ab735ed7 (patch)
tree227a325b7a5972996fb499614a1913693de7a49f
parent5902524fb85aaf760f3e6a6695b2390868a7bd06 (diff)
downloadghrss-8d3be0d8b623a405990448a5ea4fe471ab735ed7.tar.gz
Update to get spoken language and minor upgrade the github handling.
-rw-r--r--chinese.go25
-rw-r--r--config.go6
-rw-r--r--database.go7
-rw-r--r--github.go118
-rw-r--r--struct.go21
5 files changed, 161 insertions, 16 deletions
diff --git a/chinese.go b/chinese.go
new file mode 100644
index 0000000..48ccd5e
--- /dev/null
+++ b/chinese.go
@@ -0,0 +1,25 @@
+package main
+
+import (
+ "unicode"
+)
+
+func IsChinese(str string) bool {
+ count_chinese := 0
+ count_no_chinese := 0
+ for _, char := range str {
+ if unicode.Is(unicode.Han, char) {
+ count_chinese++
+ } else {
+ if !unicode.IsSpace(char) {
+ count_no_chinese++
+ }
+ }
+ }
+ // >= 10% of the text is chinese
+ if count_chinese >= count_no_chinese/10 {
+ return true
+ }
+
+ return false
+}
diff --git a/config.go b/config.go
index cc4a366..43cdf8f 100644
--- a/config.go
+++ b/config.go
@@ -20,6 +20,9 @@ type Config struct {
Delay int
IgnoreRobotsTXT bool
+ BasicAuthUsername string
+ BasicAuthPassword string
+
Debug bool // sets log level to debug
}
@@ -98,5 +101,8 @@ func (c *Config) setsConfig() {
c.Delay = viper.GetInt("Delay")
c.IgnoreRobotsTXT = viper.GetBool("IgnoreRobotsTXT")
+ c.BasicAuthUsername = viper.GetString("BasicAuthUsername")
+ c.BasicAuthPassword = viper.GetString("BasicAuthPassword")
+
c.Debug = viper.GetBool("Debug")
}
diff --git a/database.go b/database.go
index c455d87..ad09e7d 100644
--- a/database.go
+++ b/database.go
@@ -47,6 +47,7 @@ CREATE TABLE IF NOT EXISTS entry (
stars VARCHAR(255) NOT NULL,
update_period INT NOT NULL,
created_at TIMESTAMP NOT NULL,
+ natural_language VARCHAR(255),
CONSTRAINT fk_entry_owner FOREIGN KEY (owner) REFERENCES owner(id),
CONSTRAINT fk_entry_platform FOREIGN KEY (platform) REFERENCES platform(id),
CONSTRAINT fk_entry_language FOREIGN KEY (language) REFERENCES language(id),
@@ -76,6 +77,8 @@ JOIN language
ON entry.language = language.id
JOIN update_period
ON entry.update_period = update_period.id
+ WHERE
+ natural_language != 'Chinese'
ORDER BY created_at DESC;
`,
}
@@ -155,6 +158,7 @@ func (app *App) SaveEntries(entries []Entry) error {
url,
language,
stars,
+ natural_language,
update_period,
created_at
) VALUES (
@@ -167,6 +171,7 @@ func (app *App) SaveEntries(entries []Entry) error {
?,
?,
?,
+ ?,
?
);
`
@@ -189,7 +194,7 @@ func (app *App) SaveEntries(entries []Entry) error {
if err != nil {
continue
}
- _, err = stmt.Exec(e.Title, e.Synopsis, e.Owner.ID, e.Platform.ID, e.URL, e.Language.ID, e.Stars, e.UpdatePeriod.ID, app.Now)
+ _, err = stmt.Exec(e.Title, e.Synopsis, e.Owner.ID, e.Platform.ID, e.URL, e.Language.ID, e.Stars, e.NaturalLanguage, e.UpdatePeriod.ID, app.Now)
if err != nil {
Warn(err, "SaveEntries: Statement execution failed")
diff --git a/github.go b/github.go
index 50564d6..d342752 100644
--- a/github.go
+++ b/github.go
@@ -4,8 +4,17 @@ import (
"strconv"
"strings"
+ "encoding/json"
+ "io/ioutil"
+ "net/http"
+ "regexp"
+
log "github.com/Sirupsen/logrus"
"github.com/gocolly/colly"
+
+ "github.com/abadojack/whatlanggo"
+ "github.com/grokify/html-strip-tags-go"
+ "github.com/writeas/go-strip-markdown"
)
func (app *App) ScrapeGithub(platform Platform) []Entry {
@@ -22,11 +31,11 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
c := app.customCollector([]string{"www.github.com", "github.com"})
- c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) {
+ c.OnHTML(".Box-row", func(e *colly.HTMLElement) {
entry := Entry{}
owner := Owner{}
- e.ForEach("div > h3", func(i int, e *colly.HTMLElement) {
+ e.ForEach("h1", func(i int, e *colly.HTMLElement) {
entry.URL = e.ChildAttr("a", "href")
owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /")
owner.Name = strings.TrimSpace(owner.Name)
@@ -34,8 +43,8 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
entry.Title = strings.TrimSpace(entry.Title)
})
- e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) {
- entry.Synopsis = e.ChildText("p")
+ e.ForEach("p.col-9", func(i int, e *colly.HTMLElement) {
+ entry.Synopsis = e.Text
})
e.ForEach("div.text-gray > a.muted-link", func(i int, e *colly.HTMLElement) {
@@ -62,7 +71,7 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
u.Name = current_update_period.Name
owner.Platform = &platform
- owner.URL = URL + owner.Name
+ owner.URL = URL + "/" + owner.Name
entry.Owner = &owner
entry.Platform = &p
@@ -70,6 +79,20 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
entry.UpdatePeriod = &u
entry.Created_At = app.Now
+ entry.NaturalLanguage = app.GithubGetLang(entry)
+
+ if "Mandarin" == entry.NaturalLanguage {
+ entry.NaturalLanguage = "Chinese"
+ }
+ if "undefined" == entry.NaturalLanguage {
+ entry.NaturalLanguage = ""
+ }
+
+ log.Println(entry.Title + " / " + entry.Synopsis + " --- " + entry.NaturalLanguage)
+
+ //language_info := whatlanggo.Detect(entry.Synopsis)
+ //entry.NaturalLanguage = language_info.Lang.String()
+
log.Debugf("%+v\n", owner)
log.Debugf("%+v\n", entry)
@@ -96,3 +119,88 @@ func (app *App) ScrapeGithub(platform Platform) []Entry {
return Entries
}
+
+func (app *App) GithubGetLang(entry Entry) string {
+ readme := app.GithubGetReadme(entry)
+ return GetLangFromReadme(readme)
+}
+
+func (app *App) GithubGetReadme(entry Entry) string {
+ API_URL := "https://api.github.com/repos" + entry.URL + "/readme"
+ //log.Debug(API_URL)
+
+ http_client := http.Client{}
+ req, err := http.NewRequest(http.MethodGet, API_URL, nil)
+ if err != nil {
+ // TODO
+ panic(err)
+ }
+
+ req.Header.Set("accept", "application/json")
+ req.Header.Set("User-Agent", ":)")
+
+ req.SetBasicAuth(app.Config.BasicAuthUsername, app.Config.BasicAuthPassword)
+
+ api_resp, err := http_client.Do(req)
+ if err != nil {
+ // TODO
+ panic(err)
+ }
+
+ api_body, err := ioutil.ReadAll(api_resp.Body)
+ if err != nil {
+ // TODO
+ panic(err)
+ }
+
+ var tmp_api_map map[string]interface{}
+
+ err = json.Unmarshal(api_body, &tmp_api_map)
+ if err != nil {
+ // TODO
+
+ log.Printf("%+v\n", tmp_api_map)
+ log.Println("json unmarshal failed")
+ panic(err)
+ }
+
+ if v := tmp_api_map["download_url"]; v == nil {
+ log.Debugf("Skipping because empty map: \n%+v\n", entry)
+ return ""
+ }
+ readme_url := tmp_api_map["download_url"].(string)
+
+ res, err := http.Get(readme_url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ readme, err := ioutil.ReadAll(res.Body)
+ if err != nil {
+ panic(err)
+ }
+ res.Body.Close()
+
+ return string(readme)
+}
+
+func GetLangFromReadme(readme string) string {
+
+ r := regexp.MustCompile("(?s)```.+```")
+ r2 := regexp.MustCompile("(?s)<code>.+</code>") // I know!
+ readme = r.ReplaceAllString(readme, "")
+ readme = r2.ReplaceAllString(readme, "")
+ readme = strings.TrimSpace(readme)
+
+ readme = strip.StripTags(readme)
+ readme = strings.TrimSpace(readme)
+
+ readme = stripmd.Strip(readme)
+ readme = strings.TrimSpace(readme)
+
+ if IsChinese(readme) {
+ return "Chinese"
+ }
+
+ info := whatlanggo.Detect(readme)
+ return info.Lang.String()
+}
diff --git a/struct.go b/struct.go
index ccd2dc3..129d331 100644
--- a/struct.go
+++ b/struct.go
@@ -28,14 +28,15 @@ type Owner struct {
}
type Entry struct {
- ID int
- Title string
- Synopsis string
- Owner *Owner
- Platform *Platform
- URL string
- Language *Language
- Stars int
- UpdatePeriod *UpdatePeriod
- Created_At time.Time
+ ID int
+ Title string
+ Synopsis string
+ Owner *Owner
+ Platform *Platform
+ URL string
+ Language *Language
+ Stars int
+ NaturalLanguage string
+ UpdatePeriod *UpdatePeriod
+ Created_At time.Time
}