diff options
| -rw-r--r-- | database.go | 2 | ||||
| -rw-r--r-- | github.go | 73 | ||||
| -rw-r--r-- | main.go | 7 | ||||
| -rw-r--r-- | scrape.go | 6 |
4 files changed, 81 insertions, 7 deletions
diff --git a/database.go b/database.go index 044fa08..7d56018 100644 --- a/database.go +++ b/database.go @@ -69,7 +69,7 @@ CREATE TABLE IF NOT EXISTS entry ( INSERT IGNORE INTO platform (id, name, url) VALUES ( NULL, "Github", - "https://github.com/trending" + "https://github.com/" ); ` _, err = app.DB.Exec(init_platform_query) @@ -1,5 +1,76 @@ package main +import ( + "strconv" + "strings" + + "github.com/gocolly/colly" +) + func (app *App) ScrapeGithub(platform Platform) []Entry { - return []Entry{} + var err error + + URL := platform.URL + Languages := app.GetLanguages() + UpdatePeriods := app.GetUpdatePeriods() + current_language := Language{} + current_update_period := UpdatePeriod{} + + Entries := []Entry{} + + c := app.customCollector([]string{"www.github.com", "github.com"}) + + c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) { + entry := Entry{} + owner := Owner{} + + e.ForEach("div > h3", func(i int, e *colly.HTMLElement) { + entry.URL = URL + e.ChildAttr("a", "href") + entry.Title = e.ChildText("a") + owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /") + }) + + e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) { + entry.Synopsis = e.ChildText("p") + }) + + e.ForEach("div.text-gray", func(i int, e *colly.HTMLElement) { + if i == 0 { + entry.Stars, err = strconv.Atoi(e.ChildText("a.muted-text")) + if err != nil { + Warn(err, "Github: Extracting stars from "+entry.Title+" failed") + } + } + }) + + owner.Platform = &platform + owner.URL = URL + owner.Name + + entry.Owner = &owner + entry.Platform = &platform + entry.Language = ¤t_language + entry.UpdatePeriod = ¤t_update_period + entry.Created_At = app.Now + + Entries = append(Entries, entry) + }) + + for _, l := range Languages { + + current_language = l + + for _, t := range UpdatePeriods { + + current_update_period = t + + CURRENT_URL := URL + "/trending/" + l.Name + "?since=" + t.Name + + err := c.Visit(CURRENT_URL) + if err != nil { + Warn(err, "Scraping Platform "+platform.Name+" failed with URL: "+CURRENT_URL) + } + } + } + + return Entries } @@ -12,7 +12,7 @@ import ( type App struct { Config *Config DB *sqlx.DB - Now int64 + Now time.Time Debug bool } @@ -25,7 +25,7 @@ func main() { // overwrite the global _conf = Config{} - app.Now = time.Now().Unix() + app.Now = time.Now() log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) @@ -44,4 +44,7 @@ func main() { Fatal(err, "Creating table failed") } + platforms := app.GetPlatforms() + app.Scrape(platforms) + } @@ -7,14 +7,14 @@ import ( "github.com/gocolly/colly" ) -func (app *App) ScrapeHTML(platforms []Platform) { +func (app *App) Scrape(platforms []Platform) { wait := make(chan bool) count := 0 for _, platform := range platforms { - go app.Scrape(platform, wait) + go app.ScrapeHTML(platform, wait) count++ } @@ -25,7 +25,7 @@ func (app *App) ScrapeHTML(platforms []Platform) { } } -func (app *App) Scrape(platform Platform, wait chan bool) { +func (app *App) ScrapeHTML(platform Platform, wait chan bool) { var Entries []Entry var err error |
