From 6a771c754a9ede42fa9bff743087510a2168cf59 Mon Sep 17 00:00:00 2001 From: Maximilian Date: Wed, 17 Apr 2019 10:33:39 +0200 Subject: Adds crawler for Github. --- github.go | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) (limited to 'github.go') diff --git a/github.go b/github.go index f673d1d..410d847 100644 --- a/github.go +++ b/github.go @@ -1,5 +1,76 @@ package main +import ( + "strconv" + "strings" + + "github.com/gocolly/colly" +) + func (app *App) ScrapeGithub(platform Platform) []Entry { - return []Entry{} + var err error + + URL := platform.URL + Languages := app.GetLanguages() + UpdatePeriods := app.GetUpdatePeriods() + current_language := Language{} + current_update_period := UpdatePeriod{} + + Entries := []Entry{} + + c := app.customCollector([]string{"www.github.com", "github.com"}) + + c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) { + entry := Entry{} + owner := Owner{} + + e.ForEach("div > h3", func(i int, e *colly.HTMLElement) { + entry.URL = URL + e.ChildAttr("a", "href") + entry.Title = e.ChildText("a") + owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /") + }) + + e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) { + entry.Synopsis = e.ChildText("p") + }) + + e.ForEach("div.text-gray", func(i int, e *colly.HTMLElement) { + if i == 0 { + entry.Stars, err = strconv.Atoi(e.ChildText("a.muted-text")) + if err != nil { + Warn(err, "Github: Extracting stars from "+entry.Title+" failed") + } + } + }) + + owner.Platform = &platform + owner.URL = URL + owner.Name + + entry.Owner = &owner + entry.Platform = &platform + entry.Language = ¤t_language + entry.UpdatePeriod = ¤t_update_period + entry.Created_At = app.Now + + Entries = append(Entries, entry) + }) + + for _, l := range Languages { + + current_language = l + + for _, t := range UpdatePeriods { + + current_update_period = t + + CURRENT_URL := URL + "/trending/" + l.Name + "?since=" + t.Name + + err := c.Visit(CURRENT_URL) + if err != nil { + Warn(err, "Scraping Platform "+platform.Name+" failed with URL: "+CURRENT_URL) + } + } + } + + return Entries } -- cgit v1.2.3