1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
package main
import (
"strconv"
"strings"
log "github.com/Sirupsen/logrus"
"github.com/gocolly/colly"
)
func (app *App) ScrapeGithub(platform Platform) []Entry {
var err error
URL := platform.URL
Languages := app.GetLanguages()
UpdatePeriods := app.GetUpdatePeriods()
current_language := Language{}
current_update_period := UpdatePeriod{}
Entries := []Entry{}
c := app.customCollector([]string{"www.github.com", "github.com"})
c.OnHTML("ol.repo-list > li", func(e *colly.HTMLElement) {
entry := Entry{}
owner := Owner{}
e.ForEach("div > h3", func(i int, e *colly.HTMLElement) {
entry.URL = e.ChildAttr("a", "href")
owner.Name = strings.TrimSuffix(e.ChildText("a > span"), " /")
owner.Name = strings.TrimSpace(owner.Name)
entry.Title = strings.TrimPrefix(e.ChildText("a"), owner.Name+" /")
entry.Title = strings.TrimSpace(entry.Title)
})
e.ForEach("div.py-1", func(i int, e *colly.HTMLElement) {
entry.Synopsis = e.ChildText("p")
})
e.ForEach("div.text-gray > a.muted-link", func(i int, e *colly.HTMLElement) {
if strings.Contains(e.Attr("href"), "stargazers") {
stars := strings.TrimSpace(strings.Replace(e.Text, ",", "", -1))
entry.Stars, err = strconv.Atoi(stars)
if err != nil {
Warn(err, "Github: Extracting stars from "+entry.Title+" failed")
}
}
})
l := Language{}
l.ID = current_language.ID
l.Name = current_language.Name
p := Platform{}
p.ID = platform.ID
p.Name = platform.Name
p.URL = platform.URL
u := UpdatePeriod{}
u.ID = current_update_period.ID
u.Name = current_update_period.Name
owner.Platform = &platform
owner.URL = URL + owner.Name
entry.Owner = &owner
entry.Platform = &p
entry.Language = &l
entry.UpdatePeriod = &u
entry.Created_At = app.Now
log.Debugf("%+v\n", owner)
log.Debugf("%+v\n", entry)
Entries = append(Entries, entry)
})
for _, l := range Languages {
current_language = l
for _, t := range UpdatePeriods {
current_update_period = t
CURRENT_URL := platform.URL + "/trending/" + l.Name + "?since=" + t.Name
log.Println("Crawling " + CURRENT_URL)
err := c.Visit(CURRENT_URL)
if err != nil {
Warn(err, "Scraping Platform "+platform.Name+" failed with URL: "+CURRENT_URL)
}
}
}
return Entries
}
|