summaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go84
1 files changed, 84 insertions, 0 deletions
diff --git a/scrape.go b/scrape.go
new file mode 100644
index 0000000..ebb02fa
--- /dev/null
+++ b/scrape.go
@@ -0,0 +1,84 @@
+package main
+
+import (
+ "time"
+
+ log "github.com/Sirupsen/logrus"
+ "github.com/gocolly/colly"
+)
+
+func (app *App) ScrapeHTML(platforms []Platform) {
+
+ wait := make(chan bool)
+ count := 0
+
+ for _, platform := range platforms {
+
+ go app.Scrape(platform, wait)
+ count++
+
+ }
+
+ // Wait until all go routines finished
+ for i := 0; i < count; i++ {
+ <-wait
+ }
+}
+
+func (app *App) Scrape(platform Platform, wait chan bool) {
+ var Entries []Entry
+ var err error
+
+ // retry on error
+ for i := 1; i < 4; i++ {
+ Entries = app.ScrapePlatform(platform)
+
+ if len(Entries) >= 1 {
+ break
+ }
+ }
+
+ // if no results, return early
+ if len(Entries) == 0 {
+ wait <- true
+ return
+
+ }
+
+ err = app.SaveEntries(Entries)
+ if err != nil {
+ Warn(err, "Saving entries failed. Platform: "+platform.Name)
+ }
+
+ wait <- true
+}
+
+func (app *App) ScrapePlatform(platform Platform) []Entry {
+
+ switch platform.Name {
+ case "Github":
+ return app.ScrapeGithub(platform)
+ default:
+ log.Println(platform.Name + ": No Crawler")
+ }
+
+ return []Entry{}
+}
+
+/*
+ * Sets the crawler config.
+ */
+func (app *App) customCollector(allowed_urls []string) *colly.Collector {
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ colly.AllowedDomains(allowed_urls...),
+ )
+ c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT
+
+ c.Limit(&colly.LimitRule{
+ DomainGlob: "*",
+ RandomDelay: time.Duration(app.Config.Delay) * time.Second,
+ })
+
+ return c
+}