scrape.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

package main

import (
	"time"

	log "github.com/sirupsen/logrus"
	"github.com/gocolly/colly"
)

func (app *App) Scrape(platforms []Platform) {

	wait := make(chan bool)
	count := 0

	for _, platform := range platforms {

		go app.ScrapeHTML(platform, wait)
		count++

	}

	// Wait until all go routines finished
	for i := 0; i < count; i++ {
		<-wait
	}
}

func (app *App) ScrapeHTML(platform Platform, wait chan bool) {
	var Entries []Entry
	var err error

	// retry on error
	for i := 1; i < 4; i++ {
		Entries = app.ScrapePlatform(platform)

		if len(Entries) >= 1 {
			break
		}
	}

	// if no results, return early
	if len(Entries) == 0 {
		wait <- true
		return

	}

	err = app.SaveEntries(Entries)
	if err != nil {
		Warn(err, "Saving entries failed. Platform: "+platform.Name)
	}

	wait <- true
}

func (app *App) ScrapePlatform(platform Platform) []Entry {

	switch platform.Name {
	case "Github":
		return app.ScrapeGithub(platform)
	default:
		log.Println(platform.Name + ": No Crawler")
	}

	return []Entry{}
}

/*
 * Sets the crawler config.
 */
func (app *App) customCollector(allowed_urls []string) *colly.Collector {
	c := colly.NewCollector(
		colly.UserAgent(app.Config.UserAgent),
		colly.AllowedDomains(allowed_urls...),
	)
	c.IgnoreRobotsTxt = app.Config.IgnoreRobotsTXT

	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		RandomDelay: time.Duration(app.Config.Delay) * time.Second,
	})

	return c
}