summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
authorhorus2020-04-02 21:53:30 +0200
committerhorus2020-04-02 21:53:30 +0200
commit0b90b7a3b0f38f0babf4d788f4d7dd5e43253341 (patch)
treea5492cf5246522a5dd0e201be3ae988ae7e6245c /main.go
downloadcurious-crawler-0b90b7a3b0f38f0babf4d788f4d7dd5e43253341.tar.gz
Initial commit.
Diffstat (limited to 'main.go')
-rw-r--r--main.go322
1 files changed, 322 insertions, 0 deletions
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..1548776
--- /dev/null
+++ b/main.go
@@ -0,0 +1,322 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/AnikHasibul/queue"
+ log "github.com/Sirupsen/logrus"
+ "github.com/jmoiron/sqlx"
+)
+
+type App struct {
+ Config *Config
+ DB *sqlx.DB
+ Now time.Time
+}
+
+func main() {
+ var err error
+ _own_conf := _conf
+ app := App{Config: &_own_conf}
+ _conf = Config{}
+
+ app.Now = time.Now()
+
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
+ app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
+ if err != nil {
+ log.Fatal(err, "Cannot connect to database")
+ }
+
+ if err = app.DB.Ping(); err != nil {
+ log.Fatal(err, "No connection to database")
+ }
+ defer app.DB.Close()
+
+ //app.fixAllCategories()
+
+ app.deleteOrphanedArticles()
+ app.topStories()
+ app.wikipediaFixAllUrls()
+ app.deleteOrphanedArticles()
+ app.saveExcerpts()
+ //app.saveAllCategories()
+ app.updateAllDiscussions()
+ //app.walkDown()
+
+ /**
+ * Resolve redirects on stored urls.
+ */
+ //app.updateWikipediaUrls()
+ //app.saveAllCategories()
+ //return
+}
+
+func (app *App) walkDown() {
+
+ var err error
+
+ max_item := getMaxItem()
+ //max_item := 22554000
+ //max_item := 22494596
+ //max_item := 22354383
+ //max_item := 18984000
+ //max_item := 18732000
+ //max_item := 16017000
+ //max_item := 15494000
+ //max_item := 15038031
+ //max_item := 14450000
+
+ const maxRoutines = 20
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ for i := max_item; i > 22600000; i-- {
+ q.Add()
+ go func(i int) {
+ defer q.Done()
+
+ Story, ok := getStory(i)
+ if ok {
+ log.Infof("%+v\n", Story)
+ err = app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+
+ /*
+ * Prints status update every 1000th entry
+ */
+ if i%1000 == 0 {
+ log.Debugf("%s: Getting item %d\n", time.Now(), i)
+ }
+ }(i)
+ }
+ q.Wait()
+}
+
+func getMaxItem() int {
+ response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json")
+ if err != nil {
+ panic(err)
+ }
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ max_item, err := strconv.Atoi(string(data))
+ if err != nil {
+ panic(err)
+ }
+
+ return max_item
+}
+
+func (app *App) topStories() {
+ var err error
+
+ data1 := strings.TrimSuffix(string(getTopStories()), "]")
+ data2 := strings.TrimPrefix(string(getBestStories()), "[")
+
+ data1 = data1 + ","
+ data := data1 + data2
+
+ var story_ids []int
+ err = json.Unmarshal([]byte(data), &story_ids)
+ if err != nil {
+ log.Warn("topStories: Unmarshaling json failed")
+ panic(err)
+ }
+
+ const maxRoutines = 20
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+ for _, id := range story_ids {
+ q.Add()
+ go func(id int) {
+ Story, ok := getStory(id)
+ defer q.Done()
+ if ok {
+ log.Infof("%+v\n", Story)
+ err = app.saveStory(Story)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ categories, ok := app.crawlForCategories(Story.Url)
+ if ok {
+ article_id := app.getArticleIdFromUrl(Story.Url)
+ app.saveCategory(article_id, categories)
+ }
+
+ }
+ }(id)
+ }
+ q.Wait()
+}
+
+func getStory(id int) (Story, bool) {
+ Story := getDetail(id)
+ if Story.Dead || Story.Deleted {
+ return Story, false
+ }
+ if Story.Score < 10 && Story.Descendants < 10 {
+ return Story, false
+ }
+ /*
+ if (time.Now().Unix() - 3456000) > int64(Story.Time) {
+ }
+ */
+
+ Story.Title = stripHNPrefix(Story.Title)
+
+ u, err := url.Parse(Story.Url)
+ if err != nil {
+ log.Warnf("getStory: Parsing URL failed: %s\n", err.Error())
+ return Story, false
+ }
+ is_gh, err := regexp.MatchString("(github.com)($|/)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ is_wiki, err := regexp.MatchString("wikipedia.org($|/)", u.Host)
+ if err != nil {
+ log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ return Story, false
+ }
+ if is_gh {
+ return Story, true
+ }
+ if is_wiki {
+ Story.Url = wikipediaNormalizeUrl(Story.Url)
+ Story.Url = wikipediaRealUrl(Story.Url)
+ return Story, true
+ }
+ return Story, false
+}
+
+func getResponse(url string) *http.Response {
+ var err error
+ var response *http.Response
+
+ response, err = http.Get(url)
+ if err != nil {
+ for i := 0; i < 4; i++ {
+ log.Warn("getDetail: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
+ resp2, err2 := http.Get(url)
+ if err2 == nil {
+ return resp2
+ }
+ }
+ panic(err)
+ }
+ return response
+}
+
+func getBestResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/beststories.json"
+ return getResponse(_url)
+}
+
+func getTopResponse() *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/topstories.json"
+ return getResponse(_url)
+}
+
+func getWikipediaResponse(title string) *http.Response {
+ _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title
+ return getResponse(_url)
+}
+
+func getWikipediaRedirectResponse(hostname, title string) *http.Response {
+ _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title
+ return getResponse(_url)
+}
+
+func getStoryResponse(item_id string) *http.Response {
+ _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
+ return getResponse(_url)
+}
+
+func getDetail(id int) Story {
+ response := getStoryResponse(strconv.Itoa(id))
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+ var story Story
+ err = json.Unmarshal(data, &story)
+ if err != nil {
+ log.Warn("getDetail: Unmarshaling json failed")
+ panic(err)
+ }
+ return story
+}
+
+func getTopStories() []byte {
+ response := getTopResponse()
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func getBestStories() []byte {
+ response := getBestResponse()
+
+ data, err := ioutil.ReadAll(response.Body)
+ if err != nil {
+ panic(err)
+ }
+
+ return data
+}
+
+func (app *App) updateAllDiscussions() {
+ const maxRoutines = 20
+ var item_ids []int
+
+ app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on")
+
+ q := queue.New(maxRoutines)
+ defer q.Close()
+
+ for _, item_id := range item_ids {
+ q.Add()
+ go func(item_id int) {
+ defer q.Done()
+ Story, ok := getStory(item_id)
+ if !ok {
+ /**
+ * Check if we got a network error or a dead story.
+ */
+ if 0 == Story.Id {
+ log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id)
+ } else if Story.Descendants > 10 || Story.Score > 10 {
+ log.Warnf("%+v\n", Story)
+ }
+ return
+ }
+ err := app.updateDiscussion(Story)
+ if err != nil {
+ log.Warn(err)
+ return
+ }
+ }(item_id)
+ }
+ q.Wait()
+}