summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
authoradmin2024-09-13 20:04:17 +0200
committeradmin2024-09-13 20:04:17 +0200
commitb190512e951efdd1ed4642eed8726bf7bdf2c022 (patch)
tree635c5ca837aaf48ddcfbc6c65c1055c7402f84f4 /main.go
parent6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b (diff)
downloadhncrawler-b190512e951efdd1ed4642eed8726bf7bdf2c022.tar.gz
changed xurl to goquery
Diffstat (limited to 'main.go')
-rw-r--r--main.go82
1 files changed, 62 insertions, 20 deletions
diff --git a/main.go b/main.go
index 8940afc..83a35ff 100644
--- a/main.go
+++ b/main.go
@@ -15,7 +15,7 @@ import (
"github.com/anikhasibul/queue"
"github.com/jmoiron/sqlx"
log "github.com/sirupsen/logrus"
- "mvdan.cc/xurls/v2"
+ "github.com/PuerkitoBio/goquery"
)
type App struct {
@@ -63,9 +63,11 @@ func (app *App) walkDown() {
//var err error
//max_item := getMaxItem()
+ //max_item := 27351341
+ max_item := 27262623
//max_item := 41495306
//max_item := 36128477
- max_item := 32670334
+ //max_item := 32670334
//max_item := 41231601
//max_item := 41165987
//max_item := 41136898
@@ -79,7 +81,8 @@ func (app *App) walkDown() {
//max_item := 15038031
//max_item := 14450000
- const maxRoutines = 200
+ const maxRoutines = 400
+ //const maxRoutines = 1
q := queue.New(maxRoutines)
defer q.Close()
@@ -281,32 +284,67 @@ func getStory(id int) (Story, bool) {
/**
* Parse all URLs in Story.Text
+
+ log.Debugf("StoryID: %d\n", Story.Id)
+ log.Debugf("StoryID: %d\n", Story.Text)
+ */
+
+ /**
+ * This comment broke my code:
+ * https://news.ycombinator.com/item?id=27351340
*/
- rxRelaxed := xurls.Relaxed()
- rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1)
+ tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>"))
+ if err != nil {
+ log.Errorf("Failed to parse html: %s\n", err.Error())
+ return Story, false
+ }
+ sel := tmpdoc.Find("html")
+
+ // remove all found elements from selection
+ sel.Find("code").Each(func(i int, s *goquery.Selection) {
+ //log.Warnf("%+v\n", s.Get(0))
+ RemoveNode(sel.Get(0), s.Get(0))
+ })
+
+ tmphtml, err := sel.Html()
+ if err != nil {
+ log.Warn("Failed to generate html from selection: ", err.Error())
+ }
- for _, rxLink := range rxLinks {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tmphtml))
+
+ if err != nil {
+ log.Errorf("Failed to parse html: %s\n", err.Error())
+ return Story, false
+ }
+
+ doc.Find("a").Each(func(i int, s *goquery.Selection) {
+
+ l, ok := s.Attr("href")
+
+ if ok {
/**
* Check for Youtube in text field
*/
- is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink)
+ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
if err != nil {
- log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- return Story, false
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
}
if is_video {
- if ! duplicates[rxLink] {
+ if ! duplicates[l] {
var link Link
- link.Url = normalizeUrl(rxLink)
+ link.Url = normalizeUrl(l)
link.Field = 2
Story.Links = append(Story.Links, link)
log.Info("match youtube text")
log.Infof("%+v\n", Story)
- duplicates[rxLink] = true
+ duplicates[l] = true
}
}
@@ -314,27 +352,29 @@ func getStory(id int) (Story, bool) {
/**
* Check for movie platforms in text field
*/
- is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink)
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
if err != nil {
- log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- return Story, false
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
}
if is_movie {
- if ! duplicates[rxLink] {
+ if ! duplicates[l] {
var link Link
- link.Url = normalizeUrl(rxLink)
+ link.Url = normalizeUrl(l)
link.Field = 1
Story.Links = append(Story.Links, link)
log.Info("match moview platform text")
log.Infof("%+v\n", Story)
- duplicates[rxLink] = true
+ duplicates[l] = true
}
-
}
- }
+ }
+ })
+
//Story.Url = normalizeUrl(Story.Url)
@@ -398,6 +438,8 @@ func getDetail(id int) Story {
story.Text = html.UnescapeString(story.Text)
+ log.Debugf("StoryID: %d\n", story.Id)
+
return story
}