diff options
| author | admin | 2024-09-13 20:04:17 +0200 |
|---|---|---|
| committer | admin | 2024-09-13 20:04:17 +0200 |
| commit | b190512e951efdd1ed4642eed8726bf7bdf2c022 (patch) | |
| tree | 635c5ca837aaf48ddcfbc6c65c1055c7402f84f4 /main.go | |
| parent | 6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b (diff) | |
| download | hncrawler-b190512e951efdd1ed4642eed8726bf7bdf2c022.tar.gz | |
changed xurl to goquery
Diffstat (limited to 'main.go')
| -rw-r--r-- | main.go | 82 |
1 files changed, 62 insertions, 20 deletions
@@ -15,7 +15,7 @@ import ( "github.com/anikhasibul/queue" "github.com/jmoiron/sqlx" log "github.com/sirupsen/logrus" - "mvdan.cc/xurls/v2" + "github.com/PuerkitoBio/goquery" ) type App struct { @@ -63,9 +63,11 @@ func (app *App) walkDown() { //var err error //max_item := getMaxItem() + //max_item := 27351341 + max_item := 27262623 //max_item := 41495306 //max_item := 36128477 - max_item := 32670334 + //max_item := 32670334 //max_item := 41231601 //max_item := 41165987 //max_item := 41136898 @@ -79,7 +81,8 @@ func (app *App) walkDown() { //max_item := 15038031 //max_item := 14450000 - const maxRoutines = 200 + const maxRoutines = 400 + //const maxRoutines = 1 q := queue.New(maxRoutines) defer q.Close() @@ -281,32 +284,67 @@ func getStory(id int) (Story, bool) { /** * Parse all URLs in Story.Text + + log.Debugf("StoryID: %d\n", Story.Id) + log.Debugf("StoryID: %d\n", Story.Text) + */ + + /** + * This comment broke my code: + * https://news.ycombinator.com/item?id=27351340 */ - rxRelaxed := xurls.Relaxed() - rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1) + tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>")) + if err != nil { + log.Errorf("Failed to parse html: %s\n", err.Error()) + return Story, false + } + sel := tmpdoc.Find("html") + + // remove all found elements from selection + sel.Find("code").Each(func(i int, s *goquery.Selection) { + //log.Warnf("%+v\n", s.Get(0)) + RemoveNode(sel.Get(0), s.Get(0)) + }) + + tmphtml, err := sel.Html() + if err != nil { + log.Warn("Failed to generate html from selection: ", err.Error()) + } - for _, rxLink := range rxLinks { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tmphtml)) + + if err != nil { + log.Errorf("Failed to parse html: %s\n", err.Error()) + return Story, false + } + + doc.Find("a").Each(func(i int, s *goquery.Selection) { + + l, ok := s.Attr("href") + + if ok { /** * Check for Youtube in text field */ - is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink) + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) if err != nil { - log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - return Story, false + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false } if is_video { - if ! duplicates[rxLink] { + if ! duplicates[l] { var link Link - link.Url = normalizeUrl(rxLink) + link.Url = normalizeUrl(l) link.Field = 2 Story.Links = append(Story.Links, link) log.Info("match youtube text") log.Infof("%+v\n", Story) - duplicates[rxLink] = true + duplicates[l] = true } } @@ -314,27 +352,29 @@ func getStory(id int) (Story, bool) { /** * Check for movie platforms in text field */ - is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink) + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) if err != nil { - log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - return Story, false + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false } if is_movie { - if ! duplicates[rxLink] { + if ! duplicates[l] { var link Link - link.Url = normalizeUrl(rxLink) + link.Url = normalizeUrl(l) link.Field = 1 Story.Links = append(Story.Links, link) log.Info("match moview platform text") log.Infof("%+v\n", Story) - duplicates[rxLink] = true + duplicates[l] = true } - } - } + } + }) + //Story.Url = normalizeUrl(Story.Url) @@ -398,6 +438,8 @@ func getDetail(id int) Story { story.Text = html.UnescapeString(story.Text) + log.Debugf("StoryID: %d\n", story.Id) + return story } |
