diff options
| author | admin | 2025-02-09 15:07:02 +0100 |
|---|---|---|
| committer | admin | 2025-02-09 15:07:02 +0100 |
| commit | 32369fcbd4285664a527846ed392a33bbcbc3d75 (patch) | |
| tree | aabb365003b7a0b4c3a278bae117e28ff2c9c3a7 | |
| parent | a82dd3752342b0096a5070d297189b0bdf9ffe34 (diff) | |
| download | hncrawler-32369fcbd4285664a527846ed392a33bbcbc3d75.tar.gz | |
| -rw-r--r-- | database.go | 31 | ||||
| -rw-r--r-- | helper.go | 57 | ||||
| -rw-r--r-- | main.go | 230 | ||||
| -rw-r--r-- | struct.go | 31 |
4 files changed, 288 insertions, 61 deletions
diff --git a/database.go b/database.go index 31594b4..bf35f52 100644 --- a/database.go +++ b/database.go @@ -65,13 +65,19 @@ func (app *App) saveStory(s Story) error { updated_at, story_id, url, - field + field, + host, + param, + type ) VALUES ( NULL, ?, ?, ?, ?, + ?, + ?, + ?, ? ); ` @@ -82,7 +88,7 @@ func (app *App) saveStory(s Story) error { } defer stmt2.Close() - _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field) + _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field, l.Host, l.Param, l.Type) if err != nil { log.Warn("saveStory: InsertLinks: Statement execution failed") return err @@ -480,3 +486,24 @@ func (app *App) createMaxStoredItem(new_max_item int) error { return nil } + +func (app *App) FixURLs() { + rows, err := app.DB.Query("SELECT url,field FROM links;") + if err != nil { + log.Fatal(err) + } + + for rows.Next() { + var url string + var field int + + err = rows.Scan(&url, &field) + if err != nil { + log.Fatal(err) + } + + link := getURL(url, field) + log.Printf("%+v\n", link) + //log.Warnf("%+v\n", link) + } +} @@ -2,9 +2,10 @@ package main import ( _url "net/url" + "regexp" "strings" + log "github.com/sirupsen/logrus" - "regexp" xhtml "golang.org/x/net/html" ) @@ -24,16 +25,17 @@ func stripHNPrefix(title string) string { func _removeParam(url, key string) string { u, err := _url.Parse(url) if err != nil { - log.Fatal(err) + log.Fatal("_removeParam: parse: ", err) } q := u.Query() q.Del(key) - u.RawQuery = q.Encode() + u.RawQuery = q.Encode() return u.String() } func normalizeUrl(url string) string { + url = strings.ToLower(url) /** * Redirect http:// to https:// */ @@ -53,7 +55,8 @@ func normalizeUrl(url string) string { */ u, err := _url.Parse(url) if err != nil { - log.Fatal(err) + log.Warn("normalizeUrl: parse: ", err) + return url } if "" == u.Scheme { @@ -98,7 +101,7 @@ func normalizeUrl(url string) string { u.RawQuery = q.Encode() url = u.String() - + //r := regexp.MustCompile("youtu.be/") //url = r.ReplaceAllString(url, "youtube.com/watch?v=") } @@ -142,10 +145,10 @@ func normalizeUrl(url string) string { */ u, err = _url.Parse(url) if err != nil { - log.Fatal(err) + log.Fatal("normalizeUrl: parse: append www: ", err) } - if ! strings.HasPrefix(u.Host, "www.") { + if !strings.HasPrefix(u.Host, "www.") && u.Host != "music.youtube.com" { u.Host = "www." + u.Host } @@ -173,7 +176,7 @@ func normalizeUrl(url string) string { /** * remove tracking param "si", "feature" and "pp" from every youtube video */ - match, err = regexp.MatchString("/www.youtube.com/", url) + match, err = regexp.MatchString("/(www|music).youtube.com/", url) if err != nil { log.Fatal(err) } @@ -196,27 +199,27 @@ func normalizeUrl(url string) string { } func RemoveNode(root_node *xhtml.Node, remove_me *xhtml.Node) { - found_node := false - check_nodes := make(map[int]*xhtml.Node) - i := 0 + found_node := false + check_nodes := make(map[int]*xhtml.Node) + i := 0 + + // loop through siblings + for n := root_node.FirstChild; n != nil; n = n.NextSibling { + if n == remove_me { + found_node = true + n.Parent.RemoveChild(n) + } - // loop through siblings - for n := root_node.FirstChild; n != nil; n = n.NextSibling { - if n == remove_me { - found_node = true - n.Parent.RemoveChild(n) + check_nodes[i] = n + i++ } - check_nodes[i] = n - i++ - } - - // check if removing node is found - // if yes no need to check childs returning - // if no continue loop through childs and so on - if found_node == false { - for _, item := range check_nodes { - RemoveNode(item, remove_me) + // check if removing node is found + // if yes no need to check childs returning + // if no continue loop through childs and so on + if found_node == false { + for _, item := range check_nodes { + RemoveNode(item, remove_me) + } } - } } @@ -44,6 +44,8 @@ func main() { } defer app.DB.Close() + app.FixURLs() + return /* app.deleteOrphanedArticles() app.topStories() @@ -229,9 +231,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_video { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 2 + link := getURL(Story.Url, 2) Story.Links = append(Story.Links, link) log.Info("match youtube host") @@ -249,9 +249,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_movie { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 1 + link := getURL(Story.Url, 1) Story.Links = append(Story.Links, link) log.Info("match moview platform url") @@ -271,9 +269,7 @@ func getStory(id int) (Story, bool) { if is_video { if !duplicates[Story.Url] { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 2 + link := getURL(Story.Url, 2) Story.Links = append(Story.Links, link) log.Info("match video title") @@ -295,9 +291,7 @@ func getStory(id int) (Story, bool) { if is_movie { if !duplicates[Story.Url] { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 1 + link := getURL(Story.Url, 1) Story.Links = append(Story.Links, link) log.Info("match moview platform url") @@ -362,9 +356,7 @@ func getStory(id int) (Story, bool) { if is_video { if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 2 + link := getURL(l, 2) Story.Links = append(Story.Links, link) log.Info("match youtube text") @@ -387,9 +379,7 @@ func getStory(id int) (Story, bool) { if is_movie { if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 1 + link := getURL(l, 1) Story.Links = append(Story.Links, link) log.Info("match moview platform text") @@ -410,6 +400,210 @@ func getStory(id int) (Story, bool) { } } +func getURL(_url string, field int) Link { + + _url = normalizeUrl(_url) + + var link Link + + u, err := url.Parse(_url) + if err != nil { + log.Warnf("getURL: Parsing URL failed: %s \n", err.Error()) + return link + } + + link.Field = field + link.Url = _url + link.Host = u.Host + + switch u.Host { + case "www.youtube.com", + "music.youtube.com", + "www.music.youtube.com": + //log.Warn(_url) + if strings.HasPrefix(u.Path, "/@") || strings.HasPrefix(u.Path, "/c/") || strings.HasPrefix(u.Path, "/user/") || strings.HasPrefix(u.Path, "/channel/") { + link.Type = "channel" + + path_parts := strings.Split(u.Path, "/") + + if strings.HasPrefix(u.Path, "/@") { + link.Param = strings.TrimPrefix(path_parts[1], "@") + } else { + link.Param = path_parts[2] + } + + } else if strings.HasPrefix(u.Path, "/shorts") || strings.HasPrefix(u.Path, "/live") || strings.HasPrefix(u.Path, "/embed") { + link.Type = "video" + + path_parts := strings.Split(u.Path, "/") + + if len(path_parts) > 2 { + link.Param = path_parts[2] + } else { + link.Param = "" + link.Type = "null" + } + + } else if strings.HasPrefix(u.Path, "/playlist") { + link.Type = "playlist" + m, err := url.ParseQuery(u.RawQuery) + if err != nil { + log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error()) + return link + } + + /** + * ?list= includes the playlist id + */ + + p, ok := m["list"] + if !ok { + log.Warnf("getURL: Playlist: Youtube has no param: %s \n", link.Url) + } else { + link.Param = p[0] + } + + } else if strings.HasPrefix(u.Path, "/watch/") { + + link.Type = "video" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/watch") { + + link.Type = "video" + + m, err := url.ParseQuery(u.RawQuery) + if err != nil { + log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error()) + return link + } + + /** + * ?v= includes the video id + */ + p, ok := m["v"] + if !ok { + //log.Infof("getURL: v=VideoID: Youtube has no param: %s \n", link.Url) + log.Warnf("getURL: v=VideoID: Youtube has no param: %s \n", link.Url) + } else { + link.Param = p[0] + } + } else if u.Path == "/" || u.Path == "" { + link.Type = "null" + link.Param = "" + } else { + link.Type = "channel" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[1] + } + + /** + * nice debug + if link.Param == "" && !strings.HasPrefix(u.Path, "/clip") && !strings.HasPrefix(u.Path, "/results") && u.Path != "/watch" { + log.Fatal(link) + } + */ + break + case "www.imdb.com": + //log.Warn(u.Path) + if strings.HasPrefix(u.Path, "/title/") { + + link.Type = "film" + + path_parts := strings.Split(u.Path, "/") + if strings.HasPrefix(path_parts[1], "tt") { + link.Param = path_parts[1] + } else { + log.Infof("getURL: IMDB: path_parts[1] doesn't have a film id: %s %s %+v", u.Path, path_parts[2], path_parts) + log.Info(path_parts) + } + + } else if strings.HasPrefix(u.Path, "/tt") { + link.Type = "film" + + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[1] + + } else if strings.HasPrefix(u.Path, "/name/") { + + link.Type = "actor" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/character/") { + + link.Type = "character" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/company/") { + + link.Type = "company" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/video/") || strings.HasPrefix(u.Path, "/videoplayer/") { + + link.Type = "video" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/user/") { + + link.Type = "user" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/news/") || strings.HasPrefix(u.Path, "/board/announcement") { + + link.Type = "news" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/review/") { + + link.Type = "review" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/404") || u.Path == "" || u.Path == "/" || strings.HasPrefix(u.Path, "/interfaces") || strings.HasPrefix(u.Path, "/titanic") || strings.HasPrefix(u.Path, "/freedive/") || strings.HasPrefix(u.Path, "/conditions") || strings.HasPrefix(u.Path, "/help/") || strings.HasPrefix(u.Path, "/showtimes/") || strings.HasPrefix(u.Path, "/1") || strings.Contains(u.Path, "/mediaviewer/") || strings.HasPrefix(u.Path, "/media") || strings.HasPrefix(u.Path, "/licensing") || strings.HasPrefix(u.Path, "/lists") || strings.HasPrefix(u.Path, "/stats") || strings.HasPrefix(u.Path, "/rg/") || strings.HasPrefix(u.Path, "/hackernews") || strings.HasPrefix(u.Path, "/robots.txt") || strings.HasPrefix(u.Path, "/: the prestige") || strings.HasPrefix(u.Path, "/features/") || strings.HasPrefix(u.Path, "/keyword/") { + + link.Type = "null" + link.Param = "" + + } else if strings.HasPrefix(u.Path, "/list/") || strings.HasPrefix(u.Path, "/chart/") || strings.Contains(u.Path, "/top-rated-") || strings.HasPrefix(u.Path, "/whats-on-tv/") { + + link.Type = "film_list" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/find") || strings.HasPrefix(u.Path, "/search") || strings.HasPrefix(u.Path, "/filmosearch") { + + link.Type = "imdb_search" + + } else { + log.Infof("getURL: IMDB: path does not include title: %s \n", u.Path) + //log.Warnf("getURL: IMDB: path does not include title: %s \n", u.Path) + log.Fatalf("getURL: IMDB: path does not include title: %s \n", u.Path) + } + break + case "www.vimeo.com": + // todo + default: + //log.Warnf("getURL: Host not recognized. Missing param: %s", u.Host) + return link + } + + if link.Host == "www.music.youtube.com" { + u.Host = "music.youtube.com" + link.Url = u.String() + link.Host = "music.youtube.com" + } + + return link +} + func getResponse(url string) *http.Response { var err error var response *http.Response @@ -5,25 +5,28 @@ import ( ) type Story struct { - Id int + Id int //Deleted bool - Type string /* story, comment (or job, poll, pollopt) */ - Title string /* title (only story) */ - Text string /* comment text or possible text on story (HTML) */ - Dead bool - Url string /* verbatim parsed URL */ + Type string /* story, comment (or job, poll, pollopt) */ + Title string /* title (only story) */ + Text string /* comment text or possible text on story (HTML) */ + Dead bool + Url string /* verbatim parsed URL */ //NormalizedUrl string /* normalized */ - Score int /* only story */ - Descendants int /* comments on score or kids on comments */ + Score int /* only story */ + Descendants int /* comments on score or kids on comments */ //Kids []int /* id of the item's comments */ - Time int /* posted at */ - By string /* hn commenter */ - Links []Link /* matched urls */ + Time int /* posted at */ + By string /* hn commenter */ + Links []Link /* matched urls */ } type Link struct { - Url string - Field int /* 2 = video, 1 = movies, 0 = bug */ + Url string + Field int /* 2 = video, 1 = movies, 0 = bug */ + Host string /* e.g. "youtube.com", "imdb.com" */ + Param string /* e.g. "v" param for youtube, title/"ttxxx" for imdb */ + Type string /* e.g. "video", "channel", "playlist" */ } type URL struct { @@ -31,5 +34,5 @@ type URL struct { type syncMaxItem struct { max_item int - mu sync.Mutex + mu sync.Mutex } |
