From 32369fcbd4285664a527846ed392a33bbcbc3d75 Mon Sep 17 00:00:00 2001 From: admin Date: Sun, 9 Feb 2025 15:07:02 +0100 Subject: FixURLs() --- main.go | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 212 insertions(+), 18 deletions(-) (limited to 'main.go') diff --git a/main.go b/main.go index 98f4ab2..7cc7449 100644 --- a/main.go +++ b/main.go @@ -44,6 +44,8 @@ func main() { } defer app.DB.Close() + app.FixURLs() + return /* app.deleteOrphanedArticles() app.topStories() @@ -229,9 +231,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_video { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 2 + link := getURL(Story.Url, 2) Story.Links = append(Story.Links, link) log.Info("match youtube host") @@ -249,9 +249,7 @@ func getStory(id int) (Story, bool) { return Story, false } if is_movie { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 1 + link := getURL(Story.Url, 1) Story.Links = append(Story.Links, link) log.Info("match moview platform url") @@ -271,9 +269,7 @@ func getStory(id int) (Story, bool) { if is_video { if !duplicates[Story.Url] { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 2 + link := getURL(Story.Url, 2) Story.Links = append(Story.Links, link) log.Info("match video title") @@ -295,9 +291,7 @@ func getStory(id int) (Story, bool) { if is_movie { if !duplicates[Story.Url] { - var link Link - link.Url = normalizeUrl(Story.Url) - link.Field = 1 + link := getURL(Story.Url, 1) Story.Links = append(Story.Links, link) log.Info("match moview platform url") @@ -362,9 +356,7 @@ func getStory(id int) (Story, bool) { if is_video { if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 2 + link := getURL(l, 2) Story.Links = append(Story.Links, link) log.Info("match youtube text") @@ -387,9 +379,7 @@ func getStory(id int) (Story, bool) { if is_movie { if !duplicates[l] { - var link Link - link.Url = normalizeUrl(l) - link.Field = 1 + link := getURL(l, 1) Story.Links = append(Story.Links, link) log.Info("match moview platform text") @@ -410,6 +400,210 @@ func getStory(id int) (Story, bool) { } } +func getURL(_url string, field int) Link { + + _url = normalizeUrl(_url) + + var link Link + + u, err := url.Parse(_url) + if err != nil { + log.Warnf("getURL: Parsing URL failed: %s \n", err.Error()) + return link + } + + link.Field = field + link.Url = _url + link.Host = u.Host + + switch u.Host { + case "www.youtube.com", + "music.youtube.com", + "www.music.youtube.com": + //log.Warn(_url) + if strings.HasPrefix(u.Path, "/@") || strings.HasPrefix(u.Path, "/c/") || strings.HasPrefix(u.Path, "/user/") || strings.HasPrefix(u.Path, "/channel/") { + link.Type = "channel" + + path_parts := strings.Split(u.Path, "/") + + if strings.HasPrefix(u.Path, "/@") { + link.Param = strings.TrimPrefix(path_parts[1], "@") + } else { + link.Param = path_parts[2] + } + + } else if strings.HasPrefix(u.Path, "/shorts") || strings.HasPrefix(u.Path, "/live") || strings.HasPrefix(u.Path, "/embed") { + link.Type = "video" + + path_parts := strings.Split(u.Path, "/") + + if len(path_parts) > 2 { + link.Param = path_parts[2] + } else { + link.Param = "" + link.Type = "null" + } + + } else if strings.HasPrefix(u.Path, "/playlist") { + link.Type = "playlist" + m, err := url.ParseQuery(u.RawQuery) + if err != nil { + log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error()) + return link + } + + /** + * ?list= includes the playlist id + */ + + p, ok := m["list"] + if !ok { + log.Warnf("getURL: Playlist: Youtube has no param: %s \n", link.Url) + } else { + link.Param = p[0] + } + + } else if strings.HasPrefix(u.Path, "/watch/") { + + link.Type = "video" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/watch") { + + link.Type = "video" + + m, err := url.ParseQuery(u.RawQuery) + if err != nil { + log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error()) + return link + } + + /** + * ?v= includes the video id + */ + p, ok := m["v"] + if !ok { + //log.Infof("getURL: v=VideoID: Youtube has no param: %s \n", link.Url) + log.Warnf("getURL: v=VideoID: Youtube has no param: %s \n", link.Url) + } else { + link.Param = p[0] + } + } else if u.Path == "/" || u.Path == "" { + link.Type = "null" + link.Param = "" + } else { + link.Type = "channel" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[1] + } + + /** + * nice debug + if link.Param == "" && !strings.HasPrefix(u.Path, "/clip") && !strings.HasPrefix(u.Path, "/results") && u.Path != "/watch" { + log.Fatal(link) + } + */ + break + case "www.imdb.com": + //log.Warn(u.Path) + if strings.HasPrefix(u.Path, "/title/") { + + link.Type = "film" + + path_parts := strings.Split(u.Path, "/") + if strings.HasPrefix(path_parts[1], "tt") { + link.Param = path_parts[1] + } else { + log.Infof("getURL: IMDB: path_parts[1] doesn't have a film id: %s %s %+v", u.Path, path_parts[2], path_parts) + log.Info(path_parts) + } + + } else if strings.HasPrefix(u.Path, "/tt") { + link.Type = "film" + + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[1] + + } else if strings.HasPrefix(u.Path, "/name/") { + + link.Type = "actor" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/character/") { + + link.Type = "character" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/company/") { + + link.Type = "company" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/video/") || strings.HasPrefix(u.Path, "/videoplayer/") { + + link.Type = "video" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/user/") { + + link.Type = "user" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/news/") || strings.HasPrefix(u.Path, "/board/announcement") { + + link.Type = "news" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/review/") { + + link.Type = "review" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/404") || u.Path == "" || u.Path == "/" || strings.HasPrefix(u.Path, "/interfaces") || strings.HasPrefix(u.Path, "/titanic") || strings.HasPrefix(u.Path, "/freedive/") || strings.HasPrefix(u.Path, "/conditions") || strings.HasPrefix(u.Path, "/help/") || strings.HasPrefix(u.Path, "/showtimes/") || strings.HasPrefix(u.Path, "/1") || strings.Contains(u.Path, "/mediaviewer/") || strings.HasPrefix(u.Path, "/media") || strings.HasPrefix(u.Path, "/licensing") || strings.HasPrefix(u.Path, "/lists") || strings.HasPrefix(u.Path, "/stats") || strings.HasPrefix(u.Path, "/rg/") || strings.HasPrefix(u.Path, "/hackernews") || strings.HasPrefix(u.Path, "/robots.txt") || strings.HasPrefix(u.Path, "/: the prestige") || strings.HasPrefix(u.Path, "/features/") || strings.HasPrefix(u.Path, "/keyword/") { + + link.Type = "null" + link.Param = "" + + } else if strings.HasPrefix(u.Path, "/list/") || strings.HasPrefix(u.Path, "/chart/") || strings.Contains(u.Path, "/top-rated-") || strings.HasPrefix(u.Path, "/whats-on-tv/") { + + link.Type = "film_list" + path_parts := strings.Split(u.Path, "/") + link.Param = path_parts[2] + + } else if strings.HasPrefix(u.Path, "/find") || strings.HasPrefix(u.Path, "/search") || strings.HasPrefix(u.Path, "/filmosearch") { + + link.Type = "imdb_search" + + } else { + log.Infof("getURL: IMDB: path does not include title: %s \n", u.Path) + //log.Warnf("getURL: IMDB: path does not include title: %s \n", u.Path) + log.Fatalf("getURL: IMDB: path does not include title: %s \n", u.Path) + } + break + case "www.vimeo.com": + // todo + default: + //log.Warnf("getURL: Host not recognized. Missing param: %s", u.Host) + return link + } + + if link.Host == "www.music.youtube.com" { + u.Host = "music.youtube.com" + link.Url = u.String() + link.Host = "music.youtube.com" + } + + return link +} + func getResponse(url string) *http.Response { var err error var response *http.Response -- cgit v1.2.3