FixURLs()HEAD master

author: admin 2025-02-09 15:07:02 +0100
committer: admin 2025-02-09 15:07:02 +0100
commit: 32369fcbd4285664a527846ed392a33bbcbc3d75 (patch)
tree: aabb365003b7a0b4c3a278bae117e28ff2c9c3a7
parent: a82dd3752342b0096a5070d297189b0bdf9ffe34 (diff)
download: hncrawler-32369fcbd4285664a527846ed392a33bbcbc3d75.tar.gz
4 files changed, 288 insertions, 61 deletions
diff --git a/database.go b/database.go
index 31594b4..bf35f52 100644
--- a/database.go
+++ b/database.go
@@ -65,13 +65,19 @@ func (app *App) saveStory(s Story) error {
 				updated_at,
 				story_id,
 				url,
-				field
+				field,
+				host,
+				param,
+				type
 			) VALUES (
 				NULL,
 				?,
 				?,
 				?,
 				?,
+				?,
+				?,
+				?,
 				?
 			);
 			`
@@ -82,7 +88,7 @@ func (app *App) saveStory(s Story) error {
 		}
 		defer stmt2.Close()
 
-		_, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field)
+		_, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field, l.Host, l.Param, l.Type)
 		if err != nil {
 			log.Warn("saveStory: InsertLinks: Statement execution failed")
 			return err
@@ -480,3 +486,24 @@ func (app *App) createMaxStoredItem(new_max_item int) error {
 
 	return nil
 }
+
+func (app *App) FixURLs() {
+	rows, err := app.DB.Query("SELECT url,field FROM links;")
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	for rows.Next() {
+		var url string
+		var field int
+
+		err = rows.Scan(&url, &field)
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		link := getURL(url, field)
+		log.Printf("%+v\n", link)
+		//log.Warnf("%+v\n", link)
+	}
+}
diff --git a/helper.go b/helper.go
index 866ec5a..52fb003 100644
--- a/helper.go
+++ b/helper.go
@@ -2,9 +2,10 @@ package main
 
 import (
 	_url "net/url"
+	"regexp"
 	"strings"
+
 	log "github.com/sirupsen/logrus"
-	"regexp"
 	xhtml "golang.org/x/net/html"
 )
 
@@ -24,16 +25,17 @@ func stripHNPrefix(title string) string {
 func _removeParam(url, key string) string {
 	u, err := _url.Parse(url)
 	if err != nil {
-		log.Fatal(err)
+		log.Fatal("_removeParam: parse: ", err)
 	}
 	q := u.Query()
 	q.Del(key)
-	u.RawQuery  = q.Encode()
+	u.RawQuery = q.Encode()
 	return u.String()
 }
 
 func normalizeUrl(url string) string {
 
+	url = strings.ToLower(url)
 	/**
 	 * Redirect http:// to https://
 	 */
@@ -53,7 +55,8 @@ func normalizeUrl(url string) string {
 	 */
 	u, err := _url.Parse(url)
 	if err != nil {
-		log.Fatal(err)
+		log.Warn("normalizeUrl: parse: ", err)
+		return url
 	}
 
 	if "" == u.Scheme {
@@ -98,7 +101,7 @@ func normalizeUrl(url string) string {
 
 		u.RawQuery = q.Encode()
 		url = u.String()
-		
+
 		//r := regexp.MustCompile("youtu.be/")
 		//url = r.ReplaceAllString(url, "youtube.com/watch?v=")
 	}
@@ -142,10 +145,10 @@ func normalizeUrl(url string) string {
 	 */
 	u, err = _url.Parse(url)
 	if err != nil {
-		log.Fatal(err)
+		log.Fatal("normalizeUrl: parse: append www: ", err)
 	}
 
-	if ! strings.HasPrefix(u.Host, "www.") {
+	if !strings.HasPrefix(u.Host, "www.") && u.Host != "music.youtube.com" {
 		u.Host = "www." + u.Host
 	}
 
@@ -173,7 +176,7 @@ func normalizeUrl(url string) string {
 	/**
 	 * remove tracking param "si", "feature" and "pp" from every youtube video
 	 */
-	match, err = regexp.MatchString("/www.youtube.com/", url)
+	match, err = regexp.MatchString("/(www|music).youtube.com/", url)
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -196,27 +199,27 @@ func normalizeUrl(url string) string {
 }
 
 func RemoveNode(root_node *xhtml.Node, remove_me *xhtml.Node) {
-    found_node := false
-    check_nodes := make(map[int]*xhtml.Node)
-    i := 0
+	found_node := false
+	check_nodes := make(map[int]*xhtml.Node)
+	i := 0
+
+	// loop through siblings
+	for n := root_node.FirstChild; n != nil; n = n.NextSibling {
+		if n == remove_me {
+			found_node = true
+			n.Parent.RemoveChild(n)
+		}
 
-    // loop through siblings
-    for n := root_node.FirstChild; n != nil; n = n.NextSibling {
-	if n == remove_me {
-	    found_node = true
-	    n.Parent.RemoveChild(n)
+		check_nodes[i] = n
+		i++
 	}
 
-	check_nodes[i] = n
-	i++
-    }
-
-    // check if removing node is found
-    // if yes no need to check childs returning
-    // if no continue loop through childs and so on
-    if found_node == false {
-	for _, item := range check_nodes {
-	    RemoveNode(item, remove_me)
+	// check if removing node is found
+	// if yes no need to check childs returning
+	// if no continue loop through childs and so on
+	if found_node == false {
+		for _, item := range check_nodes {
+			RemoveNode(item, remove_me)
+		}
 	}
-    }
 }
diff --git a/main.go b/main.go
index 98f4ab2..7cc7449 100644
--- a/main.go
+++ b/main.go
@@ -44,6 +44,8 @@ func main() {
 	}
 	defer app.DB.Close()
 
+	app.FixURLs()
+	return
 	/*
 		app.deleteOrphanedArticles()
 		app.topStories()
@@ -229,9 +231,7 @@ func getStory(id int) (Story, bool) {
 		return Story, false
 	}
 	if is_video {
-		var link Link
-		link.Url = normalizeUrl(Story.Url)
-		link.Field = 2
+		link := getURL(Story.Url, 2)
 		Story.Links = append(Story.Links, link)
 
 		log.Info("match youtube host")
@@ -249,9 +249,7 @@ func getStory(id int) (Story, bool) {
 		return Story, false
 	}
 	if is_movie {
-		var link Link
-		link.Url = normalizeUrl(Story.Url)
-		link.Field = 1
+		link := getURL(Story.Url, 1)
 		Story.Links = append(Story.Links, link)
 
 		log.Info("match moview platform url")
@@ -271,9 +269,7 @@ func getStory(id int) (Story, bool) {
 	if is_video {
 		if !duplicates[Story.Url] {
 
-			var link Link
-			link.Url = normalizeUrl(Story.Url)
-			link.Field = 2
+			link := getURL(Story.Url, 2)
 			Story.Links = append(Story.Links, link)
 
 			log.Info("match video title")
@@ -295,9 +291,7 @@ func getStory(id int) (Story, bool) {
 	if is_movie {
 		if !duplicates[Story.Url] {
 
-			var link Link
-			link.Url = normalizeUrl(Story.Url)
-			link.Field = 1
+			link := getURL(Story.Url, 1)
 			Story.Links = append(Story.Links, link)
 
 			log.Info("match moview platform url")
@@ -362,9 +356,7 @@ func getStory(id int) (Story, bool) {
 			if is_video {
 				if !duplicates[l] {
 
-					var link Link
-					link.Url = normalizeUrl(l)
-					link.Field = 2
+					link := getURL(l, 2)
 					Story.Links = append(Story.Links, link)
 
 					log.Info("match youtube text")
@@ -387,9 +379,7 @@ func getStory(id int) (Story, bool) {
 			if is_movie {
 				if !duplicates[l] {
 
-					var link Link
-					link.Url = normalizeUrl(l)
-					link.Field = 1
+					link := getURL(l, 1)
 					Story.Links = append(Story.Links, link)
 
 					log.Info("match moview platform text")
@@ -410,6 +400,210 @@ func getStory(id int) (Story, bool) {
 	}
 }
 
+func getURL(_url string, field int) Link {
+
+	_url = normalizeUrl(_url)
+
+	var link Link
+
+	u, err := url.Parse(_url)
+	if err != nil {
+		log.Warnf("getURL: Parsing URL failed: %s \n", err.Error())
+		return link
+	}
+
+	link.Field = field
+	link.Url = _url
+	link.Host = u.Host
+
+	switch u.Host {
+	case "www.youtube.com",
+		"music.youtube.com",
+		"www.music.youtube.com":
+		//log.Warn(_url)
+		if strings.HasPrefix(u.Path, "/@") || strings.HasPrefix(u.Path, "/c/") || strings.HasPrefix(u.Path, "/user/") || strings.HasPrefix(u.Path, "/channel/") {
+			link.Type = "channel"
+
+			path_parts := strings.Split(u.Path, "/")
+
+			if strings.HasPrefix(u.Path, "/@") {
+				link.Param = strings.TrimPrefix(path_parts[1], "@")
+			} else {
+				link.Param = path_parts[2]
+			}
+
+		} else if strings.HasPrefix(u.Path, "/shorts") || strings.HasPrefix(u.Path, "/live") || strings.HasPrefix(u.Path, "/embed") {
+			link.Type = "video"
+
+			path_parts := strings.Split(u.Path, "/")
+
+			if len(path_parts) > 2 {
+				link.Param = path_parts[2]
+			} else {
+				link.Param = ""
+				link.Type = "null"
+			}
+
+		} else if strings.HasPrefix(u.Path, "/playlist") {
+			link.Type = "playlist"
+			m, err := url.ParseQuery(u.RawQuery)
+			if err != nil {
+				log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error())
+				return link
+			}
+
+			/**
+			 * ?list= includes the playlist id
+			 */
+
+			p, ok := m["list"]
+			if !ok {
+				log.Warnf("getURL: Playlist: Youtube has no param: %s \n", link.Url)
+			} else {
+				link.Param = p[0]
+			}
+
+		} else if strings.HasPrefix(u.Path, "/watch/") {
+
+			link.Type = "video"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/watch") {
+
+			link.Type = "video"
+
+			m, err := url.ParseQuery(u.RawQuery)
+			if err != nil {
+				log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error())
+				return link
+			}
+
+			/**
+			 * ?v= includes the video id
+			 */
+			p, ok := m["v"]
+			if !ok {
+				//log.Infof("getURL: v=VideoID: Youtube has no param: %s \n", link.Url)
+				log.Warnf("getURL: v=VideoID: Youtube has no param: %s \n", link.Url)
+			} else {
+				link.Param = p[0]
+			}
+		} else if u.Path == "/" || u.Path == "" {
+			link.Type = "null"
+			link.Param = ""
+		} else {
+			link.Type = "channel"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[1]
+		}
+
+		/**
+		 * nice debug
+		if link.Param == "" && !strings.HasPrefix(u.Path, "/clip") && !strings.HasPrefix(u.Path, "/results") && u.Path != "/watch" {
+			log.Fatal(link)
+		}
+		*/
+		break
+	case "www.imdb.com":
+		//log.Warn(u.Path)
+		if strings.HasPrefix(u.Path, "/title/") {
+
+			link.Type = "film"
+
+			path_parts := strings.Split(u.Path, "/")
+			if strings.HasPrefix(path_parts[1], "tt") {
+				link.Param = path_parts[1]
+			} else {
+				log.Infof("getURL: IMDB: path_parts[1] doesn't have a film id: %s %s %+v", u.Path, path_parts[2], path_parts)
+				log.Info(path_parts)
+			}
+
+		} else if strings.HasPrefix(u.Path, "/tt") {
+			link.Type = "film"
+
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[1]
+
+		} else if strings.HasPrefix(u.Path, "/name/") {
+
+			link.Type = "actor"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/character/") {
+
+			link.Type = "character"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/company/") {
+
+			link.Type = "company"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/video/") || strings.HasPrefix(u.Path, "/videoplayer/") {
+
+			link.Type = "video"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/user/") {
+
+			link.Type = "user"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/news/") || strings.HasPrefix(u.Path, "/board/announcement") {
+
+			link.Type = "news"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/review/") {
+
+			link.Type = "review"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/404") || u.Path == "" || u.Path == "/" || strings.HasPrefix(u.Path, "/interfaces") || strings.HasPrefix(u.Path, "/titanic") || strings.HasPrefix(u.Path, "/freedive/") || strings.HasPrefix(u.Path, "/conditions") || strings.HasPrefix(u.Path, "/help/") || strings.HasPrefix(u.Path, "/showtimes/") || strings.HasPrefix(u.Path, "/1") || strings.Contains(u.Path, "/mediaviewer/") || strings.HasPrefix(u.Path, "/media") || strings.HasPrefix(u.Path, "/licensing") || strings.HasPrefix(u.Path, "/lists") || strings.HasPrefix(u.Path, "/stats") || strings.HasPrefix(u.Path, "/rg/") || strings.HasPrefix(u.Path, "/hackernews") || strings.HasPrefix(u.Path, "/robots.txt") || strings.HasPrefix(u.Path, "/: the prestige") || strings.HasPrefix(u.Path, "/features/") || strings.HasPrefix(u.Path, "/keyword/") {
+
+			link.Type = "null"
+			link.Param = ""
+
+		} else if strings.HasPrefix(u.Path, "/list/") || strings.HasPrefix(u.Path, "/chart/") || strings.Contains(u.Path, "/top-rated-") || strings.HasPrefix(u.Path, "/whats-on-tv/") {
+
+			link.Type = "film_list"
+			path_parts := strings.Split(u.Path, "/")
+			link.Param = path_parts[2]
+
+		} else if strings.HasPrefix(u.Path, "/find") || strings.HasPrefix(u.Path, "/search") || strings.HasPrefix(u.Path, "/filmosearch") {
+
+			link.Type = "imdb_search"
+
+		} else {
+			log.Infof("getURL: IMDB: path does not include title: %s \n", u.Path)
+			//log.Warnf("getURL: IMDB: path does not include title: %s \n", u.Path)
+			log.Fatalf("getURL: IMDB: path does not include title: %s \n", u.Path)
+		}
+		break
+	case "www.vimeo.com":
+		// todo
+	default:
+		//log.Warnf("getURL: Host not recognized. Missing param: %s", u.Host)
+		return link
+	}
+
+	if link.Host == "www.music.youtube.com" {
+		u.Host = "music.youtube.com"
+		link.Url = u.String()
+		link.Host = "music.youtube.com"
+	}
+
+	return link
+}
+
 func getResponse(url string) *http.Response {
 	var err error
 	var response *http.Response
diff --git a/struct.go b/struct.go
index 2b7538a..568205e 100644
--- a/struct.go
+++ b/struct.go
@@ -5,25 +5,28 @@ import (
 )
 
 type Story struct {
-	Id          	int
+	Id int
 	//Deleted     	bool
-	Type        	string	/* story, comment (or job, poll, pollopt) */
-	Title		string 	/* title (only story) */
-	Text        	string 	/* comment text or possible text on story (HTML) */
-	Dead        	bool
-	Url     	string	/* verbatim parsed URL */
+	Type  string /* story, comment (or job, poll, pollopt) */
+	Title string /* title (only story) */
+	Text  string /* comment text or possible text on story (HTML) */
+	Dead  bool
+	Url   string /* verbatim parsed URL */
 	//NormalizedUrl   string	/* normalized */
-	Score       	int	/* only story */
-	Descendants 	int	/* comments on score or kids on comments */
+	Score       int /* only story */
+	Descendants int /* comments on score or kids on comments */
 	//Kids		[]int	/* id of the item's comments  */
-	Time        	int 	/* posted at */
-	By		string	/* hn commenter */
-	Links		[]Link	/* matched urls */
+	Time  int    /* posted at */
+	By    string /* hn commenter */
+	Links []Link /* matched urls */
 }
 
 type Link struct {
-	Url string
-	Field 		int	/* 2 = video, 1 = movies, 0 = bug */
+	Url   string
+	Field int    /* 2 = video, 1 = movies, 0 = bug */
+	Host  string /* e.g. "youtube.com", "imdb.com" */
+	Param string /* e.g. "v" param for youtube, title/"ttxxx" for imdb */
+	Type  string /* e.g. "video", "channel", "playlist" */
 }
 
 type URL struct {
@@ -31,5 +34,5 @@ type URL struct {
 
 type syncMaxItem struct {
 	max_item int
-	mu sync.Mutex
+	mu       sync.Mutex
 }
author	admin	2025-02-09 15:07:02 +0100
committer	admin	2025-02-09 15:07:02 +0100
commit	32369fcbd4285664a527846ed392a33bbcbc3d75 (patch)
tree	aabb365003b7a0b4c3a278bae117e28ff2c9c3a7
parent	a82dd3752342b0096a5070d297189b0bdf9ffe34 (diff)
download	hncrawler-32369fcbd4285664a527846ed392a33bbcbc3d75.tar.gz