summaryrefslogtreecommitdiff
path: root/helper.go
diff options
context:
space:
mode:
authoradmin2024-09-13 20:04:17 +0200
committeradmin2024-09-13 20:04:17 +0200
commitb190512e951efdd1ed4642eed8726bf7bdf2c022 (patch)
tree635c5ca837aaf48ddcfbc6c65c1055c7402f84f4 /helper.go
parent6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b (diff)
downloadhncrawler-b190512e951efdd1ed4642eed8726bf7bdf2c022.tar.gz
changed xurl to goquery
Diffstat (limited to 'helper.go')
-rw-r--r--helper.go112
1 files changed, 83 insertions, 29 deletions
diff --git a/helper.go b/helper.go
index af5f4c1..866ec5a 100644
--- a/helper.go
+++ b/helper.go
@@ -5,6 +5,7 @@ import (
"strings"
log "github.com/sirupsen/logrus"
"regexp"
+ xhtml "golang.org/x/net/html"
)
func stripHNPrefix(title string) string {
@@ -32,6 +33,10 @@ func _removeParam(url, key string) string {
}
func normalizeUrl(url string) string {
+
+ /**
+ * Redirect http:// to https://
+ */
match, err := regexp.MatchString("^http://", url)
if err != nil {
log.Fatal(err)
@@ -42,7 +47,10 @@ func normalizeUrl(url string) string {
url = r.ReplaceAllString(url, "https://")
}
- // add missing https:// if no scheme
+ /**
+ * add missing https:// if no scheme
+ * Fun fact: https://news.ycombinator.com/item?id=27351340 broke this part
+ */
u, err := _url.Parse(url)
if err != nil {
log.Fatal(err)
@@ -56,7 +64,9 @@ func normalizeUrl(url string) string {
}
}
-
+ /**
+ * Apple TV accepts youtube:// scheme
+ */
match, err = regexp.MatchString("youtube://", url)
if err != nil {
log.Fatal(err)
@@ -66,6 +76,9 @@ func normalizeUrl(url string) string {
url = r.ReplaceAllString(url, "https://")
}
+ /**
+ * Redirect youtu.be to desktop version
+ */
match, err = regexp.MatchString("youtu.be/", url)
if err != nil {
log.Fatal(err)
@@ -73,12 +86,6 @@ func normalizeUrl(url string) string {
if match {
log.Debug("normalize: ", "youtu.be ", url)
- /**
- * remove tracking param "si"
- */
- url = _removeParam(url, "si")
- url = _removeParam(url, "feature")
-
u, err := _url.Parse(url)
if err != nil {
log.Fatal(err)
@@ -96,6 +103,9 @@ func normalizeUrl(url string) string {
//url = r.ReplaceAllString(url, "youtube.com/watch?v=")
}
+ /**
+ * Redirect m.youtube.com to desktop version
+ */
match, err = regexp.MatchString("/m.youtube.com/", url)
if err != nil {
log.Fatal(err)
@@ -107,12 +117,14 @@ func normalizeUrl(url string) string {
* remove tracking param "si"
*/
url = _removeParam(url, "si")
- url = _removeParam(url, "feature")
r := regexp.MustCompile("/m.youtube.com/")
url = r.ReplaceAllString(url, "/www.youtube.com/")
}
+ /**
+ * Redirect m.imdb.com to desktop version
+ */
match, err = regexp.MatchString("/m.imdb.com/", url)
if err != nil {
log.Fatal(err)
@@ -120,26 +132,56 @@ func normalizeUrl(url string) string {
if match {
log.Debug("normalize: ", "m.imdb.com ", url)
+ r := regexp.MustCompile("/m.imdb.com/")
+ url = r.ReplaceAllString(url, "/www.imdb.com")
+ }
+
+ /**
+ * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go
+ * Screw that, wierd edge case. Someone pasted a
+ */
+ u, err = _url.Parse(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if ! strings.HasPrefix(u.Host, "www.") {
+ u.Host = "www." + u.Host
+ }
+
+ url = u.String()
+
+ /**
+ * Redirects youtube.com/c/<name> to youtube.com/@<name>
+ */
+ match, err = regexp.MatchString("/.youtube.com/c/", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ log.Debug("normalize: ", "youtube.com/c/ -> @ ", url)
+
/**
* remove tracking param "si"
*/
url = _removeParam(url, "si")
- url = _removeParam(url, "feature")
- r := regexp.MustCompile("/m.imdb.com/")
- url = r.ReplaceAllString(url, "/www.imdb.com")
+ r := regexp.MustCompile("youtube.com/c/")
+ url = r.ReplaceAllString(url, "youtube.com/@")
}
- /*
- match, err = regexp.MatchString("m.wikipedia.org", url)
+ /**
+ * remove tracking param "si", "feature" and "pp" from every youtube video
+ */
+ match, err = regexp.MatchString("/www.youtube.com/", url)
if err != nil {
log.Fatal(err)
}
if match {
- r := regexp.MustCompile("m.wikipedia.org")
- url = r.ReplaceAllString(url, "wikipedia.org")
+ url = _removeParam(url, "si")
+ url = _removeParam(url, "pp")
+ url = _removeParam(url, "feature")
}
- */
/**
* remove tracking utm_ params
@@ -150,19 +192,31 @@ func normalizeUrl(url string) string {
url = _removeParam(url, "utm_term")
url = _removeParam(url, "utm_content")
- u, err = _url.Parse(url)
- if err != nil {
- log.Fatal(err)
- }
+ return url
+}
- /**
- * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go
- * Screw that, wierd edge case. Someone pasted a
- */
- if ! strings.HasPrefix(u.Host, "www.") {
- u.Host = "www." + u.Host
+func RemoveNode(root_node *xhtml.Node, remove_me *xhtml.Node) {
+ found_node := false
+ check_nodes := make(map[int]*xhtml.Node)
+ i := 0
+
+ // loop through siblings
+ for n := root_node.FirstChild; n != nil; n = n.NextSibling {
+ if n == remove_me {
+ found_node = true
+ n.Parent.RemoveChild(n)
}
- url = u.String()
- return url
+ check_nodes[i] = n
+ i++
+ }
+
+ // check if removing node is found
+ // if yes no need to check childs returning
+ // if no continue loop through childs and so on
+ if found_node == false {
+ for _, item := range check_nodes {
+ RemoveNode(item, remove_me)
+ }
+ }
}