summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--database.go31
-rw-r--r--helper.go57
-rw-r--r--main.go230
-rw-r--r--struct.go31
4 files changed, 288 insertions, 61 deletions
diff --git a/database.go b/database.go
index 31594b4..bf35f52 100644
--- a/database.go
+++ b/database.go
@@ -65,13 +65,19 @@ func (app *App) saveStory(s Story) error {
updated_at,
story_id,
url,
- field
+ field,
+ host,
+ param,
+ type
) VALUES (
NULL,
?,
?,
?,
?,
+ ?,
+ ?,
+ ?,
?
);
`
@@ -82,7 +88,7 @@ func (app *App) saveStory(s Story) error {
}
defer stmt2.Close()
- _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field)
+ _, err = stmt2.Exec(app.Now, app.Now, lid, l.Url, l.Field, l.Host, l.Param, l.Type)
if err != nil {
log.Warn("saveStory: InsertLinks: Statement execution failed")
return err
@@ -480,3 +486,24 @@ func (app *App) createMaxStoredItem(new_max_item int) error {
return nil
}
+
+func (app *App) FixURLs() {
+ rows, err := app.DB.Query("SELECT url,field FROM links;")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for rows.Next() {
+ var url string
+ var field int
+
+ err = rows.Scan(&url, &field)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ link := getURL(url, field)
+ log.Printf("%+v\n", link)
+ //log.Warnf("%+v\n", link)
+ }
+}
diff --git a/helper.go b/helper.go
index 866ec5a..52fb003 100644
--- a/helper.go
+++ b/helper.go
@@ -2,9 +2,10 @@ package main
import (
_url "net/url"
+ "regexp"
"strings"
+
log "github.com/sirupsen/logrus"
- "regexp"
xhtml "golang.org/x/net/html"
)
@@ -24,16 +25,17 @@ func stripHNPrefix(title string) string {
func _removeParam(url, key string) string {
u, err := _url.Parse(url)
if err != nil {
- log.Fatal(err)
+ log.Fatal("_removeParam: parse: ", err)
}
q := u.Query()
q.Del(key)
- u.RawQuery = q.Encode()
+ u.RawQuery = q.Encode()
return u.String()
}
func normalizeUrl(url string) string {
+ url = strings.ToLower(url)
/**
* Redirect http:// to https://
*/
@@ -53,7 +55,8 @@ func normalizeUrl(url string) string {
*/
u, err := _url.Parse(url)
if err != nil {
- log.Fatal(err)
+ log.Warn("normalizeUrl: parse: ", err)
+ return url
}
if "" == u.Scheme {
@@ -98,7 +101,7 @@ func normalizeUrl(url string) string {
u.RawQuery = q.Encode()
url = u.String()
-
+
//r := regexp.MustCompile("youtu.be/")
//url = r.ReplaceAllString(url, "youtube.com/watch?v=")
}
@@ -142,10 +145,10 @@ func normalizeUrl(url string) string {
*/
u, err = _url.Parse(url)
if err != nil {
- log.Fatal(err)
+ log.Fatal("normalizeUrl: parse: append www: ", err)
}
- if ! strings.HasPrefix(u.Host, "www.") {
+ if !strings.HasPrefix(u.Host, "www.") && u.Host != "music.youtube.com" {
u.Host = "www." + u.Host
}
@@ -173,7 +176,7 @@ func normalizeUrl(url string) string {
/**
* remove tracking param "si", "feature" and "pp" from every youtube video
*/
- match, err = regexp.MatchString("/www.youtube.com/", url)
+ match, err = regexp.MatchString("/(www|music).youtube.com/", url)
if err != nil {
log.Fatal(err)
}
@@ -196,27 +199,27 @@ func normalizeUrl(url string) string {
}
func RemoveNode(root_node *xhtml.Node, remove_me *xhtml.Node) {
- found_node := false
- check_nodes := make(map[int]*xhtml.Node)
- i := 0
+ found_node := false
+ check_nodes := make(map[int]*xhtml.Node)
+ i := 0
+
+ // loop through siblings
+ for n := root_node.FirstChild; n != nil; n = n.NextSibling {
+ if n == remove_me {
+ found_node = true
+ n.Parent.RemoveChild(n)
+ }
- // loop through siblings
- for n := root_node.FirstChild; n != nil; n = n.NextSibling {
- if n == remove_me {
- found_node = true
- n.Parent.RemoveChild(n)
+ check_nodes[i] = n
+ i++
}
- check_nodes[i] = n
- i++
- }
-
- // check if removing node is found
- // if yes no need to check childs returning
- // if no continue loop through childs and so on
- if found_node == false {
- for _, item := range check_nodes {
- RemoveNode(item, remove_me)
+ // check if removing node is found
+ // if yes no need to check childs returning
+ // if no continue loop through childs and so on
+ if found_node == false {
+ for _, item := range check_nodes {
+ RemoveNode(item, remove_me)
+ }
}
- }
}
diff --git a/main.go b/main.go
index 98f4ab2..7cc7449 100644
--- a/main.go
+++ b/main.go
@@ -44,6 +44,8 @@ func main() {
}
defer app.DB.Close()
+ app.FixURLs()
+ return
/*
app.deleteOrphanedArticles()
app.topStories()
@@ -229,9 +231,7 @@ func getStory(id int) (Story, bool) {
return Story, false
}
if is_video {
- var link Link
- link.Url = normalizeUrl(Story.Url)
- link.Field = 2
+ link := getURL(Story.Url, 2)
Story.Links = append(Story.Links, link)
log.Info("match youtube host")
@@ -249,9 +249,7 @@ func getStory(id int) (Story, bool) {
return Story, false
}
if is_movie {
- var link Link
- link.Url = normalizeUrl(Story.Url)
- link.Field = 1
+ link := getURL(Story.Url, 1)
Story.Links = append(Story.Links, link)
log.Info("match moview platform url")
@@ -271,9 +269,7 @@ func getStory(id int) (Story, bool) {
if is_video {
if !duplicates[Story.Url] {
- var link Link
- link.Url = normalizeUrl(Story.Url)
- link.Field = 2
+ link := getURL(Story.Url, 2)
Story.Links = append(Story.Links, link)
log.Info("match video title")
@@ -295,9 +291,7 @@ func getStory(id int) (Story, bool) {
if is_movie {
if !duplicates[Story.Url] {
- var link Link
- link.Url = normalizeUrl(Story.Url)
- link.Field = 1
+ link := getURL(Story.Url, 1)
Story.Links = append(Story.Links, link)
log.Info("match moview platform url")
@@ -362,9 +356,7 @@ func getStory(id int) (Story, bool) {
if is_video {
if !duplicates[l] {
- var link Link
- link.Url = normalizeUrl(l)
- link.Field = 2
+ link := getURL(l, 2)
Story.Links = append(Story.Links, link)
log.Info("match youtube text")
@@ -387,9 +379,7 @@ func getStory(id int) (Story, bool) {
if is_movie {
if !duplicates[l] {
- var link Link
- link.Url = normalizeUrl(l)
- link.Field = 1
+ link := getURL(l, 1)
Story.Links = append(Story.Links, link)
log.Info("match moview platform text")
@@ -410,6 +400,210 @@ func getStory(id int) (Story, bool) {
}
}
+func getURL(_url string, field int) Link {
+
+ _url = normalizeUrl(_url)
+
+ var link Link
+
+ u, err := url.Parse(_url)
+ if err != nil {
+ log.Warnf("getURL: Parsing URL failed: %s \n", err.Error())
+ return link
+ }
+
+ link.Field = field
+ link.Url = _url
+ link.Host = u.Host
+
+ switch u.Host {
+ case "www.youtube.com",
+ "music.youtube.com",
+ "www.music.youtube.com":
+ //log.Warn(_url)
+ if strings.HasPrefix(u.Path, "/@") || strings.HasPrefix(u.Path, "/c/") || strings.HasPrefix(u.Path, "/user/") || strings.HasPrefix(u.Path, "/channel/") {
+ link.Type = "channel"
+
+ path_parts := strings.Split(u.Path, "/")
+
+ if strings.HasPrefix(u.Path, "/@") {
+ link.Param = strings.TrimPrefix(path_parts[1], "@")
+ } else {
+ link.Param = path_parts[2]
+ }
+
+ } else if strings.HasPrefix(u.Path, "/shorts") || strings.HasPrefix(u.Path, "/live") || strings.HasPrefix(u.Path, "/embed") {
+ link.Type = "video"
+
+ path_parts := strings.Split(u.Path, "/")
+
+ if len(path_parts) > 2 {
+ link.Param = path_parts[2]
+ } else {
+ link.Param = ""
+ link.Type = "null"
+ }
+
+ } else if strings.HasPrefix(u.Path, "/playlist") {
+ link.Type = "playlist"
+ m, err := url.ParseQuery(u.RawQuery)
+ if err != nil {
+ log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error())
+ return link
+ }
+
+ /**
+ * ?list= includes the playlist id
+ */
+
+ p, ok := m["list"]
+ if !ok {
+ log.Warnf("getURL: Playlist: Youtube has no param: %s \n", link.Url)
+ } else {
+ link.Param = p[0]
+ }
+
+ } else if strings.HasPrefix(u.Path, "/watch/") {
+
+ link.Type = "video"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/watch") {
+
+ link.Type = "video"
+
+ m, err := url.ParseQuery(u.RawQuery)
+ if err != nil {
+ log.Warnf("getURL: Parsing RawQuery for Youtube failed: %s \n", err.Error())
+ return link
+ }
+
+ /**
+ * ?v= includes the video id
+ */
+ p, ok := m["v"]
+ if !ok {
+ //log.Infof("getURL: v=VideoID: Youtube has no param: %s \n", link.Url)
+ log.Warnf("getURL: v=VideoID: Youtube has no param: %s \n", link.Url)
+ } else {
+ link.Param = p[0]
+ }
+ } else if u.Path == "/" || u.Path == "" {
+ link.Type = "null"
+ link.Param = ""
+ } else {
+ link.Type = "channel"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[1]
+ }
+
+ /**
+ * nice debug
+ if link.Param == "" && !strings.HasPrefix(u.Path, "/clip") && !strings.HasPrefix(u.Path, "/results") && u.Path != "/watch" {
+ log.Fatal(link)
+ }
+ */
+ break
+ case "www.imdb.com":
+ //log.Warn(u.Path)
+ if strings.HasPrefix(u.Path, "/title/") {
+
+ link.Type = "film"
+
+ path_parts := strings.Split(u.Path, "/")
+ if strings.HasPrefix(path_parts[1], "tt") {
+ link.Param = path_parts[1]
+ } else {
+ log.Infof("getURL: IMDB: path_parts[1] doesn't have a film id: %s %s %+v", u.Path, path_parts[2], path_parts)
+ log.Info(path_parts)
+ }
+
+ } else if strings.HasPrefix(u.Path, "/tt") {
+ link.Type = "film"
+
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[1]
+
+ } else if strings.HasPrefix(u.Path, "/name/") {
+
+ link.Type = "actor"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/character/") {
+
+ link.Type = "character"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/company/") {
+
+ link.Type = "company"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/video/") || strings.HasPrefix(u.Path, "/videoplayer/") {
+
+ link.Type = "video"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/user/") {
+
+ link.Type = "user"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/news/") || strings.HasPrefix(u.Path, "/board/announcement") {
+
+ link.Type = "news"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/review/") {
+
+ link.Type = "review"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/404") || u.Path == "" || u.Path == "/" || strings.HasPrefix(u.Path, "/interfaces") || strings.HasPrefix(u.Path, "/titanic") || strings.HasPrefix(u.Path, "/freedive/") || strings.HasPrefix(u.Path, "/conditions") || strings.HasPrefix(u.Path, "/help/") || strings.HasPrefix(u.Path, "/showtimes/") || strings.HasPrefix(u.Path, "/1") || strings.Contains(u.Path, "/mediaviewer/") || strings.HasPrefix(u.Path, "/media") || strings.HasPrefix(u.Path, "/licensing") || strings.HasPrefix(u.Path, "/lists") || strings.HasPrefix(u.Path, "/stats") || strings.HasPrefix(u.Path, "/rg/") || strings.HasPrefix(u.Path, "/hackernews") || strings.HasPrefix(u.Path, "/robots.txt") || strings.HasPrefix(u.Path, "/: the prestige") || strings.HasPrefix(u.Path, "/features/") || strings.HasPrefix(u.Path, "/keyword/") {
+
+ link.Type = "null"
+ link.Param = ""
+
+ } else if strings.HasPrefix(u.Path, "/list/") || strings.HasPrefix(u.Path, "/chart/") || strings.Contains(u.Path, "/top-rated-") || strings.HasPrefix(u.Path, "/whats-on-tv/") {
+
+ link.Type = "film_list"
+ path_parts := strings.Split(u.Path, "/")
+ link.Param = path_parts[2]
+
+ } else if strings.HasPrefix(u.Path, "/find") || strings.HasPrefix(u.Path, "/search") || strings.HasPrefix(u.Path, "/filmosearch") {
+
+ link.Type = "imdb_search"
+
+ } else {
+ log.Infof("getURL: IMDB: path does not include title: %s \n", u.Path)
+ //log.Warnf("getURL: IMDB: path does not include title: %s \n", u.Path)
+ log.Fatalf("getURL: IMDB: path does not include title: %s \n", u.Path)
+ }
+ break
+ case "www.vimeo.com":
+ // todo
+ default:
+ //log.Warnf("getURL: Host not recognized. Missing param: %s", u.Host)
+ return link
+ }
+
+ if link.Host == "www.music.youtube.com" {
+ u.Host = "music.youtube.com"
+ link.Url = u.String()
+ link.Host = "music.youtube.com"
+ }
+
+ return link
+}
+
func getResponse(url string) *http.Response {
var err error
var response *http.Response
diff --git a/struct.go b/struct.go
index 2b7538a..568205e 100644
--- a/struct.go
+++ b/struct.go
@@ -5,25 +5,28 @@ import (
)
type Story struct {
- Id int
+ Id int
//Deleted bool
- Type string /* story, comment (or job, poll, pollopt) */
- Title string /* title (only story) */
- Text string /* comment text or possible text on story (HTML) */
- Dead bool
- Url string /* verbatim parsed URL */
+ Type string /* story, comment (or job, poll, pollopt) */
+ Title string /* title (only story) */
+ Text string /* comment text or possible text on story (HTML) */
+ Dead bool
+ Url string /* verbatim parsed URL */
//NormalizedUrl string /* normalized */
- Score int /* only story */
- Descendants int /* comments on score or kids on comments */
+ Score int /* only story */
+ Descendants int /* comments on score or kids on comments */
//Kids []int /* id of the item's comments */
- Time int /* posted at */
- By string /* hn commenter */
- Links []Link /* matched urls */
+ Time int /* posted at */
+ By string /* hn commenter */
+ Links []Link /* matched urls */
}
type Link struct {
- Url string
- Field int /* 2 = video, 1 = movies, 0 = bug */
+ Url string
+ Field int /* 2 = video, 1 = movies, 0 = bug */
+ Host string /* e.g. "youtube.com", "imdb.com" */
+ Param string /* e.g. "v" param for youtube, title/"ttxxx" for imdb */
+ Type string /* e.g. "video", "channel", "playlist" */
}
type URL struct {
@@ -31,5 +34,5 @@ type URL struct {
type syncMaxItem struct {
max_item int
- mu sync.Mutex
+ mu sync.Mutex
}