package main
import (
"html"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"time"
"github.com/anikhasibul/queue"
"github.com/jmoiron/sqlx"
log "github.com/sirupsen/logrus"
"mvdan.cc/xurls/v2"
)
type App struct {
Config *Config
DB *sqlx.DB
Now time.Time
}
func main() {
var err error
_own_conf := _conf
app := App{Config: &_own_conf}
_conf = Config{}
app.Now = time.Now()
log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
if err != nil {
log.Fatal(err, "Cannot connect to database")
}
if err = app.DB.Ping(); err != nil {
log.Fatal(err, "No connection to database")
}
defer app.DB.Close()
/*
app.deleteOrphanedArticles()
app.topStories()
app.deleteOrphanedArticles()
app.updateAllDiscussions()
*/
app.walkDown()
/**
* Resolve redirects on stored urls.
*/
//return
}
func (app *App) walkDown() {
//var err error
//max_item := getMaxItem()
//max_item := 41495306
//max_item := 36128477
max_item := 32670334
//max_item := 41231601
//max_item := 41165987
//max_item := 41136898
//max_item := 22554000
//max_item := 22494596
//max_item := 22354383
//max_item := 18984000
//max_item := 18732000
//max_item := 16017000
//max_item := 15494000
//max_item := 15038031
//max_item := 14450000
const maxRoutines = 200
q := queue.New(maxRoutines)
defer q.Close()
//for i := max_item; i > 22600000; i-- {
for i := max_item; i > 0; i-- {
q.Add()
go func(i int) {
defer q.Done()
Story, ok := getStory(i)
if ok {
if len(Story.Links) > 0 {
//log.Debugf("%+v\n", Story)
//log.Debugf("%+v\n", Story.Links)
}
err := app.saveStory(Story)
if err != nil {
log.Fatal(err)
}
/*
*/
}
/*
* Prints status update every 1000th entry
*/
if i%1000 == 0 {
log.Infof("%s: Getting item %d\n", time.Now(), i)
}
}(i)
}
q.Wait()
}
func getMaxItem() int {
response, err := http.Get("https://hacker-news.firebaseio.com/v0/maxitem.json")
if err != nil {
panic(err)
}
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
}
max_item, err := strconv.Atoi(string(data))
if err != nil {
panic(err)
}
return max_item
}
func (app *App) topStories() {
var err error
data1 := strings.TrimSuffix(string(getTopStories()), "]")
data2 := strings.TrimPrefix(string(getBestStories()), "[")
data1 = data1 + ","
data := data1 + data2
var story_ids []int
err = json.Unmarshal([]byte(data), &story_ids)
if err != nil {
log.Warn("topStories: Unmarshaling json failed")
panic(err)
}
const maxRoutines = 20
q := queue.New(maxRoutines)
defer q.Close()
for _, id := range story_ids {
q.Add()
go func(id int) {
Story, ok := getStory(id)
defer q.Done()
if ok {
log.Infof("%+v\n", Story)
err = app.saveStory(Story)
if err != nil {
log.Fatal(err)
}
}
}(id)
}
q.Wait()
}
func getStory(id int) (Story, bool) {
Story := getDetail(id)
if Story.Dead {
return Story, false
}
if Story.Type == "Story" && Story.Score < 10 && Story.Descendants < 10 {
return Story, false
}
var duplicates = make(map[string]bool)
/*
if (time.Now().Unix() - 3456000) > int64(Story.Time) {
}
*/
Story.Title = stripHNPrefix(Story.Title)
u, err := url.Parse(Story.Url)
if err != nil {
log.Warnf("getStory: Parsing URL failed: %s\n", err.Error())
return Story, false
}
/**
* Check if story links to Youtube
*/
is_video, err := regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", u.Host)
if err != nil {
log.Errorf("Failed to parse and match regex: %s\n", err.Error())
return Story, false
}
if is_video {
var link Link
link.Url = normalizeUrl(Story.Url)
link.Field = 2
Story.Links = append(Story.Links, link)
log.Info("match youtube host")
log.Infof("%+v\n", Story)
duplicates[link.Url] = true
}
/**
* Check if story links to movie platform
*/
is_movie, err := regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
if err != nil {
log.Errorf("Failed to parse and match regex: %s\n", err.Error())
return Story, false
}
if is_movie {
var link Link
link.Url = normalizeUrl(Story.Url)
link.Field = 1
Story.Links = append(Story.Links, link)
log.Info("match moview platform url")
log.Infof("%+v\n", Story)
duplicates[link.Url] = true
}
/**
* Check for (Video) in title
*/
is_video, err = regexp.MatchString("(?i)(\\(video\\))|(\\[video\\])", Story.Title)
if err != nil {
log.Errorf("Failed to parse and match regex: %s\n", err.Error())
return Story, false
}
if is_video {
if ! duplicates[Story.Url] {
var link Link
link.Url = normalizeUrl(Story.Url)
link.Field = 2
Story.Links = append(Story.Links, link)
log.Info("match video title")
log.Infof("%+v\n", Story)
duplicates[Story.Url] = true
}
}
/**
* Check if story links to movie platform
*/
is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", u.Host)
if err != nil {
log.Errorf("Failed to parse and match regex: %s\n", err.Error())
return Story, false
}
if is_movie {
if ! duplicates[Story.Url] {
var link Link
link.Url = normalizeUrl(Story.Url)
link.Field = 1
Story.Links = append(Story.Links, link)
log.Info("match moview platform url")
log.Infof("%+v\n", Story)
duplicates[Story.Url] = true
}
}
/**
* Parse all URLs in Story.Text
*/
rxRelaxed := xurls.Relaxed()
rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1)
for _, rxLink := range rxLinks {
/**
* Check for Youtube in text field
*/
is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink)
if err != nil {
log.Errorf("Failed to parse and match regex: %s\n", err.Error())
return Story, false
}
if is_video {
if ! duplicates[rxLink] {
var link Link
link.Url = normalizeUrl(rxLink)
link.Field = 2
Story.Links = append(Story.Links, link)
log.Info("match youtube text")
log.Infof("%+v\n", Story)
duplicates[rxLink] = true
}
}
/**
* Check for movie platforms in text field
*/
is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink)
if err != nil {
log.Errorf("Failed to parse and match regex: %s\n", err.Error())
return Story, false
}
if is_movie {
if ! duplicates[rxLink] {
var link Link
link.Url = normalizeUrl(rxLink)
link.Field = 1
Story.Links = append(Story.Links, link)
log.Info("match moview platform text")
log.Infof("%+v\n", Story)
duplicates[rxLink] = true
}
}
}
//Story.Url = normalizeUrl(Story.Url)
if len(Story.Links) > 0 {
return Story, true
} else {
return Story, false
}
}
func getResponse(url string) *http.Response {
var err error
var response *http.Response
response, err = http.Get(url)
if err != nil {
for i := 0; i < 4; i++ {
if i == 0 {
log.Debug("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
} else {
log.Warn("getResponse: Got error connecting to firebase/wikipedia. Retry: " + strconv.Itoa(i))
}
resp2, err2 := http.Get(url)
if err2 == nil {
return resp2
}
}
panic(err)
}
return response
}
func getBestResponse() *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/beststories.json"
return getResponse(_url)
}
func getTopResponse() *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/topstories.json"
return getResponse(_url)
}
func getStoryResponse(item_id string) *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
return getResponse(_url)
}
func getDetail(id int) Story {
response := getStoryResponse(strconv.Itoa(id))
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
}
var story Story
err = json.Unmarshal(data, &story)
if err != nil {
log.Warn("getDetail: Unmarshaling json failed ", data)
panic(err)
}
//log.Debug("%+v\n", Story)
story.Text = html.UnescapeString(story.Text)
return story
}
func getTopStories() []byte {
response := getTopResponse()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
}
return data
}
func getBestStories() []byte {
response := getBestResponse()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
}
return data
}
func (app *App) updateAllDiscussions() {
const maxRoutines = 20
var item_ids []int
app.DB.Select(&item_ids, "SELECT item_id FROM discussion WHERE posted_on > (UNIX_TIMESTAMP()-3456000) order by posted_on")
q := queue.New(maxRoutines)
defer q.Close()
for _, item_id := range item_ids {
q.Add()
go func(item_id int) {
defer q.Done()
Story, ok := getStory(item_id)
if !ok {
/**
* Check if we got a network error or a dead story.
*/
if 0 == Story.Id {
log.Warnf("updateAllDiscussions: Failure getting Story for item_id: %d\n", item_id)
} else if Story.Descendants > 10 || Story.Score > 10 {
log.Infof(`
updateAllDiscussions: There is a bug. Can't update discussion with id %d.
NOTE: If this is happening again, probably the url was changed from Wikipedia to a different source.
%+v\n
`, item_id, Story)
}
return
}
err := app.updateDiscussion(Story)
if err != nil {
log.Warn(err)
return
}
}(item_id)
}
q.Wait()
}