summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
authoradmin2026-03-29 16:50:38 +0200
committeradmin2026-03-29 16:50:38 +0200
commitf3300bec030793d40115a08f46a7cbf49f06c2fd (patch)
tree16e19878b474aeed873a56f1ac37a1819dc360d1 /main.go
parent1b28f44a9f1c90e49ddf0149becaa004addc50d3 (diff)
downloadcurious-crawler-f3300bec030793d40115a08f46a7cbf49f06c2fd.tar.gz
fix missing user agent
Diffstat (limited to 'main.go')
-rw-r--r--main.go89
1 files changed, 53 insertions, 36 deletions
diff --git a/main.go b/main.go
index f3d0a6b..feecf27 100644
--- a/main.go
+++ b/main.go
@@ -53,6 +53,7 @@ func main() {
//app.saveAllCategories()
app.updateAllDiscussions()
//app.walkDown()
+ //app.saveExcerpts()
/**
* Resolve redirects on stored urls.
@@ -76,17 +77,20 @@ func (app *App) walkDown() {
//max_item := 15494000
//max_item := 15038031
//max_item := 14450000
+ //max_item := 47528683
+ //max_item := 46750000
- const maxRoutines = 20
+ const maxRoutines = 10
q := queue.New(maxRoutines)
defer q.Close()
- for i := max_item; i > 22600000; i-- {
+ //for i := max_item; i > 22600000; i-- {
+ for i := max_item; i > 44921609; i-- {
q.Add()
go func(i int) {
defer q.Done()
- Story, ok := getStory(i)
+ Story, ok := app.getStory(i)
if ok {
log.Infof("%+v\n", Story)
err = app.saveStory(Story)
@@ -126,8 +130,8 @@ func getMaxItem() int {
func (app *App) topStories() {
var err error
- data1 := strings.TrimSuffix(string(getTopStories()), "]")
- data2 := strings.TrimPrefix(string(getBestStories()), "[")
+ data1 := strings.TrimSuffix(string(app.getTopStories()), "]")
+ data2 := strings.TrimPrefix(string(app.getBestStories()), "[")
data1 = data1 + ","
data := data1 + data2
@@ -146,7 +150,7 @@ func (app *App) topStories() {
for _, id := range story_ids {
q.Add()
go func(id int) {
- Story, ok := getStory(id)
+ Story, ok := app.getStory(id)
defer q.Done()
if ok {
log.Infof("%+v\n", Story)
@@ -155,14 +159,17 @@ func (app *App) topStories() {
log.Fatal(err)
}
- log.Debug("topStories: crawling for Categories")
- categories, ok := app.crawlForCategories(Story.Url)
- if ok {
- article_id := app.getArticleIdFromUrl(Story.Url)
- app.saveCategory(article_id, categories)
- } else {
- log.Warn("topStories: crawling for Categories: not ok")
- }
+ /*
+ log.Debug("topStories: crawling for Categories")
+ categories, ok := app.crawlForCategories(Story.Url)
+ if ok {
+ article_id := app.getArticleIdFromUrl(Story.Url)
+ app.saveCategory(article_id, categories)
+ } else {
+ log.Warn("topStories: crawling for Categories: not ok")
+ time.Sleep(time.Duration(app.Config.Delay) * time.Second)
+ }
+ */
}
}(id)
@@ -170,8 +177,8 @@ func (app *App) topStories() {
q.Wait()
}
-func getStory(id int) (Story, bool) {
- Story := getDetail(id)
+func (app *App) getStory(id int) (Story, bool) {
+ Story := app.getDetail(id)
if Story.Dead || Story.Deleted {
return Story, false
}
@@ -205,17 +212,27 @@ func getStory(id int) (Story, bool) {
}
if is_wiki {
Story.Url = wikipediaNormalizeUrl(Story.Url)
- Story.Url = wikipediaRealUrl(Story.Url)
+ Story.Url = app.wikipediaRealUrl(Story.Url)
return Story, true
}
return Story, false
}
-func getResponse(url string) *http.Response {
+func (app *App) getResponse(url string) *http.Response {
var err error
var response *http.Response
- response, err = http.Get(url)
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ // Fehlerbehandlung
+ }
+
+ req.Header.Set("User-Agent", app.Config.UserAgent) // Hier den User-Agent setzen
+
+ client := &http.Client{}
+
+ response, err = client.Do(req)
+ //response, err = http.Get(url)
if err != nil {
for i := 0; i < 4; i++ {
if i == 0 {
@@ -233,33 +250,33 @@ func getResponse(url string) *http.Response {
return response
}
-func getBestResponse() *http.Response {
+func (app *App) getBestResponse() *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/beststories.json"
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getTopResponse() *http.Response {
+func (app *App) getTopResponse() *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/topstories.json"
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getWikipediaResponse(title string) *http.Response {
+func (app *App) getWikipediaResponse(title string) *http.Response {
_url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getWikipediaRedirectResponse(hostname, title string) *http.Response {
+func (app *App) getWikipediaRedirectResponse(hostname, title string) *http.Response {
_url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getStoryResponse(item_id string) *http.Response {
+func (app *App) getStoryResponse(item_id string) *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getDetail(id int) Story {
- response := getStoryResponse(strconv.Itoa(id))
+func (app *App) getDetail(id int) Story {
+ response := app.getStoryResponse(strconv.Itoa(id))
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
@@ -273,8 +290,8 @@ func getDetail(id int) Story {
return story
}
-func getTopStories() []byte {
- response := getTopResponse()
+func (app *App) getTopStories() []byte {
+ response := app.getTopResponse()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
@@ -283,8 +300,8 @@ func getTopStories() []byte {
return data
}
-func getBestStories() []byte {
- response := getBestResponse()
+func (app *App) getBestStories() []byte {
+ response := app.getBestResponse()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
@@ -307,7 +324,7 @@ func (app *App) updateAllDiscussions() {
q.Add()
go func(item_id int) {
defer q.Done()
- Story, ok := getStory(item_id)
+ Story, ok := app.getStory(item_id)
if !ok {
/**
* Check if we got a network error or a dead story.