summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoradmin2026-03-29 16:50:38 +0200
committeradmin2026-03-29 16:50:38 +0200
commitf3300bec030793d40115a08f46a7cbf49f06c2fd (patch)
tree16e19878b474aeed873a56f1ac37a1819dc360d1
parent1b28f44a9f1c90e49ddf0149becaa004addc50d3 (diff)
downloadcurious-crawler-f3300bec030793d40115a08f46a7cbf49f06c2fd.tar.gz
fix missing user agent
-rw-r--r--categories.go9
-rw-r--r--database.go2
-rw-r--r--main.go89
-rw-r--r--wikipedia.go15
4 files changed, 70 insertions, 45 deletions
diff --git a/categories.go b/categories.go
index a92b0df..a1bd394 100644
--- a/categories.go
+++ b/categories.go
@@ -31,7 +31,7 @@ func (app *App) queryWMLabs(wiki_url string) ([]string, bool) {
}
log.Debugf("queryWMLabs: wm_url: %s", wm_url)
- response := getResponse(wm_url)
+ response := app.getResponse(wm_url)
resp_data, err := ioutil.ReadAll(response.Body)
if err != nil {
log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url)
@@ -95,7 +95,9 @@ func (app *App) crawlWMLabs(wiki_url string) (Category, bool) {
}
var category Category
- c := colly.NewCollector()
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ )
c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) {
category.Name = strings.TrimSpace(e.Text)
@@ -154,6 +156,9 @@ func normalizeCategory(s string) string {
cat = strings.TrimSuffix(cat, "sub-project")
cat = strings.TrimSuffix(cat, "Project")
cat = strings.TrimSuffix(cat, "project")
+ if strings.ToLower(cat) == "project-independent assessment" {
+ return ""
+ }
if strings.Contains(strings.ToLower(cat), "articles") {
return ""
diff --git a/database.go b/database.go
index b029ca7..182318e 100644
--- a/database.go
+++ b/database.go
@@ -375,7 +375,7 @@ func (app *App) updateWikipediaUrls() {
log.Fatal(err)
}
- real_url := wikipediaRealUrl(wiki_url)
+ real_url := app.wikipediaRealUrl(wiki_url)
if real_url != wiki_url && "" != real_url {
/**
diff --git a/main.go b/main.go
index f3d0a6b..feecf27 100644
--- a/main.go
+++ b/main.go
@@ -53,6 +53,7 @@ func main() {
//app.saveAllCategories()
app.updateAllDiscussions()
//app.walkDown()
+ //app.saveExcerpts()
/**
* Resolve redirects on stored urls.
@@ -76,17 +77,20 @@ func (app *App) walkDown() {
//max_item := 15494000
//max_item := 15038031
//max_item := 14450000
+ //max_item := 47528683
+ //max_item := 46750000
- const maxRoutines = 20
+ const maxRoutines = 10
q := queue.New(maxRoutines)
defer q.Close()
- for i := max_item; i > 22600000; i-- {
+ //for i := max_item; i > 22600000; i-- {
+ for i := max_item; i > 44921609; i-- {
q.Add()
go func(i int) {
defer q.Done()
- Story, ok := getStory(i)
+ Story, ok := app.getStory(i)
if ok {
log.Infof("%+v\n", Story)
err = app.saveStory(Story)
@@ -126,8 +130,8 @@ func getMaxItem() int {
func (app *App) topStories() {
var err error
- data1 := strings.TrimSuffix(string(getTopStories()), "]")
- data2 := strings.TrimPrefix(string(getBestStories()), "[")
+ data1 := strings.TrimSuffix(string(app.getTopStories()), "]")
+ data2 := strings.TrimPrefix(string(app.getBestStories()), "[")
data1 = data1 + ","
data := data1 + data2
@@ -146,7 +150,7 @@ func (app *App) topStories() {
for _, id := range story_ids {
q.Add()
go func(id int) {
- Story, ok := getStory(id)
+ Story, ok := app.getStory(id)
defer q.Done()
if ok {
log.Infof("%+v\n", Story)
@@ -155,14 +159,17 @@ func (app *App) topStories() {
log.Fatal(err)
}
- log.Debug("topStories: crawling for Categories")
- categories, ok := app.crawlForCategories(Story.Url)
- if ok {
- article_id := app.getArticleIdFromUrl(Story.Url)
- app.saveCategory(article_id, categories)
- } else {
- log.Warn("topStories: crawling for Categories: not ok")
- }
+ /*
+ log.Debug("topStories: crawling for Categories")
+ categories, ok := app.crawlForCategories(Story.Url)
+ if ok {
+ article_id := app.getArticleIdFromUrl(Story.Url)
+ app.saveCategory(article_id, categories)
+ } else {
+ log.Warn("topStories: crawling for Categories: not ok")
+ time.Sleep(time.Duration(app.Config.Delay) * time.Second)
+ }
+ */
}
}(id)
@@ -170,8 +177,8 @@ func (app *App) topStories() {
q.Wait()
}
-func getStory(id int) (Story, bool) {
- Story := getDetail(id)
+func (app *App) getStory(id int) (Story, bool) {
+ Story := app.getDetail(id)
if Story.Dead || Story.Deleted {
return Story, false
}
@@ -205,17 +212,27 @@ func getStory(id int) (Story, bool) {
}
if is_wiki {
Story.Url = wikipediaNormalizeUrl(Story.Url)
- Story.Url = wikipediaRealUrl(Story.Url)
+ Story.Url = app.wikipediaRealUrl(Story.Url)
return Story, true
}
return Story, false
}
-func getResponse(url string) *http.Response {
+func (app *App) getResponse(url string) *http.Response {
var err error
var response *http.Response
- response, err = http.Get(url)
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ // Fehlerbehandlung
+ }
+
+ req.Header.Set("User-Agent", app.Config.UserAgent) // Hier den User-Agent setzen
+
+ client := &http.Client{}
+
+ response, err = client.Do(req)
+ //response, err = http.Get(url)
if err != nil {
for i := 0; i < 4; i++ {
if i == 0 {
@@ -233,33 +250,33 @@ func getResponse(url string) *http.Response {
return response
}
-func getBestResponse() *http.Response {
+func (app *App) getBestResponse() *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/beststories.json"
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getTopResponse() *http.Response {
+func (app *App) getTopResponse() *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/topstories.json"
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getWikipediaResponse(title string) *http.Response {
+func (app *App) getWikipediaResponse(title string) *http.Response {
_url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getWikipediaRedirectResponse(hostname, title string) *http.Response {
+func (app *App) getWikipediaRedirectResponse(hostname, title string) *http.Response {
_url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getStoryResponse(item_id string) *http.Response {
+func (app *App) getStoryResponse(item_id string) *http.Response {
_url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json"
- return getResponse(_url)
+ return app.getResponse(_url)
}
-func getDetail(id int) Story {
- response := getStoryResponse(strconv.Itoa(id))
+func (app *App) getDetail(id int) Story {
+ response := app.getStoryResponse(strconv.Itoa(id))
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
@@ -273,8 +290,8 @@ func getDetail(id int) Story {
return story
}
-func getTopStories() []byte {
- response := getTopResponse()
+func (app *App) getTopStories() []byte {
+ response := app.getTopResponse()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
@@ -283,8 +300,8 @@ func getTopStories() []byte {
return data
}
-func getBestStories() []byte {
- response := getBestResponse()
+func (app *App) getBestStories() []byte {
+ response := app.getBestResponse()
data, err := ioutil.ReadAll(response.Body)
if err != nil {
@@ -307,7 +324,7 @@ func (app *App) updateAllDiscussions() {
q.Add()
go func(item_id int) {
defer q.Done()
- Story, ok := getStory(item_id)
+ Story, ok := app.getStory(item_id)
if !ok {
/**
* Check if we got a network error or a dead story.
diff --git a/wikipedia.go b/wikipedia.go
index 3df392d..fbc2b81 100644
--- a/wikipedia.go
+++ b/wikipedia.go
@@ -4,6 +4,7 @@ import (
"encoding/json"
"regexp"
"strings"
+
//"strconv"
"io/ioutil"
"net/url"
@@ -13,7 +14,9 @@ import (
)
func (app *App) crawlWikipedia(url string) {
- c := colly.NewCollector()
+ c := colly.NewCollector(
+ colly.UserAgent(app.Config.UserAgent),
+ )
c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) {
e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) {
@@ -141,10 +144,10 @@ func (app *App) _changeTitle(id_to_delete int, correct_url string) {
log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title)
}
-func getWikipediaExcerpt(title string) string {
+func (app *App) getWikipediaExcerpt(title string) string {
var err error
- response := getWikipediaResponse(title)
+ response := app.getWikipediaResponse(title)
resp_data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)
@@ -201,7 +204,7 @@ func (app *App) saveExcerpts() error {
}
title, _ := getWikipediaTitle(url)
- excerpt := getWikipediaExcerpt(title)
+ excerpt := app.getWikipediaExcerpt(title)
query = "UPDATE article SET excerpt_html = ? WHERE id = ?"
stmt, err := app.DB.Prepare(query)
@@ -221,7 +224,7 @@ func (app *App) saveExcerpts() error {
return nil
}
-func wikipediaRealUrl(wiki_url string) string {
+func (app *App) wikipediaRealUrl(wiki_url string) string {
/**
* We don't change urls with parameters, because we would loose the context.
*/
@@ -247,7 +250,7 @@ func wikipediaRealUrl(wiki_url string) string {
return wiki_url
}
- response := getWikipediaRedirectResponse(hostname, title)
+ response := app.getWikipediaRedirectResponse(hostname, title)
resp_data, err := ioutil.ReadAll(response.Body)
if err != nil {
panic(err)