diff options
| author | admin | 2026-03-29 16:50:38 +0200 |
|---|---|---|
| committer | admin | 2026-03-29 16:50:38 +0200 |
| commit | f3300bec030793d40115a08f46a7cbf49f06c2fd (patch) | |
| tree | 16e19878b474aeed873a56f1ac37a1819dc360d1 | |
| parent | 1b28f44a9f1c90e49ddf0149becaa004addc50d3 (diff) | |
| download | curious-crawler-f3300bec030793d40115a08f46a7cbf49f06c2fd.tar.gz | |
fix missing user agent
| -rw-r--r-- | categories.go | 9 | ||||
| -rw-r--r-- | database.go | 2 | ||||
| -rw-r--r-- | main.go | 89 | ||||
| -rw-r--r-- | wikipedia.go | 15 |
4 files changed, 70 insertions, 45 deletions
diff --git a/categories.go b/categories.go index a92b0df..a1bd394 100644 --- a/categories.go +++ b/categories.go @@ -31,7 +31,7 @@ func (app *App) queryWMLabs(wiki_url string) ([]string, bool) { } log.Debugf("queryWMLabs: wm_url: %s", wm_url) - response := getResponse(wm_url) + response := app.getResponse(wm_url) resp_data, err := ioutil.ReadAll(response.Body) if err != nil { log.Warnf("queryWMLabs: Reading response data failed for %s", wm_url) @@ -95,7 +95,9 @@ func (app *App) crawlWMLabs(wiki_url string) (Category, bool) { } var category Category - c := colly.NewCollector() + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + ) c.OnHTML(".sort-entry--wikiproject", func(e *colly.HTMLElement) { category.Name = strings.TrimSpace(e.Text) @@ -154,6 +156,9 @@ func normalizeCategory(s string) string { cat = strings.TrimSuffix(cat, "sub-project") cat = strings.TrimSuffix(cat, "Project") cat = strings.TrimSuffix(cat, "project") + if strings.ToLower(cat) == "project-independent assessment" { + return "" + } if strings.Contains(strings.ToLower(cat), "articles") { return "" diff --git a/database.go b/database.go index b029ca7..182318e 100644 --- a/database.go +++ b/database.go @@ -375,7 +375,7 @@ func (app *App) updateWikipediaUrls() { log.Fatal(err) } - real_url := wikipediaRealUrl(wiki_url) + real_url := app.wikipediaRealUrl(wiki_url) if real_url != wiki_url && "" != real_url { /** @@ -53,6 +53,7 @@ func main() { //app.saveAllCategories() app.updateAllDiscussions() //app.walkDown() + //app.saveExcerpts() /** * Resolve redirects on stored urls. @@ -76,17 +77,20 @@ func (app *App) walkDown() { //max_item := 15494000 //max_item := 15038031 //max_item := 14450000 + //max_item := 47528683 + //max_item := 46750000 - const maxRoutines = 20 + const maxRoutines = 10 q := queue.New(maxRoutines) defer q.Close() - for i := max_item; i > 22600000; i-- { + //for i := max_item; i > 22600000; i-- { + for i := max_item; i > 44921609; i-- { q.Add() go func(i int) { defer q.Done() - Story, ok := getStory(i) + Story, ok := app.getStory(i) if ok { log.Infof("%+v\n", Story) err = app.saveStory(Story) @@ -126,8 +130,8 @@ func getMaxItem() int { func (app *App) topStories() { var err error - data1 := strings.TrimSuffix(string(getTopStories()), "]") - data2 := strings.TrimPrefix(string(getBestStories()), "[") + data1 := strings.TrimSuffix(string(app.getTopStories()), "]") + data2 := strings.TrimPrefix(string(app.getBestStories()), "[") data1 = data1 + "," data := data1 + data2 @@ -146,7 +150,7 @@ func (app *App) topStories() { for _, id := range story_ids { q.Add() go func(id int) { - Story, ok := getStory(id) + Story, ok := app.getStory(id) defer q.Done() if ok { log.Infof("%+v\n", Story) @@ -155,14 +159,17 @@ func (app *App) topStories() { log.Fatal(err) } - log.Debug("topStories: crawling for Categories") - categories, ok := app.crawlForCategories(Story.Url) - if ok { - article_id := app.getArticleIdFromUrl(Story.Url) - app.saveCategory(article_id, categories) - } else { - log.Warn("topStories: crawling for Categories: not ok") - } + /* + log.Debug("topStories: crawling for Categories") + categories, ok := app.crawlForCategories(Story.Url) + if ok { + article_id := app.getArticleIdFromUrl(Story.Url) + app.saveCategory(article_id, categories) + } else { + log.Warn("topStories: crawling for Categories: not ok") + time.Sleep(time.Duration(app.Config.Delay) * time.Second) + } + */ } }(id) @@ -170,8 +177,8 @@ func (app *App) topStories() { q.Wait() } -func getStory(id int) (Story, bool) { - Story := getDetail(id) +func (app *App) getStory(id int) (Story, bool) { + Story := app.getDetail(id) if Story.Dead || Story.Deleted { return Story, false } @@ -205,17 +212,27 @@ func getStory(id int) (Story, bool) { } if is_wiki { Story.Url = wikipediaNormalizeUrl(Story.Url) - Story.Url = wikipediaRealUrl(Story.Url) + Story.Url = app.wikipediaRealUrl(Story.Url) return Story, true } return Story, false } -func getResponse(url string) *http.Response { +func (app *App) getResponse(url string) *http.Response { var err error var response *http.Response - response, err = http.Get(url) + req, err := http.NewRequest("GET", url, nil) + if err != nil { + // Fehlerbehandlung + } + + req.Header.Set("User-Agent", app.Config.UserAgent) // Hier den User-Agent setzen + + client := &http.Client{} + + response, err = client.Do(req) + //response, err = http.Get(url) if err != nil { for i := 0; i < 4; i++ { if i == 0 { @@ -233,33 +250,33 @@ func getResponse(url string) *http.Response { return response } -func getBestResponse() *http.Response { +func (app *App) getBestResponse() *http.Response { _url := "https://hacker-news.firebaseio.com/v0/beststories.json" - return getResponse(_url) + return app.getResponse(_url) } -func getTopResponse() *http.Response { +func (app *App) getTopResponse() *http.Response { _url := "https://hacker-news.firebaseio.com/v0/topstories.json" - return getResponse(_url) + return app.getResponse(_url) } -func getWikipediaResponse(title string) *http.Response { +func (app *App) getWikipediaResponse(title string) *http.Response { _url := "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=" + title - return getResponse(_url) + return app.getResponse(_url) } -func getWikipediaRedirectResponse(hostname, title string) *http.Response { +func (app *App) getWikipediaRedirectResponse(hostname, title string) *http.Response { _url := "https://" + hostname + "/w/api.php?action=query&prop=info&format=json&redirects=1&inprop=url&titles=" + title - return getResponse(_url) + return app.getResponse(_url) } -func getStoryResponse(item_id string) *http.Response { +func (app *App) getStoryResponse(item_id string) *http.Response { _url := "https://hacker-news.firebaseio.com/v0/item/" + item_id + ".json" - return getResponse(_url) + return app.getResponse(_url) } -func getDetail(id int) Story { - response := getStoryResponse(strconv.Itoa(id)) +func (app *App) getDetail(id int) Story { + response := app.getStoryResponse(strconv.Itoa(id)) data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) @@ -273,8 +290,8 @@ func getDetail(id int) Story { return story } -func getTopStories() []byte { - response := getTopResponse() +func (app *App) getTopStories() []byte { + response := app.getTopResponse() data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) @@ -283,8 +300,8 @@ func getTopStories() []byte { return data } -func getBestStories() []byte { - response := getBestResponse() +func (app *App) getBestStories() []byte { + response := app.getBestResponse() data, err := ioutil.ReadAll(response.Body) if err != nil { @@ -307,7 +324,7 @@ func (app *App) updateAllDiscussions() { q.Add() go func(item_id int) { defer q.Done() - Story, ok := getStory(item_id) + Story, ok := app.getStory(item_id) if !ok { /** * Check if we got a network error or a dead story. diff --git a/wikipedia.go b/wikipedia.go index 3df392d..fbc2b81 100644 --- a/wikipedia.go +++ b/wikipedia.go @@ -4,6 +4,7 @@ import ( "encoding/json" "regexp" "strings" + //"strconv" "io/ioutil" "net/url" @@ -13,7 +14,9 @@ import ( ) func (app *App) crawlWikipedia(url string) { - c := colly.NewCollector() + c := colly.NewCollector( + colly.UserAgent(app.Config.UserAgent), + ) c.OnHTML("#mw-normal-catlinks", func(e *colly.HTMLElement) { e.ForEach("ul > li > a", func(i int, e *colly.HTMLElement) { @@ -141,10 +144,10 @@ func (app *App) _changeTitle(id_to_delete int, correct_url string) { log.Printf("new_title: %s, old_title: %s, cur_title: %s \n", new_title, old_title, cur_title) } -func getWikipediaExcerpt(title string) string { +func (app *App) getWikipediaExcerpt(title string) string { var err error - response := getWikipediaResponse(title) + response := app.getWikipediaResponse(title) resp_data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) @@ -201,7 +204,7 @@ func (app *App) saveExcerpts() error { } title, _ := getWikipediaTitle(url) - excerpt := getWikipediaExcerpt(title) + excerpt := app.getWikipediaExcerpt(title) query = "UPDATE article SET excerpt_html = ? WHERE id = ?" stmt, err := app.DB.Prepare(query) @@ -221,7 +224,7 @@ func (app *App) saveExcerpts() error { return nil } -func wikipediaRealUrl(wiki_url string) string { +func (app *App) wikipediaRealUrl(wiki_url string) string { /** * We don't change urls with parameters, because we would loose the context. */ @@ -247,7 +250,7 @@ func wikipediaRealUrl(wiki_url string) string { return wiki_url } - response := getWikipediaRedirectResponse(hostname, title) + response := app.getWikipediaRedirectResponse(hostname, title) resp_data, err := ioutil.ReadAll(response.Body) if err != nil { panic(err) |
