diff options
| author | dev | 2026-06-25 20:29:42 +0200 |
|---|---|---|
| committer | dev | 2026-06-25 20:29:42 +0200 |
| commit | 1972c3f0a93c23d861f8b00e4b3570f450d4519a (patch) | |
| tree | b3f5ece5f428c99f3f5f330d92768ca349a10d15 /src/wikidata.go | |
| parent | fa742660190a7d3b7b6f068565ce543d413edbab (diff) | |
| download | hnimdbbot-1972c3f0a93c23d861f8b00e4b3570f450d4519a.tar.gz | |
feat: set has_no_wiki_article flag for entries without Wikipedia article
- Mark entries as has_no_wiki_article=1 when Wikidata returns no result
- Also mark entries in batches that failed with HTTP errors
- Re-run populated 2705 wiki articles, 592 marked as no wiki
Diffstat (limited to 'src/wikidata.go')
| -rw-r--r-- | src/wikidata.go | 47 |
1 files changed, 34 insertions, 13 deletions
diff --git a/src/wikidata.go b/src/wikidata.go index b3018e5..dfaf1bd 100644 --- a/src/wikidata.go +++ b/src/wikidata.go @@ -14,7 +14,7 @@ import ( const ( wikidataSparql = "https://query.wikidata.org/sparql" wikiBatchSize = 30 - wikiDelay = 1 * time.Second // ~15 req/min, safe under 20 req/min limit + wikiDelay = 2 * time.Second // ~15 req/min, safe under 20 req/min limit wikiMaxRetries = 3 wikiRetryBackoff = 15 * time.Second ) @@ -79,33 +79,54 @@ func (a *App) fetchWikiArticles() error { return fmt.Errorf("begin tx: %w", err) } - stmt, err := tx.Prepare(`UPDATE imdb SET wiki_article = ? WHERE imdb_id = ?`) + wikiStmt, err := tx.Prepare(`UPDATE imdb SET wiki_article = ? WHERE imdb_id = ?`) if err != nil { tx.Rollback() return fmt.Errorf("prepare wiki update: %w", err) } - defer stmt.Close() + defer wikiStmt.Close() + + noWikiStmt, err := tx.Prepare(`UPDATE imdb SET has_no_wiki_article = 1 WHERE imdb_id = ?`) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare no_wiki update: %w", err) + } + defer noWikiStmt.Close() updated := 0 + noWiki := 0 for i := 0; i < len(ids); i += wikiBatchSize { chunk := ids[i:min(i+wikiBatchSize, len(ids))] results, err := a.queryWikidataBatch(chunk) if err != nil { log.Printf("wikidata batch error at offset %d: %v", i, err) - // skip batch, continue + // mark all in skipped batch as no-wiki + for _, id := range chunk { + if _, err := noWikiStmt.Exec(id); err != nil { + tx.Rollback() + return fmt.Errorf("mark no_wiki for %s: %w", id, err) + } + noWiki++ + } continue } - for id, acc := range results { - if acc.title == "" { - continue - } - if _, err := stmt.Exec(acc.title, id); err != nil { - tx.Rollback() - return fmt.Errorf("update wiki_article for %s: %w", id, err) + for _, id := range chunk { + acc, found := results[id] + if found && acc.title != "" { + if _, err := wikiStmt.Exec(acc.title, id); err != nil { + tx.Rollback() + return fmt.Errorf("update wiki_article for %s: %w", id, err) + } + updated++ + } else { + if _, err := noWikiStmt.Exec(id); err != nil { + tx.Rollback() + return fmt.Errorf("mark no_wiki for %s: %w", id, err) + } + noWiki++ } - updated++ } done := i + len(chunk) @@ -121,7 +142,7 @@ func (a *App) fetchWikiArticles() error { return fmt.Errorf("commit wiki articles: %w", err) } - log.Printf("fetchWikiArticles: %d wiki articles updated", updated) + log.Printf("fetchWikiArticles: %d wiki articles updated, %d marked as no wiki", updated, noWiki) return nil } |
