diff options
| author | dev | 2026-06-25 20:07:08 +0200 |
|---|---|---|
| committer | dev | 2026-06-25 20:07:08 +0200 |
| commit | fa742660190a7d3b7b6f068565ce543d413edbab (patch) | |
| tree | 3515262a4bd47aac3e998ecf134451257b0181a0 /src/wikidata.go | |
| parent | d41b60d08fdd5a6589cdb4e33ac1931fa16aef4c (diff) | |
| download | hnimdbbot-fa742660190a7d3b7b6f068565ce543d413edbab.tar.gz | |
feat: fetch Wikipedia article titles via Wikidata SPARQL
- Query Wikidata SPARQL in batches of 30 for entries missing wiki_article
- Store wiki_article title in imdb table
- Respect rate limits with configurable delay and retry on 5xx/429
- Skip entries that have no Wikipedia article
- Removed unique constraint on wiki_article (multiple entries can share one)
Diffstat (limited to 'src/wikidata.go')
| -rw-r--r-- | src/wikidata.go | 239 |
1 files changed, 239 insertions, 0 deletions
diff --git a/src/wikidata.go b/src/wikidata.go new file mode 100644 index 0000000..b3018e5 --- /dev/null +++ b/src/wikidata.go @@ -0,0 +1,239 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "net/url" + "strings" + "time" +) + +const ( + wikidataSparql = "https://query.wikidata.org/sparql" + wikiBatchSize = 30 + wikiDelay = 1 * time.Second // ~15 req/min, safe under 20 req/min limit + wikiMaxRetries = 3 + wikiRetryBackoff = 15 * time.Second +) + +var wikiClient = &http.Client{Timeout: 120 * time.Second} + +// wikiAcc accumulates per-ID results from a SPARQL batch. +type wikiAcc struct { + wikiArticle string + title string +} + +// type alias for SPARQL JSON response +type sparqlResponse struct { + Results struct { + BindingList []map[string]jsonNode `json:"bindings"` + } `json:"results"` +} + +type jsonNode struct { + Type string `json:"type"` + Value string `json:"value"` + Lang string `json:"xml:lang,omitempty"` +} + +// getMissingWikiArticles returns imdb_ids where wiki_article IS NULL. +func (a *App) getMissingWikiArticles() ([]string, error) { + rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE wiki_article IS NULL AND imdb_id LIKE 'tt%'`) + if err != nil { + return nil, fmt.Errorf("query missing wiki articles: %w", err) + } + defer rows.Close() + + var ids []string + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + return nil, fmt.Errorf("scan imdb_id: %w", err) + } + ids = append(ids, id) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("rows iteration: %w", err) + } + return ids, nil +} + +// fetchWikiArticles queries Wikidata SPARQL in batches and updates wiki_article in the DB. +func (a *App) fetchWikiArticles() error { + ids, err := a.getMissingWikiArticles() + if err != nil { + return err + } + if len(ids) == 0 { + log.Println("fetchWikiArticles: all entries have wiki_article, skipping") + return nil + } + log.Printf("fetchWikiArticles: %d entries missing wiki_article", len(ids)) + + tx, err := a.DB.Begin() + if err != nil { + return fmt.Errorf("begin tx: %w", err) + } + + stmt, err := tx.Prepare(`UPDATE imdb SET wiki_article = ? WHERE imdb_id = ?`) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare wiki update: %w", err) + } + defer stmt.Close() + + updated := 0 + for i := 0; i < len(ids); i += wikiBatchSize { + chunk := ids[i:min(i+wikiBatchSize, len(ids))] + + results, err := a.queryWikidataBatch(chunk) + if err != nil { + log.Printf("wikidata batch error at offset %d: %v", i, err) + // skip batch, continue + continue + } + + for id, acc := range results { + if acc.title == "" { + continue + } + if _, err := stmt.Exec(acc.title, id); err != nil { + tx.Rollback() + return fmt.Errorf("update wiki_article for %s: %w", id, err) + } + updated++ + } + + done := i + len(chunk) + log.Printf("fetchWikiArticles: [%d/%d]", done, len(ids)) + + // rate limit between batches + if i+wikiBatchSize < len(ids) { + time.Sleep(wikiDelay) + } + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit wiki articles: %w", err) + } + + log.Printf("fetchWikiArticles: %d wiki articles updated", updated) + return nil +} + +// queryWikidataBatch sends a SPARQL query for the given IDs and returns a map of id -> wikiAcc. +func (a *App) queryWikidataBatch(ids []string) (map[string]wikiAcc, error) { + sparql := buildSparql(ids) + time.Sleep(wikiDelay) + + endpoint := wikidataSparql + "?" + url.Values{ + "query": {sparql}, + "format": {"json"}, + }.Encode() + + raw, err := doGETWithRetry(endpoint, a.Config.UserAgent) + if err != nil { + return nil, fmt.Errorf("SPARQL request: %w", err) + } + + var data sparqlResponse + if err := json.Unmarshal(raw, &data); err != nil { + return nil, fmt.Errorf("SPARQL JSON parse: %w", err) + } + + results := make(map[string]wikiAcc) + for _, b := range data.Results.BindingList { + imdb := b["imdbVal"].Value + if !strings.HasPrefix(imdb, "tt") { + continue + } + + if n, ok := b["article"]; ok && n.Value != "" { + // Extract title from URL: strip "https://en.wikipedia.org/wiki/" + title := strings.TrimPrefix(n.Value, "https://en.wikipedia.org/wiki/") + results[imdb] = wikiAcc{ + wikiArticle: n.Value, + title: title, + } + } + } + + return results, nil +} + +// buildSparql creates a SPARQL query that resolves IMDb IDs to English Wikipedia article URLs. +func buildSparql(ids []string) string { + var vals strings.Builder + for _, id := range ids { + vals.WriteString(fmt.Sprintf(" \"%s\"\n", id)) + } + + return fmt.Sprintf(`PREFIX schema: <http://schema.org/> +PREFIX wdt: <http://www.wikidata.org/prop/direct/> + +SELECT ?imdbVal ?article +WHERE { + VALUES ?imdbVal { +%s + } + ?item wdt:P345 ?imdbVal . + OPTIONAL { + ?article schema:about ?item ; + schema:isPartOf <https://en.wikipedia.org/> . + } +} +ORDER BY ?imdbVal`, vals.String()) +} + +// doGET performs a GET request with the configured User-Agent. +func doGET(uri, userAgent string) ([]byte, error) { + req, err := http.NewRequest("GET", uri, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", userAgent) + resp, err := wikiClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) + return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body) + } + return io.ReadAll(resp.Body) +} + +// doGETWithRetry retries on 5xx errors. +func doGETWithRetry(uri, userAgent string) ([]byte, error) { + var lastErr error + for attempt := 0; attempt < wikiMaxRetries; attempt++ { + if attempt > 0 { + backoff := wikiRetryBackoff * time.Duration(1<<(attempt-1)) + log.Printf(" retry %d/%d after %v", attempt+1, wikiMaxRetries, backoff) + time.Sleep(backoff) + } + raw, err := doGET(uri, userAgent) + if err == nil { + return raw, nil + } + lastErr = err + if !strings.Contains(err.Error(), "HTTP 5") { + break + } + } + return nil, lastErr +} + +// min returns the smaller of a and b. +func min(a, b int) int { + if a < b { + return a + } + return b +} + |
