package main
import (
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"net/url"
"strings"
"time"
)
const (
wikidataSparql = "https://query.wikidata.org/sparql"
wikiBatchSize = 30
wikiDelay = 1 * time.Second // ~15 req/min, safe under 20 req/min limit
wikiMaxRetries = 3
wikiRetryBackoff = 15 * time.Second
)
var wikiClient = &http.Client{Timeout: 120 * time.Second}
// wikiAcc accumulates per-ID results from a SPARQL batch.
type wikiAcc struct {
wikiArticle string
title string
}
// type alias for SPARQL JSON response
type sparqlResponse struct {
Results struct {
BindingList []map[string]jsonNode `json:"bindings"`
} `json:"results"`
}
type jsonNode struct {
Type string `json:"type"`
Value string `json:"value"`
Lang string `json:"xml:lang,omitempty"`
}
// getMissingWikiArticles returns imdb_ids where wiki_article IS NULL.
func (a *App) getMissingWikiArticles() ([]string, error) {
rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE wiki_article IS NULL AND imdb_id LIKE 'tt%'`)
if err != nil {
return nil, fmt.Errorf("query missing wiki articles: %w", err)
}
defer rows.Close()
var ids []string
for rows.Next() {
var id string
if err := rows.Scan(&id); err != nil {
return nil, fmt.Errorf("scan imdb_id: %w", err)
}
ids = append(ids, id)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("rows iteration: %w", err)
}
return ids, nil
}
// fetchWikiArticles queries Wikidata SPARQL in batches and updates wiki_article in the DB.
func (a *App) fetchWikiArticles() error {
ids, err := a.getMissingWikiArticles()
if err != nil {
return err
}
if len(ids) == 0 {
log.Println("fetchWikiArticles: all entries have wiki_article, skipping")
return nil
}
log.Printf("fetchWikiArticles: %d entries missing wiki_article", len(ids))
tx, err := a.DB.Begin()
if err != nil {
return fmt.Errorf("begin tx: %w", err)
}
stmt, err := tx.Prepare(`UPDATE imdb SET wiki_article = ? WHERE imdb_id = ?`)
if err != nil {
tx.Rollback()
return fmt.Errorf("prepare wiki update: %w", err)
}
defer stmt.Close()
updated := 0
for i := 0; i < len(ids); i += wikiBatchSize {
chunk := ids[i:min(i+wikiBatchSize, len(ids))]
results, err := a.queryWikidataBatch(chunk)
if err != nil {
log.Printf("wikidata batch error at offset %d: %v", i, err)
// skip batch, continue
continue
}
for id, acc := range results {
if acc.title == "" {
continue
}
if _, err := stmt.Exec(acc.title, id); err != nil {
tx.Rollback()
return fmt.Errorf("update wiki_article for %s: %w", id, err)
}
updated++
}
done := i + len(chunk)
log.Printf("fetchWikiArticles: [%d/%d]", done, len(ids))
// rate limit between batches
if i+wikiBatchSize < len(ids) {
time.Sleep(wikiDelay)
}
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit wiki articles: %w", err)
}
log.Printf("fetchWikiArticles: %d wiki articles updated", updated)
return nil
}
// queryWikidataBatch sends a SPARQL query for the given IDs and returns a map of id -> wikiAcc.
func (a *App) queryWikidataBatch(ids []string) (map[string]wikiAcc, error) {
sparql := buildSparql(ids)
time.Sleep(wikiDelay)
endpoint := wikidataSparql + "?" + url.Values{
"query": {sparql},
"format": {"json"},
}.Encode()
raw, err := doGETWithRetry(endpoint, a.Config.UserAgent)
if err != nil {
return nil, fmt.Errorf("SPARQL request: %w", err)
}
var data sparqlResponse
if err := json.Unmarshal(raw, &data); err != nil {
return nil, fmt.Errorf("SPARQL JSON parse: %w", err)
}
results := make(map[string]wikiAcc)
for _, b := range data.Results.BindingList {
imdb := b["imdbVal"].Value
if !strings.HasPrefix(imdb, "tt") {
continue
}
if n, ok := b["article"]; ok && n.Value != "" {
// Extract title from URL: strip "https://en.wikipedia.org/wiki/"
title := strings.TrimPrefix(n.Value, "https://en.wikipedia.org/wiki/")
results[imdb] = wikiAcc{
wikiArticle: n.Value,
title: title,
}
}
}
return results, nil
}
// buildSparql creates a SPARQL query that resolves IMDb IDs to English Wikipedia article URLs.
func buildSparql(ids []string) string {
var vals strings.Builder
for _, id := range ids {
vals.WriteString(fmt.Sprintf(" \"%s\"\n", id))
}
return fmt.Sprintf(`PREFIX schema:
PREFIX wdt:
SELECT ?imdbVal ?article
WHERE {
VALUES ?imdbVal {
%s
}
?item wdt:P345 ?imdbVal .
OPTIONAL {
?article schema:about ?item ;
schema:isPartOf .
}
}
ORDER BY ?imdbVal`, vals.String())
}
// doGET performs a GET request with the configured User-Agent.
func doGET(uri, userAgent string) ([]byte, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", userAgent)
resp, err := wikiClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
return io.ReadAll(resp.Body)
}
// doGETWithRetry retries on 5xx errors.
func doGETWithRetry(uri, userAgent string) ([]byte, error) {
var lastErr error
for attempt := 0; attempt < wikiMaxRetries; attempt++ {
if attempt > 0 {
backoff := wikiRetryBackoff * time.Duration(1<<(attempt-1))
log.Printf(" retry %d/%d after %v", attempt+1, wikiMaxRetries, backoff)
time.Sleep(backoff)
}
raw, err := doGET(uri, userAgent)
if err == nil {
return raw, nil
}
lastErr = err
if !strings.Contains(err.Error(), "HTTP 5") {
break
}
}
return nil, lastErr
}
// min returns the smaller of a and b.
func min(a, b int) int {
if a < b {
return a
}
return b
}