package main import ( "encoding/json" "fmt" "io" "log" "net/http" "net/url" "strings" "time" ) const ( wikidataSparql = "https://query.wikidata.org/sparql" wikiBatchSize = 30 wikiDelay = 2 * time.Second // ~15 req/min, safe under 20 req/min limit wikiMaxRetries = 3 wikiRetryBackoff = 15 * time.Second ) var wikiClient = &http.Client{Timeout: 120 * time.Second} // wikiAcc accumulates per-ID results from a SPARQL batch. type wikiAcc struct { wikiArticle string title string } // type alias for SPARQL JSON response type sparqlResponse struct { Results struct { BindingList []map[string]jsonNode `json:"bindings"` } `json:"results"` } type jsonNode struct { Type string `json:"type"` Value string `json:"value"` Lang string `json:"xml:lang,omitempty"` } // getMissingWikiArticles returns imdb_ids where wiki_article IS NULL. func (a *App) getMissingWikiArticles() ([]string, error) { rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE wiki_article IS NULL AND has_no_wiki_article = 0 AND imdb_id LIKE 'tt%'`) if err != nil { return nil, fmt.Errorf("query missing wiki articles: %w", err) } defer rows.Close() var ids []string for rows.Next() { var id string if err := rows.Scan(&id); err != nil { return nil, fmt.Errorf("scan imdb_id: %w", err) } ids = append(ids, id) } if err := rows.Err(); err != nil { return nil, fmt.Errorf("rows iteration: %w", err) } return ids, nil } // fetchWikiArticles queries Wikidata SPARQL in batches and updates wiki_article in the DB. func (a *App) fetchWikiArticles() error { ids, err := a.getMissingWikiArticles() if err != nil { return err } if len(ids) == 0 { log.Println("fetchWikiArticles: all entries have wiki_article, skipping") return nil } log.Printf("fetchWikiArticles: %d entries missing wiki_article", len(ids)) tx, err := a.DB.Begin() if err != nil { return fmt.Errorf("begin tx: %w", err) } wikiStmt, err := tx.Prepare(`UPDATE imdb SET wiki_article = ? WHERE imdb_id = ?`) if err != nil { tx.Rollback() return fmt.Errorf("prepare wiki update: %w", err) } defer wikiStmt.Close() noWikiStmt, err := tx.Prepare(`UPDATE imdb SET has_no_wiki_article = 1 WHERE imdb_id = ?`) if err != nil { tx.Rollback() return fmt.Errorf("prepare no_wiki update: %w", err) } defer noWikiStmt.Close() updated := 0 noWiki := 0 for i := 0; i < len(ids); i += wikiBatchSize { chunk := ids[i:min(i+wikiBatchSize, len(ids))] results, err := a.queryWikidataBatch(chunk) if err != nil { log.Printf("wikidata batch error at offset %d: %v", i, err) // mark all in skipped batch as no-wiki for _, id := range chunk { if _, err := noWikiStmt.Exec(id); err != nil { tx.Rollback() return fmt.Errorf("mark no_wiki for %s: %w", id, err) } noWiki++ } continue } for _, id := range chunk { acc, found := results[id] if found && acc.title != "" { if _, err := wikiStmt.Exec(acc.title, id); err != nil { tx.Rollback() return fmt.Errorf("update wiki_article for %s: %w", id, err) } updated++ } else { if _, err := noWikiStmt.Exec(id); err != nil { tx.Rollback() return fmt.Errorf("mark no_wiki for %s: %w", id, err) } noWiki++ } } done := i + len(chunk) log.Printf("fetchWikiArticles: [%d/%d]", done, len(ids)) // rate limit between batches if i+wikiBatchSize < len(ids) { time.Sleep(wikiDelay) } } if err := tx.Commit(); err != nil { return fmt.Errorf("commit wiki articles: %w", err) } log.Printf("fetchWikiArticles: %d wiki articles updated, %d marked as no wiki", updated, noWiki) return nil } // queryWikidataBatch sends a SPARQL query for the given IDs and returns a map of id -> wikiAcc. func (a *App) queryWikidataBatch(ids []string) (map[string]wikiAcc, error) { sparql := buildSparql(ids) time.Sleep(wikiDelay) endpoint := wikidataSparql + "?" + url.Values{ "query": {sparql}, "format": {"json"}, }.Encode() raw, err := doGETWithRetry(endpoint, a.Config.UserAgent) if err != nil { return nil, fmt.Errorf("SPARQL request: %w", err) } var data sparqlResponse if err := json.Unmarshal(raw, &data); err != nil { return nil, fmt.Errorf("SPARQL JSON parse: %w", err) } results := make(map[string]wikiAcc) for _, b := range data.Results.BindingList { imdb := b["imdbVal"].Value if !strings.HasPrefix(imdb, "tt") { continue } if n, ok := b["article"]; ok && n.Value != "" { // Extract title from URL: strip "https://en.wikipedia.org/wiki/" title, _ := url.PathUnescape(strings.TrimPrefix(n.Value, "https://en.wikipedia.org/wiki/")) results[imdb] = wikiAcc{ wikiArticle: n.Value, title: title, } } } return results, nil } // buildSparql creates a SPARQL query that resolves IMDb IDs to English Wikipedia article URLs. func buildSparql(ids []string) string { var vals strings.Builder for _, id := range ids { vals.WriteString(fmt.Sprintf(" \"%s\"\n", id)) } return fmt.Sprintf(`PREFIX schema: PREFIX wdt: SELECT ?imdbVal ?article WHERE { VALUES ?imdbVal { %s } ?item wdt:P345 ?imdbVal . OPTIONAL { ?article schema:about ?item ; schema:isPartOf . } } ORDER BY ?imdbVal`, vals.String()) } // doGET performs a GET request with the configured User-Agent. func doGET(uri, userAgent string) ([]byte, error) { req, err := http.NewRequest("GET", uri, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", userAgent) resp, err := wikiClient.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body) } return io.ReadAll(resp.Body) } // doGETWithRetry retries on 5xx errors. func doGETWithRetry(uri, userAgent string) ([]byte, error) { var lastErr error for attempt := 0; attempt < wikiMaxRetries; attempt++ { if attempt > 0 { backoff := wikiRetryBackoff * time.Duration(1<<(attempt-1)) log.Printf(" retry %d/%d after %v", attempt+1, wikiMaxRetries, backoff) time.Sleep(backoff) } raw, err := doGET(uri, userAgent) if err == nil { return raw, nil } lastErr = err if !strings.Contains(err.Error(), "HTTP 5") { break } } return nil, lastErr } // min returns the smaller of a and b. func min(a, b int) int { if a < b { return a } return b }