summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordev2026-06-25 20:07:08 +0200
committerdev2026-06-25 20:07:08 +0200
commitfa742660190a7d3b7b6f068565ce543d413edbab (patch)
tree3515262a4bd47aac3e998ecf134451257b0181a0
parentd41b60d08fdd5a6589cdb4e33ac1931fa16aef4c (diff)
downloadhnimdbbot-fa742660190a7d3b7b6f068565ce543d413edbab.tar.gz
feat: fetch Wikipedia article titles via Wikidata SPARQL
- Query Wikidata SPARQL in batches of 30 for entries missing wiki_article - Store wiki_article title in imdb table - Respect rate limits with configurable delay and retry on 5xx/429 - Skip entries that have no Wikipedia article - Removed unique constraint on wiki_article (multiple entries can share one)
-rw-r--r--schema/schema.sql1
-rw-r--r--src/main.go4
-rw-r--r--src/wikidata.go239
3 files changed, 243 insertions, 1 deletions
diff --git a/schema/schema.sql b/schema/schema.sql
index 9b1e5ab..f957079 100644
--- a/schema/schema.sql
+++ b/schema/schema.sql
@@ -34,7 +34,6 @@ CREATE TABLE `imdb` (
`num_votes` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `imdb_id` (`imdb_id`),
- UNIQUE KEY `wiki_article` (`wiki_article`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
diff --git a/src/main.go b/src/main.go
index 123cc64..272e4b6 100644
--- a/src/main.go
+++ b/src/main.go
@@ -194,4 +194,8 @@ func main() {
if err = app.fetchAndUpdateImdbData(); err != nil {
log.Fatalf("fetchAndUpdateImdbData: %v", err)
}
+
+ if err = app.fetchWikiArticles(); err != nil {
+ log.Fatalf("fetchWikiArticles: %v", err)
+ }
}
diff --git a/src/wikidata.go b/src/wikidata.go
new file mode 100644
index 0000000..b3018e5
--- /dev/null
+++ b/src/wikidata.go
@@ -0,0 +1,239 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "strings"
+ "time"
+)
+
+const (
+ wikidataSparql = "https://query.wikidata.org/sparql"
+ wikiBatchSize = 30
+ wikiDelay = 1 * time.Second // ~15 req/min, safe under 20 req/min limit
+ wikiMaxRetries = 3
+ wikiRetryBackoff = 15 * time.Second
+)
+
+var wikiClient = &http.Client{Timeout: 120 * time.Second}
+
+// wikiAcc accumulates per-ID results from a SPARQL batch.
+type wikiAcc struct {
+ wikiArticle string
+ title string
+}
+
+// type alias for SPARQL JSON response
+type sparqlResponse struct {
+ Results struct {
+ BindingList []map[string]jsonNode `json:"bindings"`
+ } `json:"results"`
+}
+
+type jsonNode struct {
+ Type string `json:"type"`
+ Value string `json:"value"`
+ Lang string `json:"xml:lang,omitempty"`
+}
+
+// getMissingWikiArticles returns imdb_ids where wiki_article IS NULL.
+func (a *App) getMissingWikiArticles() ([]string, error) {
+ rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE wiki_article IS NULL AND imdb_id LIKE 'tt%'`)
+ if err != nil {
+ return nil, fmt.Errorf("query missing wiki articles: %w", err)
+ }
+ defer rows.Close()
+
+ var ids []string
+ for rows.Next() {
+ var id string
+ if err := rows.Scan(&id); err != nil {
+ return nil, fmt.Errorf("scan imdb_id: %w", err)
+ }
+ ids = append(ids, id)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, fmt.Errorf("rows iteration: %w", err)
+ }
+ return ids, nil
+}
+
+// fetchWikiArticles queries Wikidata SPARQL in batches and updates wiki_article in the DB.
+func (a *App) fetchWikiArticles() error {
+ ids, err := a.getMissingWikiArticles()
+ if err != nil {
+ return err
+ }
+ if len(ids) == 0 {
+ log.Println("fetchWikiArticles: all entries have wiki_article, skipping")
+ return nil
+ }
+ log.Printf("fetchWikiArticles: %d entries missing wiki_article", len(ids))
+
+ tx, err := a.DB.Begin()
+ if err != nil {
+ return fmt.Errorf("begin tx: %w", err)
+ }
+
+ stmt, err := tx.Prepare(`UPDATE imdb SET wiki_article = ? WHERE imdb_id = ?`)
+ if err != nil {
+ tx.Rollback()
+ return fmt.Errorf("prepare wiki update: %w", err)
+ }
+ defer stmt.Close()
+
+ updated := 0
+ for i := 0; i < len(ids); i += wikiBatchSize {
+ chunk := ids[i:min(i+wikiBatchSize, len(ids))]
+
+ results, err := a.queryWikidataBatch(chunk)
+ if err != nil {
+ log.Printf("wikidata batch error at offset %d: %v", i, err)
+ // skip batch, continue
+ continue
+ }
+
+ for id, acc := range results {
+ if acc.title == "" {
+ continue
+ }
+ if _, err := stmt.Exec(acc.title, id); err != nil {
+ tx.Rollback()
+ return fmt.Errorf("update wiki_article for %s: %w", id, err)
+ }
+ updated++
+ }
+
+ done := i + len(chunk)
+ log.Printf("fetchWikiArticles: [%d/%d]", done, len(ids))
+
+ // rate limit between batches
+ if i+wikiBatchSize < len(ids) {
+ time.Sleep(wikiDelay)
+ }
+ }
+
+ if err := tx.Commit(); err != nil {
+ return fmt.Errorf("commit wiki articles: %w", err)
+ }
+
+ log.Printf("fetchWikiArticles: %d wiki articles updated", updated)
+ return nil
+}
+
+// queryWikidataBatch sends a SPARQL query for the given IDs and returns a map of id -> wikiAcc.
+func (a *App) queryWikidataBatch(ids []string) (map[string]wikiAcc, error) {
+ sparql := buildSparql(ids)
+ time.Sleep(wikiDelay)
+
+ endpoint := wikidataSparql + "?" + url.Values{
+ "query": {sparql},
+ "format": {"json"},
+ }.Encode()
+
+ raw, err := doGETWithRetry(endpoint, a.Config.UserAgent)
+ if err != nil {
+ return nil, fmt.Errorf("SPARQL request: %w", err)
+ }
+
+ var data sparqlResponse
+ if err := json.Unmarshal(raw, &data); err != nil {
+ return nil, fmt.Errorf("SPARQL JSON parse: %w", err)
+ }
+
+ results := make(map[string]wikiAcc)
+ for _, b := range data.Results.BindingList {
+ imdb := b["imdbVal"].Value
+ if !strings.HasPrefix(imdb, "tt") {
+ continue
+ }
+
+ if n, ok := b["article"]; ok && n.Value != "" {
+ // Extract title from URL: strip "https://en.wikipedia.org/wiki/"
+ title := strings.TrimPrefix(n.Value, "https://en.wikipedia.org/wiki/")
+ results[imdb] = wikiAcc{
+ wikiArticle: n.Value,
+ title: title,
+ }
+ }
+ }
+
+ return results, nil
+}
+
+// buildSparql creates a SPARQL query that resolves IMDb IDs to English Wikipedia article URLs.
+func buildSparql(ids []string) string {
+ var vals strings.Builder
+ for _, id := range ids {
+ vals.WriteString(fmt.Sprintf(" \"%s\"\n", id))
+ }
+
+ return fmt.Sprintf(`PREFIX schema: <http://schema.org/>
+PREFIX wdt: <http://www.wikidata.org/prop/direct/>
+
+SELECT ?imdbVal ?article
+WHERE {
+ VALUES ?imdbVal {
+%s
+ }
+ ?item wdt:P345 ?imdbVal .
+ OPTIONAL {
+ ?article schema:about ?item ;
+ schema:isPartOf <https://en.wikipedia.org/> .
+ }
+}
+ORDER BY ?imdbVal`, vals.String())
+}
+
+// doGET performs a GET request with the configured User-Agent.
+func doGET(uri, userAgent string) ([]byte, error) {
+ req, err := http.NewRequest("GET", uri, nil)
+ if err != nil {
+ return nil, err
+ }
+ req.Header.Set("User-Agent", userAgent)
+ resp, err := wikiClient.Do(req)
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
+ return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
+ }
+ return io.ReadAll(resp.Body)
+}
+
+// doGETWithRetry retries on 5xx errors.
+func doGETWithRetry(uri, userAgent string) ([]byte, error) {
+ var lastErr error
+ for attempt := 0; attempt < wikiMaxRetries; attempt++ {
+ if attempt > 0 {
+ backoff := wikiRetryBackoff * time.Duration(1<<(attempt-1))
+ log.Printf(" retry %d/%d after %v", attempt+1, wikiMaxRetries, backoff)
+ time.Sleep(backoff)
+ }
+ raw, err := doGET(uri, userAgent)
+ if err == nil {
+ return raw, nil
+ }
+ lastErr = err
+ if !strings.Contains(err.Error(), "HTTP 5") {
+ break
+ }
+ }
+ return nil, lastErr
+}
+
+// min returns the smaller of a and b.
+func min(a, b int) int {
+ if a < b {
+ return a
+ }
+ return b
+}
+