From e13cbe6a4fd1ebe3f2c3bfc86c54e8dd17c59624 Mon Sep 17 00:00:00 2001 From: dev Date: Thu, 25 Jun 2026 21:14:39 +0200 Subject: feat: fetch missing wiki data from custom server and populate imdb table - Add wiki_server and wiki_username config fields - Query custom server for each wiki_article entry - Extract description, synopsis (Plot), year, poster_url, license, license_url, num_accolades from structured JSON response - Serial processing with 1 req/s rate limit - Update only entries missing at least one target column --- src/config.go | 4 + src/main.go | 4 + src/wikiarticle.go | 283 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 291 insertions(+) create mode 100644 src/wikiarticle.go (limited to 'src') diff --git a/src/config.go b/src/config.go index a798f97..8f143dd 100644 --- a/src/config.go +++ b/src/config.go @@ -20,6 +20,10 @@ type Config struct { UserAgent string `json:"user_agent"` Delay int `json:"delay"` + // wiki + WikiServer string `json:"wiki_server"` + WikiUsername string `json:"wiki_username"` + // auth AccessToken string `json:"-"` RefreshToken string `json:"refresh_token"` diff --git a/src/main.go b/src/main.go index 272e4b6..9fd53db 100644 --- a/src/main.go +++ b/src/main.go @@ -198,4 +198,8 @@ func main() { if err = app.fetchWikiArticles(); err != nil { log.Fatalf("fetchWikiArticles: %v", err) } + + if err = app.fetchWikiArticlesData(); err != nil { + log.Fatalf("fetchWikiArticlesData: %v", err) + } } diff --git a/src/wikiarticle.go b/src/wikiarticle.go new file mode 100644 index 0000000..61a6084 --- /dev/null +++ b/src/wikiarticle.go @@ -0,0 +1,283 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "net/url" + "strconv" + "strings" + "time" +) + +var wikiArticleClient = &http.Client{Timeout: 60 * time.Second} + +// wikiArticleEntry holds extracted fields from a wiki article API response. +type wikiArticleEntry struct { + Description string + Year int + PosterURL string + Synopsis string + License string + LicenseURL string + NumAccolades int +} + +// fetchWikiArticlesData queries the custom wiki server for all entries +// that have a wiki_article and updates the imdb table with extracted fields. +func (a *App) fetchWikiArticlesData() error { + rows, err := a.DB.Query(` + SELECT id, imdb_id, wiki_article FROM imdb + WHERE wiki_article IS NOT NULL + AND (synopsis IS NULL OR description IS NULL OR year IS NULL + OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL) + `) + if err != nil { + return fmt.Errorf("query wiki articles: %w", err) + } + defer rows.Close() + + type dbRow struct { + id int + imdbID string + wikiArticle string + } + var entries []dbRow + for rows.Next() { + var r dbRow + if err := rows.Scan(&r.id, &r.imdbID, &r.wikiArticle); err != nil { + return fmt.Errorf("scan row: %w", err) + } + entries = append(entries, r) + } + if err := rows.Err(); err != nil { + return fmt.Errorf("rows iteration: %w", err) + } + + if len(entries) == 0 { + log.Println("fetchWikiArticlesData: all entries complete, skipping") + return nil + } + log.Printf("fetchWikiArticlesData: %d entries need wiki data", len(entries)) + + tx, err := a.DB.Begin() + if err != nil { + return fmt.Errorf("begin tx: %w", err) + } + + stmt, err := tx.Prepare(` + UPDATE imdb SET + synopsis = ?, description = ?, year = ?, poster_url = ?, + license = ?, license_url = ?, num_accolades = ? + WHERE id = ? + `) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare wiki update: %w", err) + } + defer stmt.Close() + + type result struct { + id int + entry wikiArticleEntry + } + ch := make(chan result, 1) + + // Serial processing with 1 req/s rate limit + go func() { + for _, item := range entries { + entry, err := a.queryWikiArticle(item.wikiArticle) + if err != nil { + log.Printf("wiki error for %s (%s): %v", item.imdbID, item.wikiArticle, err) + continue + } + ch <- result{id: item.id, entry: entry} + time.Sleep(1 * time.Second) + } + close(ch) + }() + + updated := 0 + for r := range ch { + e := r.entry + _, err := stmt.Exec( + e.Synopsis, e.Description, e.Year, e.PosterURL, + e.License, e.LicenseURL, e.NumAccolades, r.id, + ) + if err != nil { + tx.Rollback() + return fmt.Errorf("update wiki data for id %d: %w", r.id, err) + } + updated++ + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit wiki data: %w", err) + } + + log.Printf("fetchWikiArticlesData: %d entries updated", updated) + return nil +} + +// queryWikiArticle fetches and parses a single wiki article from the custom server. +func (a *App) queryWikiArticle(name string) (wikiArticleEntry, error) { + reqURL := a.Config.WikiServer + "?" + url.Values{ + "username": {a.Config.WikiUsername}, + "name": {name}, + }.Encode() + + resp, err := wikiArticleClient.Get(reqURL) + if err != nil { + return wikiArticleEntry{}, fmt.Errorf("http get: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) + return wikiArticleEntry{}, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body) + } + + var articles []map[string]interface{} + if err := json.NewDecoder(resp.Body).Decode(&articles); err != nil { + return wikiArticleEntry{}, fmt.Errorf("json decode: %w", err) + } + if len(articles) == 0 { + return wikiArticleEntry{}, fmt.Errorf("no articles returned") + } + article := articles[0] + + var entry wikiArticleEntry + + // description + if desc, ok := article["description"]; ok { + entry.Description = fmt.Sprintf("%v", desc) + } + + // synopsis from Plot section + entry.Synopsis = extractSynopsis(article) + + // year, poster_url from infobox + entry.Year, entry.PosterURL = extractInfoboxData(article) + + // license + if licList, ok := article["license"]; ok { + if arr, ok := licList.([]interface{}); ok && len(arr) > 0 { + if lic, ok := arr[0].(map[string]interface{}); ok { + entry.License = fmt.Sprintf("%v", lic["name"]) + entry.LicenseURL = fmt.Sprintf("%v", lic["url"]) + } + } + } + + // num_accolades from tables + entry.NumAccolades = extractAccolades(article) + + return entry, nil +} + +func extractSynopsis(article map[string]interface{}) string { + sections, ok := article["sections"].([]interface{}) + if !ok { + return "" + } + for _, sec := range sections { + s, ok := sec.(map[string]interface{}) + if !ok || s["name"] != "Plot" { + continue + } + var parts []string + if pp, ok := s["has_parts"].([]interface{}); ok { + for _, p := range pp { + if pp2, ok := p.(map[string]interface{}); ok && pp2["type"] == "paragraph" { + parts = append(parts, fmt.Sprintf("%v", pp2["value"])) + } + } + } + if len(parts) > 0 { + return strings.Join(parts, " ") + } + return "" + } + return "" +} + +func extractInfoboxData(article map[string]interface{}) (year int, posterURL string) { + infoboxes, ok := article["infoboxes"].([]interface{}) + if !ok || len(infoboxes) == 0 { + return + } + ib, ok := infoboxes[0].(map[string]interface{}) + if !ok { + return + } + parts, ok := ib["has_parts"].([]interface{}) + if !ok || len(parts) == 0 { + return + } + section, ok := parts[0].(map[string]interface{}) + if !ok { + return + } + subParts, _ := section["has_parts"].([]interface{}) + + for _, p := range subParts { + fp, ok := p.(map[string]interface{}) + if !ok { + continue + } + // poster from first image + if fp["type"] == "image" && posterURL == "" { + if imgs, ok := fp["images"].([]interface{}); ok && len(imgs) > 0 { + if img, ok := imgs[0].(map[string]interface{}); ok { + if cu, ok := img["content_url"]; ok { + posterURL = fmt.Sprintf("%v", cu) + } + } + } + } + // year from Release dates + if fp["name"] == "Release dates" && year == 0 { + if items, ok := fp["has_parts"].([]interface{}); ok && len(items) > 0 { + if item, ok := items[0].(map[string]interface{}); ok { + val := fmt.Sprintf("%v", item["value"]) + year = extractYear(val) + } + } + } + } + return +} + +func extractYear(s string) int { + // Look for 4-digit year pattern like "1972" or "(1972-03-14)" + for i := 0; i+3 < len(s); i++ { + if s[i] == '(' { + i++ // skip paren + } + if i+4 <= len(s) && s[i] >= '1' && s[i] <= '2' { + if y, err := strconv.Atoi(s[i : i+4]); err == nil && y >= 1800 && y <= 2100 { + return y + } + } + } + return 0 +} + +func extractAccolades(article map[string]interface{}) int { + tables, ok := article["tables"].([]interface{}) + if !ok { + return 0 + } + total := 0 + for _, t := range tables { + tab, ok := t.(map[string]interface{}) + if !ok { + continue + } + rows, _ := tab["rows"].([]interface{}) + total += len(rows) + } + return total +} -- cgit v1.2.3