package main import ( "encoding/json" "fmt" "io" "log" "net/http" "net/url" "strconv" "strings" "time" ) var wikiArticleClient = &http.Client{Timeout: 60 * time.Second} // wikiArticleEntry holds extracted fields from a wiki article API response. type wikiArticleEntry struct { Description string Year int PosterURL string Synopsis string License string LicenseURL string NumAccolades int } // fetchWikiArticlesData queries the custom wiki server for all entries // that have a wiki_article and updates the imdb table with extracted fields. func (a *App) fetchWikiArticlesData() error { /* rows, err := a.DB.Query(` SELECT id, imdb_id, wiki_article FROM imdb WHERE wiki_article IS NOT NULL AND wiki_status_code != 404 AND (synopsis IS NULL OR description IS NULL OR year IS NULL OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL) `) */ rows, err := a.DB.Query(` SELECT id, imdb_id, wiki_article FROM imdb WHERE wiki_article IS NOT NULL AND (synopsis IS NULL OR description IS NULL OR year IS NULL OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL) `) if err != nil { return fmt.Errorf("query wiki articles: %w", err) } defer rows.Close() type dbRow struct { id int imdbID string wikiArticle string } var entries []dbRow for rows.Next() { var r dbRow if err := rows.Scan(&r.id, &r.imdbID, &r.wikiArticle); err != nil { return fmt.Errorf("scan row: %w", err) } entries = append(entries, r) } if err := rows.Err(); err != nil { return fmt.Errorf("rows iteration: %w", err) } if len(entries) == 0 { log.Println("fetchWikiArticlesData: all entries complete, skipping") return nil } log.Printf("fetchWikiArticlesData: %d entries need wiki data", len(entries)) tx, err := a.DB.Begin() if err != nil { return fmt.Errorf("begin tx: %w", err) } stmt, err := tx.Prepare(` UPDATE imdb SET synopsis = ?, description = ?, year = ?, poster_url = ?, license = ?, license_url = ?, num_accolades = ? WHERE id = ? `) if err != nil { tx.Rollback() return fmt.Errorf("prepare wiki update: %w", err) } defer stmt.Close() statusStmt, err := tx.Prepare(` UPDATE imdb SET wiki_status_code = ? WHERE id = ? `) if err != nil { tx.Rollback() return fmt.Errorf("prepare wiki status update: %w", err) } defer statusStmt.Close() type result struct { id int entry wikiArticleEntry statusCode int } ch := make(chan result, 1) // Serial processing with 2s between requests go func() { for i, item := range entries { if i > 0 { time.Sleep(2 * time.Second) } entry, statusCode, err := a.queryWikiArticle(item.wikiArticle) ch <- result{id: item.id, entry: entry, statusCode: statusCode} if err != nil { log.Printf("wiki error %d/%d %s (%s): HTTP %d - %v", i+1, len(entries), item.imdbID, item.wikiArticle, statusCode, err) } } close(ch) }() updated := 0 skipped := 0 for r := range ch { // Always record status code if r.statusCode > 0 { if _, err := statusStmt.Exec(r.statusCode, r.id); err != nil { tx.Rollback() return fmt.Errorf("update wiki_status_code for id %d: %w", r.id, err) } } // Only update data fields on success if r.statusCode == 200 { e := r.entry _, err := stmt.Exec( e.Synopsis, e.Description, e.Year, e.PosterURL, e.License, e.LicenseURL, e.NumAccolades, r.id, ) if err != nil { tx.Rollback() return fmt.Errorf("update wiki data for id %d: %w", r.id, err) } updated++ } else { skipped++ } } if err := tx.Commit(); err != nil { return fmt.Errorf("commit wiki data: %w", err) } log.Printf("fetchWikiArticlesData: %d updated, %d skipped (non-200)", updated, skipped) return nil } func (a *App) queryWikiArticle(name string) (wikiArticleEntry, int, error) { // Build URL — name is decoded from DB, encode it for the request reqURL := fmt.Sprintf("%s?username=%s&name=%s", a.Config.WikiServer, url.QueryEscape(a.Config.WikiUsername), url.PathEscape(name)) var resp *http.Response var err error for attempt := 0; attempt < 5; attempt++ { if attempt > 0 { backoff := 1 << attempt log.Printf("retry %d/%d for %s after %ds backoff", attempt, 4, name, backoff) time.Sleep(time.Duration(backoff) * time.Second) } resp, err = wikiArticleClient.Get(reqURL) if err != nil { continue } if resp.StatusCode == http.StatusTooManyRequests { resp.Body.Close() continue } if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) resp.Body.Close() return wikiArticleEntry{}, resp.StatusCode, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body) } break } if err != nil { return wikiArticleEntry{}, 0, fmt.Errorf("http get: %w", err) } defer resp.Body.Close() var articles []map[string]interface{} if err := json.NewDecoder(resp.Body).Decode(&articles); err != nil { return wikiArticleEntry{}, 200, fmt.Errorf("json decode: %w", err) } if len(articles) == 0 { return wikiArticleEntry{}, 200, fmt.Errorf("no articles returned") } article := articles[0] var entry wikiArticleEntry // description if desc, ok := article["description"]; ok { entry.Description = fmt.Sprintf("%v", desc) } // synopsis from Plot section entry.Synopsis = extractSynopsis(article) // year, poster_url from infobox entry.Year, entry.PosterURL = extractInfoboxData(article) // license if licList, ok := article["license"]; ok { if arr, ok := licList.([]interface{}); ok && len(arr) > 0 { if lic, ok := arr[0].(map[string]interface{}); ok { entry.License = fmt.Sprintf("%v", lic["name"]) entry.LicenseURL = fmt.Sprintf("%v", lic["url"]) } } } // num_accolades from tables entry.NumAccolades = extractAccolades(article) return entry, 200, nil } func extractSynopsis(article map[string]interface{}) string { sections, ok := article["sections"].([]interface{}) if !ok { return "" } for _, sec := range sections { s, ok := sec.(map[string]interface{}) if !ok || s["name"] != "Plot" { continue } var parts []string if pp, ok := s["has_parts"].([]interface{}); ok { for _, p := range pp { if pp2, ok := p.(map[string]interface{}); ok && pp2["type"] == "paragraph" { parts = append(parts, fmt.Sprintf("%v", pp2["value"])) } } } if len(parts) > 0 { return strings.Join(parts, " ") } return "" } return "" } func extractInfoboxData(article map[string]interface{}) (year int, posterURL string) { infoboxes, ok := article["infoboxes"].([]interface{}) if !ok || len(infoboxes) == 0 { return } ib, ok := infoboxes[0].(map[string]interface{}) if !ok { return } parts, ok := ib["has_parts"].([]interface{}) if !ok || len(parts) == 0 { return } section, ok := parts[0].(map[string]interface{}) if !ok { return } subParts, _ := section["has_parts"].([]interface{}) for _, p := range subParts { fp, ok := p.(map[string]interface{}) if !ok { continue } // poster from first image if fp["type"] == "image" && posterURL == "" { if imgs, ok := fp["images"].([]interface{}); ok && len(imgs) > 0 { if img, ok := imgs[0].(map[string]interface{}); ok { if cu, ok := img["content_url"]; ok { posterURL = fmt.Sprintf("%v", cu) } } } } // year from Release dates if fp["name"] == "Release dates" && year == 0 { if items, ok := fp["has_parts"].([]interface{}); ok && len(items) > 0 { if item, ok := items[0].(map[string]interface{}); ok { val := fmt.Sprintf("%v", item["value"]) year = extractYear(val) } } } } return } func extractYear(s string) int { // Look for 4-digit year pattern like "1972" or "(1972-03-14)" for i := 0; i+3 < len(s); i++ { if s[i] == '(' { i++ // skip paren } if i+4 <= len(s) && s[i] >= '1' && s[i] <= '2' { if y, err := strconv.Atoi(s[i : i+4]); err == nil && y >= 1800 && y <= 2100 { return y } } } return 0 } func extractAccolades(article map[string]interface{}) int { tables, ok := article["tables"].([]interface{}) if !ok { return 0 } total := 0 for _, t := range tables { tab, ok := t.(map[string]interface{}) if !ok { continue } rows, _ := tab["rows"].([]interface{}) total += len(rows) } return total }