summaryrefslogtreecommitdiff
path: root/src/wikiarticle.go
diff options
context:
space:
mode:
authordev2026-06-25 21:14:39 +0200
committerdev2026-06-25 21:14:39 +0200
commite13cbe6a4fd1ebe3f2c3bfc86c54e8dd17c59624 (patch)
treed36f8973ed80cdc2a868ebf3a6caaff2d5fa75c0 /src/wikiarticle.go
parenta5e3f8447022a50080a62285e359d38e0875de21 (diff)
downloadhnimdbbot-e13cbe6a4fd1ebe3f2c3bfc86c54e8dd17c59624.tar.gz
feat: fetch missing wiki data from custom server and populate imdb table
- Add wiki_server and wiki_username config fields - Query custom server for each wiki_article entry - Extract description, synopsis (Plot), year, poster_url, license, license_url, num_accolades from structured JSON response - Serial processing with 1 req/s rate limit - Update only entries missing at least one target column
Diffstat (limited to 'src/wikiarticle.go')
-rw-r--r--src/wikiarticle.go283
1 files changed, 283 insertions, 0 deletions
diff --git a/src/wikiarticle.go b/src/wikiarticle.go
new file mode 100644
index 0000000..61a6084
--- /dev/null
+++ b/src/wikiarticle.go
@@ -0,0 +1,283 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "strconv"
+ "strings"
+ "time"
+)
+
+var wikiArticleClient = &http.Client{Timeout: 60 * time.Second}
+
+// wikiArticleEntry holds extracted fields from a wiki article API response.
+type wikiArticleEntry struct {
+ Description string
+ Year int
+ PosterURL string
+ Synopsis string
+ License string
+ LicenseURL string
+ NumAccolades int
+}
+
+// fetchWikiArticlesData queries the custom wiki server for all entries
+// that have a wiki_article and updates the imdb table with extracted fields.
+func (a *App) fetchWikiArticlesData() error {
+ rows, err := a.DB.Query(`
+ SELECT id, imdb_id, wiki_article FROM imdb
+ WHERE wiki_article IS NOT NULL
+ AND (synopsis IS NULL OR description IS NULL OR year IS NULL
+ OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL)
+ `)
+ if err != nil {
+ return fmt.Errorf("query wiki articles: %w", err)
+ }
+ defer rows.Close()
+
+ type dbRow struct {
+ id int
+ imdbID string
+ wikiArticle string
+ }
+ var entries []dbRow
+ for rows.Next() {
+ var r dbRow
+ if err := rows.Scan(&r.id, &r.imdbID, &r.wikiArticle); err != nil {
+ return fmt.Errorf("scan row: %w", err)
+ }
+ entries = append(entries, r)
+ }
+ if err := rows.Err(); err != nil {
+ return fmt.Errorf("rows iteration: %w", err)
+ }
+
+ if len(entries) == 0 {
+ log.Println("fetchWikiArticlesData: all entries complete, skipping")
+ return nil
+ }
+ log.Printf("fetchWikiArticlesData: %d entries need wiki data", len(entries))
+
+ tx, err := a.DB.Begin()
+ if err != nil {
+ return fmt.Errorf("begin tx: %w", err)
+ }
+
+ stmt, err := tx.Prepare(`
+ UPDATE imdb SET
+ synopsis = ?, description = ?, year = ?, poster_url = ?,
+ license = ?, license_url = ?, num_accolades = ?
+ WHERE id = ?
+ `)
+ if err != nil {
+ tx.Rollback()
+ return fmt.Errorf("prepare wiki update: %w", err)
+ }
+ defer stmt.Close()
+
+ type result struct {
+ id int
+ entry wikiArticleEntry
+ }
+ ch := make(chan result, 1)
+
+ // Serial processing with 1 req/s rate limit
+ go func() {
+ for _, item := range entries {
+ entry, err := a.queryWikiArticle(item.wikiArticle)
+ if err != nil {
+ log.Printf("wiki error for %s (%s): %v", item.imdbID, item.wikiArticle, err)
+ continue
+ }
+ ch <- result{id: item.id, entry: entry}
+ time.Sleep(1 * time.Second)
+ }
+ close(ch)
+ }()
+
+ updated := 0
+ for r := range ch {
+ e := r.entry
+ _, err := stmt.Exec(
+ e.Synopsis, e.Description, e.Year, e.PosterURL,
+ e.License, e.LicenseURL, e.NumAccolades, r.id,
+ )
+ if err != nil {
+ tx.Rollback()
+ return fmt.Errorf("update wiki data for id %d: %w", r.id, err)
+ }
+ updated++
+ }
+
+ if err := tx.Commit(); err != nil {
+ return fmt.Errorf("commit wiki data: %w", err)
+ }
+
+ log.Printf("fetchWikiArticlesData: %d entries updated", updated)
+ return nil
+}
+
+// queryWikiArticle fetches and parses a single wiki article from the custom server.
+func (a *App) queryWikiArticle(name string) (wikiArticleEntry, error) {
+ reqURL := a.Config.WikiServer + "?" + url.Values{
+ "username": {a.Config.WikiUsername},
+ "name": {name},
+ }.Encode()
+
+ resp, err := wikiArticleClient.Get(reqURL)
+ if err != nil {
+ return wikiArticleEntry{}, fmt.Errorf("http get: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
+ return wikiArticleEntry{}, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
+ }
+
+ var articles []map[string]interface{}
+ if err := json.NewDecoder(resp.Body).Decode(&articles); err != nil {
+ return wikiArticleEntry{}, fmt.Errorf("json decode: %w", err)
+ }
+ if len(articles) == 0 {
+ return wikiArticleEntry{}, fmt.Errorf("no articles returned")
+ }
+ article := articles[0]
+
+ var entry wikiArticleEntry
+
+ // description
+ if desc, ok := article["description"]; ok {
+ entry.Description = fmt.Sprintf("%v", desc)
+ }
+
+ // synopsis from Plot section
+ entry.Synopsis = extractSynopsis(article)
+
+ // year, poster_url from infobox
+ entry.Year, entry.PosterURL = extractInfoboxData(article)
+
+ // license
+ if licList, ok := article["license"]; ok {
+ if arr, ok := licList.([]interface{}); ok && len(arr) > 0 {
+ if lic, ok := arr[0].(map[string]interface{}); ok {
+ entry.License = fmt.Sprintf("%v", lic["name"])
+ entry.LicenseURL = fmt.Sprintf("%v", lic["url"])
+ }
+ }
+ }
+
+ // num_accolades from tables
+ entry.NumAccolades = extractAccolades(article)
+
+ return entry, nil
+}
+
+func extractSynopsis(article map[string]interface{}) string {
+ sections, ok := article["sections"].([]interface{})
+ if !ok {
+ return ""
+ }
+ for _, sec := range sections {
+ s, ok := sec.(map[string]interface{})
+ if !ok || s["name"] != "Plot" {
+ continue
+ }
+ var parts []string
+ if pp, ok := s["has_parts"].([]interface{}); ok {
+ for _, p := range pp {
+ if pp2, ok := p.(map[string]interface{}); ok && pp2["type"] == "paragraph" {
+ parts = append(parts, fmt.Sprintf("%v", pp2["value"]))
+ }
+ }
+ }
+ if len(parts) > 0 {
+ return strings.Join(parts, " ")
+ }
+ return ""
+ }
+ return ""
+}
+
+func extractInfoboxData(article map[string]interface{}) (year int, posterURL string) {
+ infoboxes, ok := article["infoboxes"].([]interface{})
+ if !ok || len(infoboxes) == 0 {
+ return
+ }
+ ib, ok := infoboxes[0].(map[string]interface{})
+ if !ok {
+ return
+ }
+ parts, ok := ib["has_parts"].([]interface{})
+ if !ok || len(parts) == 0 {
+ return
+ }
+ section, ok := parts[0].(map[string]interface{})
+ if !ok {
+ return
+ }
+ subParts, _ := section["has_parts"].([]interface{})
+
+ for _, p := range subParts {
+ fp, ok := p.(map[string]interface{})
+ if !ok {
+ continue
+ }
+ // poster from first image
+ if fp["type"] == "image" && posterURL == "" {
+ if imgs, ok := fp["images"].([]interface{}); ok && len(imgs) > 0 {
+ if img, ok := imgs[0].(map[string]interface{}); ok {
+ if cu, ok := img["content_url"]; ok {
+ posterURL = fmt.Sprintf("%v", cu)
+ }
+ }
+ }
+ }
+ // year from Release dates
+ if fp["name"] == "Release dates" && year == 0 {
+ if items, ok := fp["has_parts"].([]interface{}); ok && len(items) > 0 {
+ if item, ok := items[0].(map[string]interface{}); ok {
+ val := fmt.Sprintf("%v", item["value"])
+ year = extractYear(val)
+ }
+ }
+ }
+ }
+ return
+}
+
+func extractYear(s string) int {
+ // Look for 4-digit year pattern like "1972" or "(1972-03-14)"
+ for i := 0; i+3 < len(s); i++ {
+ if s[i] == '(' {
+ i++ // skip paren
+ }
+ if i+4 <= len(s) && s[i] >= '1' && s[i] <= '2' {
+ if y, err := strconv.Atoi(s[i : i+4]); err == nil && y >= 1800 && y <= 2100 {
+ return y
+ }
+ }
+ }
+ return 0
+}
+
+func extractAccolades(article map[string]interface{}) int {
+ tables, ok := article["tables"].([]interface{})
+ if !ok {
+ return 0
+ }
+ total := 0
+ for _, t := range tables {
+ tab, ok := t.(map[string]interface{})
+ if !ok {
+ continue
+ }
+ rows, _ := tab["rows"].([]interface{})
+ total += len(rows)
+ }
+ return total
+}