summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordev2026-06-26 13:10:41 +0200
committerdev2026-06-26 13:10:41 +0200
commitef75353f3710d7566aa8b41922f776ecb3968830 (patch)
treed9eed852a38c7abba36a8c2804673c92ea12b2a5
parent6abbf29de5b08448005df974e95bf773de304550 (diff)
downloadhnimdbbot-ef75353f3710d7566aa8b41922f776ecb3968830.tar.gz
feat: extract actors, directors, screenwriters from Wikipedia API
- Extract directors from infobox 'Directed by' field/list - Extract screenwriters from infobox 'Screenplay by' list - Extract actors from Cast section list (first link = person name) - Upsert into people table, link via who table (profession: actor=1, director=2, screenwriter=3) - Track processed entries with has_people flag column - Consumer inserts people and marks has_people=1 on success
-rw-r--r--src/wikiarticle.go150
-rw-r--r--src/wikidata.go41
2 files changed, 184 insertions, 7 deletions
diff --git a/src/wikiarticle.go b/src/wikiarticle.go
index 5b891b6..cef9f02 100644
--- a/src/wikiarticle.go
+++ b/src/wikiarticle.go
@@ -14,15 +14,22 @@ import (
var wikiArticleClient = &http.Client{Timeout: 60 * time.Second}
+// wikiPerson carries a person extracted from a wiki article.
+type wikiPerson struct {
+ Name string
+ Profession int // 1=actor, 2=director, 3=screenwriter
+}
+
// wikiArticleEntry holds extracted fields from a wiki article API response.
type wikiArticleEntry struct {
- Description string
- Year int
- PosterURL string
- Synopsis string
- License string
- LicenseURL string
+ Description string
+ Year int
+ PosterURL string
+ Synopsis string
+ License string
+ LicenseURL string
NumAccolades int
+ People []wikiPerson
}
func (a *App) queryWikiArticle(name string) (wikiArticleEntry, int, error) {
@@ -84,6 +91,9 @@ func (a *App) queryWikiArticle(name string) (wikiArticleEntry, int, error) {
// year, poster_url from infobox
entry.Year, entry.PosterURL = extractInfoboxData(article)
+ // people from infobox and sections
+ entry.People = extractPeople(article)
+
// license
if licList, ok := article["license"]; ok {
if arr, ok := licList.([]interface{}); ok && len(arr) > 0 {
@@ -204,3 +214,131 @@ func extractAccolades(article map[string]interface{}) int {
}
return total
}
+
+// extractPeople extracts actors, directors, and screenwriters from the article.
+func extractPeople(article map[string]interface{}) []wikiPerson {
+ var people []wikiPerson
+
+ // Directors and screenwriters from infobox
+ ibParts := getInfoboxParts(article)
+ for _, p := range ibParts {
+ fp, ok := p.(map[string]interface{})
+ if !ok {
+ continue
+ }
+ name := fp["name"]
+ if name == "Directed by" {
+ people = append(people, extractPersonFromField(fp, 2)...)
+ }
+ if name == "Screenplay by" {
+ people = append(people, extractPersonFromField(fp, 3)...)
+ }
+ }
+
+ // Actors from Cast section
+ for _, sec := range getSections(article) {
+ s, ok := sec.(map[string]interface{})
+ if !ok || s["name"] != "Cast" {
+ continue
+ }
+ for _, part := range s["has_parts"].([]interface{}) {
+ p, ok := part.(map[string]interface{})
+ if !ok || p["type"] != "list" {
+ continue
+ }
+ for _, item := range p["has_parts"].([]interface{}) {
+ item, ok := item.(map[string]interface{})
+ if !ok {
+ continue
+ }
+ if link, ok := getFirstPersonLink(item); ok {
+ people = append(people, wikiPerson{Name: link, Profession: 1})
+ }
+ }
+ break // only first list in Cast section
+ }
+ break // only Cast section
+ }
+
+ return people
+}
+
+// getInfoboxParts returns the inner parts of the infobox section.
+func getInfoboxParts(article map[string]interface{}) []interface{} {
+ infoboxes, ok := article["infoboxes"].([]interface{})
+ if !ok || len(infoboxes) == 0 {
+ return nil
+ }
+ ib, ok := infoboxes[0].(map[string]interface{})
+ if !ok {
+ return nil
+ }
+ parts, ok := ib["has_parts"].([]interface{})
+ if !ok || len(parts) == 0 {
+ return nil
+ }
+ section, ok := parts[0].(map[string]interface{})
+ if !ok {
+ return nil
+ }
+ return section["has_parts"].([]interface{})
+}
+
+// getSections returns sections from the article.
+func getSections(article map[string]interface{}) []interface{} {
+ sections, ok := article["sections"].([]interface{})
+ if !ok {
+ return nil
+ }
+ return sections
+}
+
+// extractPersonFromField extracts a person name from a field or list.
+func extractPersonFromField(fp map[string]interface{}, profession int) []wikiPerson {
+ var people []wikiPerson
+ switch fp["type"] {
+ case "field":
+ if val, ok := fp["value"]; ok && val != nil {
+ name := fmt.Sprintf("%v", val)
+ if name != "" {
+ people = append(people, wikiPerson{Name: name, Profession: profession})
+ }
+ }
+ case "list":
+ for _, item := range fp["has_parts"].([]interface{}) {
+ item, ok := item.(map[string]interface{})
+ if !ok {
+ continue
+ }
+ // Prefer link text over value
+ if link, ok := getFirstPersonLink(item); ok {
+ people = append(people, wikiPerson{Name: link, Profession: profession})
+ } else if val := item["value"]; val != nil {
+ name := fmt.Sprintf("%v", val)
+ if name != "" {
+ people = append(people, wikiPerson{Name: name, Profession: profession})
+ }
+ }
+ }
+ }
+ return people
+}
+
+// getFirstPersonLink extracts the first link text from a list item.
+func getFirstPersonLink(item map[string]interface{}) (string, bool) {
+ links, ok := item["links"].([]interface{})
+ if !ok || len(links) == 0 {
+ return "", false
+ }
+ link, ok := links[0].(map[string]interface{})
+ if !ok {
+ return "", false
+ }
+ if text, ok := link["text"]; ok && text != nil {
+ s := fmt.Sprintf("%v", text)
+ if s != "" {
+ return s, true
+ }
+ }
+ return "", false
+}
diff --git a/src/wikidata.go b/src/wikidata.go
index e6c2e0e..59c8188 100644
--- a/src/wikidata.go
+++ b/src/wikidata.go
@@ -126,7 +126,7 @@ func (a *App) getExistingWikiArticles() ([]existingWikiArticle, error) {
WHERE wiki_article IS NOT NULL
AND wiki_status_code != 404
AND (synopsis IS NULL OR description IS NULL OR year IS NULL
- OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL)
+ OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL OR has_people = 0)
`)
if err != nil {
return nil, fmt.Errorf("query existing wiki articles: %w", err)
@@ -261,10 +261,49 @@ func (a *App) wikiDataConsumer(artCh <-chan wikiArticleFetch, done chan<- struct
entry.License, entry.LicenseURL, entry.NumAccolades, art.imdbID)
updated++
}
+
+ // Insert people (actors, directors, screenwriters)
+ if statusCode == 200 && len(entry.People) > 0 {
+ if err := a.insertWikiPeople(art.imdbID, entry.People); err != nil {
+ log.Printf("insert people error %s: %v", art.imdbID, err)
+ }
+ a.DB.Exec(`UPDATE imdb SET has_people = 1 WHERE imdb_id = ?`, art.imdbID)
+ }
}
log.Printf("fetchWikiArticlesData: %d updated, %d skipped (non-200)", updated, skipped)
}
+
+// insertWikiPeople upserts people into people/who tables.
+func (a *App) insertWikiPeople(imdbID string, people []wikiPerson) error {
+ // Get the DB row id for this imdb_id
+ var dbID int
+ if err := a.DB.Get(&dbID, `SELECT id FROM imdb WHERE imdb_id = ?`, imdbID); err != nil {
+ return fmt.Errorf("lookup imdb id for %s: %w", imdbID, err)
+ }
+
+ for _, p := range people {
+ // Upsert person
+ var personID int
+ err := a.DB.Get(&personID, `SELECT id FROM people WHERE name = ?`, p.Name)
+ if err != nil {
+ // Insert new person
+ result, err := a.DB.Exec(`INSERT INTO people (name) VALUES (?)`, p.Name)
+ if err != nil {
+ continue // skip on conflict
+ }
+ if id, err := result.LastInsertId(); err == nil {
+ personID = int(id)
+ }
+ }
+
+ // Insert who relationship (unique on imdb_id, people_id, profession_id)
+ a.DB.Exec(`INSERT IGNORE INTO who (imdb_id, people_id, profession_id) VALUES (?, ?, ?)`,
+ dbID, personID, p.Profession)
+ }
+
+ return nil
+}
// fetchWikiArticlesData fetches wiki article data from the custom server for all
// entries that have a wiki_article but need data extraction. Callable independently.
func (a *App) fetchWikiArticlesData() error {