diff options
| author | dev | 2026-06-26 13:10:41 +0200 |
|---|---|---|
| committer | dev | 2026-06-26 13:10:41 +0200 |
| commit | ef75353f3710d7566aa8b41922f776ecb3968830 (patch) | |
| tree | d9eed852a38c7abba36a8c2804673c92ea12b2a5 | |
| parent | 6abbf29de5b08448005df974e95bf773de304550 (diff) | |
| download | hnimdbbot-ef75353f3710d7566aa8b41922f776ecb3968830.tar.gz | |
feat: extract actors, directors, screenwriters from Wikipedia API
- Extract directors from infobox 'Directed by' field/list
- Extract screenwriters from infobox 'Screenplay by' list
- Extract actors from Cast section list (first link = person name)
- Upsert into people table, link via who table (profession: actor=1, director=2, screenwriter=3)
- Track processed entries with has_people flag column
- Consumer inserts people and marks has_people=1 on success
| -rw-r--r-- | src/wikiarticle.go | 150 | ||||
| -rw-r--r-- | src/wikidata.go | 41 |
2 files changed, 184 insertions, 7 deletions
diff --git a/src/wikiarticle.go b/src/wikiarticle.go index 5b891b6..cef9f02 100644 --- a/src/wikiarticle.go +++ b/src/wikiarticle.go @@ -14,15 +14,22 @@ import ( var wikiArticleClient = &http.Client{Timeout: 60 * time.Second} +// wikiPerson carries a person extracted from a wiki article. +type wikiPerson struct { + Name string + Profession int // 1=actor, 2=director, 3=screenwriter +} + // wikiArticleEntry holds extracted fields from a wiki article API response. type wikiArticleEntry struct { - Description string - Year int - PosterURL string - Synopsis string - License string - LicenseURL string + Description string + Year int + PosterURL string + Synopsis string + License string + LicenseURL string NumAccolades int + People []wikiPerson } func (a *App) queryWikiArticle(name string) (wikiArticleEntry, int, error) { @@ -84,6 +91,9 @@ func (a *App) queryWikiArticle(name string) (wikiArticleEntry, int, error) { // year, poster_url from infobox entry.Year, entry.PosterURL = extractInfoboxData(article) + // people from infobox and sections + entry.People = extractPeople(article) + // license if licList, ok := article["license"]; ok { if arr, ok := licList.([]interface{}); ok && len(arr) > 0 { @@ -204,3 +214,131 @@ func extractAccolades(article map[string]interface{}) int { } return total } + +// extractPeople extracts actors, directors, and screenwriters from the article. +func extractPeople(article map[string]interface{}) []wikiPerson { + var people []wikiPerson + + // Directors and screenwriters from infobox + ibParts := getInfoboxParts(article) + for _, p := range ibParts { + fp, ok := p.(map[string]interface{}) + if !ok { + continue + } + name := fp["name"] + if name == "Directed by" { + people = append(people, extractPersonFromField(fp, 2)...) + } + if name == "Screenplay by" { + people = append(people, extractPersonFromField(fp, 3)...) + } + } + + // Actors from Cast section + for _, sec := range getSections(article) { + s, ok := sec.(map[string]interface{}) + if !ok || s["name"] != "Cast" { + continue + } + for _, part := range s["has_parts"].([]interface{}) { + p, ok := part.(map[string]interface{}) + if !ok || p["type"] != "list" { + continue + } + for _, item := range p["has_parts"].([]interface{}) { + item, ok := item.(map[string]interface{}) + if !ok { + continue + } + if link, ok := getFirstPersonLink(item); ok { + people = append(people, wikiPerson{Name: link, Profession: 1}) + } + } + break // only first list in Cast section + } + break // only Cast section + } + + return people +} + +// getInfoboxParts returns the inner parts of the infobox section. +func getInfoboxParts(article map[string]interface{}) []interface{} { + infoboxes, ok := article["infoboxes"].([]interface{}) + if !ok || len(infoboxes) == 0 { + return nil + } + ib, ok := infoboxes[0].(map[string]interface{}) + if !ok { + return nil + } + parts, ok := ib["has_parts"].([]interface{}) + if !ok || len(parts) == 0 { + return nil + } + section, ok := parts[0].(map[string]interface{}) + if !ok { + return nil + } + return section["has_parts"].([]interface{}) +} + +// getSections returns sections from the article. +func getSections(article map[string]interface{}) []interface{} { + sections, ok := article["sections"].([]interface{}) + if !ok { + return nil + } + return sections +} + +// extractPersonFromField extracts a person name from a field or list. +func extractPersonFromField(fp map[string]interface{}, profession int) []wikiPerson { + var people []wikiPerson + switch fp["type"] { + case "field": + if val, ok := fp["value"]; ok && val != nil { + name := fmt.Sprintf("%v", val) + if name != "" { + people = append(people, wikiPerson{Name: name, Profession: profession}) + } + } + case "list": + for _, item := range fp["has_parts"].([]interface{}) { + item, ok := item.(map[string]interface{}) + if !ok { + continue + } + // Prefer link text over value + if link, ok := getFirstPersonLink(item); ok { + people = append(people, wikiPerson{Name: link, Profession: profession}) + } else if val := item["value"]; val != nil { + name := fmt.Sprintf("%v", val) + if name != "" { + people = append(people, wikiPerson{Name: name, Profession: profession}) + } + } + } + } + return people +} + +// getFirstPersonLink extracts the first link text from a list item. +func getFirstPersonLink(item map[string]interface{}) (string, bool) { + links, ok := item["links"].([]interface{}) + if !ok || len(links) == 0 { + return "", false + } + link, ok := links[0].(map[string]interface{}) + if !ok { + return "", false + } + if text, ok := link["text"]; ok && text != nil { + s := fmt.Sprintf("%v", text) + if s != "" { + return s, true + } + } + return "", false +} diff --git a/src/wikidata.go b/src/wikidata.go index e6c2e0e..59c8188 100644 --- a/src/wikidata.go +++ b/src/wikidata.go @@ -126,7 +126,7 @@ func (a *App) getExistingWikiArticles() ([]existingWikiArticle, error) { WHERE wiki_article IS NOT NULL AND wiki_status_code != 404 AND (synopsis IS NULL OR description IS NULL OR year IS NULL - OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL) + OR poster_url IS NULL OR license IS NULL OR license_url IS NULL OR num_accolades IS NULL OR has_people = 0) `) if err != nil { return nil, fmt.Errorf("query existing wiki articles: %w", err) @@ -261,10 +261,49 @@ func (a *App) wikiDataConsumer(artCh <-chan wikiArticleFetch, done chan<- struct entry.License, entry.LicenseURL, entry.NumAccolades, art.imdbID) updated++ } + + // Insert people (actors, directors, screenwriters) + if statusCode == 200 && len(entry.People) > 0 { + if err := a.insertWikiPeople(art.imdbID, entry.People); err != nil { + log.Printf("insert people error %s: %v", art.imdbID, err) + } + a.DB.Exec(`UPDATE imdb SET has_people = 1 WHERE imdb_id = ?`, art.imdbID) + } } log.Printf("fetchWikiArticlesData: %d updated, %d skipped (non-200)", updated, skipped) } + +// insertWikiPeople upserts people into people/who tables. +func (a *App) insertWikiPeople(imdbID string, people []wikiPerson) error { + // Get the DB row id for this imdb_id + var dbID int + if err := a.DB.Get(&dbID, `SELECT id FROM imdb WHERE imdb_id = ?`, imdbID); err != nil { + return fmt.Errorf("lookup imdb id for %s: %w", imdbID, err) + } + + for _, p := range people { + // Upsert person + var personID int + err := a.DB.Get(&personID, `SELECT id FROM people WHERE name = ?`, p.Name) + if err != nil { + // Insert new person + result, err := a.DB.Exec(`INSERT INTO people (name) VALUES (?)`, p.Name) + if err != nil { + continue // skip on conflict + } + if id, err := result.LastInsertId(); err == nil { + personID = int(id) + } + } + + // Insert who relationship (unique on imdb_id, people_id, profession_id) + a.DB.Exec(`INSERT IGNORE INTO who (imdb_id, people_id, profession_id) VALUES (?, ?, ?)`, + dbID, personID, p.Profession) + } + + return nil +} // fetchWikiArticlesData fetches wiki article data from the custom server for all // entries that have a wiki_article but need data extraction. Callable independently. func (a *App) fetchWikiArticlesData() error { |
