summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordev2026-06-24 04:26:46 +0200
committerdev2026-06-24 04:26:46 +0200
commit6d5231a204790dae325a0557d908c2c6d15bb516 (patch)
tree3c8e018c86badb60bd082ae08fb80b0f91a2642e
parenta55f6e227ff397a5d9167cd4ee15442e9cad06ab (diff)
downloadhnimdbbot-6d5231a204790dae325a0557d908c2c6d15bb516.tar.gz
feat: populate genre table from title.basics.tsv
- Parse genres field (rec[8]) from title.basics.tsv, split by comma - Insert into genre table via SELECT to resolve imdb.id from imdb_id - Update fetchAndUpdateImdbData to check for missing genres too - Skip download if TSV already exists (supports stubbed downloadFile)
-rw-r--r--src/imdbdata.go58
1 files changed, 47 insertions, 11 deletions
diff --git a/src/imdbdata.go b/src/imdbdata.go
index 30095ea..c4216a7 100644
--- a/src/imdbdata.go
+++ b/src/imdbdata.go
@@ -95,6 +95,12 @@ func (a *App) downloadImdbDatasets() error {
}
for _, p := range pairs {
+ // skip if TSV already exists (download may be stubbed out)
+ if _, err := os.Stat(p.tsv); err == nil {
+ log.Printf("reusing existing %s", p.tsv)
+ continue
+ }
+
if err := downloadFile(p.gz, p.url); err != nil {
return fmt.Errorf("download %s: %w", p.url, err)
}
@@ -122,6 +128,7 @@ type basicEntry struct {
OriginalTitle string
StartYear *int
RuntimeMinutes *int
+ Genres []string
}
// parseTSV reads a TSV file line by line and calls fn for each row (after header).
@@ -197,7 +204,7 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro
func(s string) bool { return ids[s] },
func(rec []string) error {
// tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres
- if len(rec) < 8 {
+ if len(rec) < 9 {
return nil
}
@@ -223,6 +230,11 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro
}
}
+ // genres is field 8 (0-indexed), comma-separated
+ if rec[8] != "\\N" && rec[8] != "" {
+ entry.Genres = strings.Split(rec[8], ",")
+ }
+
result[rec[0]] = entry
return nil
},
@@ -251,7 +263,17 @@ func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string
}
defer bStmt.Close()
- rCount, bCount := 0, 0
+ genreStmt, err := tx.Prepare(`
+ INSERT INTO genre (imdb_id, name)
+ SELECT i.id, ? FROM imdb i WHERE i.imdb_id = ?
+ `)
+ if err != nil {
+ tx.Rollback()
+ return fmt.Errorf("prepare genre insert: %w", err)
+ }
+ defer genreStmt.Close()
+
+ rCount, bCount, gCount := 0, 0, 0
for id, r := range ratings {
dec := float64(int(r.AverageRating*10+0.5)) / 10.0
@@ -271,13 +293,21 @@ func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string
return fmt.Errorf("update basic %s: %w", id, err)
}
bCount++
+
+ for _, g := range b.Genres {
+ if _, err := genreStmt.Exec(g, id); err != nil {
+ tx.Rollback()
+ return fmt.Errorf("insert genre %s for %s: %w", g, id, err)
+ }
+ gCount++
+ }
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit: %w", err)
}
- log.Printf("applyImdbUpdates: %d ratings, %d basics updated", rCount, bCount)
+ log.Printf("applyImdbUpdates: %d ratings, %d basics, %d genres updated", rCount, bCount, gCount)
return nil
}
@@ -304,16 +334,22 @@ func (a *App) getImdbIDsWithoutRating() (map[string]bool, error) {
// Checks if any imdb entry lacks average_rating. If so, downloads datasets,
// parses them, and updates matching rows with ratings and basic metadata.
func (a *App) fetchAndUpdateImdbData() error {
- var count int
- if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&count); err != nil {
- return fmt.Errorf("count: %w", err)
- }
- if count == 0 {
- log.Println("fetchAndUpdateImdbData: all entries have ratings, skipping")
+ var missingRatings, missingGenres int
+ if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&missingRatings); err != nil {
+ return fmt.Errorf("count ratings: %w", err)
+ }
+ if err := a.DB.QueryRow(`
+ SELECT COUNT(*) FROM imdb i
+ LEFT JOIN genre g ON g.imdb_id = i.id
+ WHERE g.id IS NULL
+ `).Scan(&missingGenres); err != nil {
+ return fmt.Errorf("count genres: %w", err)
+ }
+ if missingRatings == 0 && missingGenres == 0 {
+ log.Println("fetchAndUpdateImdbData: all entries complete, skipping")
return nil
}
- log.Printf("fetchAndUpdateImdbData: %d entries without rating", count)
-
+ log.Printf("fetchAndUpdateImdbData: %d missing ratings, %d missing genres", missingRatings, missingGenres)
// Download / refresh datasets
if err := a.downloadImdbDatasets(); err != nil {
return err