diff options
| author | dev | 2026-06-24 04:26:46 +0200 |
|---|---|---|
| committer | dev | 2026-06-24 04:26:46 +0200 |
| commit | 6d5231a204790dae325a0557d908c2c6d15bb516 (patch) | |
| tree | 3c8e018c86badb60bd082ae08fb80b0f91a2642e /src/imdbdata.go | |
| parent | a55f6e227ff397a5d9167cd4ee15442e9cad06ab (diff) | |
| download | hnimdbbot-6d5231a204790dae325a0557d908c2c6d15bb516.tar.gz | |
feat: populate genre table from title.basics.tsv
- Parse genres field (rec[8]) from title.basics.tsv, split by comma
- Insert into genre table via SELECT to resolve imdb.id from imdb_id
- Update fetchAndUpdateImdbData to check for missing genres too
- Skip download if TSV already exists (supports stubbed downloadFile)
Diffstat (limited to 'src/imdbdata.go')
| -rw-r--r-- | src/imdbdata.go | 58 |
1 files changed, 47 insertions, 11 deletions
diff --git a/src/imdbdata.go b/src/imdbdata.go index 30095ea..c4216a7 100644 --- a/src/imdbdata.go +++ b/src/imdbdata.go @@ -95,6 +95,12 @@ func (a *App) downloadImdbDatasets() error { } for _, p := range pairs { + // skip if TSV already exists (download may be stubbed out) + if _, err := os.Stat(p.tsv); err == nil { + log.Printf("reusing existing %s", p.tsv) + continue + } + if err := downloadFile(p.gz, p.url); err != nil { return fmt.Errorf("download %s: %w", p.url, err) } @@ -122,6 +128,7 @@ type basicEntry struct { OriginalTitle string StartYear *int RuntimeMinutes *int + Genres []string } // parseTSV reads a TSV file line by line and calls fn for each row (after header). @@ -197,7 +204,7 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro func(s string) bool { return ids[s] }, func(rec []string) error { // tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres - if len(rec) < 8 { + if len(rec) < 9 { return nil } @@ -223,6 +230,11 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro } } + // genres is field 8 (0-indexed), comma-separated + if rec[8] != "\\N" && rec[8] != "" { + entry.Genres = strings.Split(rec[8], ",") + } + result[rec[0]] = entry return nil }, @@ -251,7 +263,17 @@ func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string } defer bStmt.Close() - rCount, bCount := 0, 0 + genreStmt, err := tx.Prepare(` + INSERT INTO genre (imdb_id, name) + SELECT i.id, ? FROM imdb i WHERE i.imdb_id = ? + `) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare genre insert: %w", err) + } + defer genreStmt.Close() + + rCount, bCount, gCount := 0, 0, 0 for id, r := range ratings { dec := float64(int(r.AverageRating*10+0.5)) / 10.0 @@ -271,13 +293,21 @@ func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string return fmt.Errorf("update basic %s: %w", id, err) } bCount++ + + for _, g := range b.Genres { + if _, err := genreStmt.Exec(g, id); err != nil { + tx.Rollback() + return fmt.Errorf("insert genre %s for %s: %w", g, id, err) + } + gCount++ + } } if err := tx.Commit(); err != nil { return fmt.Errorf("commit: %w", err) } - log.Printf("applyImdbUpdates: %d ratings, %d basics updated", rCount, bCount) + log.Printf("applyImdbUpdates: %d ratings, %d basics, %d genres updated", rCount, bCount, gCount) return nil } @@ -304,16 +334,22 @@ func (a *App) getImdbIDsWithoutRating() (map[string]bool, error) { // Checks if any imdb entry lacks average_rating. If so, downloads datasets, // parses them, and updates matching rows with ratings and basic metadata. func (a *App) fetchAndUpdateImdbData() error { - var count int - if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&count); err != nil { - return fmt.Errorf("count: %w", err) - } - if count == 0 { - log.Println("fetchAndUpdateImdbData: all entries have ratings, skipping") + var missingRatings, missingGenres int + if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&missingRatings); err != nil { + return fmt.Errorf("count ratings: %w", err) + } + if err := a.DB.QueryRow(` + SELECT COUNT(*) FROM imdb i + LEFT JOIN genre g ON g.imdb_id = i.id + WHERE g.id IS NULL + `).Scan(&missingGenres); err != nil { + return fmt.Errorf("count genres: %w", err) + } + if missingRatings == 0 && missingGenres == 0 { + log.Println("fetchAndUpdateImdbData: all entries complete, skipping") return nil } - log.Printf("fetchAndUpdateImdbData: %d entries without rating", count) - + log.Printf("fetchAndUpdateImdbData: %d missing ratings, %d missing genres", missingRatings, missingGenres) // Download / refresh datasets if err := a.downloadImdbDatasets(); err != nil { return err |
