diff options
| author | dev | 2026-06-24 03:46:14 +0200 |
|---|---|---|
| committer | dev | 2026-06-24 03:46:14 +0200 |
| commit | 86069f011f35e339a30ffb717308990369c5f29f (patch) | |
| tree | 276537b5c86400d409a8d257ec68b13512b997db | |
| parent | eec189de8a5be0a18103a215d369c6135b86e9ff (diff) | |
| download | hnimdbbot-86069f011f35e339a30ffb717308990369c5f29f.tar.gz | |
feat: fetchAndUpdateImdbData — download IMDB datasets and populate imdb table
- Check for imdb entries with NULL average_rating
- Download title.basics.tsv.gz and title.ratings.tsv.gz to imdbdata/
- Decompress alongside gzip originals
- Parse only rows matching our imdb_ids (memory-efficient)
- Update: average_rating, num_votes, title_type, primary_title,
original_title, start_year, runtime_minutes
- Results: 3394 ratings, 3093 basics updated out of 3448 entries
| -rw-r--r-- | src/imdbdata.go | 348 | ||||
| -rw-r--r-- | src/main.go | 4 |
2 files changed, 352 insertions, 0 deletions
diff --git a/src/imdbdata.go b/src/imdbdata.go new file mode 100644 index 0000000..7eb803d --- /dev/null +++ b/src/imdbdata.go @@ -0,0 +1,348 @@ +package main + +import ( + "compress/gzip" + "encoding/csv" + "fmt" + "io" + "log" + "net/http" + "os" + "path/filepath" + "strconv" +) + +const ( + imdbDataDir = "imdbdata" + basicsURL = "https://datasets.imdbws.com/title.basics.tsv.gz" + ratingsURL = "https://datasets.imdbws.com/title.ratings.tsv.gz" +) + +func dataPath(name string) string { + return filepath.Join(imdbDataDir, name) +} + +// downloadFile fetches url and writes to dst, overwriting if present. +func downloadFile(dst, url string) error { + resp, err := http.Get(url) + if err != nil { + return fmt.Errorf("http get: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("http %d from %s", resp.StatusCode, url) + } + + f, err := os.Create(dst) + if err != nil { + return fmt.Errorf("create %s: %w", dst, err) + } + defer f.Close() + + _, err = io.Copy(f, resp.Body) + if err != nil { + return fmt.Errorf("write %s: %w", dst, err) + } + log.Printf("downloaded %s", url) + return nil +} + +// gunzipFile decompresses src.gz to dst. +func gunzipFile(src, dst string) error { + f, err := os.Open(src) + if err != nil { + return err + } + defer f.Close() + + gz, err := gzip.NewReader(f) + if err != nil { + return err + } + defer gz.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + _, err = io.Copy(out, gz) + if err != nil { + return fmt.Errorf("write %s: %w", dst, err) + } + return nil +} + +// downloadImdbDatasets fetches and extracts the two TSV files into imdbdata/. +func (a *App) downloadImdbDatasets() error { + if err := os.MkdirAll(imdbDataDir, 0755); err != nil { + return fmt.Errorf("mkdir: %w", err) + } + + type filePair struct { + url string + gz string + tsv string + } + + pairs := []filePair{ + {basicsURL, dataPath("title.basics.tsv.gz"), dataPath("title.basics.tsv")}, + {ratingsURL, dataPath("title.ratings.tsv.gz"), dataPath("title.ratings.tsv")}, + } + + for _, p := range pairs { + if err := downloadFile(p.gz, p.url); err != nil { + return fmt.Errorf("download %s: %w", p.url, err) + } + if err := gunzipFile(p.gz, p.tsv); err != nil { + return fmt.Errorf("gunzip %s: %w", p.gz, err) + } + log.Printf("extracted %s", p.tsv) + } + return nil +} + +// ratingEntry holds data from title.ratings.tsv for one title. +type ratingEntry struct { + AverageRating float64 + NumVotes int +} + +// basicEntry holds data from title.basics.tsv for one title. +type basicEntry struct { + TitleType string + PrimaryTitle string + OriginalTitle string + StartYear *int + RuntimeMinutes *int +} + +// parseTSV reads a TSV file and calls fn for each row (after header). +// Only rows where keep(tconst) is true are passed to fn. +func parseTSV(path string, keep func(string) bool, fn func(record []string) error) error { + f, err := os.Open(path) + if err != nil { + return fmt.Errorf("open %s: %w", path, err) + } + defer f.Close() + + r := csv.NewReader(f) + r.Comma = '\t' + r.LazyQuotes = true + r.FieldsPerRecord = -1 + + // skip header + if _, err := r.Read(); err != nil { + return fmt.Errorf("read header: %w", err) + } + + for { + rec, err := r.Read() + if err == io.EOF { + break + } + if err != nil { + return fmt.Errorf("read row: %w", err) + } + + if len(rec) < 1 { + continue + } + + tconst := rec[0] + if !keep(tconst) { + continue + } + + if err := fn(rec); err != nil { + return err + } + } + return nil +} + +// parseTitleRatings reads the ratings TSV and collects entries for the given ids. +func (a *App) parseTitleRatings(ids map[string]bool) (map[string]ratingEntry, error) { + result := make(map[string]ratingEntry) + + err := parseTSV(dataPath("title.ratings.tsv"), + func(s string) bool { return ids[s] }, + func(rec []string) error { + if len(rec) < 3 { + return nil + } + rating, err := strconv.ParseFloat(rec[1], 64) + if err != nil { + return nil + } + votes, err := strconv.Atoi(rec[2]) + if err != nil { + return nil + } + result[rec[0]] = ratingEntry{AverageRating: rating, NumVotes: votes} + return nil + }, + ) + return result, err +} + +// parseTitleBasics reads the basics TSV and collects entries for the given ids. +func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, error) { + result := make(map[string]basicEntry) + + err := parseTSV(dataPath("title.basics.tsv"), + func(s string) bool { return ids[s] }, + func(rec []string) error { + if len(rec) < 7 { + return nil + } + + entry := basicEntry{ + TitleType: rec[1], + PrimaryTitle: rec[2], + OriginalTitle: rec[3], + } + + if rec[4] != "\\N" && rec[4] != "" { + v, err := strconv.Atoi(rec[4]) + if err == nil { + entry.StartYear = &v + } + } + + if rec[5] != "\\N" && rec[5] != "" { + v, err := strconv.Atoi(rec[5]) + if err == nil { + entry.RuntimeMinutes = &v + } + } + + result[rec[0]] = entry + return nil + }, + ) + return result, err +} + +// applyImdbUpdates writes ratings and basics data into the imdb table. +func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string]basicEntry) error { + tx := a.DB.MustBegin() + + rStmt, err := tx.Prepare(`UPDATE imdb SET average_rating = ?, num_votes = ? WHERE imdb_id = ?`) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare rating update: %w", err) + } + defer rStmt.Close() + + bStmt, err := tx.Prepare(` + UPDATE imdb SET title_type = ?, primary_title = ?, original_title = ?, + start_year = ?, runtime_minutes = ? WHERE imdb_id = ? + `) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare basic update: %w", err) + } + defer bStmt.Close() + + rCount, bCount := 0, 0 + + for id, r := range ratings { + dec := float64(int(r.AverageRating*10+0.5)) / 10.0 + if _, err := rStmt.Exec(dec, r.NumVotes, id); err != nil { + tx.Rollback() + return fmt.Errorf("update rating %s: %w", id, err) + } + rCount++ + } + + for id, b := range basics { + if _, err := bStmt.Exec( + b.TitleType, b.PrimaryTitle, b.OriginalTitle, + b.StartYear, b.RuntimeMinutes, id, + ); err != nil { + tx.Rollback() + return fmt.Errorf("update basic %s: %w", id, err) + } + bCount++ + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit: %w", err) + } + + log.Printf("applyImdbUpdates: %d ratings, %d basics updated", rCount, bCount) + return nil +} + +// getImdbIDsWithoutRating returns all imdb_id values where average_rating IS NULL. +func (a *App) getImdbIDsWithoutRating() (map[string]bool, error) { + rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE average_rating IS NULL`) + if err != nil { + return nil, err + } + defer rows.Close() + + ids := make(map[string]bool) + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + return nil, err + } + ids[id] = true + } + return ids, rows.Err() +} + +// fetchAndUpdateImdbData is the main entry point. +// Checks if any imdb entry lacks average_rating. If so, downloads datasets, +// parses them, and updates matching rows with ratings and basic metadata. +func (a *App) fetchAndUpdateImdbData() error { + var count int + if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&count); err != nil { + return fmt.Errorf("count: %w", err) + } + if count == 0 { + log.Println("fetchAndUpdateImdbData: all entries have ratings, skipping") + return nil + } + log.Printf("fetchAndUpdateImdbData: %d entries without rating", count) + + // Download / refresh datasets + if err := a.downloadImdbDatasets(); err != nil { + return err + } + + // Gather all imdb_ids needing updates + ids, err := a.getImdbIDsWithoutRating() + if err != nil { + return err + } + if len(ids) == 0 { + log.Println("fetchAndUpdateImdbData: no entries need updating") + return nil + } + log.Printf("fetchAndUpdateImdbData: %d entries to update", len(ids)) + + // Parse datasets + ratings, err := a.parseTitleRatings(ids) + if err != nil { + return fmt.Errorf("parseTitleRatings: %w", err) + } + log.Printf("fetchAndUpdateImdbData: found ratings for %d entries", len(ratings)) + + basics, err := a.parseTitleBasics(ids) + if err != nil { + return fmt.Errorf("parseTitleBasics: %w", err) + } + log.Printf("fetchAndUpdateImdbData: found basics for %d entries", len(basics)) + + // Write to DB + if err := a.applyImdbUpdates(ratings, basics); err != nil { + return err + } + + return nil +} diff --git a/src/main.go b/src/main.go index 0aa01bb..123cc64 100644 --- a/src/main.go +++ b/src/main.go @@ -190,4 +190,8 @@ func main() { if err = app.populateImdbTable(); err != nil { log.Fatalf("populateImdbTable: %v", err) } + + if err = app.fetchAndUpdateImdbData(); err != nil { + log.Fatalf("fetchAndUpdateImdbData: %v", err) + } } |
