summaryrefslogtreecommitdiff
path: root/src/imdbdata.go
diff options
context:
space:
mode:
authordev2026-06-24 03:46:14 +0200
committerdev2026-06-24 03:46:14 +0200
commit86069f011f35e339a30ffb717308990369c5f29f (patch)
tree276537b5c86400d409a8d257ec68b13512b997db /src/imdbdata.go
parenteec189de8a5be0a18103a215d369c6135b86e9ff (diff)
downloadhnimdbbot-86069f011f35e339a30ffb717308990369c5f29f.tar.gz
feat: fetchAndUpdateImdbData — download IMDB datasets and populate imdb table
- Check for imdb entries with NULL average_rating - Download title.basics.tsv.gz and title.ratings.tsv.gz to imdbdata/ - Decompress alongside gzip originals - Parse only rows matching our imdb_ids (memory-efficient) - Update: average_rating, num_votes, title_type, primary_title, original_title, start_year, runtime_minutes - Results: 3394 ratings, 3093 basics updated out of 3448 entries
Diffstat (limited to 'src/imdbdata.go')
-rw-r--r--src/imdbdata.go348
1 files changed, 348 insertions, 0 deletions
diff --git a/src/imdbdata.go b/src/imdbdata.go
new file mode 100644
index 0000000..7eb803d
--- /dev/null
+++ b/src/imdbdata.go
@@ -0,0 +1,348 @@
+package main
+
+import (
+ "compress/gzip"
+ "encoding/csv"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "os"
+ "path/filepath"
+ "strconv"
+)
+
+const (
+ imdbDataDir = "imdbdata"
+ basicsURL = "https://datasets.imdbws.com/title.basics.tsv.gz"
+ ratingsURL = "https://datasets.imdbws.com/title.ratings.tsv.gz"
+)
+
+func dataPath(name string) string {
+ return filepath.Join(imdbDataDir, name)
+}
+
+// downloadFile fetches url and writes to dst, overwriting if present.
+func downloadFile(dst, url string) error {
+ resp, err := http.Get(url)
+ if err != nil {
+ return fmt.Errorf("http get: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("http %d from %s", resp.StatusCode, url)
+ }
+
+ f, err := os.Create(dst)
+ if err != nil {
+ return fmt.Errorf("create %s: %w", dst, err)
+ }
+ defer f.Close()
+
+ _, err = io.Copy(f, resp.Body)
+ if err != nil {
+ return fmt.Errorf("write %s: %w", dst, err)
+ }
+ log.Printf("downloaded %s", url)
+ return nil
+}
+
+// gunzipFile decompresses src.gz to dst.
+func gunzipFile(src, dst string) error {
+ f, err := os.Open(src)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ gz, err := gzip.NewReader(f)
+ if err != nil {
+ return err
+ }
+ defer gz.Close()
+
+ out, err := os.Create(dst)
+ if err != nil {
+ return err
+ }
+ defer out.Close()
+
+ _, err = io.Copy(out, gz)
+ if err != nil {
+ return fmt.Errorf("write %s: %w", dst, err)
+ }
+ return nil
+}
+
+// downloadImdbDatasets fetches and extracts the two TSV files into imdbdata/.
+func (a *App) downloadImdbDatasets() error {
+ if err := os.MkdirAll(imdbDataDir, 0755); err != nil {
+ return fmt.Errorf("mkdir: %w", err)
+ }
+
+ type filePair struct {
+ url string
+ gz string
+ tsv string
+ }
+
+ pairs := []filePair{
+ {basicsURL, dataPath("title.basics.tsv.gz"), dataPath("title.basics.tsv")},
+ {ratingsURL, dataPath("title.ratings.tsv.gz"), dataPath("title.ratings.tsv")},
+ }
+
+ for _, p := range pairs {
+ if err := downloadFile(p.gz, p.url); err != nil {
+ return fmt.Errorf("download %s: %w", p.url, err)
+ }
+ if err := gunzipFile(p.gz, p.tsv); err != nil {
+ return fmt.Errorf("gunzip %s: %w", p.gz, err)
+ }
+ log.Printf("extracted %s", p.tsv)
+ }
+ return nil
+}
+
+// ratingEntry holds data from title.ratings.tsv for one title.
+type ratingEntry struct {
+ AverageRating float64
+ NumVotes int
+}
+
+// basicEntry holds data from title.basics.tsv for one title.
+type basicEntry struct {
+ TitleType string
+ PrimaryTitle string
+ OriginalTitle string
+ StartYear *int
+ RuntimeMinutes *int
+}
+
+// parseTSV reads a TSV file and calls fn for each row (after header).
+// Only rows where keep(tconst) is true are passed to fn.
+func parseTSV(path string, keep func(string) bool, fn func(record []string) error) error {
+ f, err := os.Open(path)
+ if err != nil {
+ return fmt.Errorf("open %s: %w", path, err)
+ }
+ defer f.Close()
+
+ r := csv.NewReader(f)
+ r.Comma = '\t'
+ r.LazyQuotes = true
+ r.FieldsPerRecord = -1
+
+ // skip header
+ if _, err := r.Read(); err != nil {
+ return fmt.Errorf("read header: %w", err)
+ }
+
+ for {
+ rec, err := r.Read()
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ return fmt.Errorf("read row: %w", err)
+ }
+
+ if len(rec) < 1 {
+ continue
+ }
+
+ tconst := rec[0]
+ if !keep(tconst) {
+ continue
+ }
+
+ if err := fn(rec); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// parseTitleRatings reads the ratings TSV and collects entries for the given ids.
+func (a *App) parseTitleRatings(ids map[string]bool) (map[string]ratingEntry, error) {
+ result := make(map[string]ratingEntry)
+
+ err := parseTSV(dataPath("title.ratings.tsv"),
+ func(s string) bool { return ids[s] },
+ func(rec []string) error {
+ if len(rec) < 3 {
+ return nil
+ }
+ rating, err := strconv.ParseFloat(rec[1], 64)
+ if err != nil {
+ return nil
+ }
+ votes, err := strconv.Atoi(rec[2])
+ if err != nil {
+ return nil
+ }
+ result[rec[0]] = ratingEntry{AverageRating: rating, NumVotes: votes}
+ return nil
+ },
+ )
+ return result, err
+}
+
+// parseTitleBasics reads the basics TSV and collects entries for the given ids.
+func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, error) {
+ result := make(map[string]basicEntry)
+
+ err := parseTSV(dataPath("title.basics.tsv"),
+ func(s string) bool { return ids[s] },
+ func(rec []string) error {
+ if len(rec) < 7 {
+ return nil
+ }
+
+ entry := basicEntry{
+ TitleType: rec[1],
+ PrimaryTitle: rec[2],
+ OriginalTitle: rec[3],
+ }
+
+ if rec[4] != "\\N" && rec[4] != "" {
+ v, err := strconv.Atoi(rec[4])
+ if err == nil {
+ entry.StartYear = &v
+ }
+ }
+
+ if rec[5] != "\\N" && rec[5] != "" {
+ v, err := strconv.Atoi(rec[5])
+ if err == nil {
+ entry.RuntimeMinutes = &v
+ }
+ }
+
+ result[rec[0]] = entry
+ return nil
+ },
+ )
+ return result, err
+}
+
+// applyImdbUpdates writes ratings and basics data into the imdb table.
+func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string]basicEntry) error {
+ tx := a.DB.MustBegin()
+
+ rStmt, err := tx.Prepare(`UPDATE imdb SET average_rating = ?, num_votes = ? WHERE imdb_id = ?`)
+ if err != nil {
+ tx.Rollback()
+ return fmt.Errorf("prepare rating update: %w", err)
+ }
+ defer rStmt.Close()
+
+ bStmt, err := tx.Prepare(`
+ UPDATE imdb SET title_type = ?, primary_title = ?, original_title = ?,
+ start_year = ?, runtime_minutes = ? WHERE imdb_id = ?
+ `)
+ if err != nil {
+ tx.Rollback()
+ return fmt.Errorf("prepare basic update: %w", err)
+ }
+ defer bStmt.Close()
+
+ rCount, bCount := 0, 0
+
+ for id, r := range ratings {
+ dec := float64(int(r.AverageRating*10+0.5)) / 10.0
+ if _, err := rStmt.Exec(dec, r.NumVotes, id); err != nil {
+ tx.Rollback()
+ return fmt.Errorf("update rating %s: %w", id, err)
+ }
+ rCount++
+ }
+
+ for id, b := range basics {
+ if _, err := bStmt.Exec(
+ b.TitleType, b.PrimaryTitle, b.OriginalTitle,
+ b.StartYear, b.RuntimeMinutes, id,
+ ); err != nil {
+ tx.Rollback()
+ return fmt.Errorf("update basic %s: %w", id, err)
+ }
+ bCount++
+ }
+
+ if err := tx.Commit(); err != nil {
+ return fmt.Errorf("commit: %w", err)
+ }
+
+ log.Printf("applyImdbUpdates: %d ratings, %d basics updated", rCount, bCount)
+ return nil
+}
+
+// getImdbIDsWithoutRating returns all imdb_id values where average_rating IS NULL.
+func (a *App) getImdbIDsWithoutRating() (map[string]bool, error) {
+ rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE average_rating IS NULL`)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ ids := make(map[string]bool)
+ for rows.Next() {
+ var id string
+ if err := rows.Scan(&id); err != nil {
+ return nil, err
+ }
+ ids[id] = true
+ }
+ return ids, rows.Err()
+}
+
+// fetchAndUpdateImdbData is the main entry point.
+// Checks if any imdb entry lacks average_rating. If so, downloads datasets,
+// parses them, and updates matching rows with ratings and basic metadata.
+func (a *App) fetchAndUpdateImdbData() error {
+ var count int
+ if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&count); err != nil {
+ return fmt.Errorf("count: %w", err)
+ }
+ if count == 0 {
+ log.Println("fetchAndUpdateImdbData: all entries have ratings, skipping")
+ return nil
+ }
+ log.Printf("fetchAndUpdateImdbData: %d entries without rating", count)
+
+ // Download / refresh datasets
+ if err := a.downloadImdbDatasets(); err != nil {
+ return err
+ }
+
+ // Gather all imdb_ids needing updates
+ ids, err := a.getImdbIDsWithoutRating()
+ if err != nil {
+ return err
+ }
+ if len(ids) == 0 {
+ log.Println("fetchAndUpdateImdbData: no entries need updating")
+ return nil
+ }
+ log.Printf("fetchAndUpdateImdbData: %d entries to update", len(ids))
+
+ // Parse datasets
+ ratings, err := a.parseTitleRatings(ids)
+ if err != nil {
+ return fmt.Errorf("parseTitleRatings: %w", err)
+ }
+ log.Printf("fetchAndUpdateImdbData: found ratings for %d entries", len(ratings))
+
+ basics, err := a.parseTitleBasics(ids)
+ if err != nil {
+ return fmt.Errorf("parseTitleBasics: %w", err)
+ }
+ log.Printf("fetchAndUpdateImdbData: found basics for %d entries", len(basics))
+
+ // Write to DB
+ if err := a.applyImdbUpdates(ratings, basics); err != nil {
+ return err
+ }
+
+ return nil
+}