package main import ( "compress/gzip" "encoding/csv" "fmt" "io" "log" "net/http" "os" "path/filepath" "strconv" ) const ( imdbDataDir = "imdbdata" basicsURL = "https://datasets.imdbws.com/title.basics.tsv.gz" ratingsURL = "https://datasets.imdbws.com/title.ratings.tsv.gz" ) func dataPath(name string) string { return filepath.Join(imdbDataDir, name) } // downloadFile fetches url and writes to dst, overwriting if present. func downloadFile(dst, url string) error { resp, err := http.Get(url) if err != nil { return fmt.Errorf("http get: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return fmt.Errorf("http %d from %s", resp.StatusCode, url) } f, err := os.Create(dst) if err != nil { return fmt.Errorf("create %s: %w", dst, err) } defer f.Close() _, err = io.Copy(f, resp.Body) if err != nil { return fmt.Errorf("write %s: %w", dst, err) } log.Printf("downloaded %s", url) return nil } // gunzipFile decompresses src.gz to dst. func gunzipFile(src, dst string) error { f, err := os.Open(src) if err != nil { return err } defer f.Close() gz, err := gzip.NewReader(f) if err != nil { return err } defer gz.Close() out, err := os.Create(dst) if err != nil { return err } defer out.Close() _, err = io.Copy(out, gz) if err != nil { return fmt.Errorf("write %s: %w", dst, err) } return nil } // downloadImdbDatasets fetches and extracts the two TSV files into imdbdata/. func (a *App) downloadImdbDatasets() error { if err := os.MkdirAll(imdbDataDir, 0755); err != nil { return fmt.Errorf("mkdir: %w", err) } type filePair struct { url string gz string tsv string } pairs := []filePair{ {basicsURL, dataPath("title.basics.tsv.gz"), dataPath("title.basics.tsv")}, {ratingsURL, dataPath("title.ratings.tsv.gz"), dataPath("title.ratings.tsv")}, } for _, p := range pairs { if err := downloadFile(p.gz, p.url); err != nil { return fmt.Errorf("download %s: %w", p.url, err) } if err := gunzipFile(p.gz, p.tsv); err != nil { return fmt.Errorf("gunzip %s: %w", p.gz, err) } log.Printf("extracted %s", p.tsv) } return nil } // ratingEntry holds data from title.ratings.tsv for one title. type ratingEntry struct { AverageRating float64 NumVotes int } // basicEntry holds data from title.basics.tsv for one title. type basicEntry struct { TitleType string PrimaryTitle string OriginalTitle string StartYear *int RuntimeMinutes *int } // parseTSV reads a TSV file and calls fn for each row (after header). // Only rows where keep(tconst) is true are passed to fn. func parseTSV(path string, keep func(string) bool, fn func(record []string) error) error { f, err := os.Open(path) if err != nil { return fmt.Errorf("open %s: %w", path, err) } defer f.Close() r := csv.NewReader(f) r.Comma = '\t' r.LazyQuotes = true r.FieldsPerRecord = -1 // skip header if _, err := r.Read(); err != nil { return fmt.Errorf("read header: %w", err) } for { rec, err := r.Read() if err == io.EOF { break } if err != nil { return fmt.Errorf("read row: %w", err) } if len(rec) < 1 { continue } tconst := rec[0] if !keep(tconst) { continue } if err := fn(rec); err != nil { return err } } return nil } // parseTitleRatings reads the ratings TSV and collects entries for the given ids. func (a *App) parseTitleRatings(ids map[string]bool) (map[string]ratingEntry, error) { result := make(map[string]ratingEntry) err := parseTSV(dataPath("title.ratings.tsv"), func(s string) bool { return ids[s] }, func(rec []string) error { if len(rec) < 3 { return nil } rating, err := strconv.ParseFloat(rec[1], 64) if err != nil { return nil } votes, err := strconv.Atoi(rec[2]) if err != nil { return nil } result[rec[0]] = ratingEntry{AverageRating: rating, NumVotes: votes} return nil }, ) return result, err } // parseTitleBasics reads the basics TSV and collects entries for the given ids. func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, error) { result := make(map[string]basicEntry) err := parseTSV(dataPath("title.basics.tsv"), func(s string) bool { return ids[s] }, func(rec []string) error { if len(rec) < 7 { return nil } entry := basicEntry{ TitleType: rec[1], PrimaryTitle: rec[2], OriginalTitle: rec[3], } if rec[4] != "\\N" && rec[4] != "" { v, err := strconv.Atoi(rec[4]) if err == nil { entry.StartYear = &v } } if rec[5] != "\\N" && rec[5] != "" { v, err := strconv.Atoi(rec[5]) if err == nil { entry.RuntimeMinutes = &v } } result[rec[0]] = entry return nil }, ) return result, err } // applyImdbUpdates writes ratings and basics data into the imdb table. func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string]basicEntry) error { tx := a.DB.MustBegin() rStmt, err := tx.Prepare(`UPDATE imdb SET average_rating = ?, num_votes = ? WHERE imdb_id = ?`) if err != nil { tx.Rollback() return fmt.Errorf("prepare rating update: %w", err) } defer rStmt.Close() bStmt, err := tx.Prepare(` UPDATE imdb SET title_type = ?, primary_title = ?, original_title = ?, start_year = ?, runtime_minutes = ? WHERE imdb_id = ? `) if err != nil { tx.Rollback() return fmt.Errorf("prepare basic update: %w", err) } defer bStmt.Close() rCount, bCount := 0, 0 for id, r := range ratings { dec := float64(int(r.AverageRating*10+0.5)) / 10.0 if _, err := rStmt.Exec(dec, r.NumVotes, id); err != nil { tx.Rollback() return fmt.Errorf("update rating %s: %w", id, err) } rCount++ } for id, b := range basics { if _, err := bStmt.Exec( b.TitleType, b.PrimaryTitle, b.OriginalTitle, b.StartYear, b.RuntimeMinutes, id, ); err != nil { tx.Rollback() return fmt.Errorf("update basic %s: %w", id, err) } bCount++ } if err := tx.Commit(); err != nil { return fmt.Errorf("commit: %w", err) } log.Printf("applyImdbUpdates: %d ratings, %d basics updated", rCount, bCount) return nil } // getImdbIDsWithoutRating returns all imdb_id values where average_rating IS NULL. func (a *App) getImdbIDsWithoutRating() (map[string]bool, error) { rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE average_rating IS NULL`) if err != nil { return nil, err } defer rows.Close() ids := make(map[string]bool) for rows.Next() { var id string if err := rows.Scan(&id); err != nil { return nil, err } ids[id] = true } return ids, rows.Err() } // fetchAndUpdateImdbData is the main entry point. // Checks if any imdb entry lacks average_rating. If so, downloads datasets, // parses them, and updates matching rows with ratings and basic metadata. func (a *App) fetchAndUpdateImdbData() error { var count int if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&count); err != nil { return fmt.Errorf("count: %w", err) } if count == 0 { log.Println("fetchAndUpdateImdbData: all entries have ratings, skipping") return nil } log.Printf("fetchAndUpdateImdbData: %d entries without rating", count) // Download / refresh datasets if err := a.downloadImdbDatasets(); err != nil { return err } // Gather all imdb_ids needing updates ids, err := a.getImdbIDsWithoutRating() if err != nil { return err } if len(ids) == 0 { log.Println("fetchAndUpdateImdbData: no entries need updating") return nil } log.Printf("fetchAndUpdateImdbData: %d entries to update", len(ids)) // Parse datasets ratings, err := a.parseTitleRatings(ids) if err != nil { return fmt.Errorf("parseTitleRatings: %w", err) } log.Printf("fetchAndUpdateImdbData: found ratings for %d entries", len(ratings)) basics, err := a.parseTitleBasics(ids) if err != nil { return fmt.Errorf("parseTitleBasics: %w", err) } log.Printf("fetchAndUpdateImdbData: found basics for %d entries", len(basics)) // Write to DB if err := a.applyImdbUpdates(ratings, basics); err != nil { return err } return nil }