package main import ( "bufio" "compress/gzip" "fmt" "io" "log" "net/http" "os" "path/filepath" "strconv" "strings" ) const ( imdbDataDir = "../imdbdata" basicsURL = "https://datasets.imdbws.com/title.basics.tsv.gz" ratingsURL = "https://datasets.imdbws.com/title.ratings.tsv.gz" ) func dataPath(name string) string { return filepath.Join(imdbDataDir, name) } // downloadFile fetches url and writes to dst, overwriting if present. func downloadFile(dst, url string) error { return nil resp, err := http.Get(url) if err != nil { return fmt.Errorf("http get: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return fmt.Errorf("http %d from %s", resp.StatusCode, url) } f, err := os.Create(dst) if err != nil { return fmt.Errorf("create %s: %w", dst, err) } defer f.Close() _, err = io.Copy(f, resp.Body) if err != nil { return fmt.Errorf("write %s: %w", dst, err) } log.Printf("downloaded %s", url) return nil } // gunzipFile decompresses src.gz to dst. func gunzipFile(src, dst string) error { f, err := os.Open(src) if err != nil { return err } defer f.Close() gz, err := gzip.NewReader(f) if err != nil { return err } defer gz.Close() out, err := os.Create(dst) if err != nil { return err } defer out.Close() _, err = io.Copy(out, gz) if err != nil { return fmt.Errorf("write %s: %w", dst, err) } return nil } // downloadImdbDatasets fetches and extracts the two TSV files into imdbdata/. func (a *App) downloadImdbDatasets() error { if err := os.MkdirAll(imdbDataDir, 0755); err != nil { return fmt.Errorf("mkdir: %w", err) } type filePair struct { url string gz string tsv string } pairs := []filePair{ {basicsURL, dataPath("title.basics.tsv.gz"), dataPath("title.basics.tsv")}, {ratingsURL, dataPath("title.ratings.tsv.gz"), dataPath("title.ratings.tsv")}, } for _, p := range pairs { // skip if TSV already exists (download may be stubbed out) if _, err := os.Stat(p.tsv); err == nil { log.Printf("reusing existing %s", p.tsv) continue } if err := downloadFile(p.gz, p.url); err != nil { return fmt.Errorf("download %s: %w", p.url, err) } if err := gunzipFile(p.gz, p.tsv); err != nil { return fmt.Errorf("gunzip %s: %w", p.gz, err) } if err := os.Remove(p.gz); err != nil { return fmt.Errorf("remove %s: %w", p.gz, err) } log.Printf("extracted %s", p.tsv) } return nil } // ratingEntry holds data from title.ratings.tsv for one title. type ratingEntry struct { AverageRating float64 NumVotes int } // basicEntry holds data from title.basics.tsv for one title. type basicEntry struct { TitleType string PrimaryTitle string OriginalTitle string StartYear *int RuntimeMinutes *int Genres []string } // parseTSV reads a TSV file line by line and calls fn for each row (after header). // Only rows where keep(tconst) is true are passed to fn. // Uses simple tab-splitting to avoid csv.Reader quote issues with large files. func parseTSV(path string, keep func(string) bool, fn func(record []string) error) error { f, err := os.Open(path) if err != nil { return fmt.Errorf("open %s: %w", path, err) } defer f.Close() scanner := bufio.NewScanner(f) // skip header if !scanner.Scan() { return fmt.Errorf("empty file") } for scanner.Scan() { line := scanner.Text() if line == "" { continue } parts := strings.Split(line, "\t") if len(parts) < 1 { continue } tconst := parts[0] if !keep(tconst) { continue } if err := fn(parts); err != nil { return err } } return scanner.Err() } // parseTitleRatings reads the ratings TSV and collects entries for the given ids. func (a *App) parseTitleRatings(ids map[string]bool) (map[string]ratingEntry, error) { result := make(map[string]ratingEntry) err := parseTSV(dataPath("title.ratings.tsv"), func(s string) bool { return ids[s] }, func(rec []string) error { if len(rec) < 3 { return nil } rating, err := strconv.ParseFloat(rec[1], 64) if err != nil { return nil } votes, err := strconv.Atoi(rec[2]) if err != nil { return nil } result[rec[0]] = ratingEntry{AverageRating: rating, NumVotes: votes} return nil }, ) return result, err } // parseTitleBasics reads the basics TSV and collects entries for the given ids. func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, error) { result := make(map[string]basicEntry) err := parseTSV(dataPath("title.basics.tsv"), func(s string) bool { return ids[s] }, func(rec []string) error { // tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres if len(rec) < 9 { return nil } entry := basicEntry{ TitleType: rec[1], PrimaryTitle: rec[2], OriginalTitle: rec[3], } // startYear is field 5 (0-indexed) if rec[5] != "\\N" && rec[5] != "" { v, err := strconv.Atoi(rec[5]) if err == nil { entry.StartYear = &v } } // runtimeMinutes is field 7 (0-indexed) if rec[7] != "\\N" && rec[7] != "" { v, err := strconv.Atoi(rec[7]) if err == nil { entry.RuntimeMinutes = &v } } // genres is field 8 (0-indexed), comma-separated if rec[8] != "\\N" && rec[8] != "" { entry.Genres = strings.Split(rec[8], ",") } result[rec[0]] = entry return nil }, ) return result, err } // applyImdbUpdates writes ratings and basics data into the imdb table. func (a *App) applyImdbUpdates(ratings map[string]ratingEntry, basics map[string]basicEntry) error { tx := a.DB.MustBegin() rStmt, err := tx.Prepare(`UPDATE imdb SET average_rating = ?, num_votes = ? WHERE imdb_id = ?`) if err != nil { tx.Rollback() return fmt.Errorf("prepare rating update: %w", err) } defer rStmt.Close() bStmt, err := tx.Prepare(` UPDATE imdb SET title_type = ?, primary_title = ?, original_title = ?, start_year = ?, runtime_minutes = ? WHERE imdb_id = ? `) if err != nil { tx.Rollback() return fmt.Errorf("prepare basic update: %w", err) } defer bStmt.Close() genreUpsert, err := tx.Prepare(` INSERT INTO genre (name) VALUES (?) ON DUPLICATE KEY UPDATE id = LAST_INSERT_ID(id) `) if err != nil { tx.Rollback() return fmt.Errorf("prepare genre upsert: %w", err) } defer genreUpsert.Close() genreLink, err := tx.Prepare(` INSERT IGNORE INTO imdb_genre (imdb_id, genre_id) VALUES ((SELECT i.id FROM imdb i WHERE i.imdb_id = ?), ?) `) if err != nil { tx.Rollback() return fmt.Errorf("prepare genre link: %w", err) } defer genreLink.Close() rCount, bCount, gCount := 0, 0, 0 for id, r := range ratings { dec := float64(int(r.AverageRating*10+0.5)) / 10.0 if _, err := rStmt.Exec(dec, r.NumVotes, id); err != nil { tx.Rollback() return fmt.Errorf("update rating %s: %w", id, err) } rCount++ } for id, b := range basics { if _, err := bStmt.Exec( b.TitleType, b.PrimaryTitle, b.OriginalTitle, b.StartYear, b.RuntimeMinutes, id, ); err != nil { tx.Rollback() return fmt.Errorf("update basic %s: %w", id, err) } bCount++ for _, g := range b.Genres { res, err := genreUpsert.Exec(g) if err != nil { tx.Rollback() return fmt.Errorf("upsert genre %s: %w", g, err) } genreID, err := res.LastInsertId() if err != nil { tx.Rollback() return fmt.Errorf("get genre id: %w", err) } if _, err := genreLink.Exec(id, genreID); err != nil { tx.Rollback() return fmt.Errorf("link genre %s for %s: %w", g, id, err) } gCount++ } } if err := tx.Commit(); err != nil { return fmt.Errorf("commit: %w", err) } log.Printf("applyImdbUpdates: %d ratings, %d basics, %d genres updated", rCount, bCount, gCount) return nil } // getImdbIDsWithoutRating returns all imdb_id values where average_rating IS NULL. func (a *App) getImdbIDsWithoutRating() (map[string]bool, error) { rows, err := a.DB.Query(`SELECT imdb_id FROM imdb WHERE average_rating IS NULL`) if err != nil { return nil, err } defer rows.Close() ids := make(map[string]bool) for rows.Next() { var id string if err := rows.Scan(&id); err != nil { return nil, err } ids[id] = true } return ids, rows.Err() } // fetchAndUpdateImdbData is the main entry point. // Checks if any imdb entry lacks average_rating. If so, downloads datasets, // parses them, and updates matching rows with ratings and basic metadata. func (a *App) fetchAndUpdateImdbData() error { var missingRatings, missingGenres int if err := a.DB.QueryRow(`SELECT COUNT(*) FROM imdb WHERE average_rating IS NULL`).Scan(&missingRatings); err != nil { return fmt.Errorf("count ratings: %w", err) } if err := a.DB.QueryRow(` SELECT COUNT(*) FROM imdb i LEFT JOIN imdb_genre ig ON ig.imdb_id = i.id WHERE ig.id IS NULL `).Scan(&missingGenres); err != nil { return fmt.Errorf("count genres: %w", err) } if missingRatings == 0 && missingGenres == 0 { log.Println("fetchAndUpdateImdbData: all entries complete, skipping") return nil } log.Printf("fetchAndUpdateImdbData: %d missing ratings, %d missing genres", missingRatings, missingGenres) // Download / refresh datasets if err := a.downloadImdbDatasets(); err != nil { return err } // Gather all imdb_ids needing rating updates ratingIDs, err := a.getImdbIDsWithoutRating() if err != nil { return err } if len(ratingIDs) == 0 { log.Println("fetchAndUpdateImdbData: no entries need updating") return nil } log.Printf("fetchAndUpdateImdbData: %d entries need rating update", len(ratingIDs)) // Gather all imdb_ids for basics update allIDs, err := a.getAllImdbIDs() if err != nil { return err } log.Printf("fetchAndUpdateImdbData: %d entries total for basics update", len(allIDs)) // Parse datasets ratings, err := a.parseTitleRatings(ratingIDs) if err != nil { return fmt.Errorf("parseTitleRatings: %w", err) } log.Printf("fetchAndUpdateImdbData: found ratings for %d entries", len(ratings)) basics, err := a.parseTitleBasics(allIDs) if err != nil { return fmt.Errorf("parseTitleBasics: %w", err) } log.Printf("fetchAndUpdateImdbData: found basics for %d entries", len(basics)) // Write to DB if err := a.applyImdbUpdates(ratings, basics); err != nil { return err } return nil } // getAllImdbIDs returns all imdb_id values in the imdb table. func (a *App) getAllImdbIDs() (map[string]bool, error) { rows, err := a.DB.Query(`SELECT imdb_id FROM imdb`) if err != nil { return nil, err } defer rows.Close() ids := make(map[string]bool) for rows.Next() { var id string if err := rows.Scan(&id); err != nil { return nil, err } ids[id] = true } return ids, rows.Err() }