diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/imdbdata.go | 87 |
1 files changed, 57 insertions, 30 deletions
diff --git a/src/imdbdata.go b/src/imdbdata.go index c970c9b..30095ea 100644 --- a/src/imdbdata.go +++ b/src/imdbdata.go @@ -1,8 +1,8 @@ package main import ( + "bufio" "compress/gzip" - "encoding/csv" "fmt" "io" "log" @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "strconv" + "strings" ) const ( @@ -24,6 +25,7 @@ func dataPath(name string) string { // downloadFile fetches url and writes to dst, overwriting if present. func downloadFile(dst, url string) error { + return nil resp, err := http.Get(url) if err != nil { return fmt.Errorf("http get: %w", err) @@ -122,8 +124,9 @@ type basicEntry struct { RuntimeMinutes *int } -// parseTSV reads a TSV file and calls fn for each row (after header). +// parseTSV reads a TSV file line by line and calls fn for each row (after header). // Only rows where keep(tconst) is true are passed to fn. +// Uses simple tab-splitting to avoid csv.Reader quote issues with large files. func parseTSV(path string, keep func(string) bool, fn func(record []string) error) error { f, err := os.Open(path) if err != nil { @@ -131,39 +134,34 @@ func parseTSV(path string, keep func(string) bool, fn func(record []string) erro } defer f.Close() - r := csv.NewReader(f) - r.Comma = '\t' - r.LazyQuotes = true - r.FieldsPerRecord = -1 + scanner := bufio.NewScanner(f) // skip header - if _, err := r.Read(); err != nil { - return fmt.Errorf("read header: %w", err) + if !scanner.Scan() { + return fmt.Errorf("empty file") } - for { - rec, err := r.Read() - if err == io.EOF { - break - } - if err != nil { - return fmt.Errorf("read row: %w", err) + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue } - if len(rec) < 1 { + parts := strings.Split(line, "\t") + if len(parts) < 1 { continue } - tconst := rec[0] + tconst := parts[0] if !keep(tconst) { continue } - if err := fn(rec); err != nil { + if err := fn(parts); err != nil { return err } } - return nil + return scanner.Err() } // parseTitleRatings reads the ratings TSV and collects entries for the given ids. @@ -198,7 +196,8 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro err := parseTSV(dataPath("title.basics.tsv"), func(s string) bool { return ids[s] }, func(rec []string) error { - if len(rec) < 7 { + // tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres + if len(rec) < 8 { return nil } @@ -208,15 +207,17 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro OriginalTitle: rec[3], } - if rec[4] != "\\N" && rec[4] != "" { - v, err := strconv.Atoi(rec[4]) + // startYear is field 5 (0-indexed) + if rec[5] != "\\N" && rec[5] != "" { + v, err := strconv.Atoi(rec[5]) if err == nil { entry.StartYear = &v } } - if rec[5] != "\\N" && rec[5] != "" { - v, err := strconv.Atoi(rec[5]) + // runtimeMinutes is field 7 (0-indexed) + if rec[7] != "\\N" && rec[7] != "" { + v, err := strconv.Atoi(rec[7]) if err == nil { entry.RuntimeMinutes = &v } @@ -318,25 +319,32 @@ func (a *App) fetchAndUpdateImdbData() error { return err } - // Gather all imdb_ids needing updates - ids, err := a.getImdbIDsWithoutRating() + // Gather all imdb_ids needing rating updates + ratingIDs, err := a.getImdbIDsWithoutRating() if err != nil { return err } - if len(ids) == 0 { + if len(ratingIDs) == 0 { log.Println("fetchAndUpdateImdbData: no entries need updating") return nil } - log.Printf("fetchAndUpdateImdbData: %d entries to update", len(ids)) + log.Printf("fetchAndUpdateImdbData: %d entries need rating update", len(ratingIDs)) + + // Gather all imdb_ids for basics update + allIDs, err := a.getAllImdbIDs() + if err != nil { + return err + } + log.Printf("fetchAndUpdateImdbData: %d entries total for basics update", len(allIDs)) // Parse datasets - ratings, err := a.parseTitleRatings(ids) + ratings, err := a.parseTitleRatings(ratingIDs) if err != nil { return fmt.Errorf("parseTitleRatings: %w", err) } log.Printf("fetchAndUpdateImdbData: found ratings for %d entries", len(ratings)) - basics, err := a.parseTitleBasics(ids) + basics, err := a.parseTitleBasics(allIDs) if err != nil { return fmt.Errorf("parseTitleBasics: %w", err) } @@ -349,3 +357,22 @@ func (a *App) fetchAndUpdateImdbData() error { return nil } + +// getAllImdbIDs returns all imdb_id values in the imdb table. +func (a *App) getAllImdbIDs() (map[string]bool, error) { + rows, err := a.DB.Query(`SELECT imdb_id FROM imdb`) + if err != nil { + return nil, err + } + defer rows.Close() + + ids := make(map[string]bool) + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + return nil, err + } + ids[id] = true + } + return ids, rows.Err() +} |
