summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordev2026-06-24 04:04:32 +0200
committerdev2026-06-24 04:21:20 +0200
commita55f6e227ff397a5d9167cd4ee15442e9cad06ab (patch)
treeaf524b12d9d056d8720a094780570635875f99bc
parent256c372033bf0ccb6d27ae05c953fa0c18981bf3 (diff)
downloadhnimdbbot-a55f6e227ff397a5d9167cd4ee15442e9cad06ab.tar.gz
fix: correct TSV parsing — use line-by-line reader and proper column indices
- Replace csv.Reader with bufio.Scanner to avoid quote-parsing issues that skipped ~355 entries (e.g. tt1853728 was on line 4.8M and got lost when csv.Reader encountered malformed quoted fields earlier) - Fix column indices: startYear=rec[5], runtimeMinutes=rec[7] (was rec[4]/rec[5] which mapped to isAdult/startYear) - Update basics for ALL imdb entries, not just those missing ratings
-rw-r--r--src/imdbdata.go87
1 files changed, 57 insertions, 30 deletions
diff --git a/src/imdbdata.go b/src/imdbdata.go
index c970c9b..30095ea 100644
--- a/src/imdbdata.go
+++ b/src/imdbdata.go
@@ -1,8 +1,8 @@
package main
import (
+ "bufio"
"compress/gzip"
- "encoding/csv"
"fmt"
"io"
"log"
@@ -10,6 +10,7 @@ import (
"os"
"path/filepath"
"strconv"
+ "strings"
)
const (
@@ -24,6 +25,7 @@ func dataPath(name string) string {
// downloadFile fetches url and writes to dst, overwriting if present.
func downloadFile(dst, url string) error {
+ return nil
resp, err := http.Get(url)
if err != nil {
return fmt.Errorf("http get: %w", err)
@@ -122,8 +124,9 @@ type basicEntry struct {
RuntimeMinutes *int
}
-// parseTSV reads a TSV file and calls fn for each row (after header).
+// parseTSV reads a TSV file line by line and calls fn for each row (after header).
// Only rows where keep(tconst) is true are passed to fn.
+// Uses simple tab-splitting to avoid csv.Reader quote issues with large files.
func parseTSV(path string, keep func(string) bool, fn func(record []string) error) error {
f, err := os.Open(path)
if err != nil {
@@ -131,39 +134,34 @@ func parseTSV(path string, keep func(string) bool, fn func(record []string) erro
}
defer f.Close()
- r := csv.NewReader(f)
- r.Comma = '\t'
- r.LazyQuotes = true
- r.FieldsPerRecord = -1
+ scanner := bufio.NewScanner(f)
// skip header
- if _, err := r.Read(); err != nil {
- return fmt.Errorf("read header: %w", err)
+ if !scanner.Scan() {
+ return fmt.Errorf("empty file")
}
- for {
- rec, err := r.Read()
- if err == io.EOF {
- break
- }
- if err != nil {
- return fmt.Errorf("read row: %w", err)
+ for scanner.Scan() {
+ line := scanner.Text()
+ if line == "" {
+ continue
}
- if len(rec) < 1 {
+ parts := strings.Split(line, "\t")
+ if len(parts) < 1 {
continue
}
- tconst := rec[0]
+ tconst := parts[0]
if !keep(tconst) {
continue
}
- if err := fn(rec); err != nil {
+ if err := fn(parts); err != nil {
return err
}
}
- return nil
+ return scanner.Err()
}
// parseTitleRatings reads the ratings TSV and collects entries for the given ids.
@@ -198,7 +196,8 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro
err := parseTSV(dataPath("title.basics.tsv"),
func(s string) bool { return ids[s] },
func(rec []string) error {
- if len(rec) < 7 {
+ // tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres
+ if len(rec) < 8 {
return nil
}
@@ -208,15 +207,17 @@ func (a *App) parseTitleBasics(ids map[string]bool) (map[string]basicEntry, erro
OriginalTitle: rec[3],
}
- if rec[4] != "\\N" && rec[4] != "" {
- v, err := strconv.Atoi(rec[4])
+ // startYear is field 5 (0-indexed)
+ if rec[5] != "\\N" && rec[5] != "" {
+ v, err := strconv.Atoi(rec[5])
if err == nil {
entry.StartYear = &v
}
}
- if rec[5] != "\\N" && rec[5] != "" {
- v, err := strconv.Atoi(rec[5])
+ // runtimeMinutes is field 7 (0-indexed)
+ if rec[7] != "\\N" && rec[7] != "" {
+ v, err := strconv.Atoi(rec[7])
if err == nil {
entry.RuntimeMinutes = &v
}
@@ -318,25 +319,32 @@ func (a *App) fetchAndUpdateImdbData() error {
return err
}
- // Gather all imdb_ids needing updates
- ids, err := a.getImdbIDsWithoutRating()
+ // Gather all imdb_ids needing rating updates
+ ratingIDs, err := a.getImdbIDsWithoutRating()
if err != nil {
return err
}
- if len(ids) == 0 {
+ if len(ratingIDs) == 0 {
log.Println("fetchAndUpdateImdbData: no entries need updating")
return nil
}
- log.Printf("fetchAndUpdateImdbData: %d entries to update", len(ids))
+ log.Printf("fetchAndUpdateImdbData: %d entries need rating update", len(ratingIDs))
+
+ // Gather all imdb_ids for basics update
+ allIDs, err := a.getAllImdbIDs()
+ if err != nil {
+ return err
+ }
+ log.Printf("fetchAndUpdateImdbData: %d entries total for basics update", len(allIDs))
// Parse datasets
- ratings, err := a.parseTitleRatings(ids)
+ ratings, err := a.parseTitleRatings(ratingIDs)
if err != nil {
return fmt.Errorf("parseTitleRatings: %w", err)
}
log.Printf("fetchAndUpdateImdbData: found ratings for %d entries", len(ratings))
- basics, err := a.parseTitleBasics(ids)
+ basics, err := a.parseTitleBasics(allIDs)
if err != nil {
return fmt.Errorf("parseTitleBasics: %w", err)
}
@@ -349,3 +357,22 @@ func (a *App) fetchAndUpdateImdbData() error {
return nil
}
+
+// getAllImdbIDs returns all imdb_id values in the imdb table.
+func (a *App) getAllImdbIDs() (map[string]bool, error) {
+ rows, err := a.DB.Query(`SELECT imdb_id FROM imdb`)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ ids := make(map[string]bool)
+ for rows.Next() {
+ var id string
+ if err := rows.Scan(&id); err != nil {
+ return nil, err
+ }
+ ids[id] = true
+ }
+ return ids, rows.Err()
+}