diff options
| author | dev | 2026-06-24 03:33:12 +0200 |
|---|---|---|
| committer | dev | 2026-06-24 03:33:12 +0200 |
| commit | eec189de8a5be0a18103a215d369c6135b86e9ff (patch) | |
| tree | 2bf5072c4aca6f0f79da7793a94ce661c5c63870 /src | |
| parent | 163b9bddd68f7ffc8fc4164acee333fe5bff3c7a (diff) | |
| download | hnimdbbot-eec189de8a5be0a18103a215d369c6135b86e9ff.tar.gz | |
feat: populate imdb table with unique title IDs from links
- Extract distinct IMDb title IDs from links.param (host=imdb.com)
- Skip IDs already in imdb table and non-title params (nm, ls, etc.)
- Insert 3448 unique title IDs into imdb.imdb_id
Diffstat (limited to 'src')
| -rw-r--r-- | src/main.go | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/src/main.go b/src/main.go index e079760..0aa01bb 100644 --- a/src/main.go +++ b/src/main.go @@ -20,6 +20,93 @@ type LinkRow struct { } var imdbTitleRe = regexp.MustCompile(`/title/(tt\d+)($|/)`) +var imdbIDRe = regexp.MustCompile(`^(tt\d+)$`) + +func (a *App) populateImdbTable() error { + // Gather all IMDb IDs already in the table to avoid duplicates + existing, err := a.getExistingImdbIDs() + if err != nil { + return fmt.Errorf("getExistingImdbIDs: %w", err) + } + + rows, err := a.DB.Query(` + SELECT DISTINCT param FROM links + WHERE host = 'www.imdb.com' AND param IS NOT NULL AND param != '' + `) + if err != nil { + return fmt.Errorf("query links: %w", err) + } + defer rows.Close() + + tx := a.DB.MustBegin() + stmt, err := tx.Prepare(`INSERT INTO imdb (imdb_id) VALUES (?)`) + if err != nil { + tx.Rollback() + return fmt.Errorf("prepare insert: %w", err) + } + defer stmt.Close() + + var inserted, skipped int + for rows.Next() { + var param string + if err := rows.Scan(¶m); err != nil { + tx.Rollback() + return fmt.Errorf("scan param: %w", err) + } + + if !imdbIDRe.MatchString(param) { + log.Printf("populateImdbTable: invalid param %q, skipping", param) + continue + } + + if existing[param] { + skipped++ + continue + } + + _, err := stmt.Exec(param) + if err != nil { + tx.Rollback() + return fmt.Errorf("insert %s: %w", param, err) + } + inserted++ + existing[param] = true + } + if err := rows.Err(); err != nil { + tx.Rollback() + return fmt.Errorf("rows iteration: %w", err) + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit: %w", err) + } + + log.Printf("populateImdbTable: inserted %d, skipped %d (already existed)", inserted, skipped) + return nil +} + +func (a *App) getExistingImdbIDs() (map[string]bool, error) { + rows, err := a.DB.Query(`SELECT imdb_id FROM imdb`) + if err != nil { + return nil, fmt.Errorf("query imdb: %w", err) + } + defer rows.Close() + + existing := make(map[string]bool) + for rows.Next() { + var imdbID string + if err := rows.Scan(&imdbID); err != nil { + return nil, fmt.Errorf("scan imdb_id: %w", err) + } + existing[imdbID] = true + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("rows iteration: %w", err) + } + + log.Printf("getExistingImdbIDs: %d existing records", len(existing)) + return existing, nil +} func (a *App) extractImdbIDs() error { rows, err := a.DB.Query(` @@ -99,4 +186,8 @@ func main() { if err = app.extractImdbIDs(); err != nil { log.Fatalf("extractImdbIDs: %v", err) } + + if err = app.populateImdbTable(); err != nil { + log.Fatalf("populateImdbTable: %v", err) + } } |
