package main import ( "flag" "fmt" "regexp" _ "github.com/go-sql-driver/mysql" "github.com/jmoiron/sqlx" ) type App struct { Config *Config DB *sqlx.DB } type LinkRow struct { ID int `db:"id"` URL string `db:"url"` } var imdbTitleRe = regexp.MustCompile(`/title/(tt\d+)($|/)`) var imdbIDRe = regexp.MustCompile(`^(tt\d+)$`) func (a *App) populateImdbTable() error { // Gather all IMDb IDs already in the table to avoid duplicates existing, err := a.getExistingImdbIDs() if err != nil { return fmt.Errorf("getExistingImdbIDs: %w", err) } rows, err := a.DB.Query(` SELECT DISTINCT param FROM links WHERE host = 'www.imdb.com' AND param IS NOT NULL AND param != '' `) if err != nil { return fmt.Errorf("query links: %w", err) } defer rows.Close() tx := a.DB.MustBegin() stmt, err := tx.Prepare(`INSERT INTO imdb (imdb_id) VALUES (?)`) if err != nil { tx.Rollback() return fmt.Errorf("prepare insert: %w", err) } defer stmt.Close() var inserted, skipped int for rows.Next() { var param string if err := rows.Scan(¶m); err != nil { tx.Rollback() return fmt.Errorf("scan param: %w", err) } if !imdbIDRe.MatchString(param) { logWarn("populateImdbTable: invalid param %q, skipping", param) continue } if existing[param] { skipped++ continue } _, err := stmt.Exec(param) if err != nil { tx.Rollback() return fmt.Errorf("insert %s: %w", param, err) } inserted++ existing[param] = true } if err := rows.Err(); err != nil { tx.Rollback() return fmt.Errorf("rows iteration: %w", err) } if err := tx.Commit(); err != nil { return fmt.Errorf("commit: %w", err) } logInfo("populateImdbTable: inserted %d, skipped %d (already existed)", inserted, skipped) return nil } func (a *App) getExistingImdbIDs() (map[string]bool, error) { rows, err := a.DB.Query(`SELECT imdb_id FROM imdb`) if err != nil { return nil, fmt.Errorf("query imdb: %w", err) } defer rows.Close() existing := make(map[string]bool) for rows.Next() { var imdbID string if err := rows.Scan(&imdbID); err != nil { return nil, fmt.Errorf("scan imdb_id: %w", err) } existing[imdbID] = true } if err := rows.Err(); err != nil { return nil, fmt.Errorf("rows iteration: %w", err) } logInfo("getExistingImdbIDs: %d existing records", len(existing)) return existing, nil } func (a *App) extractImdbIDs() error { rows, err := a.DB.Query(` SELECT id, url FROM links WHERE field = 1 AND url LIKE '%.com/title%' AND host = 'www.imdb.com' AND (param IS NULL OR param = '') `) if err != nil { return fmt.Errorf("query links: %w", err) } defer rows.Close() var count, updated int tx := a.DB.MustBegin() stmt, err := tx.Prepare(`UPDATE links SET param = ? WHERE id = ?`) if err != nil { tx.Rollback() return fmt.Errorf("prepare update: %w", err) } defer stmt.Close() for rows.Next() { count++ var link LinkRow if err := rows.Scan(&link.ID, &link.URL); err != nil { tx.Rollback() return fmt.Errorf("scan row: %w", err) } match := imdbTitleRe.FindStringSubmatch(link.URL) if len(match) < 2 { logWarn("no IMDb ID found in URL: %s", link.URL) continue } imdbID := match[1] _, err := stmt.Exec(imdbID, link.ID) if err != nil { tx.Rollback() return fmt.Errorf("update link %d: %w", link.ID, err) } updated++ } if err := rows.Err(); err != nil { tx.Rollback() return fmt.Errorf("rows iteration: %w", err) } if err := tx.Commit(); err != nil { return fmt.Errorf("commit: %w", err) } logInfo("extractImdbIDs: scanned %d rows, updated %d", count, updated) return nil } func main() { wikiOnly := flag.Bool("wiki-only", false, "only fetch wiki article data, skip SPARQL") logLevelFlag := flag.String("log-level", "info", "logging level: debug, info, silent") flag.Parse() setLogLevel(*logLevelFlag) cfg, err := LoadConfig("config.json") if err != nil { logFatalErr("failed to load config", err) } app := App{Config: cfg} logInfo(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, cfg.DBDriver, cfg.DBDBName, cfg.DBUser, cfg.DBHost, cfg.DBPort, cfg.DBOptions) app.DB, err = sqlx.Connect(cfg.DBDriver, cfg.DBUser+":"+cfg.DBPassword+"@tcp("+cfg.DBHost+":"+cfg.DBPort+")/"+cfg.DBDBName+"?"+cfg.DBOptions) if err != nil { logFatalTwoArgs(err, "Cannot connect to database") } if err = app.DB.Ping(); err != nil { logFatalTwoArgs(err, "No connection to database") } defer app.DB.Close() if !*wikiOnly { if err = app.extractImdbIDs(); err != nil { logFatalErr("extractImdbIDs", err) } if err = app.populateImdbTable(); err != nil { logFatalErr("populateImdbTable", err) } if err = app.fetchAndUpdateImdbData(); err != nil { logFatalErr("fetchAndUpdateImdbData", err) } if err = app.fetchWikiArticles(); err != nil { logFatalErr("fetchWikiArticles", err) } } else { if err = app.fetchWikiArticlesData(); err != nil { logFatalErr("fetchWikiArticlesData", err) } } }