diff options
| author | dev | 2026-06-26 03:37:51 +0200 |
|---|---|---|
| committer | dev | 2026-06-26 03:37:51 +0200 |
| commit | 15d06c9802d08037283aa218ccc2f92a9236fcc9 (patch) | |
| tree | 1cd3628b5680212c723fd00c694b15fb0bad1f08 | |
| parent | 8e2d742e59b3923852e1ef6e7a5e2ee1de14ce45 (diff) | |
| download | hnimdbbot-15d06c9802d08037283aa218ccc2f92a9236fcc9.tar.gz | |
feat: add -wiki-only flag to rerun only wiki data extraction
- fetchWikiArticlesData is standalone again (re-extracted from consumer)
- -wiki-only flag skips SPARQL pipeline, runs only wiki data fetch
- Default behavior: full pipeline (SPARQL + wiki data in parallel)
| -rw-r--r-- | src/main.go | 32 | ||||
| -rw-r--r-- | src/wikidata.go | 25 |
2 files changed, 46 insertions, 11 deletions
diff --git a/src/main.go b/src/main.go index 272e4b6..cf1c74b 100644 --- a/src/main.go +++ b/src/main.go @@ -1,6 +1,7 @@ package main import ( + "flag" "fmt" "log" "regexp" @@ -164,6 +165,9 @@ func (a *App) extractImdbIDs() error { } func main() { + wikiOnly := flag.Bool("wiki-only", false, "only fetch wiki article data, skip SPARQL") + flag.Parse() + cfg, err := LoadConfig("config.json") if err != nil { log.Fatalf("failed to load config: %v", err) @@ -183,19 +187,25 @@ func main() { } defer app.DB.Close() - if err = app.extractImdbIDs(); err != nil { - log.Fatalf("extractImdbIDs: %v", err) - } + if !*wikiOnly { + if err = app.extractImdbIDs(); err != nil { + log.Fatalf("extractImdbIDs: %v", err) + } - if err = app.populateImdbTable(); err != nil { - log.Fatalf("populateImdbTable: %v", err) - } + if err = app.populateImdbTable(); err != nil { + log.Fatalf("populateImdbTable: %v", err) + } - if err = app.fetchAndUpdateImdbData(); err != nil { - log.Fatalf("fetchAndUpdateImdbData: %v", err) - } + if err = app.fetchAndUpdateImdbData(); err != nil { + log.Fatalf("fetchAndUpdateImdbData: %v", err) + } - if err = app.fetchWikiArticles(); err != nil { - log.Fatalf("fetchWikiArticles: %v", err) + if err = app.fetchWikiArticles(); err != nil { + log.Fatalf("fetchWikiArticles: %v", err) + } + } else { + if err = app.fetchWikiArticlesData(); err != nil { + log.Fatalf("fetchWikiArticlesData: %v", err) + } } } diff --git a/src/wikidata.go b/src/wikidata.go index 9dacb7b..5d1b594 100644 --- a/src/wikidata.go +++ b/src/wikidata.go @@ -274,6 +274,31 @@ func (a *App) wikiDataConsumer(artCh <-chan wikiArticleFetch, done chan<- struct log.Printf("fetchWikiArticlesData: %d updated, %d skipped (non-200)", updated, skipped) } +// fetchWikiArticlesData fetches wiki article data from the custom server for all +// entries that have a wiki_article but need data extraction. Callable independently. +func (a *App) fetchWikiArticlesData() error { + existing, err := a.getExistingWikiArticles() + if err != nil { + return err + } + if len(existing) == 0 { + log.Println("fetchWikiArticlesData: all entries complete, skipping") + return nil + } + log.Printf("fetchWikiArticlesData: %d entries need wiki data", len(existing)) + + artCh := make(chan wikiArticleFetch, len(existing)) + consumerDone := make(chan struct{}) + go a.wikiDataConsumer(artCh, consumerDone) + + for _, e := range existing { + artCh <- wikiArticleFetch{imdbID: e.imdbID, name: e.wikiArticle} + } + close(artCh) + <-consumerDone + + return nil +} // queryWikidataBatch sends a SPARQL query for the given IDs and returns a map of id -> wikiAcc. func (a *App) queryWikidataBatch(ids []string) (map[string]wikiAcc, error) { |
