summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordev2026-06-26 03:37:51 +0200
committerdev2026-06-26 03:37:51 +0200
commit15d06c9802d08037283aa218ccc2f92a9236fcc9 (patch)
tree1cd3628b5680212c723fd00c694b15fb0bad1f08
parent8e2d742e59b3923852e1ef6e7a5e2ee1de14ce45 (diff)
downloadhnimdbbot-15d06c9802d08037283aa218ccc2f92a9236fcc9.tar.gz
feat: add -wiki-only flag to rerun only wiki data extraction
- fetchWikiArticlesData is standalone again (re-extracted from consumer) - -wiki-only flag skips SPARQL pipeline, runs only wiki data fetch - Default behavior: full pipeline (SPARQL + wiki data in parallel)
-rw-r--r--src/main.go32
-rw-r--r--src/wikidata.go25
2 files changed, 46 insertions, 11 deletions
diff --git a/src/main.go b/src/main.go
index 272e4b6..cf1c74b 100644
--- a/src/main.go
+++ b/src/main.go
@@ -1,6 +1,7 @@
package main
import (
+ "flag"
"fmt"
"log"
"regexp"
@@ -164,6 +165,9 @@ func (a *App) extractImdbIDs() error {
}
func main() {
+ wikiOnly := flag.Bool("wiki-only", false, "only fetch wiki article data, skip SPARQL")
+ flag.Parse()
+
cfg, err := LoadConfig("config.json")
if err != nil {
log.Fatalf("failed to load config: %v", err)
@@ -183,19 +187,25 @@ func main() {
}
defer app.DB.Close()
- if err = app.extractImdbIDs(); err != nil {
- log.Fatalf("extractImdbIDs: %v", err)
- }
+ if !*wikiOnly {
+ if err = app.extractImdbIDs(); err != nil {
+ log.Fatalf("extractImdbIDs: %v", err)
+ }
- if err = app.populateImdbTable(); err != nil {
- log.Fatalf("populateImdbTable: %v", err)
- }
+ if err = app.populateImdbTable(); err != nil {
+ log.Fatalf("populateImdbTable: %v", err)
+ }
- if err = app.fetchAndUpdateImdbData(); err != nil {
- log.Fatalf("fetchAndUpdateImdbData: %v", err)
- }
+ if err = app.fetchAndUpdateImdbData(); err != nil {
+ log.Fatalf("fetchAndUpdateImdbData: %v", err)
+ }
- if err = app.fetchWikiArticles(); err != nil {
- log.Fatalf("fetchWikiArticles: %v", err)
+ if err = app.fetchWikiArticles(); err != nil {
+ log.Fatalf("fetchWikiArticles: %v", err)
+ }
+ } else {
+ if err = app.fetchWikiArticlesData(); err != nil {
+ log.Fatalf("fetchWikiArticlesData: %v", err)
+ }
}
}
diff --git a/src/wikidata.go b/src/wikidata.go
index 9dacb7b..5d1b594 100644
--- a/src/wikidata.go
+++ b/src/wikidata.go
@@ -274,6 +274,31 @@ func (a *App) wikiDataConsumer(artCh <-chan wikiArticleFetch, done chan<- struct
log.Printf("fetchWikiArticlesData: %d updated, %d skipped (non-200)", updated, skipped)
}
+// fetchWikiArticlesData fetches wiki article data from the custom server for all
+// entries that have a wiki_article but need data extraction. Callable independently.
+func (a *App) fetchWikiArticlesData() error {
+ existing, err := a.getExistingWikiArticles()
+ if err != nil {
+ return err
+ }
+ if len(existing) == 0 {
+ log.Println("fetchWikiArticlesData: all entries complete, skipping")
+ return nil
+ }
+ log.Printf("fetchWikiArticlesData: %d entries need wiki data", len(existing))
+
+ artCh := make(chan wikiArticleFetch, len(existing))
+ consumerDone := make(chan struct{})
+ go a.wikiDataConsumer(artCh, consumerDone)
+
+ for _, e := range existing {
+ artCh <- wikiArticleFetch{imdbID: e.imdbID, name: e.wikiArticle}
+ }
+ close(artCh)
+ <-consumerDone
+
+ return nil
+}
// queryWikidataBatch sends a SPARQL query for the given IDs and returns a map of id -> wikiAcc.
func (a *App) queryWikidataBatch(ids []string) (map[string]wikiAcc, error) {