From 2e3e5b3efc6a8d9471a73c5553f88fa94e28bd3a Mon Sep 17 00:00:00 2001 From: dev Date: Wed, 24 Jun 2026 01:41:31 +0200 Subject: Initial commit --- .gitignore | 2 + imdbdata/source.txt | 1 + schema/schema.sql | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/config.go | 97 ++++++++++++++++++++++++++++++++++++++++++++ src/main.go | 46 +++++++++++++++++++++ 5 files changed, 259 insertions(+) create mode 100644 .gitignore create mode 100644 imdbdata/source.txt create mode 100644 schema/schema.sql create mode 100644 src/config.go create mode 100644 src/main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0fada4c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.tsv +src/hnimdbbot diff --git a/imdbdata/source.txt b/imdbdata/source.txt new file mode 100644 index 0000000..92659a1 --- /dev/null +++ b/imdbdata/source.txt @@ -0,0 +1 @@ +https://datasets.imdbws.com/ diff --git a/schema/schema.sql b/schema/schema.sql new file mode 100644 index 0000000..b643cc7 --- /dev/null +++ b/schema/schema.sql @@ -0,0 +1,113 @@ +SET NAMES utf8; +SET time_zone = '+00:00'; +SET foreign_key_checks = 0; +SET sql_mode = 'NO_AUTO_VALUE_ON_ZERO'; + +SET NAMES utf8mb4; + +CREATE TABLE `genre` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `imdb_id` int(11) NOT NULL, + `name` varchar(255) NOT NULL, + PRIMARY KEY (`id`), + KEY `imdb_id` (`imdb_id`), + CONSTRAINT `genre_ibfk_1` FOREIGN KEY (`imdb_id`) REFERENCES `imdb` (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + + +CREATE TABLE `imdb` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `links_id` int(11) NOT NULL, + `imdb_id` varchar(255) NOT NULL, + `wiki_article` varchar(255) DEFAULT NULL, + `synopsis` text DEFAULT NULL, + `year` int(11) DEFAULT NULL, + `poster_url` text DEFAULT NULL, + `title_type` varchar(255) DEFAULT NULL, + `primary_title` varchar(255) DEFAULT NULL, + `original_title` varchar(255) DEFAULT NULL, + `start_year` int(11) DEFAULT NULL, + `runtime_minutes` int(11) DEFAULT NULL, + `average_rating` decimal(3,1) unsigned DEFAULT NULL, + `num_votes` int(11) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `links_id` (`links_id`), + CONSTRAINT `imdb_ibfk_1` FOREIGN KEY (`links_id`) REFERENCES `links` (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + + +CREATE TABLE `links` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `created_at` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', + `story_id` int(11) NOT NULL, + `url` varchar(1000) DEFAULT NULL, + `field` int(11) NOT NULL, + `host` varchar(255) DEFAULT NULL, + `param` varchar(255) DEFAULT NULL, + `type` varchar(255) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `story_id` (`story_id`), + CONSTRAINT `links_ibfk_1` FOREIGN KEY (`story_id`) REFERENCES `story` (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=407448 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + + +CREATE TABLE `max_item` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `max_story_id` int(11) NOT NULL, + `created_at` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', + PRIMARY KEY (`id`), + UNIQUE KEY `max_story_id` (`max_story_id`), + CONSTRAINT `max_item_ibfk_1` FOREIGN KEY (`max_story_id`) REFERENCES `story` (`story_id`) +) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + + +CREATE TABLE `people` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `name` varchar(255) NOT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + + +CREATE TABLE `profession` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `name` varchar(255) NOT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + +INSERT INTO `profession` (`id`, `name`) VALUES +(1, 'actor'), +(2, 'director'), +(3, 'screenwriter'); + +CREATE TABLE `story` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `story_id` int(11) NOT NULL, + `created_at` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', + `type` varchar(255) NOT NULL, + `title` varchar(255) NOT NULL, + `text` text DEFAULT NULL, + `score` int(11) NOT NULL, + `descendants` int(11) NOT NULL, + `time` int(11) NOT NULL, + `poster` varchar(255) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `story_id` (`story_id`) +) ENGINE=InnoDB AUTO_INCREMENT=366974 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + + +CREATE TABLE `who` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `imdb_id` int(11) NOT NULL, + `people_id` int(11) NOT NULL, + `profession_id` int(11) NOT NULL, + PRIMARY KEY (`id`), + KEY `imdb_id` (`imdb_id`), + KEY `people_id` (`people_id`), + KEY `profession_id` (`profession_id`), + CONSTRAINT `who_ibfk_1` FOREIGN KEY (`imdb_id`) REFERENCES `imdb` (`id`), + CONSTRAINT `who_ibfk_2` FOREIGN KEY (`people_id`) REFERENCES `people` (`id`), + CONSTRAINT `who_ibfk_3` FOREIGN KEY (`profession_id`) REFERENCES `profession` (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; diff --git a/src/config.go b/src/config.go new file mode 100644 index 0000000..41ea55a --- /dev/null +++ b/src/config.go @@ -0,0 +1,97 @@ +package main + +import ( + "os" + + log "github.com/sirupsen/logrus" + "github.com/spf13/viper" +) + +type Config struct { + DBDriver string + DBDBName string + DBHost string + DBPort string + DBUser string + DBPassword string + DBOptions string + + UserAgent string + Delay int + + AccessToken string + + Debug bool // sets log level to debug +} + +// Parses the configuration and sets the configuration struct. +func (c *Config) parseConfig(configFile string) { + + viper.SetDefault("DB_Driver", "mysql") + viper.SetDefault("DB_DBName", "hncrawler") + viper.SetDefault("DB_Host", "localhost") + viper.SetDefault("DB_Port", "3306") + + viper.SetDefault("Debug", false) + viper.SetDefault("Delay", 0) + + viper.SetDefault("UserAgent", "pure cinema - mostdiscussed.com") + + // Name of the configuration file + viper.SetConfigName("config") + + // Where to find the config file + if configFile == "" { + viper.AddConfigPath(".") + } else { + stat, err := os.Stat(configFile) + if os.IsNotExist(err) { + // provided config file does not exist, so we add the path instead + viper.AddConfigPath(configFile) + } else if err == nil && stat.IsDir() { + // adds the path to look for the config file + viper.AddConfigPath(configFile) + } else if err == nil { + // directly sets the config file + viper.SetConfigFile(configFile) + } else { + // if we are here something went wrong + log.Warn(err, "config.go: os.Stat("+configFile+") error") + // adding the path nonetheless because it's not hurting + viper.AddConfigPath(configFile) + } + } + + // Env variables need to be prefixed with "ALKOBOTE_" + viper.SetEnvPrefix("DISCUSS_") + + // Parses automatic the matching env variables + viper.AutomaticEnv() + + // Reads the config + err := viper.ReadInConfig() + if err != nil { + log.Fatal(err, "Config: Error parsing config file.") + } + log.Debug("Config: Config file used: " + viper.ConfigFileUsed()) + + c.setsConfig() +} + +// Actually sets the config struct +func (c *Config) setsConfig() { + c.DBDriver = viper.GetString("DB_Driver") + c.DBHost = viper.GetString("DB_Host") + c.DBPort = viper.GetString("DB_Port") + c.DBUser = viper.GetString("DB_User") + c.DBPassword = viper.GetString("DB_Password") + c.DBDBName = viper.GetString("DB_DBName") + c.DBOptions = viper.GetString("DB_Options") + + c.UserAgent = viper.GetString("UserAgent") + c.Delay = viper.GetInt("Delay") + + c.AccessToken= viper.GetString("AccessToken") + + c.Debug = viper.GetBool("Debug") +} diff --git a/src/main.go b/src/main.go new file mode 100644 index 0000000..8596cd1 --- /dev/null +++ b/src/main.go @@ -0,0 +1,46 @@ +package main + +import ( + "encoding/json" + "fmt" + "html" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/anikhasibul/queue" + "github.com/jmoiron/sqlx" + log "github.com/sirupsen/logrus" +) + +type App struct { + Config *Config + DB *sqlx.DB + Now time.Time +} + +func main() { + var err error + _own_conf := _conf + app := App{Config: &_own_conf} + _conf = Config{} + + app.Now = time.Now() + + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions) + if err != nil { + log.Fatal(err, "Cannot connect to database") + } + + if err = app.DB.Ping(); err != nil { + log.Fatal(err, "No connection to database") + } + defer app.DB.Close() +} -- cgit v1.2.3