summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordev2026-06-24 01:41:31 +0200
committerdev2026-06-24 01:41:31 +0200
commit2e3e5b3efc6a8d9471a73c5553f88fa94e28bd3a (patch)
tree6b2cebd45eb1b7068f35d737c6f326e9852bdbf5
downloadhnimdbbot-2e3e5b3efc6a8d9471a73c5553f88fa94e28bd3a.tar.gz
Initial commit
-rw-r--r--.gitignore2
-rw-r--r--imdbdata/source.txt1
-rw-r--r--schema/schema.sql113
-rw-r--r--src/config.go97
-rw-r--r--src/main.go46
5 files changed, 259 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0fada4c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.tsv
+src/hnimdbbot
diff --git a/imdbdata/source.txt b/imdbdata/source.txt
new file mode 100644
index 0000000..92659a1
--- /dev/null
+++ b/imdbdata/source.txt
@@ -0,0 +1 @@
+https://datasets.imdbws.com/
diff --git a/schema/schema.sql b/schema/schema.sql
new file mode 100644
index 0000000..b643cc7
--- /dev/null
+++ b/schema/schema.sql
@@ -0,0 +1,113 @@
+SET NAMES utf8;
+SET time_zone = '+00:00';
+SET foreign_key_checks = 0;
+SET sql_mode = 'NO_AUTO_VALUE_ON_ZERO';
+
+SET NAMES utf8mb4;
+
+CREATE TABLE `genre` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `imdb_id` int(11) NOT NULL,
+ `name` varchar(255) NOT NULL,
+ PRIMARY KEY (`id`),
+ KEY `imdb_id` (`imdb_id`),
+ CONSTRAINT `genre_ibfk_1` FOREIGN KEY (`imdb_id`) REFERENCES `imdb` (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+
+CREATE TABLE `imdb` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `links_id` int(11) NOT NULL,
+ `imdb_id` varchar(255) NOT NULL,
+ `wiki_article` varchar(255) DEFAULT NULL,
+ `synopsis` text DEFAULT NULL,
+ `year` int(11) DEFAULT NULL,
+ `poster_url` text DEFAULT NULL,
+ `title_type` varchar(255) DEFAULT NULL,
+ `primary_title` varchar(255) DEFAULT NULL,
+ `original_title` varchar(255) DEFAULT NULL,
+ `start_year` int(11) DEFAULT NULL,
+ `runtime_minutes` int(11) DEFAULT NULL,
+ `average_rating` decimal(3,1) unsigned DEFAULT NULL,
+ `num_votes` int(11) DEFAULT NULL,
+ PRIMARY KEY (`id`),
+ KEY `links_id` (`links_id`),
+ CONSTRAINT `imdb_ibfk_1` FOREIGN KEY (`links_id`) REFERENCES `links` (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+
+CREATE TABLE `links` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `created_at` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
+ `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
+ `story_id` int(11) NOT NULL,
+ `url` varchar(1000) DEFAULT NULL,
+ `field` int(11) NOT NULL,
+ `host` varchar(255) DEFAULT NULL,
+ `param` varchar(255) DEFAULT NULL,
+ `type` varchar(255) DEFAULT NULL,
+ PRIMARY KEY (`id`),
+ KEY `story_id` (`story_id`),
+ CONSTRAINT `links_ibfk_1` FOREIGN KEY (`story_id`) REFERENCES `story` (`id`)
+) ENGINE=InnoDB AUTO_INCREMENT=407448 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+
+CREATE TABLE `max_item` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `max_story_id` int(11) NOT NULL,
+ `created_at` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
+ `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
+ PRIMARY KEY (`id`),
+ UNIQUE KEY `max_story_id` (`max_story_id`),
+ CONSTRAINT `max_item_ibfk_1` FOREIGN KEY (`max_story_id`) REFERENCES `story` (`story_id`)
+) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+
+CREATE TABLE `people` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `name` varchar(255) NOT NULL,
+ PRIMARY KEY (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+
+CREATE TABLE `profession` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `name` varchar(255) NOT NULL,
+ PRIMARY KEY (`id`)
+) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+INSERT INTO `profession` (`id`, `name`) VALUES
+(1, 'actor'),
+(2, 'director'),
+(3, 'screenwriter');
+
+CREATE TABLE `story` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `story_id` int(11) NOT NULL,
+ `created_at` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
+ `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
+ `type` varchar(255) NOT NULL,
+ `title` varchar(255) NOT NULL,
+ `text` text DEFAULT NULL,
+ `score` int(11) NOT NULL,
+ `descendants` int(11) NOT NULL,
+ `time` int(11) NOT NULL,
+ `poster` varchar(255) NOT NULL,
+ PRIMARY KEY (`id`),
+ UNIQUE KEY `story_id` (`story_id`)
+) ENGINE=InnoDB AUTO_INCREMENT=366974 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
+
+
+CREATE TABLE `who` (
+ `id` int(11) NOT NULL AUTO_INCREMENT,
+ `imdb_id` int(11) NOT NULL,
+ `people_id` int(11) NOT NULL,
+ `profession_id` int(11) NOT NULL,
+ PRIMARY KEY (`id`),
+ KEY `imdb_id` (`imdb_id`),
+ KEY `people_id` (`people_id`),
+ KEY `profession_id` (`profession_id`),
+ CONSTRAINT `who_ibfk_1` FOREIGN KEY (`imdb_id`) REFERENCES `imdb` (`id`),
+ CONSTRAINT `who_ibfk_2` FOREIGN KEY (`people_id`) REFERENCES `people` (`id`),
+ CONSTRAINT `who_ibfk_3` FOREIGN KEY (`profession_id`) REFERENCES `profession` (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
diff --git a/src/config.go b/src/config.go
new file mode 100644
index 0000000..41ea55a
--- /dev/null
+++ b/src/config.go
@@ -0,0 +1,97 @@
+package main
+
+import (
+ "os"
+
+ log "github.com/sirupsen/logrus"
+ "github.com/spf13/viper"
+)
+
+type Config struct {
+ DBDriver string
+ DBDBName string
+ DBHost string
+ DBPort string
+ DBUser string
+ DBPassword string
+ DBOptions string
+
+ UserAgent string
+ Delay int
+
+ AccessToken string
+
+ Debug bool // sets log level to debug
+}
+
+// Parses the configuration and sets the configuration struct.
+func (c *Config) parseConfig(configFile string) {
+
+ viper.SetDefault("DB_Driver", "mysql")
+ viper.SetDefault("DB_DBName", "hncrawler")
+ viper.SetDefault("DB_Host", "localhost")
+ viper.SetDefault("DB_Port", "3306")
+
+ viper.SetDefault("Debug", false)
+ viper.SetDefault("Delay", 0)
+
+ viper.SetDefault("UserAgent", "pure cinema - mostdiscussed.com")
+
+ // Name of the configuration file
+ viper.SetConfigName("config")
+
+ // Where to find the config file
+ if configFile == "" {
+ viper.AddConfigPath(".")
+ } else {
+ stat, err := os.Stat(configFile)
+ if os.IsNotExist(err) {
+ // provided config file does not exist, so we add the path instead
+ viper.AddConfigPath(configFile)
+ } else if err == nil && stat.IsDir() {
+ // adds the path to look for the config file
+ viper.AddConfigPath(configFile)
+ } else if err == nil {
+ // directly sets the config file
+ viper.SetConfigFile(configFile)
+ } else {
+ // if we are here something went wrong
+ log.Warn(err, "config.go: os.Stat("+configFile+") error")
+ // adding the path nonetheless because it's not hurting
+ viper.AddConfigPath(configFile)
+ }
+ }
+
+ // Env variables need to be prefixed with "ALKOBOTE_"
+ viper.SetEnvPrefix("DISCUSS_")
+
+ // Parses automatic the matching env variables
+ viper.AutomaticEnv()
+
+ // Reads the config
+ err := viper.ReadInConfig()
+ if err != nil {
+ log.Fatal(err, "Config: Error parsing config file.")
+ }
+ log.Debug("Config: Config file used: " + viper.ConfigFileUsed())
+
+ c.setsConfig()
+}
+
+// Actually sets the config struct
+func (c *Config) setsConfig() {
+ c.DBDriver = viper.GetString("DB_Driver")
+ c.DBHost = viper.GetString("DB_Host")
+ c.DBPort = viper.GetString("DB_Port")
+ c.DBUser = viper.GetString("DB_User")
+ c.DBPassword = viper.GetString("DB_Password")
+ c.DBDBName = viper.GetString("DB_DBName")
+ c.DBOptions = viper.GetString("DB_Options")
+
+ c.UserAgent = viper.GetString("UserAgent")
+ c.Delay = viper.GetInt("Delay")
+
+ c.AccessToken= viper.GetString("AccessToken")
+
+ c.Debug = viper.GetBool("Debug")
+}
diff --git a/src/main.go b/src/main.go
new file mode 100644
index 0000000..8596cd1
--- /dev/null
+++ b/src/main.go
@@ -0,0 +1,46 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "html"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/anikhasibul/queue"
+ "github.com/jmoiron/sqlx"
+ log "github.com/sirupsen/logrus"
+)
+
+type App struct {
+ Config *Config
+ DB *sqlx.DB
+ Now time.Time
+}
+
+func main() {
+ var err error
+ _own_conf := _conf
+ app := App{Config: &_own_conf}
+ _conf = Config{}
+
+ app.Now = time.Now()
+
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
+ app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+"?"+app.Config.DBOptions)
+ if err != nil {
+ log.Fatal(err, "Cannot connect to database")
+ }
+
+ if err = app.DB.Ping(); err != nil {
+ log.Fatal(err, "No connection to database")
+ }
+ defer app.DB.Close()
+}