summaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
authorhorus2018-02-19 21:21:40 +0100
committerhorus2018-02-19 21:21:40 +0100
commitf95995af8364ab9fd7106c050455e2d1f71c7ecd (patch)
tree01f657ad4bc98aa52b702d2ea547f0c9df159a66 /crawler
parent11fc8bd809d3362fd9be5e5e9bc17c74f538fc07 (diff)
downloadalkobote-f95995af8364ab9fd7106c050455e2d1f71c7ecd.tar.gz
Improves sanitizing. (crawler)
Diffstat (limited to 'crawler')
-rw-r--r--crawler/database.go2
-rw-r--r--crawler/main.go6
-rw-r--r--crawler/sanitize.go36
3 files changed, 39 insertions, 5 deletions
diff --git a/crawler/database.go b/crawler/database.go
index 0d495c8..01813e0 100644
--- a/crawler/database.go
+++ b/crawler/database.go
@@ -62,7 +62,7 @@ func (app *App) createTables() error {
view_query := `CREATE OR REPLACE VIEW ` + v + `_view AS
SELECT
- angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type,
+ angebot.id, angebot.name, age, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type,
original_price/100 as original_price, discounted_price/100 as discounted_price, angebot.base_price/100 as base_price, image_url,
shop.name as shop, shop.id as shop_id, shop.short_url as shop_url, shop.shipping_costs/100 as shipping_costs, shop.free_shipping,
ROUND(100-((discounted_price/original_price)*100)) AS procent, created_at
diff --git a/crawler/main.go b/crawler/main.go
index 573a718..cbbbf1b 100644
--- a/crawler/main.go
+++ b/crawler/main.go
@@ -66,13 +66,11 @@ func main() {
app.Now = time.Now().Unix()
if "sqlite3" == app.Config.DBDriver {
- //app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBPath)
app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBPath)
} else {
- if app.Config.Debug {
- log.Debug(app.Config.DBUser + ":" + app.Config.DBPassword + "@tcp(" + app.Config.DBHost + ":" + app.Config.DBPort + ")/" + app.Config.DBDBName + app.Config.DBOptions)
- }
+ log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions))
+
app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+app.Config.DBOptions)
}
if err != nil {
diff --git a/crawler/sanitize.go b/crawler/sanitize.go
index 26d254d..346100a 100644
--- a/crawler/sanitize.go
+++ b/crawler/sanitize.go
@@ -71,6 +71,28 @@ func sanitize_name(name string) string {
name = strings.Replace(name, "Years Old", "Jahre", 1)
}
+ if strings.Contains(name, " Anos ") {
+ name = strings.Replace(name, " Anos ", " Jahre ", 1)
+ }
+
+ if strings.Contains(name, " anos ") {
+ name = strings.Replace(name, " anos ", " Jahre ", 1)
+ }
+
+ r_J, err := regexp.Compile(`[0-9]+(\s)*J(\s|-)`)
+ if err != nil {
+ Fatal(err, "sanitize_name: J-Regexp (J für Jahr) failed")
+ }
+ age_noisy := r_J.FindString(name)
+ if age_noisy != "" {
+ r_number, err := regexp.Compile(`[0-9]+`)
+ if err != nil {
+ Fatal(err, "sanitize_name: Number-Regexp failed")
+ }
+ age := r_number.FindString(age_noisy)
+ name = strings.Replace(name, age_noisy, age+" Jahre ", 1)
+ }
+
r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`)
if err != nil {
Fatal(err, "sanitize_name: Liter-Regexp failed")
@@ -128,6 +150,20 @@ func sanitize_name(name string) string {
name = strings.TrimSpace(name)
}
+ // removes redundant white spaces
+ r_ws, err := regexp.Compile(`\s(\s)+`)
+ if err != nil {
+ Fatal(err, "sanitize_name: White Space-Regexp failed")
+ }
+ for {
+ ws := r_ws.FindString(name)
+ if ws == "" {
+ break
+ }
+ name = strings.Replace(name, ws, " ", -1)
+ name = strings.TrimSpace(name)
+ }
+
return name
}