diff options
| author | horus | 2018-02-19 21:21:40 +0100 |
|---|---|---|
| committer | horus | 2018-02-19 21:21:40 +0100 |
| commit | f95995af8364ab9fd7106c050455e2d1f71c7ecd (patch) | |
| tree | 01f657ad4bc98aa52b702d2ea547f0c9df159a66 /crawler | |
| parent | 11fc8bd809d3362fd9be5e5e9bc17c74f538fc07 (diff) | |
| download | alkobote-f95995af8364ab9fd7106c050455e2d1f71c7ecd.tar.gz | |
Improves sanitizing. (crawler)
Diffstat (limited to 'crawler')
| -rw-r--r-- | crawler/database.go | 2 | ||||
| -rw-r--r-- | crawler/main.go | 6 | ||||
| -rw-r--r-- | crawler/sanitize.go | 36 |
3 files changed, 39 insertions, 5 deletions
diff --git a/crawler/database.go b/crawler/database.go index 0d495c8..01813e0 100644 --- a/crawler/database.go +++ b/crawler/database.go @@ -62,7 +62,7 @@ func (app *App) createTables() error { view_query := `CREATE OR REPLACE VIEW ` + v + `_view AS SELECT - angebot.id, angebot.name, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type, + angebot.id, angebot.name, age, angebot.abv, angebot.volume, angebot.url as long_url, angebot.short_url as url, spirit_type, original_price/100 as original_price, discounted_price/100 as discounted_price, angebot.base_price/100 as base_price, image_url, shop.name as shop, shop.id as shop_id, shop.short_url as shop_url, shop.shipping_costs/100 as shipping_costs, shop.free_shipping, ROUND(100-((discounted_price/original_price)*100)) AS procent, created_at diff --git a/crawler/main.go b/crawler/main.go index 573a718..cbbbf1b 100644 --- a/crawler/main.go +++ b/crawler/main.go @@ -66,13 +66,11 @@ func main() { app.Now = time.Now().Unix() if "sqlite3" == app.Config.DBDriver { - //app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBPath) app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBPath) } else { - if app.Config.Debug { - log.Debug(app.Config.DBUser + ":" + app.Config.DBPassword + "@tcp(" + app.Config.DBHost + ":" + app.Config.DBPort + ")/" + app.Config.DBDBName + app.Config.DBOptions) - } + log.Debug(fmt.Sprintf(`Connecting to "%s" database "%s" as user "%s" on host "%s:%s" with extra options "%s".`, app.Config.DBDriver, app.Config.DBDBName, app.Config.DBUser, app.Config.DBHost, app.Config.DBPort, app.Config.DBOptions)) + app.DB, err = sqlx.Connect(app.Config.DBDriver, app.Config.DBUser+":"+app.Config.DBPassword+"@tcp("+app.Config.DBHost+":"+app.Config.DBPort+")/"+app.Config.DBDBName+app.Config.DBOptions) } if err != nil { diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 26d254d..346100a 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -71,6 +71,28 @@ func sanitize_name(name string) string { name = strings.Replace(name, "Years Old", "Jahre", 1) } + if strings.Contains(name, " Anos ") { + name = strings.Replace(name, " Anos ", " Jahre ", 1) + } + + if strings.Contains(name, " anos ") { + name = strings.Replace(name, " anos ", " Jahre ", 1) + } + + r_J, err := regexp.Compile(`[0-9]+(\s)*J(\s|-)`) + if err != nil { + Fatal(err, "sanitize_name: J-Regexp (J für Jahr) failed") + } + age_noisy := r_J.FindString(name) + if age_noisy != "" { + r_number, err := regexp.Compile(`[0-9]+`) + if err != nil { + Fatal(err, "sanitize_name: Number-Regexp failed") + } + age := r_number.FindString(age_noisy) + name = strings.Replace(name, age_noisy, age+" Jahre ", 1) + } + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) if err != nil { Fatal(err, "sanitize_name: Liter-Regexp failed") @@ -128,6 +150,20 @@ func sanitize_name(name string) string { name = strings.TrimSpace(name) } + // removes redundant white spaces + r_ws, err := regexp.Compile(`\s(\s)+`) + if err != nil { + Fatal(err, "sanitize_name: White Space-Regexp failed") + } + for { + ws := r_ws.FindString(name) + if ws == "" { + break + } + name = strings.Replace(name, ws, " ", -1) + name = strings.TrimSpace(name) + } + return name } |
