From f95995af8364ab9fd7106c050455e2d1f71c7ecd Mon Sep 17 00:00:00 2001 From: horus Date: Mon, 19 Feb 2018 21:21:40 +0100 Subject: Improves sanitizing. (crawler) --- crawler/sanitize.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'crawler/sanitize.go') diff --git a/crawler/sanitize.go b/crawler/sanitize.go index 26d254d..346100a 100644 --- a/crawler/sanitize.go +++ b/crawler/sanitize.go @@ -71,6 +71,28 @@ func sanitize_name(name string) string { name = strings.Replace(name, "Years Old", "Jahre", 1) } + if strings.Contains(name, " Anos ") { + name = strings.Replace(name, " Anos ", " Jahre ", 1) + } + + if strings.Contains(name, " anos ") { + name = strings.Replace(name, " anos ", " Jahre ", 1) + } + + r_J, err := regexp.Compile(`[0-9]+(\s)*J(\s|-)`) + if err != nil { + Fatal(err, "sanitize_name: J-Regexp (J für Jahr) failed") + } + age_noisy := r_J.FindString(name) + if age_noisy != "" { + r_number, err := regexp.Compile(`[0-9]+`) + if err != nil { + Fatal(err, "sanitize_name: Number-Regexp failed") + } + age := r_number.FindString(age_noisy) + name = strings.Replace(name, age_noisy, age+" Jahre ", 1) + } + r_liter, err := regexp.Compile(`[0-9]+([,.][0-9]+)?( )?[lL](iter)?`) if err != nil { Fatal(err, "sanitize_name: Liter-Regexp failed") @@ -128,6 +150,20 @@ func sanitize_name(name string) string { name = strings.TrimSpace(name) } + // removes redundant white spaces + r_ws, err := regexp.Compile(`\s(\s)+`) + if err != nil { + Fatal(err, "sanitize_name: White Space-Regexp failed") + } + for { + ws := r_ws.FindString(name) + if ws == "" { + break + } + name = strings.Replace(name, ws, " ", -1) + name = strings.TrimSpace(name) + } + return name } -- cgit v1.2.3