diff options
| author | admin | 2024-09-13 20:04:17 +0200 |
|---|---|---|
| committer | admin | 2024-09-13 20:04:17 +0200 |
| commit | b190512e951efdd1ed4642eed8726bf7bdf2c022 (patch) | |
| tree | 635c5ca837aaf48ddcfbc6c65c1055c7402f84f4 | |
| parent | 6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b (diff) | |
| download | hncrawler-b190512e951efdd1ed4642eed8726bf7bdf2c022.tar.gz | |
changed xurl to goquery
| -rw-r--r-- | go.mod | 23 | ||||
| -rw-r--r-- | go.sum | 55 | ||||
| -rw-r--r-- | helper.go | 112 | ||||
| -rw-r--r-- | main.go | 82 |
4 files changed, 157 insertions, 115 deletions
@@ -3,9 +3,9 @@ module hn-crawler go 1.19 require ( + github.com/PuerkitoBio/goquery v1.8.1 github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde github.com/go-sql-driver/mysql v1.7.1 - github.com/gocolly/colly v1.2.0 github.com/jmoiron/sqlx v1.3.5 github.com/sirupsen/logrus v1.9.3 github.com/spf13/pflag v1.0.5 @@ -13,38 +13,25 @@ require ( ) require ( - github.com/PuerkitoBio/goquery v1.8.1 // indirect github.com/andybalholm/cascadia v1.3.1 // indirect - github.com/antchfx/htmlquery v1.3.0 // indirect - github.com/antchfx/xmlquery v1.3.18 // indirect - github.com/antchfx/xpath v1.2.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect - github.com/gobwas/glob v0.2.3 // indirect - github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/golang/protobuf v1.5.3 // indirect github.com/hashicorp/hcl v1.0.0 // indirect - github.com/kennygrant/sanitize v1.2.4 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect - github.com/mvdan/xurls v1.1.0 // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect + github.com/rogpeppe/go-internal v1.10.0 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect - github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.6.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect - github.com/temoto/robotstxt v1.1.2 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect - golang.org/x/net v0.19.0 // indirect - golang.org/x/sys v0.15.0 // indirect - golang.org/x/text v0.14.0 // indirect - google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.31.0 // indirect + golang.org/x/net v0.29.0 // indirect + golang.org/x/sys v0.25.0 // indirect + golang.org/x/text v0.18.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - mvdan.cc/xurls/v2 v2.5.0 // indirect ) @@ -4,13 +4,6 @@ github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x0 github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde h1:xcvoK8AzKQi2TR/lgV5lcG3PcDU4T3F8hN75Ou3KZ6w= github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde/go.mod h1:CZ177vKofY/zZG0s1KUJQflzzEWlceyyqW8RRpyMqfs= -github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= -github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= -github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0= -github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= -github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY= -github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -20,24 +13,11 @@ github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyT github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= -github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= -github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= -github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= -github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g= github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ= -github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= -github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0= @@ -48,20 +28,16 @@ github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRU github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= -github.com/mvdan/xurls v1.1.0 h1:OpuDelGQ1R1ueQ6sSryzi6P+1RtBpfQHM8fJwlE45ww= -github.com/mvdan/xurls v1.1.0/go.mod h1:tQlNn3BED8bE/15hnSL2HLkDeLWpNPAwtw7wkEq44oU= github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= -github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= -github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= -github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= @@ -85,8 +61,6 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= -github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= -github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= @@ -97,15 +71,13 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= -golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -115,34 +87,23 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= @@ -150,5 +111,3 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= -mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= @@ -5,6 +5,7 @@ import ( "strings" log "github.com/sirupsen/logrus" "regexp" + xhtml "golang.org/x/net/html" ) func stripHNPrefix(title string) string { @@ -32,6 +33,10 @@ func _removeParam(url, key string) string { } func normalizeUrl(url string) string { + + /** + * Redirect http:// to https:// + */ match, err := regexp.MatchString("^http://", url) if err != nil { log.Fatal(err) @@ -42,7 +47,10 @@ func normalizeUrl(url string) string { url = r.ReplaceAllString(url, "https://") } - // add missing https:// if no scheme + /** + * add missing https:// if no scheme + * Fun fact: https://news.ycombinator.com/item?id=27351340 broke this part + */ u, err := _url.Parse(url) if err != nil { log.Fatal(err) @@ -56,7 +64,9 @@ func normalizeUrl(url string) string { } } - + /** + * Apple TV accepts youtube:// scheme + */ match, err = regexp.MatchString("youtube://", url) if err != nil { log.Fatal(err) @@ -66,6 +76,9 @@ func normalizeUrl(url string) string { url = r.ReplaceAllString(url, "https://") } + /** + * Redirect youtu.be to desktop version + */ match, err = regexp.MatchString("youtu.be/", url) if err != nil { log.Fatal(err) @@ -73,12 +86,6 @@ func normalizeUrl(url string) string { if match { log.Debug("normalize: ", "youtu.be ", url) - /** - * remove tracking param "si" - */ - url = _removeParam(url, "si") - url = _removeParam(url, "feature") - u, err := _url.Parse(url) if err != nil { log.Fatal(err) @@ -96,6 +103,9 @@ func normalizeUrl(url string) string { //url = r.ReplaceAllString(url, "youtube.com/watch?v=") } + /** + * Redirect m.youtube.com to desktop version + */ match, err = regexp.MatchString("/m.youtube.com/", url) if err != nil { log.Fatal(err) @@ -107,12 +117,14 @@ func normalizeUrl(url string) string { * remove tracking param "si" */ url = _removeParam(url, "si") - url = _removeParam(url, "feature") r := regexp.MustCompile("/m.youtube.com/") url = r.ReplaceAllString(url, "/www.youtube.com/") } + /** + * Redirect m.imdb.com to desktop version + */ match, err = regexp.MatchString("/m.imdb.com/", url) if err != nil { log.Fatal(err) @@ -120,26 +132,56 @@ func normalizeUrl(url string) string { if match { log.Debug("normalize: ", "m.imdb.com ", url) + r := regexp.MustCompile("/m.imdb.com/") + url = r.ReplaceAllString(url, "/www.imdb.com") + } + + /** + * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go + * Screw that, wierd edge case. Someone pasted a + */ + u, err = _url.Parse(url) + if err != nil { + log.Fatal(err) + } + + if ! strings.HasPrefix(u.Host, "www.") { + u.Host = "www." + u.Host + } + + url = u.String() + + /** + * Redirects youtube.com/c/<name> to youtube.com/@<name> + */ + match, err = regexp.MatchString("/.youtube.com/c/", url) + if err != nil { + log.Fatal(err) + } + if match { + log.Debug("normalize: ", "youtube.com/c/ -> @ ", url) + /** * remove tracking param "si" */ url = _removeParam(url, "si") - url = _removeParam(url, "feature") - r := regexp.MustCompile("/m.imdb.com/") - url = r.ReplaceAllString(url, "/www.imdb.com") + r := regexp.MustCompile("youtube.com/c/") + url = r.ReplaceAllString(url, "youtube.com/@") } - /* - match, err = regexp.MatchString("m.wikipedia.org", url) + /** + * remove tracking param "si", "feature" and "pp" from every youtube video + */ + match, err = regexp.MatchString("/www.youtube.com/", url) if err != nil { log.Fatal(err) } if match { - r := regexp.MustCompile("m.wikipedia.org") - url = r.ReplaceAllString(url, "wikipedia.org") + url = _removeParam(url, "si") + url = _removeParam(url, "pp") + url = _removeParam(url, "feature") } - */ /** * remove tracking utm_ params @@ -150,19 +192,31 @@ func normalizeUrl(url string) string { url = _removeParam(url, "utm_term") url = _removeParam(url, "utm_content") - u, err = _url.Parse(url) - if err != nil { - log.Fatal(err) - } + return url +} - /** - * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go - * Screw that, wierd edge case. Someone pasted a - */ - if ! strings.HasPrefix(u.Host, "www.") { - u.Host = "www." + u.Host +func RemoveNode(root_node *xhtml.Node, remove_me *xhtml.Node) { + found_node := false + check_nodes := make(map[int]*xhtml.Node) + i := 0 + + // loop through siblings + for n := root_node.FirstChild; n != nil; n = n.NextSibling { + if n == remove_me { + found_node = true + n.Parent.RemoveChild(n) } - url = u.String() - return url + check_nodes[i] = n + i++ + } + + // check if removing node is found + // if yes no need to check childs returning + // if no continue loop through childs and so on + if found_node == false { + for _, item := range check_nodes { + RemoveNode(item, remove_me) + } + } } @@ -15,7 +15,7 @@ import ( "github.com/anikhasibul/queue" "github.com/jmoiron/sqlx" log "github.com/sirupsen/logrus" - "mvdan.cc/xurls/v2" + "github.com/PuerkitoBio/goquery" ) type App struct { @@ -63,9 +63,11 @@ func (app *App) walkDown() { //var err error //max_item := getMaxItem() + //max_item := 27351341 + max_item := 27262623 //max_item := 41495306 //max_item := 36128477 - max_item := 32670334 + //max_item := 32670334 //max_item := 41231601 //max_item := 41165987 //max_item := 41136898 @@ -79,7 +81,8 @@ func (app *App) walkDown() { //max_item := 15038031 //max_item := 14450000 - const maxRoutines = 200 + const maxRoutines = 400 + //const maxRoutines = 1 q := queue.New(maxRoutines) defer q.Close() @@ -281,32 +284,67 @@ func getStory(id int) (Story, bool) { /** * Parse all URLs in Story.Text + + log.Debugf("StoryID: %d\n", Story.Id) + log.Debugf("StoryID: %d\n", Story.Text) + */ + + /** + * This comment broke my code: + * https://news.ycombinator.com/item?id=27351340 */ - rxRelaxed := xurls.Relaxed() - rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1) + tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>")) + if err != nil { + log.Errorf("Failed to parse html: %s\n", err.Error()) + return Story, false + } + sel := tmpdoc.Find("html") + + // remove all found elements from selection + sel.Find("code").Each(func(i int, s *goquery.Selection) { + //log.Warnf("%+v\n", s.Get(0)) + RemoveNode(sel.Get(0), s.Get(0)) + }) + + tmphtml, err := sel.Html() + if err != nil { + log.Warn("Failed to generate html from selection: ", err.Error()) + } - for _, rxLink := range rxLinks { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tmphtml)) + + if err != nil { + log.Errorf("Failed to parse html: %s\n", err.Error()) + return Story, false + } + + doc.Find("a").Each(func(i int, s *goquery.Selection) { + + l, ok := s.Attr("href") + + if ok { /** * Check for Youtube in text field */ - is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink) + is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l) if err != nil { - log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - return Story, false + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false } if is_video { - if ! duplicates[rxLink] { + if ! duplicates[l] { var link Link - link.Url = normalizeUrl(rxLink) + link.Url = normalizeUrl(l) link.Field = 2 Story.Links = append(Story.Links, link) log.Info("match youtube text") log.Infof("%+v\n", Story) - duplicates[rxLink] = true + duplicates[l] = true } } @@ -314,27 +352,29 @@ func getStory(id int) (Story, bool) { /** * Check for movie platforms in text field */ - is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink) + is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l) if err != nil { - log.Errorf("Failed to parse and match regex: %s\n", err.Error()) - return Story, false + log.Fatal("Failed to parse and match regex: %s\n", err.Error()) + //log.Errorf("Failed to parse and match regex: %s\n", err.Error()) + //return Story, false } if is_movie { - if ! duplicates[rxLink] { + if ! duplicates[l] { var link Link - link.Url = normalizeUrl(rxLink) + link.Url = normalizeUrl(l) link.Field = 1 Story.Links = append(Story.Links, link) log.Info("match moview platform text") log.Infof("%+v\n", Story) - duplicates[rxLink] = true + duplicates[l] = true } - } - } + } + }) + //Story.Url = normalizeUrl(Story.Url) @@ -398,6 +438,8 @@ func getDetail(id int) Story { story.Text = html.UnescapeString(story.Text) + log.Debugf("StoryID: %d\n", story.Id) + return story } |
