summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoradmin2024-09-13 20:04:17 +0200
committeradmin2024-09-13 20:04:17 +0200
commitb190512e951efdd1ed4642eed8726bf7bdf2c022 (patch)
tree635c5ca837aaf48ddcfbc6c65c1055c7402f84f4
parent6b091dc7ab2c4fdaed0675ab57ea05e4ddb81e5b (diff)
downloadhncrawler-b190512e951efdd1ed4642eed8726bf7bdf2c022.tar.gz
changed xurl to goquery
-rw-r--r--go.mod23
-rw-r--r--go.sum55
-rw-r--r--helper.go112
-rw-r--r--main.go82
4 files changed, 157 insertions, 115 deletions
diff --git a/go.mod b/go.mod
index 85bdde6..ee6b4b9 100644
--- a/go.mod
+++ b/go.mod
@@ -3,9 +3,9 @@ module hn-crawler
go 1.19
require (
+ github.com/PuerkitoBio/goquery v1.8.1
github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde
github.com/go-sql-driver/mysql v1.7.1
- github.com/gocolly/colly v1.2.0
github.com/jmoiron/sqlx v1.3.5
github.com/sirupsen/logrus v1.9.3
github.com/spf13/pflag v1.0.5
@@ -13,38 +13,25 @@ require (
)
require (
- github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
- github.com/antchfx/htmlquery v1.3.0 // indirect
- github.com/antchfx/xmlquery v1.3.18 // indirect
- github.com/antchfx/xpath v1.2.4 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
- github.com/gobwas/glob v0.2.3 // indirect
- github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
- github.com/golang/protobuf v1.5.3 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
- github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
- github.com/mvdan/xurls v1.1.0 // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
+ github.com/rogpeppe/go-internal v1.10.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
- github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spf13/afero v1.11.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
- github.com/temoto/robotstxt v1.1.2 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
- golang.org/x/net v0.19.0 // indirect
- golang.org/x/sys v0.15.0 // indirect
- golang.org/x/text v0.14.0 // indirect
- google.golang.org/appengine v1.6.7 // indirect
- google.golang.org/protobuf v1.31.0 // indirect
+ golang.org/x/net v0.29.0 // indirect
+ golang.org/x/sys v0.25.0 // indirect
+ golang.org/x/text v0.18.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
- mvdan.cc/xurls/v2 v2.5.0 // indirect
)
diff --git a/go.sum b/go.sum
index 5fb070d..59f0a54 100644
--- a/go.sum
+++ b/go.sum
@@ -4,13 +4,6 @@ github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x0
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde h1:xcvoK8AzKQi2TR/lgV5lcG3PcDU4T3F8hN75Ou3KZ6w=
github.com/anikhasibul/queue v0.0.0-20190518110522-5d242d08bdde/go.mod h1:CZ177vKofY/zZG0s1KUJQflzzEWlceyyqW8RRpyMqfs=
-github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E=
-github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8=
-github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0=
-github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA=
-github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
-github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY=
-github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
@@ -20,24 +13,11 @@ github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyT
github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
-github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
-github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
-github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
-github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
-github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
-github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
-github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
-github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
-github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g=
github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ=
-github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
-github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0=
@@ -48,20 +28,16 @@ github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRU
github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
-github.com/mvdan/xurls v1.1.0 h1:OpuDelGQ1R1ueQ6sSryzi6P+1RtBpfQHM8fJwlE45ww=
-github.com/mvdan/xurls v1.1.0/go.mod h1:tQlNn3BED8bE/15hnSL2HLkDeLWpNPAwtw7wkEq44oU=
github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4=
github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
-github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
+github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ=
github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4=
github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
-github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
-github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
@@ -85,8 +61,6 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
-github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
-github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE=
go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
@@ -97,15 +71,13 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g=
golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
-golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
-golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
-golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
-golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
+golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
+golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -115,34 +87,23 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
-golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
+golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
-golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
-golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224=
+golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
-google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
-google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
-google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
-google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
@@ -150,5 +111,3 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8=
-mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE=
diff --git a/helper.go b/helper.go
index af5f4c1..866ec5a 100644
--- a/helper.go
+++ b/helper.go
@@ -5,6 +5,7 @@ import (
"strings"
log "github.com/sirupsen/logrus"
"regexp"
+ xhtml "golang.org/x/net/html"
)
func stripHNPrefix(title string) string {
@@ -32,6 +33,10 @@ func _removeParam(url, key string) string {
}
func normalizeUrl(url string) string {
+
+ /**
+ * Redirect http:// to https://
+ */
match, err := regexp.MatchString("^http://", url)
if err != nil {
log.Fatal(err)
@@ -42,7 +47,10 @@ func normalizeUrl(url string) string {
url = r.ReplaceAllString(url, "https://")
}
- // add missing https:// if no scheme
+ /**
+ * add missing https:// if no scheme
+ * Fun fact: https://news.ycombinator.com/item?id=27351340 broke this part
+ */
u, err := _url.Parse(url)
if err != nil {
log.Fatal(err)
@@ -56,7 +64,9 @@ func normalizeUrl(url string) string {
}
}
-
+ /**
+ * Apple TV accepts youtube:// scheme
+ */
match, err = regexp.MatchString("youtube://", url)
if err != nil {
log.Fatal(err)
@@ -66,6 +76,9 @@ func normalizeUrl(url string) string {
url = r.ReplaceAllString(url, "https://")
}
+ /**
+ * Redirect youtu.be to desktop version
+ */
match, err = regexp.MatchString("youtu.be/", url)
if err != nil {
log.Fatal(err)
@@ -73,12 +86,6 @@ func normalizeUrl(url string) string {
if match {
log.Debug("normalize: ", "youtu.be ", url)
- /**
- * remove tracking param "si"
- */
- url = _removeParam(url, "si")
- url = _removeParam(url, "feature")
-
u, err := _url.Parse(url)
if err != nil {
log.Fatal(err)
@@ -96,6 +103,9 @@ func normalizeUrl(url string) string {
//url = r.ReplaceAllString(url, "youtube.com/watch?v=")
}
+ /**
+ * Redirect m.youtube.com to desktop version
+ */
match, err = regexp.MatchString("/m.youtube.com/", url)
if err != nil {
log.Fatal(err)
@@ -107,12 +117,14 @@ func normalizeUrl(url string) string {
* remove tracking param "si"
*/
url = _removeParam(url, "si")
- url = _removeParam(url, "feature")
r := regexp.MustCompile("/m.youtube.com/")
url = r.ReplaceAllString(url, "/www.youtube.com/")
}
+ /**
+ * Redirect m.imdb.com to desktop version
+ */
match, err = regexp.MatchString("/m.imdb.com/", url)
if err != nil {
log.Fatal(err)
@@ -120,26 +132,56 @@ func normalizeUrl(url string) string {
if match {
log.Debug("normalize: ", "m.imdb.com ", url)
+ r := regexp.MustCompile("/m.imdb.com/")
+ url = r.ReplaceAllString(url, "/www.imdb.com")
+ }
+
+ /**
+ * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go
+ * Screw that, wierd edge case. Someone pasted a
+ */
+ u, err = _url.Parse(url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if ! strings.HasPrefix(u.Host, "www.") {
+ u.Host = "www." + u.Host
+ }
+
+ url = u.String()
+
+ /**
+ * Redirects youtube.com/c/<name> to youtube.com/@<name>
+ */
+ match, err = regexp.MatchString("/.youtube.com/c/", url)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if match {
+ log.Debug("normalize: ", "youtube.com/c/ -> @ ", url)
+
/**
* remove tracking param "si"
*/
url = _removeParam(url, "si")
- url = _removeParam(url, "feature")
- r := regexp.MustCompile("/m.imdb.com/")
- url = r.ReplaceAllString(url, "/www.imdb.com")
+ r := regexp.MustCompile("youtube.com/c/")
+ url = r.ReplaceAllString(url, "youtube.com/@")
}
- /*
- match, err = regexp.MatchString("m.wikipedia.org", url)
+ /**
+ * remove tracking param "si", "feature" and "pp" from every youtube video
+ */
+ match, err = regexp.MatchString("/www.youtube.com/", url)
if err != nil {
log.Fatal(err)
}
if match {
- r := regexp.MustCompile("m.wikipedia.org")
- url = r.ReplaceAllString(url, "wikipedia.org")
+ url = _removeParam(url, "si")
+ url = _removeParam(url, "pp")
+ url = _removeParam(url, "feature")
}
- */
/**
* remove tracking utm_ params
@@ -150,19 +192,31 @@ func normalizeUrl(url string) string {
url = _removeParam(url, "utm_term")
url = _removeParam(url, "utm_content")
- u, err = _url.Parse(url)
- if err != nil {
- log.Fatal(err)
- }
+ return url
+}
- /**
- * Append www. to normalize URL. exclude relative URLs starting with // since this is not recognized by Go
- * Screw that, wierd edge case. Someone pasted a
- */
- if ! strings.HasPrefix(u.Host, "www.") {
- u.Host = "www." + u.Host
+func RemoveNode(root_node *xhtml.Node, remove_me *xhtml.Node) {
+ found_node := false
+ check_nodes := make(map[int]*xhtml.Node)
+ i := 0
+
+ // loop through siblings
+ for n := root_node.FirstChild; n != nil; n = n.NextSibling {
+ if n == remove_me {
+ found_node = true
+ n.Parent.RemoveChild(n)
}
- url = u.String()
- return url
+ check_nodes[i] = n
+ i++
+ }
+
+ // check if removing node is found
+ // if yes no need to check childs returning
+ // if no continue loop through childs and so on
+ if found_node == false {
+ for _, item := range check_nodes {
+ RemoveNode(item, remove_me)
+ }
+ }
}
diff --git a/main.go b/main.go
index 8940afc..83a35ff 100644
--- a/main.go
+++ b/main.go
@@ -15,7 +15,7 @@ import (
"github.com/anikhasibul/queue"
"github.com/jmoiron/sqlx"
log "github.com/sirupsen/logrus"
- "mvdan.cc/xurls/v2"
+ "github.com/PuerkitoBio/goquery"
)
type App struct {
@@ -63,9 +63,11 @@ func (app *App) walkDown() {
//var err error
//max_item := getMaxItem()
+ //max_item := 27351341
+ max_item := 27262623
//max_item := 41495306
//max_item := 36128477
- max_item := 32670334
+ //max_item := 32670334
//max_item := 41231601
//max_item := 41165987
//max_item := 41136898
@@ -79,7 +81,8 @@ func (app *App) walkDown() {
//max_item := 15038031
//max_item := 14450000
- const maxRoutines = 200
+ const maxRoutines = 400
+ //const maxRoutines = 1
q := queue.New(maxRoutines)
defer q.Close()
@@ -281,32 +284,67 @@ func getStory(id int) (Story, bool) {
/**
* Parse all URLs in Story.Text
+
+ log.Debugf("StoryID: %d\n", Story.Id)
+ log.Debugf("StoryID: %d\n", Story.Text)
+ */
+
+ /**
+ * This comment broke my code:
+ * https://news.ycombinator.com/item?id=27351340
*/
- rxRelaxed := xurls.Relaxed()
- rxLinks := rxRelaxed.FindAllString(html.UnescapeString(Story.Text), -1)
+ tmpdoc, err := goquery.NewDocumentFromReader(strings.NewReader("<html>"+Story.Text+"</html>"))
+ if err != nil {
+ log.Errorf("Failed to parse html: %s\n", err.Error())
+ return Story, false
+ }
+ sel := tmpdoc.Find("html")
+
+ // remove all found elements from selection
+ sel.Find("code").Each(func(i int, s *goquery.Selection) {
+ //log.Warnf("%+v\n", s.Get(0))
+ RemoveNode(sel.Get(0), s.Get(0))
+ })
+
+ tmphtml, err := sel.Html()
+ if err != nil {
+ log.Warn("Failed to generate html from selection: ", err.Error())
+ }
- for _, rxLink := range rxLinks {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tmphtml))
+
+ if err != nil {
+ log.Errorf("Failed to parse html: %s\n", err.Error())
+ return Story, false
+ }
+
+ doc.Find("a").Each(func(i int, s *goquery.Selection) {
+
+ l, ok := s.Attr("href")
+
+ if ok {
/**
* Check for Youtube in text field
*/
- is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", rxLink)
+ is_video, err = regexp.MatchString("(?i)(youtube.com)|(youtu.be)|(vimeo.com)", l)
if err != nil {
- log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- return Story, false
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
}
if is_video {
- if ! duplicates[rxLink] {
+ if ! duplicates[l] {
var link Link
- link.Url = normalizeUrl(rxLink)
+ link.Url = normalizeUrl(l)
link.Field = 2
Story.Links = append(Story.Links, link)
log.Info("match youtube text")
log.Infof("%+v\n", Story)
- duplicates[rxLink] = true
+ duplicates[l] = true
}
}
@@ -314,27 +352,29 @@ func getStory(id int) (Story, bool) {
/**
* Check for movie platforms in text field
*/
- is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", rxLink)
+ is_movie, err = regexp.MatchString("(?i)(imdb.com)|(rottentomatoes.com)|(metacritic.com)", l)
if err != nil {
- log.Errorf("Failed to parse and match regex: %s\n", err.Error())
- return Story, false
+ log.Fatal("Failed to parse and match regex: %s\n", err.Error())
+ //log.Errorf("Failed to parse and match regex: %s\n", err.Error())
+ //return Story, false
}
if is_movie {
- if ! duplicates[rxLink] {
+ if ! duplicates[l] {
var link Link
- link.Url = normalizeUrl(rxLink)
+ link.Url = normalizeUrl(l)
link.Field = 1
Story.Links = append(Story.Links, link)
log.Info("match moview platform text")
log.Infof("%+v\n", Story)
- duplicates[rxLink] = true
+ duplicates[l] = true
}
-
}
- }
+ }
+ })
+
//Story.Url = normalizeUrl(Story.Url)
@@ -398,6 +438,8 @@ func getDetail(id int) Story {
story.Text = html.UnescapeString(story.Text)
+ log.Debugf("StoryID: %d\n", story.Id)
+
return story
}