刚开始爬取的时候是用正则表达式实现的,然后爬取了不必要的链接,现在改用goquery实现:
// judgeUrl project judgeUrl.go
package judgeUrl
import (
"strings"
)
func IsUrl(str string) bool {
if strings.HasPrefix(str, "#") || strings.HasPrefix(str, "//") || strings.HasSuffix(str, ".exe") || strings.HasSuffix(str, ":void(0);") {
return false
} else if strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}") {
return false
} else if strings.EqualFold(str, "javascript:;") {
return false
} else {
return true
}
return true
}
func SamePathUrl(preUrl string, url string, mark int) (newUrl string) {
last := strings.LastIndex(preUrl, "/")
if last == 6 {
newUrl = preUrl + url
} else {
if mark == 1 {
newUrl = preUrl[:last] + url
} else {
newPreUrl := preUrl[:last]
newLast := strings.LastIndex(newPreUrl, "/"