我们在爬取网页之后
有大量的无用的信息
所以我们需要用正则表达式去筛选一下
我们先来试试普通爬取
var channel = make(chan bool)
func main() {
startSpider(1, 5)
}
func startSpider(start int, end int) {
for i := start; i <= end; i++ {
url := "https://www.qiushibaike.com/text/page/" + strconv.Itoa(i)
filename := "f:/test/第" + strconv.Itoa(i) + "页.txt"
go spider(url, filename)
}
for i := start; i <= end; i++ {
<-channel
}
}
func spider(url string, filename string) {
resp, err := http.Get(url)
if err != nil {
fmt.Println("爬取失败...")
return
}
defer resp.Body.Close()
bytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("读取失败...")
return
}
writeErr := ioutil.WriteFile(filename, bytes, 0666)
if writeErr != nil {
fmt.Println("写入失败...")
return
}
channel <- true
}
然后我们加上一个方法
handleBody(bytes)
用正则来匹配我们想要的东西
bytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("读取失败...")
return
}
handleBody(bytes)
func handleBody(bytes []byte) {
old := string(bytes)
compile := regexp.MustCompile("(www.(.*?).com)")
submatch := compile.FindAllStringSubmatch(old, 5)
fmt.Println(submatch)
}
我们来查看一下一些网址
结果有
[www.qiushibaike.com www.qiushibaike.com qiushibaike] [www.qiushibaike.com www.qiushibaike.com qiushibaike] [www.app-remix.com www.app-remix.com app-remix]
所以成功找到了一些网址
然后我们再试试找一下span标签
func handleBody(bytes []byte) {
old := string(bytes)
compile := regexp.MustCompile("<span>(.+?)</span>")
submatch := compile.FindAllStringSubmatch(old, 5)
fmt.Println(submatch)
}
但是我们发现
匹配到的数据还是比较多而且比较杂乱
如果我们只是想要文字怎么办
var channel = make(chan bool)
func main() {
startSpider(1, 5)
}
func startSpider(start int, end int) {
for i := start; i <= end; i++ {
url := "https://www.qiushibaike.com/text/page/" + strconv.Itoa(i)
filename := "f:/test/第" + strconv.Itoa(i) + "页.txt"
go spider(url, filename)
}
for i := start; i <= end; i++ {
<-channel
}
}
func spider(url string, filename string) {
resp, err := http.Get(url)
if err != nil {
fmt.Println("爬取失败...")
return
}
defer resp.Body.Close()
bytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("读取失败...")
return
}
str := handleBody(bytes)
writeErr := ioutil.WriteFile(filename, []byte(str), 0666)
if writeErr != nil {
fmt.Println("写入失败...")
return
}
channel <- true
}
func handleBody(bytes []byte) string {
//我们将一些特殊字符给替换一下
old := string(bytes)
old = strings.Replace(old, " ", "", -1)
old = strings.Replace(old, "\n", "", -1)
old = strings.Replace(old, "\r", "", -1)
old = strings.Replace(old, "<br/>", "", -1)
//匹配正则
compile := regexp.MustCompile("<span>(.+?)</span>")
submatch := compile.FindAllStringSubmatch(old, -1)
fmt.Println(submatch)
//拼接字符串
str := ""
for i1, s1 := range submatch {
for _, s2 := range s1 {
if !strings.Contains(s2, "span") {
str = str + "第" + strconv.Itoa(i1) + "条是: " + s2 + "\r\n"
}
}
}
return str
}