点击 爬取页面
整体代码
package main
import (
"fmt"
"github.com/axgle/mahonia"
"github.com/xuri/excelize/v2"
"io/ioutil"
"net/http"
"strconv"
"strings"
)
//爬笑话 录入Excel
var path = "detail60/59124.html"
func main() {
NewExcel()
runThis()
if err := file.SaveAs("joker.xlsx"); err != nil {
fmt.Println(err)
}
}
func CheckErr(err error,msg string){
if err !=nil{
fmt.Errorf("err: %v",msg)
}
}
// dataProcessing 处理格式
func dataProcessing( pageBytes []byte) string{
//解决乱码
bodyStr := mahonia.NewDecoder("gbk").ConvertString(string(pageBytes))
path = bodyStr[strings.Index(bodyStr, "上一页</a><a href=\""):strings.Index(bodyStr, "\" class=\"next\">下一页")]
path = path[strings.Index(path, "detail"):]
bodyStr = bodyStr[strings.Index(bodyStr, "<p>"):]
bodyStr = bodyStr[:strings.Index(bodyStr, "</div>")]
bodyStr =strings.ReplaceAll(bodyStr,"<p></p>","")
bodyStr = strings.ReplaceAll(bodyStr,"<p>","")
bodyStr = strings.ReplaceAll(bodyStr,"</p>","%!@")
bodyStr = strings.TrimSpace(bodyStr)
return bodyStr
}
var i = 1
// AnalyticalData 解析数据放入excel
var file *excelize.File
func AnalyticalData(str string){
tmp := strings.Split(str,"%!@")
for _,v :=range tmp{
if v == ""{
continue
}
v = v[strings.Index(v, "、")+3:]
file.SetCellValue("Sheet1","A"+strconv.Itoa(i),i)
file.SetCellValue("Sheet1","B"+strconv.Itoa(i),v)
i++
if i == 40 {
path=""
}
}
}
func NewExcel() {
file = excelize.NewFile()
index := file.NewSheet("Sheet1")
//设置默认工作表
file.SetActiveSheet(index)
}
func runThis(){
//1.去网站拿数据
resp,err := http.Get("https://xiaohua.zol.com.cn/"+path)
CheckErr(err,"http.Get")
//2处理数据格式
pageBytes ,err := ioutil.ReadAll(resp.Body)
bodyStr := dataProcessing(pageBytes)
AnalyticalData(bodyStr)
resp.Body.Close()
if path != ""{
runThis()
}
}
思路
1.拉取网页内容
2.解决乱码问题
3.提取可用数据(临时写的 没有对数据处理下太多功夫,可能会比较乱)
4.处理数据
5.录入excel
6.继续爬区下一个页面
7.设置推出条件。
解析
1.因为全篇只有这里有p标签所有决定从这里截取
2.p标签被div包裹 所以到"" 停止
bodyStr = bodyStr[strings.Index(bodyStr, "<p>"):]
bodyStr = bodyStr[:strings.Index(bodyStr, "</div>")]
- 将数据进行处理
bodyStr =strings.ReplaceAll(bodyStr,"<p></p>","") //去掉连在一起的p标签
bodyStr = strings.ReplaceAll(bodyStr,"<p>","") //处理一下各段分割 方便后续切割为数组
bodyStr = strings.ReplaceAll(bodyStr,"</p>","%!@")
path = bodyStr[strings.Index(bodyStr, "上一页</a><a href=\""):strings.Index(bodyStr, "\" class=\"next\">下一页")]
path = path[strings.Index(path, "detail"):] //拿到下一页的地址 方便后续爬取