打开京东商城,搜索手机会得到如下结果:
点击下一页:
请求的url地址会有变化:
第一页:https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&page=1&s=56&click=0
第二页:https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&page=3&s=56&click=0
第三页:https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&page=5&s=56&click=0
…
…
…
找到页码计算规律后,发现是pageIndex=2n-1
废话不多说,直接上代码:
package main
import (
"fmt"
"io/ioutil"
"net/http"
"os"
)
func main() {
//在PC端打开jd.com,搜索手机,翻译的url地址,找到规则后,对翻译页码进行计算并赋值
/*https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.base&wq=%E6%89%8B%E6%9C%BA&page=1&s=56&click=0
https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.base&wq=%E6%89%8B%E6%9C%BA&page=3&s=56&click=0
https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.base&wq=%E6%89%8B%E6%9C%BA&page=5&s=116&click=0
https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.base&wq=%E6%89%8B%E6%9C%BA&page=7&s=176&click=0 */
// 比如我要请求20页的数据
pageIndex_max:=20
for i:=1;i<=pageIndex_max;i++ {
pageIndexValue:=i*2-1
Crawler(pageIndexValue,i)
}
}
func Crawler(pageIndexValue ,pageindex int) {
url:="https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.base&wq=%E6%89%8B%E6%9C%BA&page="+fmt.Sprintf("%d",pageIndexValue)+"&s=56&click=0"
request,err:=http.NewRequest("GET",url,nil)
//todo 一定要设置请求头,否则京东认为你是非法请求,不会返回任意数据
request.Header.Add("Content-Type", "application/json")
request.Header.Add("Host","<calculated when request is sent>")
request.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36")
request.Header.Add("Accept","")
request.Header.Add("Accept-Encoding","")
request.Header.Add("Connection","keep-alive")
if err != nil {
panic(err)
}
client := &http.Client{
//Timeout: 500000,//设置超时时间
}
//处理返回结果
response, _ := client.Do(request)
defer response.Body.Close()
fmt.Println("data:",response.Body)
body, err := ioutil.ReadAll(response.Body)
if err != nil {
fmt.Println("err:",err.Error())
}
//获取到爬取到的数据后,存储到本地
file,err:=os.Create(fmt.Sprintf("jd_%d.html",pageindex))
defer file.Close()
if err!=nil {
fmt.Println("创建文件失败,err:",err.Error())
}
_,err=file.Write(body)
if err !=nil{
fmt.Println("写入文本失败")
}
}
Url地址可能会有细微部分不同,但是不影响结果。
上图就是爬取完写入到文件的示例展示。
如果有建议或者意见可以交流。