爬虫案例如下:在豆瓣上爬取喜剧的前20条数据
#引入urllib
import urllib
import urllib.request
from urllib.request import Request
from urllib import parse
#爬虫地址
url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10'
#POST请求数据
submit_data = {
'start':20,
'tags':'喜剧'
}
#请求数据编码,注意:这儿需要加上UTF8,否则会报下面的错误
data = urllib.parse.urlencode(submit_data).encode(encoding='UTF8')
headers = {
"Accept" : "application/json, text/plain, */*",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.7 Safari/537.36",
"Accept-Language" : "zh-CN,zh;q=0.8"
}
#开始请求
request = urllib.request.Request(url,data,headers)
#获取请求结果
content = urllib.request.urlopen(request).read()
#将结果写入文档
with open('movies.json','wb+') as f:
f.write(content)
爬取结果:
错误: