注1:只是用来简单练习,无频繁且恶意请求。
注2:此爬虫使用urllib完成,因页面返回数据为json,需要找到对应的js。基础爬取,无需登录及验证码输入。
1、百度搜索豆瓣电影,进入后点击F12查找页面数据返回内容
2、分析请求网址,可以看出start为0,limit为20。在页面继续下拉后新的请求网址start变为20,limit还是20。这样就分析出了start会变,每次增加20,limit不变每次显示20条电影信息。
3、代码开发,可以将start和limit作为参数拼接到ur链接后,start通过键盘输入计算可以每次增加
import urllib.parse
import urllib.request
# 创建请求的方法
def create_request(page):
# 请求地址 url
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
# get请求参数
data = {
"start": (page-1)*20,
"limit": 20
}
headers = {
'Accept': '*/*',
'Connection': 'keep-alive',
'Cookie': 'bid=ujO4Cqsktb0; douban-fav-remind=1; ll="118172"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1692338825%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DU1ziHlvFex7zncyOzhGxEDCpg6M_H42PW5uxVTj5bS0icDos5KvJB-C5dh6EpdYY%26wd%3D%26eqid%3Df6e4bf50000c28b70000000564df0a83%22%5D; _pk_id.100001.4cf6=be7ba6396a2968e6.1692338825.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __yadk_uid=rkw5nvuELbxEw1E13qo9QZgf1ZN65Oow; __utma=30149280.1077130142.1689832947.1689832947.1692338827.2; __utmc=30149280; __utmz=30149280.1692338827.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt_douban=1; __utma=223695111.170840167.1692338827.1692338827.1692338827.1; __utmc=223695111; __utmz=223695111.1692338827.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; _vwo_uuid_v2=DF9086CA3F1D922D9E0174B547480B64F|427b04b29a362b6abdece6fbdf790d18; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1692338840; Hm_lpvt_16a14f3002af32bf3a75dfe352478639=1692338840; __utmb=30149280.4.10.1692338827; __utmb=223695111.4.10.1692338827',
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action=',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
# 把data中两个参数参数拼接起来
new_data = urllib.parse.urlencode(data)
# 把base_url和new_data拼接为完整的请求地址
url = base_url + new_data
# 把url,header 这些参数 交给 Request() 去封装,得到request对象
request = urllib.request.Request(url=url, headers=headers)
return request
# 发送请求,并获取结果内容
def get_context(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
# 下载数据到本地存储
def down_lod(page, content):
with open('douban' + str(page) + '.json', 'w', encoding='utf-8') as fp:
fp.write(content)
return fp
if __name__ == '__main__':
pageStart = int(input("请输入开始页:"))
pageEnd = int(input("请输入结束页:"))
print("---------爬取开始----------")
for page in range(pageStart, pageEnd+1):
# 创建请求的方法
request = create_request(page)
# 发送请求,获取结果
content = get_context(request)
# 把数据下载到本地存储
fp = down_lod(page,content)
# jx(fp)
print("-------结束-------")