import requests
from pyquery import PyQuery
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
def get_html(url, header=''):
'''
:param url: 要访问的地址
:param header: 设置请求头
:return: 返回响应数据
'''
response = requests.get(url, headers=header, timeout=3)
# 如果状态码200 表示成功
if response.status_code == 200:
# 设置编码
response.encoding = response.apparent_encoding
# 返回数据
return response.text
else:
print('访问 {} 失败了。。 {}'.format(url, response.status_code))
return None
def parser_html(html):
'''
:param html: 需要处理的html
:return: 返回当前页面的数据
'''
doc = PyQuery(html)
title = doc('title')
name = doc('p.name a')
star = doc('p.star')
releasetime = doc('p.releasetime')
img = doc('.image-link img.board-img')
integer = doc('i.integer')
fraction = doc('i.fraction')
print(title.text())
data = []
for i in range(len(name)):
info = '电影名' + name[i].text + star[i].text.strip() + releasetime[i].text[5:] + '图片' + img[i].get(
'data-src') + '分数:' + (integer[i].text + fraction[i].text)
print(info)
data.append(info)
return data
def save_data(data, path=''):
'''
:param data: 要保存的数据
:param path: 保存的路径
:return:
'''
# 保存数据
with open(path, 'a', encoding='utf-8') as f:
for i in data:
f.write(i + '\n')
def main(url):
print('q') # 获取当前页面
html = get_html(url, header=header)
# 解析当前页
data = parser_html(html)
# 保存当前数据集
save_data(data, '猫眼电影top100.txt')
if __name__ == '__main__':
for i in range(10):
main(f'https://maoyan.com/board/4?offset={i*10}')
爬虫实战2-猫眼电影top100
最新推荐文章于 2021-12-24 10:11:12 发布