爬取猫眼电影的排名电影信息
import requests
import json
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
def getPage(url): #获取请求页面
try:
response = requests.get(url,headers = headers)
if response.status_code == 200: #判断状态码是否正常
return response.text
else:
return None
except Exception: #异常基类,包含了python所有错误的类型
return None
def getInfo(html): #数据分析,获取信息
soup = BeautifulSoup(html,'lxml') #第一个参数是提取的网页,第二个是html解析器
items = soup.select('dd') #找见所有的dd标签
for item in items:
index= item.find('i',class_='board-index').get_text() #get_text()提取文本
name = item.find('p',class_='name').get_text()
start = item.find('p',class_='star').get_text().strip() #strip()对字符串进行处理
time = item.find('p',class_='releasetime').get_text()
score = item.find('p',class_='score').get_text()
#生成器
yield {
'排名': index,
'电影名称': name,
'主演': start,
'上映时间': time,
'评分': score
}
#写入文件
def writeData(file):
with open('maoyan.txt','a',encoding='utf-8')as f: #python中对象无法写入文件,要把字典对象转成json数据
f.write(json.dumps(file,ensure_ascii=False)+'\n')
#调用
if __name__ =="__main__":
for num in [i * 10 for i in range(11)]:
url = 'https://maoyan.com/board/4?offset='+str(num)
html = getPage(url)
for item in getInfo(html):
print(item)
writeData(item)
注意:class='star’必须写成class_=‘star’