这里使用requests模块和BeautifulSoup模块
爬取内容为:豆瓣top250电影的名字、简介、评分及评分人数。
1、查找信息
进入豆瓣电影Top250,F12审查元素,找到所要爬取的信息都在标签 <div class="info">
中间,如下图所示。
URL:
https://movie.douban.com/top250?start=0
后面分别为:strat=25,50,75…
User-Agent:
2、代码实现
# 导入模块
import requests
from bs4 import BeautifulSoup
def get_movies():
# 添加header
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
# 创建存储信息的列表
movie_title=[]
movie_other=[]
movie_info=[]
movie_fen=[]
movie_num=[]
# 遍历每个页面链接并发出GET请求
for i in range(0,10):
link='https://movie.douban.com/top250?start='+str(i*25)
r=requests.get(link,headers=headers)
print('已获取第',str(i*25+1),'到',str(i*25+25),'部电影信息。')
# 获取电影信息
soup=BeautifulSoup(r.text,'lxml')
for t in soup.find_all('div', 'hd'):
name = t.find('span', 'title').get_text()
movie_title.append(name)
for t in soup.find_all('div', 'hd'):
name = t.find('span', 'other').get_text()
movie_other.append(name)
for t in soup.find_all('div', 'info'):
info = t.find('p').get_text().replace(' ','')
movie_info.append(info)
for t in soup.find_all('div', 'star'):
fen = t.find('span', 'rating_num').get_text()
movie_fen.append(fen)
for t in soup.find_all('div','star'):
num = t.find_all('span')[3].get_text()
movie_num.append(num)
return movie_title,movie_other,movie_info,movie_fen,movie_num
a,b,c,d,e=get_movies()
def main():
import codecs
# 将获取信息写入TXT文件
with codecs.open('c:\\Users\\32662\\Desktop\\douban.txt', 'wb',encoding='utf-8') as f:
f.write('豆瓣电影 Top 250\n\n')
for n in range(0,250):
f.write('Top'+str(n+1)+'\n')
f.write('电影名:'+a[n]+b[n]+c[n]+'豆瓣评分'+d[n]+' '+e[n])
f.write('\n\n')
if __name__ == "__main__":
main()