这两天又写了一个爬取豆瓣前250部高分电影的爬虫,并把电影名字和图片保存到本地。
用的是requests和BeautifulSoup。
@requires_authorization
import requests
from bs4 import BeautifulSoup
def get_(url):
'''
获得电影的名字,并且保存电影的图片
'''
name_list = []
turn = 0
page = url
while turn < 10:
r = requests.get(page, timeout=10)
soup = BeautifulSoup(r.text,'lxml')
div_list = soup.find_all('div',{'class':'item'})
for img in div_list:
#寻找电影的名字
movie_text = img.find('span', {'class': 'title'}).text
name_list.append(movie_text)
#寻找图片的url
movie_img = img.find('img')['src']
ir = requests.get(movie_img)
#如果成功,以电影名保存图片到img文件夹
if ir.status_code == 200:
#需要在程序目录下新建一个img文件夹
with open('img/'+movie_text+'.jpg', 'wb') as f:
#with open(movie_text + '.jpg', 'wb') as f: 不需要新建文件夹
f.write(ir.content)
# 获得下一页的url
turn += 1
yema = 25 * turn
page = url + '?start=' + str(yema) + '&filter='
print('完成第{}页的保存,共10页'.format(turn))
return name_list
def main():
url = 'http://movie.douban.com/top250'
name_list = get_(url)
#将电影名写入到moviename.txt
with open('moviename.txt','w',encoding='utf-8') as f:
for i,name in enumerate(name_list):
f.write(name)
f.write('\n')
if __name__=='__main__':
main()