import requests
from bs4 import BeautifulSoup
def get_movies():
headers={'User-Agent':'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1','Host':'movie.douban.com'}
movie_list=[]
for i in range(0,10):
link='https://movie.douban.com/top250?start='+str(i*25)
r=requests.get(link,headers=headers,timeout=10)
print(str(i+1),"页响应状态码:",r.status_code)
soup=BeautifulSoup(r.text,"lxml")
div_list=soup.find_all('div',class_='hd' or 'bd')
for each in div_list:
movie=each.a.span.text.strip()
movie_list.append(movie)
return movie_list
movies=get_movies()
f = open('top250.txt', 'w') #清空文件内容再写
for i in range(0,len(movies)):
f.writelines([movies[i],'\n'])
#print(movies[i])
f.close()
f = open('top250.txt','r')
#print(f.readlines())
for eachline in f:
print(eachline.strip())
f.close()
其中r.text内容大致是这样的:
<ol class="grid_view">
<li>
<div class="item">
<div class="pic">
<em class="">1</em>
<a href="https://movie.douban.com/subject/1292052/">
<img width="100" alt="肖申克的救赎" src="https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg" class="">
</a>
</div>
<div class="info">
<div class="hd">
<a href="https://movie.douban.com/subject/1292052/" class="">
<span class="title">肖申克的救赎</span>
<span class="title"> / The Shawshank Redemption</span>
<span class="other"> / 月黑高飞(港) / 刺激1995(台)</span>
</a>
<span class="playable">[可播放]</span>
</div>
<div class="bd">
<p class="">
导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
1994 / 美国 / 犯罪 剧情
</p>
<div class="star">
<span class="rating5-t"></span>
<span class="rating_num" property="v:average">9.7</span>
<span property="v:best" content="10.0"></span>
<span>1612528人评价</span>
</div>
<p class="quote">
<span class="inq">希望让人自由。</span>
</p>
</div>
</div>
</div>
</li>
进阶一点
from selenium import webdriver
import time
import urllib.request
import re
from bs4 import BeautifulSoup
import codecs
page = urllib.request.urlopen("https://movie.douban.com/top250")
contents = page.read()
soup = BeautifulSoup(contents,"html.parser")
# driver = webdriver.Chrome("chromedriver.exe") # chromedriver所在路径
# driver.get(r"https://movie.douban.com/top250")
mov_list=soup.find_all(attrs={"class":"item"})
for each in mov_list:
movname=each.find(attrs={"class":"title"}).get_text()
print('电影名:',movname)
rate=each.find(attrs={"class":"rating_num"}).get_text()
print('评分:',rate)
comment=each.find(attrs={"class":"quote"}).get_text()
print('评论:',comment)
结果如下:
参考链接:https://blog.csdn.net/u010885059/article/details/53939659
或者这样使用BeautifulSoup写
import requests
from bs4 import BeautifulSoup
f = open('top250.txt', 'w+',encoding='utf-8') #追加方式写文件
for i in range(0,10):
link="https://movie.douban.com/top250?start="+str(i*25)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
r=requests.get(link,headers=headers)
soup=BeautifulSoup(r.text,"lxml")
#mov_list=soup.find_all(attrs={"class":"item"})
mov_list=soup.find_all(class_="item")
for each in mov_list:
number=each.find(attrs={"class":"pic"}).em.text.strip()
print('排名:',number)
movname=each.find(attrs={"class":"title"}).get_text().strip()
print('电影名:',movname)
#.p.text的含义是:提取<p>元素中的文字,strip()的功能是把字符串左右的空格去掉
director=each.find(attrs={"class":"bd"}).p.text.strip().replace(" ","").strip().replace("\n","").strip().replace("...","").strip().replace("/","")
print(director)
rate=each.find(attrs={"class":"rating_num"}).get_text()
print('评分:',rate)
comment=each.find(attrs={"class":"quote"}).get_text().strip()
print('评论:',comment)
f.writelines([number,'\n',movname,'\n',director,'\n',rate,'\n',comment,'\n'])
f.close()
程序运行结果: