利用python的BeautifulSoup4库爬取豆瓣top250基本电影信息
存在.txt文件中
```python
import requests
import bs4
import re
def open_url(url):
#那么User-Agent到底是什么呢?User-Agent会告诉网站服务器,访问者是通过什么工具来请求的,如果是爬虫请求,一般会拒绝,如果是用户浏览器,就会应答
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
res=requests.get(url,headers=headers)
return res
def find_movies(res):
soup=bs4.BeautifulSoup(res.text,'html.parser')
# 序号
number = []
targets = soup.find_all("div", class_='pic')
for each in targets:
number.append(each.em.text+' ')
# 电影名
movies = []
targets = soup.find_all("div", class_='hd')
for each in targets:
movies.append(each.a.span.text+' ')
# 地址
address = []
targets = soup.find_all("div", class_='hd')
for each in targets:
address.append(each.a.get('href')+' ') # 这里得到属性值里的地址
# 图片
picture = []
targets = soup.find_all("div", class_='pic')
for each in targets:
picture.append(each.a.img.get('src')+' ') # 这里得到属性值里的地址
#评分
ranks=[]
targets = soup.find_all("span", class_='rating_num')
for each in targets:
ranks.append(each.text)
#类型名
kinds = []
targets = soup.find_all("div", class_='bd')
for each in targets:
kinds.append(each.p.text.split('/')[-1]) # 将p标记下的最后一块切出来 [n]:表示选取第n个分片
'''
results=[]
length=len(movies)-5
for i in range(length):
results.append(number[i]+movies[i]+address[i]+picture[i]+ranks[i]+kinds[i+1]+'\n')
#results.append(movies[i] + ranks[i]+'\n')
return results
#找出有多少页面
def find_depth(res):
soup=bs4.BeautifulSoup(res.text,'html.parser')
depth=soup.find('span',class_='next').previous_sibling.previous_sibling.text
return int(depth)
def main():
host='https://movie.douban.com/top250'
res=open_url(host) #这端代码的作用就是打开https://movie.douban.com/top250这个网站,返回一个response对象
depth=find_depth(res)-2
info=[]
for i in range(depth):
url=host+'?start='+str(25*i)+'&filter='
res=open_url(url)
info.extend(find_movies(res))
with open('E:\\pythonimg\\电影.txt','w',encoding='utf-8') as f:
for each in info:
f.write(each)
if __name__=="__main__":
main()