爬取数据的地址:豆瓣电影TOP250
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
def get_detail_urls(url):
resp = requests.get(url, headers=headers)
# print(resp.text)
# 获取详情页面的url
html = resp.text
soup = BeautifulSoup(html, 'lxml')
lis = soup.find('ol', class_='grid_view').find_all('li')
detail_urls = []
for li in lis:
detail_url = li.find('a')['href']
# print(detail_url)
detail_urls.append(detail_url)
return detail_urls
def parse_detail_url(detail_url,f,page):
# 解析详情页面内容
resp = requests.get(detail_url, headers=headers)
# print(detail_url)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
# 获取电影的名字
name = list(soup.find('div', id='content').find('h1').stripped_strings)
name = ''.join(name)
# print(name)
try:
# 导演
director = list(soup.find('div', id='info').find('span').find('span', class_='attrs').stripped_strings)
director = ''.join(director)
# print(director)
# 编剧
screenwriter = list(soup.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
screenwriter = ''.join(screenwriter)
# print(screenwriter)
# 演员
actor = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
actor = ''.join(actor)
# print(actor)
# 评分
score = soup.find('strong', class_='ll rating_num').string
print(score)
f.write('{},{},{},{},{}\n'.format(name,director,screenwriter,actor,score))
except Exception as e:
print('第{}页,{},{}获取失败'.format(page,name,detail_url))
def main():
base_url = 'https://movie.douban.com/top250?start={}&filter='
# 创建Top250.csv文件,并用来存放数据
with open('Top250.csv','a',encoding='utf-8') as f:
# 第一页参数为0,第二页为25.。。。。。
for x in range(0,226,25):
# 获取网页上真实的页数
page = 1+x/25
print('正在获取第%d页的数据'%page)
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
parse_detail_url(detail_url,f,page)
if __name__ == '__main__':
main()