豆瓣爬取最新版的电影top250,教你如何将p标签里面的作者、主演、年份、地点、类型分开
- 1.设置表头
def request_douban(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
由于现在的豆瓣反爬机制做的比较好,因此需要加上headers头部
- 2.制作Excel表头信息
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('豆瓣电影top250')
worksheet.write(0, 0, '排名')
worksheet.write(0, 1, '电影名')
worksheet.write(0, 2, '导演')
worksheet.write(0, 3, '主演')
worksheet.write(0, 4, '时间')
worksheet.write(0, 5, '地点')
worksheet.write(0, 6, '分类')
worksheet.write(0, 7, '影评')
worksheet.write(0, 8, '评分')
worksheet.write(0, 9, '简介')
worksheet.write(0, 10, '图片')
- 3.设置一个全局变量n用来记录每一行的信息
n = 1
- 4.使用BeautifulSoup库爬取主要信息,并且将爬取到的信息写入到Excel表中
def save_to_excel(soup):
items = soup.find(class_='grid_view').find_all('li')
for item in items:
item_index = item.find(class_='item').find('em').string
item_pic = item.find(class_='pic').find('img').get('src')
item_title = item.find(class_='title').string
item_info = parse_info(item)
item_director = item_info[0]
item_actor = item_info[1]
item_year = item_info[2]
item_spot = item_info[3]
item_sort = item_info[4]
item_score = item.find(class_='star').find(class_='rating_num').string
if item.find(class_='inq') != None:
item_inq = item.find(class_='inq').string
print(item_index + '|' + item_title + '|' + item_director + '|' + item_actor + '|' + item_year + '|' + item_spot + '|' + item_sort + '|' + item_score + '|' + item_inq + '|' + item_pic)
global n
worksheet.write(n, 0, item_index)
worksheet.write(n, 1, item_title)
worksheet.write(n, 2, item_director)
worksheet.write(n, 3, item_actor)
worksheet.write(n, 4, item_year)
worksheet.write(n, 5, item_spot)
worksheet.write(n, 6, item_sort)
worksheet.write(n, 7, item_score)
worksheet.write(n, 8, item_inq)
worksheet.write(n, 9, item_pic)
n += 1
注意:由于有些信息是在同一个标签里面,所以要将它们分开,这里用了一个parse_info函数,这也是整个程序的核心,豆瓣的发爬取机制做的越来越好,因此将它们分开就变得比较麻烦
def parse_info(item):
list_info = item.find(class_='bd').find('p').text
str1 = list_info.strip(' \n').split('\xa0')
str2 = []
for temp in str1:
if len(temp) > 1:
str2.append(temp)
item_director = ''
item_actor = ''
item_year = ''
if len(str2) == 3:
if len(str2[0]) > len(str2[1]):
str3 = str2[0].split('\n ')
item_director += str3[0]
item_actor += ''
item_year += str3[1].strip()
else:
str3 = str2[1].split('\n ')
item_director += str3[0]
item_actor += ''
item_year += str3[1].strip()
if len(str2) > 3:
str4 = str2[1].split('\n ')
item_director += str2[0]
item_actor += str4[0]
item_year += str4[1].strip()
item_spot = str2[-2]
item_sort = str2[-1]
str_total = [item_director, item_actor, item_year, item_spot, item_sort]
return str_total
导演主演年份比较复杂,有的电影没有主演,所以是导演和年份在一起,而大部分情况下是主演和年份在一起,所以为了将这些与普通的类型区分开,这里是用数组的长度来区分,长度为3的,也就是上面两种情况,将它们单独列开,之后里面又分两种情况。最后用一个数组将这个标签里面所有的信息保存起来并作为结果返回就可以啦
- 5.有了上面的函数封装,于是我们就可以爬取每一页的信息
def main(page):
url = 'https://movie.douban.com/top250?start=' + str(page*25)+'&filter='
html = request_douban(url)
soup = BeautifulSoup(html, 'lxml')
save_to_excel(soup)
- 6.最后设置我们爬取的页数并将其保存起来就OK啦!过程千辛万苦,好在结果还不错,嘻嘻
if __name__ == "__main__":
for i in range(0, 10):
main(i)
workbook.save('豆瓣最受欢迎的250部电影.xls')
虽然过程千辛万苦,好在结果还不错,嘻嘻
下面就是完整代码块
import requests
from bs4 import BeautifulSoup
import xlwt
def request_douban(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('豆瓣电影top250')
worksheet.write(0, 0, '排名')
worksheet.write(0, 1, '电影名')
worksheet.write(0, 2, '导演')
worksheet.write(0, 3, '主演')
worksheet.write(0, 4, '时间')
worksheet.write(0, 5, '地点')
worksheet.write(0, 6, '分类')
worksheet.write(0, 7, '影评')
worksheet.write(0, 8, '评分')
worksheet.write(0, 9, '简介')
worksheet.write(0, 10, '图片')
n = 1
def save_to_excel(soup):
items = soup.find(class_='grid_view').find_all('li')
for item in items:
item_index = item.find(class_='item').find('em').string
item_pic = item.find(class_='pic').find('img').get('src')
item_title = item.find(class_='title').string
item_info = parse_info(item)
item_director = item_info[0]
item_actor = item_info[1]
item_year = item_info[2]
item_spot = item_info[3]
item_sort = item_info[4]
item_score = item.find(class_='star').find(class_='rating_num').string
if item.find(class_='inq') != None:
item_inq = item.find(class_='inq').string
print(item_index + '|' + item_title + '|' + item_director + '|' + item_actor + '|' + item_year + '|' + item_spot + '|' + item_sort + '|' + item_score + '|' + item_inq + '|' + item_pic)
global n
worksheet.write(n, 0, item_index)
worksheet.write(n, 1, item_title)
worksheet.write(n, 2, item_director)
worksheet.write(n, 3, item_actor)
worksheet.write(n, 4, item_year)
worksheet.write(n, 5, item_spot)
worksheet.write(n, 6, item_sort)
worksheet.write(n, 7, item_score)
worksheet.write(n, 8, item_inq)
worksheet.write(n, 9, item_pic)
n += 1
def parse_info(item):
list_info = item.find(class_='bd').find('p').text
str1 = list_info.strip(' \n').split('\xa0')
str2 = []
for temp in str1:
if len(temp) > 1:
str2.append(temp)
item_director = ''
item_actor = ''
item_year = ''
if len(str2) == 3:
if len(str2[0]) > len(str2[1]):
str3 = str2[0].split('\n ')
item_director += str3[0]
item_actor += ''
item_year += str3[1].strip()
else:
str3 = str2[1].split('\n ')
item_director += str3[0]
item_actor += ''
item_year += str3[1].strip()
if len(str2) > 3:
str4 = str2[1].split('\n ')
item_director += str2[0]
item_actor += str4[0]
item_year += str4[1].strip()
item_spot = str2[-2]
item_sort = str2[-1]
str_total = [item_director, item_actor, item_year, item_spot, item_sort]
return str_total
def main(page):
url = 'https://movie.douban.com/top250?start=' + str(page*25)+'&filter='
html = request_douban(url)
soup = BeautifulSoup(html, 'lxml')
save_to_excel(soup)
if __name__ == "__main__":
for i in range(0, 10):
main(i)
workbook.save('豆瓣最受欢迎的250部电影.xls')