import requests
from bs4 import BeautifulSoup
import re
import xlwt
def getContent(row): #从第几条开始
headers1 = {'User-Agent': 'Mozilla/4.0', 'content-type': 'text/html; charset=utf-8'}
r = requests.get("https://movie.douban.com/top250?start="+str(row)+"&filter=",headers=headers1)
html = r.text
soup = BeautifulSoup(html, "html.parser")
#print(html)
return soup
films=[] #用这个做添加
#getContent(0)
def getItem(row):
soup = getContent(row)
grid_view=soup.find("ol",attrs={"class":"grid_view"})
#print(grid_view) #输出所有的ol
#访问 grid_view 的li
items = grid_view.find_all("li") #所有的li
#print(items[0]) #输出一个li 肖申克的救赎
for li in items:
film = []
#print(li.find("span",attrs={"class":"title"}).text) #输出所有标题
title = li.find("span",attrs={"class":"title"}).getText()
# print(title)
# 添加电影标题
film.append(title)
rating_num = li.find("span",attrs={"class":"rating_num"})#电影评分
rating_numtxt = rating_num.getText()
#电影信息加入评分
film.append(rating_numtxt)
#电影信息评论人数
#方法一
# #print(rating_num.next_sibling.next_sibling.next_sibling.next_sibling)
# peopletxt = rating_num.next_sibling.next_sibling.next_sibling.next_sibling.getText()
# pattern = re.compile('\d+')
# peoplenum = re.findall(pattern,peopletxt)
# print(peoplenum)
# film.append(int(peoplenum[0]))
#方法二
star=li.find("div",attrs = {"class":"star"})
num = re.findall(re.compile('\d+'),str(star))[-1]
#print(x)
film.append(int(num))
#获取评论
quote = li.find("span",attrs={"class":"inq"})
if quote == None:
film.append('没有评论')
else:
film.append(quote.getText())
films.append(film)
return
#导入csv
def saveData(savePath):
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('豆瓣最受欢迎的电影信息',cell_overwrite_ok=True)
col=(u'电影名称',u'评分',u'评论人数',u'短评')
for i in range(0,len(col)): #将列名写在第一行
sheet.write(0,i,col[i])
for i in range(0,len(films)): #250行数据一条一条加出来
data=films[i]
for j in range(0,len(data)):
sheet.write(i+1,j,data[j])
book.save(savePath) #保存至本地excel文件中
return
for i in range(0,250,25):
getItem(i)
# print(data)
saveData('top250.xls')
python 爬虫爬取豆瓣top250保存到excel
最新推荐文章于 2024-04-17 16:07:57 发布