我简要说下,我这里的问题,有几处错误,使得自己始终获取不到数据。
beautifulsoup的文档
http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
1.属性后面是用键值对修饰的
moviesol = soup.find('ol',attrs = {'class':"grid_view"}) movieLi = moviesol.find_all('li')
2.attrs是关键字,不能写错
好了,不多说,直接上代码进行说明
import requests import re from bs4 import BeautifulSoup import xlwt def gethtml(url,pagerow): header = {"User-Agent":'Mozilla/4.0'} # proxy = {"https":'221.5.54.6:808'} param = {'start':pagerow,'filter':''} r = requests.get(url,params=param,headers =header) # print(r.text) return r.text dataList = [] def getdata(html): soup = BeautifulSoup(html,'html.parser') moviesol = soup.find('ol',attrs = {'class':"grid_view"}) movieLi = moviesol.find_all('li') for eachmovie in movieLi: data = [] moviehd = eachmovie.find('div',attrs={"class":"hd"}) movietitle = moviehd.find('span',attrs={"class":"title"}).getText() data.append(movietitle) moviebd = eachmovie.find('div',attrs={"class":"bd"}) moviestar = moviebd.find('div',attrs={"class":"star"}) star = moviestar.find('span',attrs={"class":"rating_num"}).getText() data.append(star) critical = re.findall(r'\d+',str(moviestar))[-1] data.append(critical) # moviequote = moviebd.find('p',attrs={"class":"quote"}) quote = eachmovie.find('span',attrs = {"class":"inq"}) # 如果这里不限制,将会出错,不信可以自己调试 if quote: quote = quote.getText() else: quote = '无评论' data.append(quote) dataList.append(data) return def saveToExcel(path): book = xlwt.Workbook() sheet = book.add_sheet('douban') col = ('title','star','critical','quote') for i in range(4): sheet.write(0,i,col[i]) for i in range(len(dataList)): data = dataList[i] for j in range(4): sheet.write(i+1,j,data[j]) book.save(path) return def mainFunction(): url = "http://movie.douban.com/top250" for page in range(10): if page==0: pagerow=0 else: pagerow=(page-1)*25 html = gethtml(url,pagerow) # print(html) getdata(html) saveToExcel('douban.xls') return mainFunction()