import re
from bs4 import BeautifulSoup
import urllib.request
import xlwt
def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = GetData(baseurl)
datalist = GetData(baseurl)
# #获取数据
# #解析数据
# #存储数据
savepath = './豆瓣爬取demo.xls'
saveData(savepath,datalist)
def GetData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl+str(i*25)
html = askURL(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup.find_all('div',class_="item"):
item = str(item)
data = []
findlink = re.compile(r'<a href="(.*?)">')
link = re.findall(findlink,item)[0]
data.append(link)
findimgSrc = re.compile('<img.*src="(.*?)"')
imgSrc = re.findall(findimgSrc,item)[0]
data.append(imgSrc)
findtitle = re.compile('<span class="title">(.*?)</span>')
title = re.findall(findtitle,item)
if (len(title)==2):
ctitle = title[0]
data.append(ctitle)
otitle = title[1]
data.append(otitle)
else:
ctitle =title[0]
data.append(ctitle)
data.append('')
findrating = re.compile('<span class="rating_num" property="v:average">(.*?)</span>')
rating = re.findall(findrating,item)[0]
data.append(rating)
findJadge =re.compile('<span>(.*?)人评价</span>')
Jadge = re.findall(findJadge,item)[0]
data.append(Jadge)
findIng = re.compile('<span class="inq">(.*?)</span>')
Ing = re.findall(findIng,item)
if (len(Ing)!=0):
Ing = Ing[0].replace('。','')
data.append(Ing)
else:
data.append('')
findBd = re.compile('<p class="">(.*?)</p>',re.S) ##########################
Bd = re.findall(findBd,item)[0]
Bd = re.sub('<br(\s+)?/>(\s+)?','',Bd)
Bd = re.sub('/','',Bd)
data.append(Bd.strip())
datalist.append(data)
# print(datalist)
return datalist
def askURL(url):
head= {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
req = urllib.request.Request(url,headers =head)
reponse = urllib.request.urlopen(req) #urlopen
html = reponse.read().decode('utf-8')
return html
def saveData(savepath,datalist):
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('shee1')
col = ('链接','图片','中文名','英文名','相关信息','评价人数','评分','内容')
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
data = datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j])
book.save(savepath)
if __name__ =="__main__":
main()