import urllib.request
from bs4 import BeautifulSoup
import re
import xlwt
#正则表达式
findLink = re.compile(r'<a class="" href="(.*?)">')
title = re.compile(r'<span class="title">(.*?)</span>')
imgLink = re.compile(r'<img .* src="(.*?)".*/>')
daoyan = re.compile(r'<p class="">.*导演:(.*?)主.',re.S)
zhuyan = re.compile(r'<p class="">.*主.:(.*?)/.*</p>',re.S)
message = re.compile(r'<p class="">.*主..*<br/>(.*?)</p>',re.S)
pingfen = re.compile(r'<span class="rating_num".*property="v:average">(.*?)</span>',re.S)
pingjiaNum = re.compile(r'<span>(.*?)人评价</span>')
quote = re.compile(r'<span class="inq">(.*?)</span>')
data = []
def getData(baseurl):
headers = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv:82.0) Gecko / 20100101 Firefox / 82.0"
# "User-Agent": "Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 65.0.3325.181 Safari / 537.36"
}
requset = urllib.request.Request(url=baseurl,headers=headers)
response = urllib.request.urlopen(requset)
return (response.read().decode("utf-8"))
def readPage(pageNum):
f = open("doubantop250/%dpages.html"%pageNum,"r",encoding="utf-8")
return f.read()
def bs4(htmlFile):
#查找div下item类
bs = BeautifulSoup(htmlFile,"html.parser")
item = bs.find_all("div", class_="item") #class_表示为一个类
for dy in item:
movie = []
dy = str(dy)
link = re.findall(findLink,dy)[0]
movie.append(link)
imglink = re.findall(imgLink,dy)[0]
movie.append(imglink)
tit = re.findall(title,dy)
if tit.__len__()==2:
#中文名
ctitle = tit[0]
#英文名
wtitle = str(tit[1]).replace("/","")
elist = wtitle.split()
wtitles = ""
for el in elist:
wtitles = wtitles+el+" "
movie.append(ctitle)
movie.append(wtitles)
else:
ctitle = tit[0]
movie.append(ctitle)
movie.append(" ")
#导演
daoy = re.findall(daoyan,dy)
daoylist = str(daoy).replace(r"\xa0","")
# daoylist.replace("[","")
# daoylist.replace("]", "")
# daoylist.replace("'", "")
movie.append(daoylist)
#主演
zhuy = re.findall(zhuyan,dy)
zhuy = str(zhuy).replace("<br","")
movie.append(zhuy)
#上映日期
startYear = re.findall(message,dy)
if startYear.__len__()==0:
movie.append("")
else:
movie.append(str(startYear).split().pop(1)[0:4])
#评分
rating_num = re.findall(pingfen,dy)[0]
movie.append(rating_num)
#评价人数
pjnum = re.findall(pingjiaNum,dy)[0]
movie.append(pjnum)
#简介
inq = re.findall(quote,dy)
if inq.__len__()==0:
movie.append(" ")
else:
movie.append(inq[0])
data.append(movie)
def name_is_exists(tag):
return tag.has_attr("href")
def saves(data,path):
book = xlwt.Workbook(encoding="UTF-8")
sheet = book.add_sheet("豆瓣TOP250")
col = ["详情链接","图片链接","电影名称","英文名称","导演","主演","上映时间","豆瓣评分","评论人数","概括"]
for i in range(0,10):
sheet.write(0,i,col[i])
for j in range(0,250):
one = data[j]
print(one)
for k in range(0,10):
sheet.write(j+1,k,one[k])
print(j+1)
book.save(path)
def savePage(htmlfile,pageNum):
f = open("doubantop250/%dpages.html"%pageNum,"a+")
f.close()
with open("doubantop250/%dpages.html"%pageNum,"wb") as f:
# 写文件用bytes而不是str,所以要转码
f.write(bytes(htmlfile.encode('utf-8')))
if __name__ == "__main__":
baseurl = "https://movie.douban.com/top250?start="
#爬取网页
for i in range(0,10):
url = baseurl+str(i*25)
# htmlFile = getData(url)
htmlFile = readPage(i)
# savePage(htmlFile,i)
print("*" , i)
# 解析数据
bs4(htmlFile)
print(data)
#保存数据
saves(data,"D:/自编程序/studyPython/doubanTOP250.xls")