话不多说 上代码:
from urllib import request,parse
import re
import xlwt
'''表格文件设置'''
#标题的字体样式设置
font_title = xlwt.Font()
font_title.name = u'幼圆'
font_title.colour_index = 0x31
font_title.bold = True
#标题的字体居中显示设置
alignment0 = xlwt.Alignment()
alignment0.horz = xlwt.Alignment.HORZ_CENTER
alignment0.vert = xlwt.Alignment.VERT_CENTER
#标题样式设置
style0 = xlwt.XFStyle()
style0.font = font_title
style0.alignment = alignment0
#正文数据格式
font_body = xlwt.Font()
font_body.name = u'华文楷体'
font_body.colour_index = 0x08
font_body.bold = True
#正文垂直居中显示
alignment1 = xlwt.Alignment()
alignment1.vert = xlwt.Alignment.VERT_CENTER
style1 = xlwt.XFStyle()
style1.font = font_body
style1.alignment = alignment1
#工作表添加
wb = xlwt.Workbook()
ws = wb.add_sheet("人生必看电影TOP100")
#设置行宽
col0 = ws.col(0)
col1 = ws.col(1)
col2 = ws.col(2)
col0.width = 256*20
col1.width = 256*45
col2.width = 256*30
#填入首行数据
ws.write(0,0,'电影名称',style0)
ws.write(0,1,'主演',style0)
ws.write(0,2,'上映时间',style0)
ws.write(0,3,'评分',style0)
Name = []
Star = []
Releasetime = []
Score = []
for i in range(10):
'''内容爬取'''
url = "http://maoyan.com/board/4?offset="+str(i*10)
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'__mta=45578011.1541345815725.1541380877343.1541380888601.20; uuid_n_v=v1; uuid=75667970E04711E89CCFC511A3A3AA062F57C39460854EBFB5ABF721B74CE428; _lxsdk_cuid=166df5eaac1c8-08bef925d4afc-65547628-1cb7b9-166df5eaac1c8; _lxsdk=75667970E04711E89CCFC511A3A3AA062F57C39460854EBFB5ABF721B74CE428; _csrf=91e89bf12159f48b0b638b46ec5e99b58429b96aaea3d29b13c22e593b679b33; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=45578011.1541345815725.1541380877343.1541380886583.20; _lxsdk_s=166e16edfcc-d54-ef8-582%7C%7C39',
'Host':'maoyan.com',
'Referer':'http://maoyan.com/board',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3510.2 Safari/537.36',
}
req = request.Request(url,headers=headers)
res = request.urlopen(req)
html = res.read().decode("utf-8")
name = re.findall('<p class="name"><a .*>(.*?)</a></p>',html)
star = re.findall('<p class="star">[\s]*(.*?)[\s]*</p>',html)
releasetime = re.findall('<p class="releasetime">(.*?)</p>',html)
score = re.findall('<p class="score"><i class="integer">(.*)</i><i class="fraction">(.*?)</i></p>',html)
for q in name:
Name.append(q)
for w in star:
Star.append(w)
for e in releasetime:
Releasetime.append(e)
for r in score:
Score.append(r)
for m in range(99):
ws.write(m+1,0,Name[m],style1)
ws.write(m+1,1,Star[m],style1)
ws.write(m+1,2,Releasetime[m],style1)
ws.write(m+1,3,Score[m][0]+Score[m][1],style1)
wb.save("TOP100.xls")
print("搞定了。。。")