class Grasp:
def __init__(self):
pass
def getName(self):
for i in range(0,10):
self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
self.html = urlopen(self.url).read().decode()
self.htmlobj = et.HTML(self.html)
self.res = self.htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()") # 电影名
def getDir(self):
for i in range(0,10):
self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
self.html = urlopen(self.url).read().decode()
self.htmlobj = et.HTML(self.html)
self.dicr = self.htmlobj.xpath("//div[@class ='bd']/p[1]/text()") # 导演
def getScore(self):
for i in range(0,10):
self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
self.html = urlopen(self.url).read().decode()
self.htmlobj = et.HTML(self.html)
self.score = self.htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()") # 评分
def getCri(self):
for i in range(0,10):
self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
self.html = urlopen(self.url).read().decode()
self.htmlobj = et.HTML(self.html)
self.cri = self.htmlobj.xpath("//p[@class ='quote']/span//text()") # 介绍
def run(self):
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet('豆瓣电影')
for i in range(0,10):
self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
self.html = urlopen(self.url).read().decode()
self.htmlobj = et.HTML(self.html)
self.res = self.htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()") # 电影名
self.dicr = self.htmlobj.xpath("//div[@class ='bd']/p[1]/text()") # 导演
self.cri = self.htmlobj.xpath("//p[@class ='quote']/span//text()") # 介绍
self.score = self.htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()") # 评分
print((''.join(self.dicr)).strip().replace(' ', '').replace('/', '').split(',')[0:1])
for j in range(0, len(self.res)):
d = ((''.join(self.dicr[j * 2])).strip().replace(' ', '').replace('/', '')).find('主')
ws.write(i, j * 4, (''.join(self.res[j])).strip().replace(' ', '').replace('/', ''))
ws.write(i, 4 * j + 1, ((''.join(self.dicr[j * 2])).strip().replace(' ', '').replace('/', ''))[0:d])
try:
ws.write(i, 4 * j + 2, (''.join(self.cri[j])).strip())
except:
ws.write(i, 4 * j + 2,'没有介绍')
ws.write(i, 4 * j + 3, (''.join(self.score[j])).strip().replace(' ', '').replace('/', ''))
wb.save('./豆瓣电影/movie.xls')
r = Grasp()
r.run()
def reader():
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet('豆瓣电影')
for i in range(0, 10):
url = f"https://movie.douban.com/top250?start={25*i}&filter="
html = urlopen(url).read().decode()
htmlobj = et.HTML(html)
res = htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()") #电影名
dicr = htmlobj.xpath("//div[@class ='bd']/p[1]/text()") #导演
cri = htmlobj.xpath("//p[@class ='quote']/span//text()") #介绍
score = htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()") #评分
for j in range(0, len(res)):
d = ((''.join(dicr[j*2])).strip().replace(' ', '').replace('/', '')).find('主')
ws.write(i, j*4, (''.join(res[j])).strip().replace(' ','').replace('/',''))
ws.write(i, 4*j+1, ((''.join(dicr[j*2])).strip().replace(' ', '').replace('/', ''))[0:d])
try:
ws.write(i, 4*j+2, (''.join(cri[j])).strip())
except:
ws.write(i, 4 * j + 2,'没有介绍')
ws.write(i, 4*j+3, (''.join(score[j])).strip().replace(' ', '').replace('/', ''))
wb.save('./豆瓣电影/movie.xls')
reader()
将爬取的数据存储为表格
''.join()这些是为了将数据转换为单纯的字符串,除去特殊的字符和空格,便于数据的查看
此外,需要注意的是xpath获取的是一个列表,可以用列表的方法进行操作,不需要进行多余的转化