爬虫小练习，面向对象，和函数式编程，爬取豆瓣电影

最新推荐文章于 2022-08-04 11:11:08 发布

Mdq0512

最新推荐文章于 2022-08-04 11:11:08 发布

阅读量419

点赞数

文章标签：爬虫

原文链接：http://www.cnblogs.com/superSmall/p/11502872.html

版权

class Grasp:
    def __init__(self):
        pass
    def getName(self):
        for i in range(0,10):
            self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
            self.html = urlopen(self.url).read().decode()
            self.htmlobj = et.HTML(self.html)
            self.res = self.htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()")  # 电影名

    def getDir(self):
        for i in range(0,10):
            self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
            self.html = urlopen(self.url).read().decode()
            self.htmlobj = et.HTML(self.html)
            self.dicr = self.htmlobj.xpath("//div[@class ='bd']/p[1]/text()")  # 导演

    def getScore(self):
        for i in range(0,10):
            self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
            self.html = urlopen(self.url).read().decode()
            self.htmlobj = et.HTML(self.html)
            self.score = self.htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()")  # 评分

    def getCri(self):
        for i in range(0,10):
            self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
            self.html = urlopen(self.url).read().decode()
            self.htmlobj = et.HTML(self.html)
            self.cri = self.htmlobj.xpath("//p[@class ='quote']/span//text()")  # 介绍

    def run(self):
        wb = xlwt.Workbook(encoding='utf-8')
        ws = wb.add_sheet('豆瓣电影')
        for i in range(0,10):
            self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
            self.html = urlopen(self.url).read().decode()
            self.htmlobj = et.HTML(self.html)
            self.res = self.htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()")  # 电影名
            self.dicr = self.htmlobj.xpath("//div[@class ='bd']/p[1]/text()")  # 导演
            self.cri = self.htmlobj.xpath("//p[@class ='quote']/span//text()")  # 介绍
            self.score = self.htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()")  # 评分
            print((''.join(self.dicr)).strip().replace(' ', '').replace('/', '').split(',')[0:1])
            for j in range(0, len(self.res)):
                d = ((''.join(self.dicr[j * 2])).strip().replace(' ', '').replace('/', '')).find('主')
                ws.write(i, j * 4, (''.join(self.res[j])).strip().replace(' ', '').replace('/', ''))
                ws.write(i, 4 * j + 1, ((''.join(self.dicr[j * 2])).strip().replace(' ', '').replace('/', ''))[0:d])
                try:
                    ws.write(i, 4 * j + 2, (''.join(self.cri[j])).strip())
                except:
                    ws.write(i, 4 * j + 2,'没有介绍')
                ws.write(i, 4 * j + 3, (''.join(self.score[j])).strip().replace(' ', '').replace('/', ''))
        wb.save('./豆瓣电影/movie.xls')

r = Grasp()
r.run()


def reader():
    wb = xlwt.Workbook(encoding='utf-8')
    ws = wb.add_sheet('豆瓣电影')
    for i in range(0, 10):
        url = f"https://movie.douban.com/top250?start={25*i}&filter="
        html = urlopen(url).read().decode()
        htmlobj = et.HTML(html)
        res = htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()") #电影名
        dicr = htmlobj.xpath("//div[@class ='bd']/p[1]/text()") #导演
        cri = htmlobj.xpath("//p[@class ='quote']/span//text()") #介绍
        score = htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()") #评分
        for j in range(0, len(res)):
            d = ((''.join(dicr[j*2])).strip().replace(' ', '').replace('/', '')).find('主')
            ws.write(i, j*4, (''.join(res[j])).strip().replace(' ','').replace('/',''))
            ws.write(i, 4*j+1, ((''.join(dicr[j*2])).strip().replace(' ', '').replace('/', ''))[0:d])
            try:
                ws.write(i, 4*j+2, (''.join(cri[j])).strip())
            except:
                ws.write(i, 4 * j + 2,'没有介绍')
            ws.write(i, 4*j+3, (''.join(score[j])).strip().replace(' ', '').replace('/', ''))
    wb.save('./豆瓣电影/movie.xls')

reader()


将爬取的数据存储为表格
''.join()这些是为了将数据转换为单纯的字符串，除去特殊的字符和空格，便于数据的查看
此外，需要注意的是xpath获取的是一个列表，可以用列表的方法进行操作，不需要进行多余的转化

转载于:https://www.cnblogs.com/superSmall/p/11502872.html

Mdq0512

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
爬虫小练习，面向对象，和函数式编程，爬取豆瓣电影

class Grasp: def __init__(self): pass def getName(self): for i in range(0,10): self.url = f"https://movie.douban.com/top250?start={25*i}&filter=" s...
复制链接

扫一扫