2021-03-24

最新推荐文章于 2022-01-26 22:26:06 发布

不同林

最新推荐文章于 2022-01-26 22:26:06 发布

阅读量82

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/aclplr/article/details/115189875

版权

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

爬取豆瓣

from selenium import webdriver
import time
from pyecharts.charts import Bar
from pyecharts import options as opts

class DouBanSpider():

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = 'https://movie.douban.com/'
        self.dicts = {9:0,8:0,7:0,6:0,5:0,4:0,3:0,2:0,1:0,0:0}
    def open(self, url):
        self.driver.get(url)
    
    def get_tag_by_id(self, id):
        return self.driver.find_element_by_id(id)
    
    def get_tag_by_cn(self, cn):
        return self.driver.find_element_by_class_name(cn)
    
    def set_keys(self, tag, value):
        tag.send_keys(value)
    
    def parse_content(self):
        div_list = self.driver.find_elements_by_xpath('//div[contains(@class,"sc-bZQynM")]')
        # info = ''
        # pl = ''
        # rating_num = 0
        for div in div_list:
            title = div.find_element_by_xpath('.//div[@class="title"]/a').text
            try:
                rating_num = div.find_element_by_class_name('rating_nums')
                rating_num = rating_num.text
            except Exception as e:
                rating_num = 0
            pl = div.find_element_by_class_name('pl').text
            info = div.find_element_by_xpath('.//div[@class="item-root"]//div[@class="meta abstract"]').text
            action = div.find_element_by_xpath('.//div[@class="item-root"]//div[@class="meta abstract_2"]').text
            print(title, rating_num, pl, info,action)
            self.cal_rating(float(rating_num))
    
        # self.paint_table()  # 爬完绘制
        try:
            next = self.driver.find_element_by_xpath('//div[contains(@class,"paginator")]//a[last()]')
            next.click()
            self.parse_content()
        except Exception as e:
            self.paint_table() # 爬完绘制
            self.driver.close()


    def cal_rating(self,grade):
        if grade >= 9:
            self.dicts[9] = self.dicts[9]+1
        elif grade >= 8 and grade < 9:
            self.dicts[8] = self.dicts[8] + 1
        elif grade >= 7 and grade < 8:
            self.dicts[7] = self.dicts[7] + 1
        elif grade >= 6 and grade < 7:
            self.dicts[6] = self.dicts[6] + 1
        elif grade >= 5 and grade < 6:
            self.dicts[5] = self.dicts[5] + 1
        elif grade >= 4 and grade < 5:
            self.dicts[4] = self.dicts[4] + 1
        elif grade >= 3 and grade < 4:
            self.dicts[3] = self.dicts[3] + 1
        elif grade >= 2 and grade < 3:
            self.dicts[2] = self.dicts[2] + 1
        elif grade >= 1 and grade < 2:
            self.dicts[1] = self.dicts[1] + 1
        else:
            self.dicts[0] = self.dicts[0] + 1
    
    def paint_table(self):
        bar = (
            Bar()
                .add_xaxis([str(key) for key in self.dicts.keys()])
                .add_yaxis("xxx", list(self.dicts.values()))
                .set_global_opts(title_opts=opts.TitleOpts(title="某商场销售情况"))
        )
        bar.render()
    def start(self):
        self.open(self.url)
        time.sleep(1)
        inp = self.get_tag_by_id('inp-query')
        # val = input('请输入你要爬取的电影明星')
        val = '成龙'
        self.set_keys(inp, val)
        btn = self.get_tag_by_cn('inp-btn')
        btn.click()
        time.sleep(2)
        self.parse_content()
        
    if __name__ == '__main__':
        dbs = DouBanSpider()
        dbs.start()

不同林

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2021-03-24

爬取豆瓣from selenium import webdriverimport timefrom pyecharts.charts import Barfrom pyecharts import options as optsclass DouBanSpider(): def __init__(self): self.driver = webdriver.Chrome() self.url = 'https://movie.douban.com/'
复制链接

扫一扫

专栏目录