爬取豆瓣
from selenium import webdriver
import time
from pyecharts.charts import Bar
from pyecharts import options as opts
class DouBanSpider():
def __init__(self):
self.driver = webdriver.Chrome()
self.url = 'https://movie.douban.com/'
self.dicts = {9:0,8:0,7:0,6:0,5:0,4:0,3:0,2:0,1:0,0:0}
def open(self, url):
self.driver.get(url)
def get_tag_by_id(self, id):
return self.driver.find_element_by_id(id)
def get_tag_by_cn(self, cn):
return self.driver.find_element_by_class_name(cn)
def set_keys(self, tag, value):
tag.send_keys(value)
def parse_content(self):
div_list = self.driver.find_elements_by_xpath('//div[contains(@class,"sc-bZQynM")]')
# info = ''
# pl = ''
# rating_num = 0
for div in div_list:
title = div.find_element_by_xpath('.//div[@class="title"]/a').text
try:
rating_num = div.find_element_by_class_name('rating_nums')
rating_num = rating_num.text
except Exception as e:
rating_num = 0
pl = div.find_element_by_class_name('pl').text
info = div.find_element_by_xpath('.//div[@class="item-root"]//div[@class="meta abstract"]').text
action = div.find_element_by_xpath('.//div[@class="item-root"]//div[@class="meta abstract_2"]').text
print(title, rating_num, pl, info,action)
self.cal_rating(float(rating_num))
# self.paint_table() # 爬完绘制
try:
next = self.driver.find_element_by_xpath('//div[contains(@class,"paginator")]//a[last()]')
next.click()
self.parse_content()
except Exception as e:
self.paint_table() # 爬完绘制
self.driver.close()
def cal_rating(self,grade):
if grade >= 9:
self.dicts[9] = self.dicts[9]+1
elif grade >= 8 and grade < 9:
self.dicts[8] = self.dicts[8] + 1
elif grade >= 7 and grade < 8:
self.dicts[7] = self.dicts[7] + 1
elif grade >= 6 and grade < 7:
self.dicts[6] = self.dicts[6] + 1
elif grade >= 5 and grade < 6:
self.dicts[5] = self.dicts[5] + 1
elif grade >= 4 and grade < 5:
self.dicts[4] = self.dicts[4] + 1
elif grade >= 3 and grade < 4:
self.dicts[3] = self.dicts[3] + 1
elif grade >= 2 and grade < 3:
self.dicts[2] = self.dicts[2] + 1
elif grade >= 1 and grade < 2:
self.dicts[1] = self.dicts[1] + 1
else:
self.dicts[0] = self.dicts[0] + 1
def paint_table(self):
bar = (
Bar()
.add_xaxis([str(key) for key in self.dicts.keys()])
.add_yaxis("xxx", list(self.dicts.values()))
.set_global_opts(title_opts=opts.TitleOpts(title="某商场销售情况"))
)
bar.render()
def start(self):
self.open(self.url)
time.sleep(1)
inp = self.get_tag_by_id('inp-query')
# val = input('请输入你要爬取的电影明星')
val = '成龙'
self.set_keys(inp, val)
btn = self.get_tag_by_cn('inp-btn')
btn.click()
time.sleep(2)
self.parse_content()
if __name__ == '__main__':
dbs = DouBanSpider()
dbs.start()