爬虫数据定时采集,异步持久化存储,可视化
原来scrapy框架爬取百度热搜还可以这么玩
本篇文章有点长,纯属记录并分享自己的心得。某一天打开浏览器,百度搜索引擎每天都会推送当天热点信息,它的这个热闹统计是怎么来的呢?然后引起我的注意,能不能每天一醒来就看到到可视化结果,开始了慢慢的摸索。
后来发现和这个百度风云榜的数据是一样的。这里不仅有实时热搜还有各类的百度搜索排行榜,兴趣更加浓厚,想想直接试试爬虫怎么样,能不能可视化呢,邮件提醒等等
准备阶段
由于一时兴趣学习爬虫,了解到scrapy的一些基本原理和操作就开始爬虫之路。创建完项目之后直接用pycharm工具打开并设置虚拟环境。
然后开始对hot.py爬虫文件的编写在这里插入代码片
为了防爬虫技术,就自定义一个headers,重写start_requests方法对起始url发起请求,并对settings.py文件中ROBOTSTXT_OBEY = False,延迟下载DOWNLOAD_DELAY = 0.5
import scrapy
from ..items import HotsearchItem, population_gender_Item, population_old_Item, Myitem
from .hhhhhhhhhhhh import Bigcdata
class HotSpider(scrapy.Spider):
name = 'hot'
# allowed_domains = ['http://top.baidu.com/']
start_urls = ['http://top.baidu.com/boards?fr=topbuzz_b4_c2']
headers = {
'LOG_LEVEL': 'ERROR',
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Host":"top.baidu.com",
"Referer":"http://top.baidu.com/boards?fr=topboards",
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Mobile Safari/537.36"
}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0],callback=self.parse,headers=self.headers)
def parse(self, response):
print("我进入了全部榜单开始获得子链接")
# print(response.text)
# 去掉部分url 化妆品月度榜单 手机月度榜单 汽车月度榜单
dic = {
}
t_list = response.css("div.links a")
for t in t_list:
href = t.xpath('./@href').extract()[0]
title = t.xpath('./text()').extract()[0]
dic[href] = title
# print(title,href)
# print(len(t_list),t_list)
remove_list = ['./buzz?b=1564','./buzz?b=1566','./buzz?b=1565']
for item in remove_list:
dic.pop(item)
# print(dic.items())
dic['./buzz?b=1677&c=536'] = '少儿影视'
dic['./buzz?b=1678&c=537'] = '纪录片'
print(len(dic.keys()))
#开始拼接子url
for item in dic:
url = "http://top.baidu.com/" + item.split('.')[-1]
yield scrapy.Request(url=url,callback=self.dtail)
#
def dtail(self,response):
print("你好:我已经到详情页面了")
data_dic = {
}
hotitem = HotsearchItem()
Title = response.css("div.top-title h2::text").extract()
hotitem["Title"] = Title[0]
# print(Title[0])
#tr_list = response.xpath("//tr[position() !=1 and position()!=3 and position()!=5 and position()!=7]")
tr_list = response.xpath("//tr[position() !=1 ]")
# print(type(tr_list),tr_list)
data_list = []
for tr in tr_list:
#经发现存在None
nickname = tr.xpath('./td[2]/a[1]/text()').extract_first()
url = tr.xpath('./td[@class="keyword"]/a[1]/@href').extract_first()
#拿到热度了
hot_number = tr.xpath('./td[@class="last"]/span/text()').extract_first()
#上升下降指标
icon = tr.xpath('./td[@class="last"]/span/@class').extract_first()
num_top = tr.xpath('./td[@class="first"]/span/text()').extract_first()
if nickname and url