思路
网易新闻版块信息是静态加载,但是点进版块看具体的新闻,页面是动态加载的,所以需要采用selenium和scrapy相结合的思路进行爬取网站。
Spider代码
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['xx.com']
start_urls = ['https://news.163.com/']
url_list = []
def __init__(self,spider):
self.chrome = webdriver.Chrome()
# 在类下初始化一个谷歌浏览器,可在downloadmiddleware那边用spider进行接
def parse(self, response):
all_li_list = response.xpath('//div[@class="ns_area list"]/ul/li')
menu_list = [3,4,6,7,8]
# 要爬取的是国内,国际,军事,航空,新闻这五个板块的内容
for i in menu_list:
menus = all_li_list[i].xpath('./a/@href').extract_first()
self.url_list.append(menus)
for url in self.url_list:
yield scrapy.Request(url = url,callback=self.parse_info,dont_filter=True)
# dont_fiter用来防止调度器过滤过度
# 爬取版块的url,并且进行二次请求。
def parse_info(self,response):
divs = response.xpath('//div[@class="ndi_main"]/div')
for div in divs:
detail_title = div.xpath('.//h3/a/text()').extract_first()
detail_url = div.xpath('.//h3/a/@href').extract_first()
item = WangyiproItem()
item['content_name'] = detail_title
# 封装成item类,进行请求传参
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item},dont_filter=True)
# 得到每个版块每条新闻的url,进行深度爬取,得到相关的内容。
def parse_detail(self,response):
item = response.meta['item']
# 接收传过来的参数
content = response.xpath('string(//div[@class="post_body"]/p/text())').extract_first()
item['content'] = content
yield item
def closed(self,spider):
self.chrome.close()
DownloadMiddlewares 代码
from scrapy import signals
from scrapy.http import HtmlResponse
from itemadapter import is_item, ItemAdapter
from time import sleep
from fake_useragent import UserAgent
class WangyiproDownloaderMiddleware:
def process_request(self, request, spider):
# request.header.setdefault(b'useragent',UserAgent().Chrome)
# UA伪装
return None
def process_response(self, request, response, spider):
chrome = spider.chrome
# 用spider接收selenium
if request.url in spider.url_list:
chrome.get(request.url)
sleep(3)
page_source = chrome.page_source
return HtmlResponse(url=request.url,body=page_source,encoding='utf-8',request=request)
# selenium主要处理五大模块的url的响应,不处理单独板块的每条新闻的内容
else:
return response
def process_exception(self, request, exception, spider):
request.meta['proxy'] = 'https://ip:prot'
# IP伪装,写在这里,只有当页面出现异常的时候才会调用。
pass
pipeline代码
import pymongo
class WangyiproPipeline:
def open_spider(self):
self.client = pymongo.MongoClient()
def process_item(self, item, spider):
self.client.wangyi.data.insert(item)
return item
def close_spider(self,spider):
self.client.close()
# 保存到pymongo数据库里面
Items代码
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
content_name = scrapy.Field()
content = scrapy.Field()