#(第一)获取一级分类交给管道处理
import scrapy,re
from bokeyuan.items import BokeyuanItem
from w3lib.html import remove_tags #处理标签的模块
class Bokeyuan(scrapy.Spider):
name = 'bokeyuan'
allowed_domains = ['cnblogs.com']
start_urls = ['https://www.cnblogs.com/']
#
def parse(self, response):
base_url = 'https://www.cnblogs.com'
#获取一级分类
id_url=response.xpath('//ul[@id="cate_item"]//a/@href').extract()
for i in id_url:
fullurl = base_url +i
yield scrapy.Request(fullurl,callback=self.parse_page)
def parse_page(self,response):
#获取最大的页码
# print(response.status)
fullurl = response.url
page = response.xpath('//div[@class="pager"]/a[last()-1]/text()').extract()[0]
# print(page)
for i in range(int(page),0,-1):
url = fullurl + str(i)
# print(url)
yield scrapy.Request(url,callback=self.parse_list)
def parse_list(self,response):
# print(response.status)
div_list = response.css('div#post_list > div')
# print(div_list)
for div in div_list:
item = BokeyuanItem()
title = div.xpath('.//h3/a[@class="titlelnk"]/text()').extract()[0]
diggnum = div.xpath('.//span[@class="diggnum"]/text()').extract()[0]
summary = div.xpath('.//p[@class="post_item_summary"]/text()').extract()
if len(summary) >=2:
summary = summary[1].strip()
else:
summary = summary[0].strip()
article = div.xpath('.//a[@class="lightblue"]/text()').extract()[0]
content = div.xpath('.//div[@class="post_item_foot"]//span[@class="article_comment"]/a/text()').extract()[0].strip()
views = div.xpath('.//div[@class="post_item_foot"]//span[@class="article_view"]/a/text()').extract()[0]
dates = div.xpath('.//div[@class="post_item_foot"]').extract()[0]
pat = re.compile(r'<a .+?class="lightblue">.+?</a>(.+?)<span', re.S)
dates = pat.search(dates).group(1).strip().strip('发布于').strip()
info_url = div.xpath('.//h3/a[@class="titlelnk"]/@href').extract()[0]
# print(dates)
item["title"] = title
item["diggnum"] = diggnum
item["summary"] = summary
item["article"] = article
item["content"] = content
item["views"] = views
item["dates"] = dates
item["info_url"] = info_url
# print(item)
yield scrapy.Request(info_url,callback=self.parse_info,meta={'item':item})
def parse_info(self,response):
item = response.meta['item']
div_body = response.css('div#cnblogs_post_body').extract()[0]
div_body = remove_tags(div_body,keep=('div','p'))
item['body_content'] = div_body
# print(item)
yield item
#(第二)获取二级分类spider
import scrapy
class PostCate(scrapy.Spider):
name = 'postcate'
allowed_domains = ['cnblogs.com']
start_urls = ['https://www.cnblogs.com/']
def parse(self, response):
base_url = 'https://www.cnblogs.com/aggsite/SubCategories'
body = {"cateIds": "108698,2,108701,108703,108704,108705,108709,108712,108724,4"}
yield scrapy.FormRequest(base_url, callback=self.parse_data, method='POST', formdata=body)
def parse_data(self,response):
print(response.text)
#获取的内容通过fiddler抓包工具,获取到post请求,通过请求,获取到响应的内容。