思路
url:http://wz.sun0769.com/political/index/supervise?page=0
目标:爬取全网的事件编号,标题,和详情页的编号,内容。
spider部分
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunPro.items import SunproItem
from sunPro.items import MyItem
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['xxx.com']
start_urls = ['http://wz.sun0769.com/political/index/supervise?page=1']
link = LinkExtractor(restrict_xpaths=('//div[@class="mr-three paging-box"]/a | //div[@class="page_div"]/a'))
link_2 = LinkExtractor(restrict_xpaths=('//ul[@class="title-state-ul"]/li/span[3]/a'))
# 使用xpath提取链接
# link = LinkExtractor(allow=r'supervise\?page=\d+')
# # <a href="/political/index/supervise?page=2" class="page-num">2</a>
# link_2 = LinkExtractor(allow=r'index\?id=\d+')
# # <a target="_blank" href="/political/politics/index?id=499261" class="color-hover">对大朗镇道路交通管理的一些建议</a>
# 使用re提取链接
# 得到的url是不完整的,但是crawlspider会自动补齐完整的url
# 执行流程:start_urls开始爬取,先发送给调度器,然后发送给request给下载器,
# 下载器发回response给rules,而不是给调度器,其他的流程与spider相同
rules = (
Rule(link, callback='parse_item', follow=True),
Rule(link_2, callback='parse_info', follow=False),
)
def parse_item(self, response):
li_list = response.xpath('//ul[@class="title-state-ul"]/li')
item = SunproItem()
# 在for循环外边实例化对象
for li in li_list:
number = li.xpath('./span[1]/text()').extract_first()
title = li.xpath('./span[3]/a/text()').extract_first()
item['number'] = number
item['title'] = title
# print(number,title)
yield item
def parse_info(self, response):
info_number = response.xpath(
'//div[@class="focus-date clear focus-date-list"]/span[last()]/text()').extract_first()
content = response.xpath('normalize-space(//div[@class="details-box"]/pre/text())')[:].extract_first()
item = MyItem()
item['info_number'] = info_number.replace('编号:', '')
item['content'] = content
# print(info_number,content)
yield item
item部分代码
import scrapy
class SunproItem(scrapy.Item):
number = scrapy.Field()
title = scrapy.Field()
class MyItem(scrapy.Item):
info_number = scrapy.Field()
content = scrapy.Field()
# 实例化两个item,注意继承父类Item的I是大写
pipelines部分代码
class SunproPipeline:
# def open_spider(self,spider):
# self.client = pymongo.MongoClient()
def process_item(self, item, spider):
if item.__class__.__name__ == 'SunproItem':
print("SunproItem",item['number'], item['title'])
# self.client.sun.data.insert(item)
else:
print(item['info_number'], item['content'])
# self.client.sun.data.update(item['number'],{$set:item['info_number']})
# 如何区分两个item
return item