证监会处罚公告爬取

域名:http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm

# -*- coding: utf-8 -*-
import scrapy
from scrapy .linkextractors import LinkExtractor
from scrapy .spiders import CrawlSpider , Rule
import re
class CfSpider ( CrawlSpider ):
name = ' cf '
allowed_domains = [ ' csrc.gov.cn ' ]
start_urls = [ ' http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm ' ]
rules = (
Rule (LinkExtractor ( allow = r '/G\d + /\d + /t\d + _\d + \.htm' ), callback = ' parse_item ' ),
# Rule(LinkExtractor(allow=r'/3300/3313/index_7401_.*?\.htm'),follow=True), # 不起作用,所以重写了start_requests
)
def start_requests ( self ):
current_page = 0
while current_page < 67 :
if current_page == 0 :
url = ' http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401 '
next_url = url + " .html "
else :
url = ' http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401_ {} .htm '
next_url = url .format ( str (current_page ))
yield scrapy .Request (
url =next_url ,
callback = self .parse ,
)
current_page += 1
def parse_item ( self , response ):
item = dict ()
item [ " title " ] = response .xpath ( " //span[@id='lTitle']/text() " ).extract_first ()
item [ " pub_title " ] = re .findall ( r "<span>(20\d + 年\d {2} 月\d {2} 日)</span>" ,response .body .decode (),re .S )
item [ " pub_title " ] = item [ " pub_title " ][ 0 ] if item [ " pub_title " ] else None
item [ " index_number " ] = response .xpath ( " //table[@id='headContainer']//tr[1]//td[@colspan='2']//td[1]/text() " ).extract_first ()
item [ " href " ] = response .url
yield item
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值