域名:http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm
# -*- coding: utf-8 -*-
import scrapy
from scrapy
.linkextractors
import LinkExtractor
from scrapy
.spiders
import CrawlSpider
, Rule
import re
class
CfSpider
(
CrawlSpider
):
name
=
'
cf
'
allowed_domains
=
[
'
csrc.gov.cn
'
]
start_urls
=
[
'
http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm
'
]
rules
=
(
Rule
(LinkExtractor
(
allow
=
r
'/G\d
+
/\d
+
/t\d
+
_\d
+
\.htm'
),
callback
=
'
parse_item
'
),
# Rule(LinkExtractor(allow=r'/3300/3313/index_7401_.*?\.htm'),follow=True), # 不起作用,所以重写了start_requests
)
def
start_requests
(
self
):
current_page
=
0
while current_page
<
67
:
if current_page
==
0
:
url
=
'
http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401
'
next_url
= url
+
"
.html
"
else
:
url
=
'
http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401_
{}
.htm
'
next_url
= url
.format
(
str
(current_page
))
yield scrapy
.Request
(
url
=next_url
,
callback
=
self
.parse
,
)
current_page
+=
1
def
parse_item
(
self
,
response
):
item
=
dict
()
item
[
"
title
"
]
= response
.xpath
(
"
//span[@id='lTitle']/text()
"
).extract_first
()
item
[
"
pub_title
"
]
= re
.findall
(
r
"<span>(20\d
+
年\d
{2}
月\d
{2}
日)</span>"
,response
.body
.decode
(),re
.S
)
item
[
"
pub_title
"
]
= item
[
"
pub_title
"
][
0
]
if item
[
"
pub_title
"
]
else
None
item
[
"
index_number
"
]
= response
.xpath
(
"
//table[@id='headContainer']//tr[1]//td[@colspan='2']//td[1]/text()
"
).extract_first
()
item
[
"
href
"
]
= response
.url
yield item