#-*- coding: utf-8 -*-
importscrapyimportsysimporthashlibfrom scrapy.contrib.spiders importCrawlSpider, Rulefrom scrapy.contrib.linkextractors importLinkExtractorfrom datetime import *
from common_lib import *reload(sys)
sys.setdefaultencoding('utf-8')classNhfpcItem(scrapy.Item):
url=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
size=scrapy.Field()
dateTime=scrapy.Field()classNhfpcSpider(scrapy.contrib.spiders.CrawlSpider):
name= "nhfpc"allowed_domains= ["nhfpc.gov.cn"]
start_urls=('http://www.nhfpc.gov.cn/fzs/pzcfg/list.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_2.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_3.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_4.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_5.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_6.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_7.shtml',
)
rules=(
Rule(
LinkExtractor(allow='.*\d{6}/.*'),
callback='parse_item'),
Rule(
LinkExtractor(allow='.*201307.*'),
follow=True,
),
)defparse_item(self, response):
retList= response.xpath("//div[@id='zoomtitle']/*/text()").extract()
title= ""
if len(retList) ==0:
retList= response.xpath("//div[@id='zoomtitl']/*/text()").extract()
title=retList[0].strip()else:
title=retList[0].strip()
content= ""data= response.xpath('//div[@id="zoomcon"]')if len(data) ==0:
data= response.xpath('//div[@id="contentzoom"]')
content= ''.join(data.xpath('string(.)').extract())
pubTime= "1970-01-01 00:00:00"time= response.xpath("//div[@id='zoomtime']/@title").extract()if len(time) ==0 :
time= response.xpath("//ucmspubtime/text()").extract()else:
time= ''.join(time).split(":")[1]
pubTime= ''.join(time)
pubTime= pubTime + "00:00:00"
#print pubTime
#insertTime = datetime.now().strftime("%20y-%m-%d %H:%M:%S")
insertTime =datetime.now()
webSite= "nhfpc.gov.cn"values=[]
values.append(title)
md5Url=hashlib.md5(response.url.encode('utf-8')).hexdigest()
values.append(md5Url)
values.append(pubTime)
values.append(insertTime)
values.append(webSite)
values.append(content)
values.append(response.url)#print values
insertDB(values)