mysql xpath_xpath的常见操作

#-*- coding: utf-8 -*-

importscrapyimportsysimporthashlibfrom scrapy.contrib.spiders importCrawlSpider, Rulefrom scrapy.contrib.linkextractors importLinkExtractorfrom datetime import *

from common_lib import *reload(sys)

sys.setdefaultencoding('utf-8')classNhfpcItem(scrapy.Item):

url=scrapy.Field()

name=scrapy.Field()

description=scrapy.Field()

size=scrapy.Field()

dateTime=scrapy.Field()classNhfpcSpider(scrapy.contrib.spiders.CrawlSpider):

name= "nhfpc"allowed_domains= ["nhfpc.gov.cn"]

start_urls=('http://www.nhfpc.gov.cn/fzs/pzcfg/list.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_2.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_3.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_4.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_5.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_6.shtml','http://www.nhfpc.gov.cn/fzs/pzcfg/list_7.shtml',

)

rules=(

Rule(

LinkExtractor(allow='.*\d{6}/.*'),

callback='parse_item'),

Rule(

LinkExtractor(allow='.*201307.*'),

follow=True,

),

)defparse_item(self, response):

retList= response.xpath("//div[@id='zoomtitle']/*/text()").extract()

title= ""

if len(retList) ==0:

retList= response.xpath("//div[@id='zoomtitl']/*/text()").extract()

title=retList[0].strip()else:

title=retList[0].strip()

content= ""data= response.xpath('//div[@id="zoomcon"]')if len(data) ==0:

data= response.xpath('//div[@id="contentzoom"]')

content= ''.join(data.xpath('string(.)').extract())

pubTime= "1970-01-01 00:00:00"time= response.xpath("//div[@id='zoomtime']/@title").extract()if len(time) ==0 :

time= response.xpath("//ucmspubtime/text()").extract()else:

time= ''.join(time).split(":")[1]

pubTime= ''.join(time)

pubTime= pubTime + "00:00:00"

#print pubTime

#insertTime = datetime.now().strftime("%20y-%m-%d %H:%M:%S")

insertTime =datetime.now()

webSite= "nhfpc.gov.cn"values=[]

values.append(title)

md5Url=hashlib.md5(response.url.encode('utf-8')).hexdigest()

values.append(md5Url)

values.append(pubTime)

values.append(insertTime)

values.append(webSite)

values.append(content)

values.append(response.url)#print values

insertDB(values)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值