scrapy的应用

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.


from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector


from mySpider.items import DmozItem


class DmozSpider(BaseSpider):
    name = "dmoz"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
        #"http://stackoverflow.com/"
    ]


    def parse(self,response):
        #sel = Selector(response)
        #sites = sel.xpath('//ul/li')
        #for site in sites:
         #   title = site.xpath('a/text()').extract()
          #  link = site.xpath('a/@href').extract()
            #desc = site.xpath('text()').extract()
          #  print title
        ##filename = response.url.split("/")[-2]
        #with open(filename,'wb') as f:
        #    f.write(response.body)
        hxs = HtmlXPathSelector(text=response.body)
        #print hxs.select('/title/text()').extract()
        items = []
        for sel in hxs.select('//ul/li'):
            item = DmozItem()
            item['title'] = sel.select('a/text()').extract()
            item['link'] = sel.select('a/@href').extract()
            item['desc']= sel.select('text()').extract()
            items.append(item)
            return items
            #print title
        #    for t in title:
        #        print t.encode('utf-8')
        
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值