工具,firefox(推荐好用)
相较于soupbeautifull,scrapy显得优雅又快速,试手如下
from scrapy.spider import Spider
from scrapy.selector import Selector
from health.items import HealthItem
# from health.pipelines import HealthPipeline
import simplejson
class DmozSpider(Spider):
name = "all"
allowed_domains = ["xywy.org"]
start_urls = [
"http://zzk.xywy.com/",
]
def parse(self, response):
filename = response.url.split(".")[-2]
sel = Selector(response)
sites = sel.xpath('//div[@class="shentih"]')
results = sites.xpath('./div/div/div/*/*/a[@class="fsize14"]')
# results = sites.xpath('./div/div/div[@id="AList"]/*/*/a[@class="fsize14"]')
item = HealthItem()
for site in results:
for title,link in zip(site.xpath('text()').extract(),site.xpath('@href').extract()):
item['title'] = title.encode('utf-8')
item['link'] = link.encode('utf-8')
yield item