Selector选择器
基于lxml构建出来。
1.使用
直接导入模块,然后实例化使用。Selector支持css,re,xpath等解析方式
from scrapy import Selector
content = "<html><head><title>My html</title><body><h3>Hello world!</h3></body></head></html>"
selector = Selector(text=content)
print(selector.xpath("//title/text()").get())
print(selector.xpath("//title/text()").getall())
2.用Xpath爬取淘宝特色市场商品分类
爬虫出现Forbidden by robots.txt_菜鸡瞎讲-CSDN博客
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['huodong.taobao.com/wow/tbhome/act/special-markets']
start_urls = ['http://huodong.taobao.com/wow/tbhome/act/special-markets/']
def parse(self, response):
titles = response.xpath("//dl[@class='market-list']")
for title in titles:
print(title.xpath("./dt/text()").get())
print("*"*50)
alist = title.xpath(".//a")
for a in alist:
print(a.xpath("./@href").get(), end=":")
print(a.xpath("./span[@class='market-list-title']/text()").get())
3.用css爬取
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['huodong.taobao.com/wow/tbhome/act/special-markets']
start_urls = ['http://huodong.taobao.com/wow/tbhome/act/special-markets/']
def parse(self, response):
titles = response.css("dl.market-lis")
for title in titles:
print(title.css("dt::text").get())
print("*"*50)
alist = title.css("a")
for a in alist:
print(a.css("::attr(href)").get(), end=":")
print(a.css("span.market-list-title::text").get())