创建对象
#创建Selector 对象
from scrapy.selector import Selector
text = '''
<html>
<body>
<h1>Hello World</h1>
<h1>Hello Scrapy</h1>
<h1>Hello World</h1>
<ul>
<li>C++</li>
<li>Java</li>
<li>Python</li>
</ul>
</body>
</html>
'''
selector=Selector(text=text)
print(selector) #<Selector xpath=None data='<html>\n <body>\n <h1>Hello W...'>
from scrapy.http import HtmlResponse
body = '''
<html>
<body>
<h1>Hello World</h1>
<h1>Hello Scrapy</h1>
<h1>Hello World</h1>
<ul>
<li>C++</li>
<li>Java</li>
<li>Python</li>
</ul>
</body>
</html>
'''
response=HtmlResponse(url='http://example.com',body=body,encoding='utf-8')
selector=Selector(response=response)
print(selector) #<Selector xpath=None data='<html>\n <body>\n <h1>Hello W...'>
选中数据
#选中数据
selector_list=selector.xpath('//h1')
print(selector_list) # 输出的是一个 <Selector xpath data > 的列表[<>,<>,<>]
#[<Selector xpath='//h1' data='<h1>Hello World</h1>'>,
# <Selector xpath='//h1' data='<h1>Hello Scrapy</h1>'>, <Selector xpath='//h1' data='<h1>Hello World</h1>'>]
for sel in selector_list:
print(sel.xpath('./text()'))
# [<Selector xpath='./text()' data='Hello World'>]
# [<Selector xpath='./text()' data='Hello Scrapy'>]
# [<Selector xpath='./text()' data='Hello World'>]
print(selector_list.xpath('./text()'))
#[<Selector xpath='./text()' data='Hello World'>,
# <Selector xpath='./text()' data='Hello Scrapy'>, <Selector xpath='./text()' data='Hello World'>]
print(selector.xpath('//ul').xpath('//li').xpath('./text()')) #书上xpath('.//ul')加了一个点,不加也可
print(selector.css('ul').css('li').xpath('./text()'))
# [<Selector xpath='./text()' data='C++'>, <Selector xpath='./text()' data='Java'>, <Selector xpath='./text()' data='Python'>]
# [<Selector xpath='./text()' data='C++'>, <Selector xpath='./text()' data='Java'>, <Selector xpath='./text()' data='Python'>]
提取数据
#提取数据
#extract()
s=selector.xpath('//li').xpath('./text()')
print(s) #[<Selector xpath='./text() data='C++'>, <Selector xpath='./text()' data='Java'>, <Selector xpath='./text()' data='Python'>]
print(s.extract()) #['C++', 'Java', 'Python'], extract()函数的功能是selector 对象列表中的data内容,并组成一个列表返回
print(s[1].extract()) #Java 书上给的是'C++'
print(s.extract()[0]) #C++
s1=selector.xpath('.//h1')
print(s1.extract_first()) #<h1>Hello World</h1>
#re()方法,有时候我们使用正则表达式提取选中内容的某部分,可以使用re方法
text1 ='''
<ul>
<li>Python学习手册 <b>价格: 99元’</b></li>
<li>Python核心编程 <b>价格: 88元’</b></li>
<li>Python基础教程 <b>价格: 77元’</b></li>
</ul>
'''
selector1=Selector(text=text1)
s2=selector1.xpath('//ul//li//b/text()')
print(s2)
print(s2.extract())
print(s2.re('\d+\.\d+')) #有错,学完正则表达式再来
#[<Selector xpath='//ul//li//b/text()' data='价格: 99元’'>, <Selector xpath='//ul//li//b/text()' data='价格: 88元’'>, <Selector xpath='//ul//li//b/text()' data='价格: 77元’'>]
# ['价格: 99元’', '价格: 88元’', '价格: 77元’']
# []
Response内置Selector
# Response内置Selector
#通常,我们直接是同Response对象提供内置的Selector对象即可
response=HtmlResponse(url='http://example.com',body=body,encoding='utf-8')
selector=response.selector
response.xpath()
response.css()