基本使用
from lxml import etree
tree = etree.parse('../html/xpath_test.html')
print(tree)
li_list = tree.xpath('//ul/li')
print(li_list)
print(len(li_list))
li_list = tree.xpath('//ul/li[@id]/text()')
print(li_list)
print(len(li_list))
li_list = tree.xpath('//ul/li[@id="l1"]/text()')
print(li_list)
print(len(li_list))
li = tree.xpath('//ul/li[@id="l1"]/@class')
print(li)
print(len(li))
li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')
print(li_list)
print(len(li_list))
li_list = tree.xpath('//ul/li[starts-with(@id,"l")]/text()')
print(li_list)
print(len(li_list))
li_list = tree.xpath('//ul/li[@id="a3" and @class="a3"]/text()')
print(li_list)
print(len(li_list))
li_list = tree.xpath('//ul/li[@id="a3"]/text()' | '//ul/li[@id="l1"]/text()')
print(li_list)
print(len(li_list))
使用xpath获取百度一下
import urllib.request
url = "https://www.baidu.com/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
from lxml import etree
tree = etree.HTML(content)
result = tree.xpath('//input[@id="su"]/@value')
print(result[0])
站长素材图片获取和下载
import urllib.request
from lxml import etree
import urllib.error
last = '.html'
url = 'https://sc.chinaz.com/tu/fengjing'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
def Schedule(a, b, c):
per = 100.0 * a * b / c
if per > 100:
per = 100
print('完成!')
print('%.2f%%' % per)
def create_request(p):
if p == 1:
current_url = url + last
else:
current_url = url + '-' + str(p) + '-0-0' + last
print(current_url)
request1 = urllib.request.Request(url=current_url, headers=headers)
return request1
def get_content(request2):
response = urllib.request.urlopen(request2)
content1 = response.read().decode('utf-8')
return content1
def down_load(content1):
tree = etree.HTML(content1)
name_list = tree.xpath('//div[@id="ulcontent"]//a/img/@alt')
src_list = tree.xpath('//div[@id="ulcontent"]//a/img/@data-src')
for i in range(len(name_list)):
try:
name = name_list[i] + '.jpg'
src = src_list[i]
url_1 = 'http:' + src
url_1 = url_1.replace('\\', '/')
print(url_1)
urllib.request.urlretrieve(url_1, './images/' + name, Schedule)
except urllib.error.HTTPError:
print('图片路径有错误哦')
continue
if __name__ == '__main__':
start_page = int(input('请输入起始页码'))
end_page = int(input('请输入结束页码'))
for page in range(start_page, end_page + 1):
request = create_request(page)
content = get_content(request)
down_load(content)