数据解析:
- Xpath语法和lxml库
- BeautifulSoup4库
- 正则表达式和re模块
Xpath语法和lxml库
什么是Xpath
xpath(XML Path HTML)是一门在XML和HTML文档中查找信息的语言,可用来在XML和HTML文档中对元素和属性进行遍历
xpath语法:
选取多个路径:
通过在路径表达式中使用“|”运算符,可以选取若干个路径。示例如下:
//bookstore/book | //book/title
选取所有book元素以及book元素下所有的title元素
lxml库
lxml库是一个HTML/XML的解析器,主要功能是如何解析和提取HTML/XML数据,我们可以利用lxml来解析HTML代码,并且在解析HTML代码的时候,如果代码不规范,它会自动的进行补全。
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
#将字符串解析为html文档
html = etree.HTML(text)
#按照字符串序列化文档
resp = etree.tostring(html).decode('utf-8')
print(resp)
以下代码编译时,如果读取的html文件中有不规范的操作则会报错:lxml.etree.XMLSyntaxError
from lxml import etree
#从文件中读取html
htmls = etree.parse('hello.html')
res = etree.tostring(htmls).decode('utf-8')
print(res)
解决方法
from lxml import etree
#从文件中读取html
pasrers = etree.HTMLParser() #创建html解析器
#使用lxml.etree.parse()解析html文件,该方法默认使用的是“XML”解析器,所以如果碰到不规范的html文件时就会解析错误
#解决办法:自己创建html解析器,增加parser参数
htmls = etree.parse('hello.html',parser=pasrers)
res = etree.tostring(htmls).decode('utf-8')
print(res)
在lxml中使用xpath语法练习
from lxml import etree
html = etree.parse('hello.html')
#获取所有li标签
# res = html.xpath('//li')
# print(res)
# for i in res:
# print(etree.tostring(i))
#获取所有li标签下所有class属性的值
# res = html.xpath('//li/@class')
# print(res)
#获取li标签下href为www.baidu.com的a标签
# res = html.xpath('//li/a[@href="www.baidu.com"]')
# print(res)
# for i in res:
# print(etree.tostring(i))
#获取li标签下所有span标签
# res = html.xpath('//li/span')
# print(res)
#获取li标签下的a标签的所有class
# res = html.xpath('//li/a//@class')
# print(res)
#获取最后一个li的a的href属性对应的值
# res = html.xpath('//li[last()]/a/@href')
# print(res)
#获取倒数第二个li元素的内容
# res = html.xpath('//li[last()-1]/a')
# print(res)
# print(res[0].text)
#获取倒数第二个li元素的内容的第二种方法
res = html.xpath('//li[last()-1]/a/text()')
print(res)
hello.html
<html lang="en">
<body>
<div>
<ul>
<li class="item-0"><a href="www.baidu.com">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="www.baidu.com">fifth item</a></li>
</ul>
</div>
</body>
</html>
实例:爬取瓜子二手车
import requests
from lxml import etree
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Cookie': 'track_id=42717333969293312; uuid=2c65219b-f88b-43b0-d69e-ae85fd5cc754; antipas=8079840989247V20L5644x4; cityDomain=www; clueSourceCode=%2A%2300; user_city_id=-1; preTime=%7B%22last%22%3A1581339806%2C%22this%22%3A1581339806%2C%22pre%22%3A1581339806%7D; ganji_uuid=8897614559278256260931; sessionid=bceeae28-657b-4bca-befc-798913b24869; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22pcbiaoti%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2242717333969293312%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%222c65219b-f88b-43b0-d69e-ae85fd5cc754%22%2C%22ca_city%22%3A%22hn%22%2C%22sessionid%22%3A%22bceeae28-657b-4bca-befc-798913b24869%22%7D; gr_user_id=830ba263-072e-434d-a077-a3f023e090b4; gr_session_id_bf5e6f1c1bf9a992=e7c677fd-683f-4152-bb70-8fec3eb9460b; gr_session_id_bf5e6f1c1bf9a992_e7c677fd-683f-4152-bb70-8fec3eb9460b=true; close_finance_popup=2020-02-10'
}
def get_detail_urls(url):
detail_urls = []
resp = requests.get(url, headers=header)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
a_url = html.xpath('//ul[@class="carlist clearfix"]/li/a/@href')
# print(a_url)
for i_url in a_url:
detail_url = 'https://www.guazi.com' + i_url
# print(detail_url)
detail_urls.append(detail_url)
return detail_urls
def parse_detail_page(detail_url):
resp = requests.get(detail_url, headers=header)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
title = html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
title = title.replace(r'\r\n', '').strip() # 字符串处理
# print(title)
info = html.xpath('//div[@class="product-textbox"]/ul/li/span/text()')
# print(info)
price = html.xpath('//div[@class="pricebox js-disprice"]/span/text()')
# print(price)
infos = {}
cartime = info[0] # 上牌时间
km = info[1] # 公里数
dispacement = info[-2] # 排量
speedbox = info[-1] # 变速箱
infos['名称'] = title
infos['上牌时间'] = cartime
infos['公里数'] = km
infos['排量'] = dispacement
infos['变速箱'] = speedbox
infos['原价'] = price[1]
infos['金融专享价'] = price[0] + '万'
return infos
#保存数据
def save_data(infos,f):
f.write('名称:%s,上牌时间:%s,公里数:%s,排量:%s,变速箱:%s,%s,金融专享价:%s\n'%(infos['名称'],infos['上牌时间'],infos['公里数'],infos['排量'],infos['变速箱'],infos['原价'],infos['金融专享价']))
def main():
#首页
url = 'https://www.guazi.com/www/'
with open('guazi_cs.txt', 'a', encoding='utf-8') as f:
#获取详情页面url
detail_urls = get_detail_urls(url)
#解析详情页面内容
for detail_url in detail_urls:
infos = parse_detail_page(detail_url)
save_data(infos,f)
if __name__ == '__main__':
main()