python xpath html正则,python爬虫xpath,BeautifulSoup和正则用法全

from lxml import etree

from bs4 import BeautifulSoup

import re

html = """

xpath test
  • 时间
  • 地点
  • 任务

这里是个小标题

  1. 1
  2. 2
  3. 3
  • 84
  • 104
  • 223

这里是H3的内容

百度一下

  • test1
  • test2
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

"""

def title():

#第一种,xpath提取

html_etree = etree.HTML(html)

# print(type(html_etree)) #

# result = etree.tostring(html_etree) #如果标签不全,tostring()可以补全

# print(result.decode('utf-8')) #tostring()后的数据类型是bytes,需要decode()转成str

title_xpath1 = html_etree.xpath('/html/head/title/text()') #需要text()把文字解析出来

print('用xpath绝对路径方法提取title:', title_xpath1) #xpath返回的是列表

title_xpath2 = html_etree.xpath('//head/title/text()') #效果一样,/表示绝对路径,//表示相对路径

print('用xpath相对路径方法提取title:', title_xpath2)

#第二种,BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

# print(soup)

# print(type(soup)) #

title_soup = soup.select('title') #soup.select返回的也是列表,需要提取出来在用get_text()拿出文字

# css选择器,标签名不加修饰,类名前加点,id名前加#,可组合查找

# print(title_soup)

# print(type(title_soup)) #list

title_BeautifuleSoup = title_soup[0].get_text()

# title_BeautifuleSoup = soup.title.get_text()

print('用BeautifulSoup方法提取title:', title_BeautifuleSoup)

#第三种,正则表达式提取

re_pattern = re.compile(r'

(.*?)', re.S) #(.*?)是需要匹配返回的字符串,re.S可换行匹配

# print(type(re_pattern)) #re.compile返回的是数据类型正则表达式:

title_re_compile = re.findall(re_pattern, html)

print('用正则表达式方法提取title:', title_re_compile)

#可以不使用re.compile

title_re = re.findall(r'

(.*?)', html)

print('用正则表达式跳过re.compile提取title:', title_re)

def price():

#第一种,xpath提取

html_etree = etree.HTML(html)

# price_xpath = html_etree.xpath('/html/body/div/@price')

# price_xpath = html_etree.xpath('/html/body/child::*/@price') #child::* 选取当前节点所有子元素

# price_xpath = html_etree.xpath('/html/body/child::div/@price') # child::div 子节点定位div标签

# price_xpath = html_etree.xpath('//@price') #相对路径,且price属性只有一个

# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div") #ancestor:: 提取所有父辈div元素

# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div/@price") #父辈定位div元素price属性

price_xpath = html_etree.xpath("//div[@id='testid']/ancestor-or-self::div/@price") # 父辈及当前节点div元素

print('用xpath方法提取price:', price_xpath)

#第二种,BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

price_BeautifulSoup = soup.div.attrs['price']

# price_BeautifulSoup = soup.find('div').attrs['price']

# price_BeautifulSoup = soup.select('div')[0].attrs['price']

print('用BeautifulSoup方法提取price:', price_BeautifulSoup)

#第三种,正则表达式提取

re_pattern = re.compile(r'

', re.S)

price_re = re.findall(re_pattern, html)

print('用正则表达式跳过re.compile提取price:', price_re)

提取第一个div下ul下li的文字

def ul_li():

# 第一种,xpath提取

html_etree = etree.HTML(html)

# ul_li = html_etree.xpath('//div/div[1]/ul/child::*/text()') #child::节点子元素方法

# ul_li = html_etree.xpath('//div/div[1]/ul/li/text()')

# ul_li = html_etree.xpath("//div[@id='testid']/preceding::div/ul/li/text()") #preceding:: 当前节点标签之前的所有节点,可定点

ul_li = html_etree.xpath("//div[@id='testid']/preceding::li/text()") #preceding:: 可避免重复节点带来的麻烦

print('用xpath方法提取ul标签下的li的内容:', ul_li)

# 第二种,BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

# 第一种BeautifulSoup方法

# ul_li = soup.select('ul')[0].select('li')

# ul_li = [i.get_text() for i in ul_li]

#另外一种BeautifulSoup方法

ul_li = soup.div.div.get_text()

ul_li = ul_li.strip() #删除首尾空格

ul_li = ul_li.split('\n') #按换行符分割字符串

print('用BeautifulSoup方法提取ul_li:', ul_li)

# 第三种,正则表达式提取

re_pattern = re.compile(r'

.*?
.*?
  • .*?
  • (.*?).*?
  • (.*?).*?
  • (.*?)', re.S)

re_ul_li = re.findall(re_pattern, html)

print('用正则表达式跳过re.compile提取ul_li:', re_ul_li)

def first_id():

# 第一种,xpath提取

html_etree = etree.HTML(html)

first_id = html_etree.xpath('//div/div[2]/@id')

print('用xpath方法提取first_id的内容:', first_id)

# 第二种,BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

first_id = soup.select('div')[2].attrs['id']

print('用BeautifulSoup方法提取first_id:', first_id)

# 第三种,正则表达式提取

re_comppile = re.compile(r"

", re.S)

first_id = re.findall(re_comppile, html)

print('用正则表达式跳过re.compile提取first_id:', first_id)

def h2():

# 第一种,xpath提取

html_etree = etree.HTML(html)

h2 = html_etree.xpath('//div/div[2]/h2/text()')

print('用xpath方法提取h2的内容:', h2)

# 第二种,BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

# h2 = soup.select('h2')[0].get_text()

h2 = soup.div.h2.get_text()

print('用BeautifulSoup方法提取h2:', h2)

# 第三种,正则表达式提取

re_comppile = re.compile(r'

(.*?)

', re.S)

h2 = re.findall(re_comppile, html)

print('用正则表达式跳过re.compile提取h2:', h2)

def main():

title()

price()

ul_li()

first_id()

h2()

if name == 'main':

main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值