python xpath html正则,python爬虫xpath，BeautifulSoup和正则用法全

最新推荐文章于 2024-09-12 08:33:27 发布

电影观察君

最新推荐文章于 2024-09-12 08:33:27 发布

阅读量204

点赞数

文章标签： python xpath html正则

from lxml import etree

from bs4 import BeautifulSoup

import re

html = """

xpath test

时间
地点
任务

这里是个小标题

这里是H3的内容

百度一下

test1
test2

"""

def title():

#第一种，xpath提取

html_etree = etree.HTML(html)

# print(type(html_etree)) #

# result = etree.tostring(html_etree) #如果标签不全，tostring()可以补全

# print(result.decode('utf-8')) #tostring()后的数据类型是bytes，需要decode()转成str

title_xpath1 = html_etree.xpath('/html/head/title/text()') #需要text()把文字解析出来

print('用xpath绝对路径方法提取title:', title_xpath1) #xpath返回的是列表

title_xpath2 = html_etree.xpath('//head/title/text()') #效果一样，/表示绝对路径，//表示相对路径

print('用xpath相对路径方法提取title:', title_xpath2)

#第二种，BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

# print(soup)

# print(type(soup)) #

title_soup = soup.select('title') #soup.select返回的也是列表，需要提取出来在用get_text()拿出文字

# css选择器，标签名不加修饰，类名前加点，id名前加#，可组合查找

# print(title_soup)

# print(type(title_soup)) #list

title_BeautifuleSoup = title_soup[0].get_text()

# title_BeautifuleSoup = soup.title.get_text()

print('用BeautifulSoup方法提取title:', title_BeautifuleSoup)

#第三种，正则表达式提取

re_pattern = re.compile(r'

(.*?)', re.S) #(.*?)是需要匹配返回的字符串,re.S可换行匹配

# print(type(re_pattern)) #re.compile返回的是数据类型正则表达式：

title_re_compile = re.findall(re_pattern, html)

print('用正则表达式方法提取title:', title_re_compile)

#可以不使用re.compile

title_re = re.findall(r'

(.*?)', html)

print('用正则表达式跳过re.compile提取title:', title_re)

def price():

#第一种，xpath提取

html_etree = etree.HTML(html)

# price_xpath = html_etree.xpath('/html/body/div/@price')

# price_xpath = html_etree.xpath('/html/body/child::*/@price') #child::* 选取当前节点所有子元素

# price_xpath = html_etree.xpath('/html/body/child::div/@price') # child::div 子节点定位div标签

# price_xpath = html_etree.xpath('//@price') #相对路径，且price属性只有一个

# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div") #ancestor:: 提取所有父辈div元素

# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div/@price") #父辈定位div元素price属性

price_xpath = html_etree.xpath("//div[@id='testid']/ancestor-or-self::div/@price") # 父辈及当前节点div元素

print('用xpath方法提取price:', price_xpath)

#第二种，BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

price_BeautifulSoup = soup.div.attrs['price']

# price_BeautifulSoup = soup.find('div').attrs['price']

# price_BeautifulSoup = soup.select('div')[0].attrs['price']

print('用BeautifulSoup方法提取price:', price_BeautifulSoup)

#第三种，正则表达式提取

re_pattern = re.compile(r'

', re.S)

price_re = re.findall(re_pattern, html)

print('用正则表达式跳过re.compile提取price:', price_re)

提取第一个div下ul下li的文字

def ul_li():

# 第一种，xpath提取

html_etree = etree.HTML(html)

# ul_li = html_etree.xpath('//div/div[1]/ul/child::*/text()') #child::节点子元素方法

# ul_li = html_etree.xpath('//div/div[1]/ul/li/text()')

# ul_li = html_etree.xpath("//div[@id='testid']/preceding::div/ul/li/text()") #preceding:: 当前节点标签之前的所有节点，可定点

ul_li = html_etree.xpath("//div[@id='testid']/preceding::li/text()") #preceding:: 可避免重复节点带来的麻烦

print('用xpath方法提取ul标签下的li的内容:', ul_li)

# 第二种，BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

# 第一种BeautifulSoup方法

# ul_li = soup.select('ul')[0].select('li')

# ul_li = [i.get_text() for i in ul_li]

#另外一种BeautifulSoup方法

ul_li = soup.div.div.get_text()

ul_li = ul_li.strip() #删除首尾空格

ul_li = ul_li.split('\n') #按换行符分割字符串

print('用BeautifulSoup方法提取ul_li:', ul_li)

# 第三种，正则表达式提取

re_pattern = re.compile(r'

.*?

.*?
(.*?).*?
(.*?).*?
(.*?)', re.S)

re_ul_li = re.findall(re_pattern, html)

print('用正则表达式跳过re.compile提取ul_li:', re_ul_li)

def first_id():

# 第一种，xpath提取

html_etree = etree.HTML(html)

first_id = html_etree.xpath('//div/div[2]/@id')

print('用xpath方法提取first_id的内容:', first_id)

# 第二种，BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

first_id = soup.select('div')[2].attrs['id']

print('用BeautifulSoup方法提取first_id:', first_id)

# 第三种，正则表达式提取

re_comppile = re.compile(r"

", re.S)

first_id = re.findall(re_comppile, html)

print('用正则表达式跳过re.compile提取first_id:', first_id)

def h2():

# 第一种，xpath提取

html_etree = etree.HTML(html)

h2 = html_etree.xpath('//div/div[2]/h2/text()')

print('用xpath方法提取h2的内容:', h2)

# 第二种，BeautifulSoup提取

soup = BeautifulSoup(html, 'lxml')

# h2 = soup.select('h2')[0].get_text()

h2 = soup.div.h2.get_text()

print('用BeautifulSoup方法提取h2:', h2)

# 第三种，正则表达式提取

re_comppile = re.compile(r'

(.*?)

', re.S)

h2 = re.findall(re_comppile, html)

print('用正则表达式跳过re.compile提取h2:', h2)

def main():

title()

price()

ul_li()

first_id()

h2()

if name == 'main':

main()

电影观察君

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫