学习目标:
掌握xpath/bs4/re三种解析方法
学习内容:
# xpath 数据解析
# 安装 lxml 模块
# from lxml import etree 只能解析树形结构文档
# html_data='''
#
#
#
# '''
# html = etree.HTML(html_data)
# //获取所有的子节点
# print(html.xpath('//book'))
# / 从根节点出发
# print(html.xpath('/book'))
# . 选取节点下的节点
# .. 获取上一个节点 父节点
# @ 选取属性
# 节点选择
# * 未知
from lxml import etree
import requests
# f = open('data.csv', 'w+', encoding='gbk')
# f.write(f'title,name,witch\n')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.55'
}
#
# for i in range(1,100,24):
#
# url = f'https://www.douguo.com/jingxuan/{i}'
#
# html = requests.get(url, headers=headers)
# # print(html.text)
# html = etree.HTML(html.text)
# # print(html.xpath('/html/head')) # 从根目录获取
# # print(html.xpath('//body/div')) # 相对路径
# # print(html.xpath('//@id'))
#
# # 谓语 获取单个数据 从1开始
# # print(html.xpath('//body/div[1]')) # 获取第一个
# # print(html.xpath('//body/div[last()]')) # 获取最后一个
# # print(html.xpath('//body/div[last()-1]')) # 获取倒数第二个
# # print(html.xpath('//body/div[position()<3]')) # 获取前两个
#
# # //title[@lang='eng']
# # print(html.xpath('//div[@class="imublo clearfix"][1]/a/@href'))
# # print(html.xpath('//div[@class="imublo clearfix"][1]/a/text()'))
#
# # //* 任何标签都会获取
# # print(html.xpath('//*[@class="imublo clearfix"][1]/a/text()'))
#
# # print(html.xpath('//div|//a'))
#
# data1 = (html.xpath('//*[@id="jxlist"]/li/div/a[1]/text()')) # 菜的标题
# data2 = (html.xpath('//*[@id="jxlist"]/li/div/a[2]/img/@alt')) # 作者
# data3 = (html.xpath('//*[@id="jxlist"]/li/div/div/span[1]/text()')) # 观看数
#
# datas = zip(data1, data2, data3)
# for title,name,witch in datas:
# print(title, name, witch)
# try:
# f.write(f'{title},{name},{witch}\n')
# except:
# pass
# f.close()
url = f'https://www.douguo.com/jingxuan/'
html = requests.get(url, headers=headers)
print(html.text)
with open('../day06/text.html', 'w+', encoding='utf-8') as f:
f.write(html.text)
# bs4取块操作 查看获取数据的代码
# xpath re
# from bs4 import BeautifulSoup
#
# f = open('text.html', 'r', encoding='utf-8')
# data = f.read()
# f.close()
#
# data = BeautifulSoup(data, 'lxml')
# print(data.prettify()) # 格式化
# print(data.title)
# print(data.html.attrs) # 获取属性
# print(data.meta.attrs)
# print(data.meta['content']) # 获取具体的属性
# print(data.meta.get('content'))
# print(data.title.string)
# print(data.title.text)
# 遍历文档
# 获取所有的子节点 contents、children
# print(data.head.contents)
# print(data.head.children)
# # data.find(标签,'属性') # 提取最近一个满足要求的数据
# print(data.find_all('div', class_="imublo clearfix")) # 关键字实参可以在后面加_
# 返回的是一个列表
from bs4 import BeautifulSoup
import requests
url = 'https://movie.douban.com/subject/35457272/?from=showing'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.55'
}
html = requests.get(url, headers=headers)
with open('豆瓣.html', 'w', encoding='utf-8') as f:
f.write(html.text)
fo = open('豆瓣.html', 'r', encoding='utf-8')
soup = fo.read()
fo.close()
soup = BeautifulSoup(soup, 'lxml')
attrs = soup.find_all('a', rel="v:starring")
print(attrs)
import re
# 'abc' # b在中间的位置 a 的后面 c的前面 第二个
# print(re.S) # 一个匹配模式
# \d \s \w
# 匹配任何十进制数
# findall(正则表达式, 需要查找的字符串, 模式)
# str_data = 'a1hd2sj3afj45ids'
# print(re.findall('\d', str_data)) # 匹配任何十进制数
# print(re.findall('\d*', str_data))
# print(re.findall('\d+', str_data))
# print(re.findall('\d?', str_data))
# print(re.findall('\D', str_data)) # 匹配非数字
# \s 匹配空白符
# str_data = 'a1\thd2 sj3 afj4ids'
# print(re.findall('\s', str_data))
# print(re.findall('\S', str_data)) # 匹配非空白符
# \w 匹配任何非特殊字符
# str_data = 'a1\thd2 sj3 afj4ids我_!@#¥……&&'
# print(re.findall('\w', str_data)) # _也属于
# print(re.findall('\W', str_data)) # 匹配任何特殊字符!@#¥%……&*
# 量词
# . 代表通配符,除了\n不能匹配外,其他全部能匹配,一个点代表一个字符
# str_data = 'a1b a2b a3b'
# print(re.findall('a.b', str_data))
# ^代表字符串开头进行匹配,只能放在最前面
# str_data = 'a1112345678'
# print(re.findall('^a111', str_data))
# $代表字符串结尾进行匹配,只能放在最后面
# str_data = 'a1112345678'
# print(re.findall('\d$', str_data))
# *代表0到0次以上 .*代表贪婪匹配 \d*
# str_data = 'ac abc aaac abbbbc'
# print(re.findall('a.*c', str_data))
# +代表1到1次以上 \d+
# str_data = 'ac abc aaac abbbbc'
# print(re.findall('a.+c', str_data))
# ?代表0个或1个 \d?
# str_data = 'ac abc aaac abbbbc'
# print(re.findall('a.?c', str_data))
# {}可以自行控制匹配多少个 {6}代表匹配6次,{1,6}代表匹配1-6次,{6,}代表匹配6个以上
# []代表字符集,或的作用
# str_data = 'abc aac adc'
# print(re.findall('a[ba]c', str_data))
# 从开头进行查找
# str_data = 'abcdef12345'
# data = re.match('abc', str_data)
# print(data.span())
# print(data.group())
# search 查找 找到符合数据开头直接返回,只找一次
# str_data = 'abcdef12345'
# data = re.search('abc', str_data)
# print(data.span())
# print(data.group())
# compile 编译正则表达式 search match
# data = re.compile('\d*')
# print(data.match('12345abcd'))
# str_data = '<meta charset="UTF-8">'
#
# print(re.findall('<meta charset="(.*?)">', str_data))
# str_data = '''
# <div class="imublo clearfix">
#
# <a href="/caipu/家常菜" target1="_blank">家常菜</a>
#
# <a href="/caipu/热菜" target1="_blank">热菜</a>
#
# <a href="/caipu/凉菜" target1="_blank">凉菜</a>
#
# <a href="/caipu/主食" target1="_blank">主食</a>
#
# <a href="/caipu/汤" target1="_blank">汤</a>
#
# <a href="/caipu/早餐" target="_blank">早餐</a>
#
# <a href="/caipu/午餐" target="_blank">午餐</a>
#
# <a href="/caipu/海鲜" target="_blank">海鲜</a>
#
# <a href="/caipu/孕妇" target="_blank">孕妇</a>
#
# <a href="/caipu/甜品" target="_blank">甜品</a>
#
# <a href="/caipu/粥" target="_blank">粥</a>
#
# <a href="/caipu/宝宝食谱" target="_blank">宝宝食谱</a>
#
# <a href="/caipu/糕点" target="_blank">糕点</a>
#
# <a href="/caipu/微波炉" target="_blank">微波炉</a>
#
# </div>
# '''
#
# print(re.findall('>(.*?)<', str_data))
#
# print(re.findall('target.*?="(.*?)"', str_data))
#
# print(re.findall('<a href="(.*?)" .*?="_blank">(.*?)</a>', str_data))
# re.I 忽略大小写
# re.A 忽略中文
# re.S 通常用于表达式使用多行的参照数据
# .不能匹配\n 但是可以通过re.S匹配
# re.M 匹配多行数据
id_ = input('请输入身份证号:')
is_id = re.findall('^\d{17}[0-9Xx]$', id_)
if is_id:
print(True)
else:
print(False)
学习产出:
- 技术笔记 1 篇