数据解析
数据解析分类: 1.正则 2.bs4 3.xpath
数据解析原理概述:
- 解析的局部的文本内容都会在标签之间或者标签对应的属性中进行存储
- 进行指定标签的定位
- 对标签或者标签对应的属性中存储的数据值进行提取(解析)
实际操作
需求:请求url保存图片至本地
import requests
url = 'https://pics5.baidu.com/feed/b3b7d0a20cf431ad67f62bea911223aa2fdd98e3.jpeg'
#content返回的是二进制形式的图片数据
image_data = requests.get(url=url).content
with open("./zz.jpeg",'wb') as fp:
fp.write(image_data)
需求:爬取指定url的图片,这里要将Cookie改为自己的,不然爬取不了
import requests
import re
import os
#创建一个文件夹,保存所有的图片
if not os.path.exists('./image'):
os.mkdir('./image')
#请求头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
,'Cookie':'BAIDUID=15792F369CCB57C7B1768AB47021693D:FG=1; BIDUPSID=C5F74397E4C3533B468A6D7F70753C45; PSTM=1709637600; H_PS_PSSID=40212_40080_40365_40352_40375_40366_40401_40464_40459_40317_39661_40510; H_WISE_SIDS=40212_40080_40365_40352_40375_40366_40401_40464_40459_40317_39661_40510; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ab_sr=1.0.1_YTc4OTcyYWQ4NzdkMjkxMjAyZjc1NzM1YmY3MDdlOWM0YjM3ZTUzOGVlNWVjMjIzZjEwN2Q5NGExOWM5YjAxMDA1ZWFmYmJiNGNkYzQ4YTI4OTFiNjU0Y2E4OWE3ZDBlODFmZjg4M2JjZjFhOTA5YjAwNTFmNGVhMmM3NWViMWViZWEzZGIzNzVhNDEyZWIwOTAzMGYwYTg4NjRjOTlhMg==; BA_HECTOR=20a1858l808485242l252424f05h9o1ivqpa91s; ZFY=intePhgjae:BoGLsKPKIab2U4XLe4NjsR:B15bTm:BBmQE:C; RT="z=1&dm=baidu.com&si=d57b3fea-11cb-4cc7-9cb6-664a568860fe&ss=lu2mdpll&sl=1&tt=1ec&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=26p&ul=2pyf&hd=2pz6'
,'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
,'Referer':'https://www.baidu.com/link?url=glpn33iXU_5YSodvA3laXaAJ9gntlMpxcDahjF8rt-4UdB3RPqqEbDSVm4Y4uEuXcKazstR2DjF_mH5ILBv39sBvPwHzv2yVrUGRNXyAkqa&wd=&eqid=e4289b1d0029ac530000000665fd39e3'
,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
}
#要爬取的页面
url = 'https://baijiahao.baidu.com/s?id=1646606332727653670&wfr=spider&for=pc'
#使用通用爬虫对url对应的一整张页面进行爬取,去除空格以及换行符
url_text = requests.get(url=url,headers=headers).text.replace(' ','').replace('\n','')
#正则表达式
ex = '{"type":"img","link":"(.*?)@f_auto\?token=.*?"}'
img_url = re.findall(ex,url_text)
url_list = []
print(img_url)
#去重
for item in img_url:
if item not in url_list:
url_list.append(item)
#将不能利用的去除
del url_list[-10:]
for item in url_list:
#生成图片名称
img_name = item.split('/')[-1]
#生成路径
img_path = './image/' + img_name
img_data = requests.get(url=item,headers=headers).content
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name + ' 下载成功!')
bs4数据解析的原理:
1.实例化一个BeautifulSoup对象,并且将页面源码数据加载到该对象中
2.通过调用BeautifulSoup对象中相关的属性或者方法进行标签定位和数据提取
bs4基本使用
from bs4 import BeautifulSoup
import lxml
#将本地的html文档中的数据加载到该对象中
#fp = open('./test.html','r',encoding='utf-8')
#soup = BeautifulSoup(fp,'lxml')
#print(soup)
#将互联网上获取的页面源码加载到该对象中
#page_text = response.text
#soup = Beautiful(page_text,'lxml')
fp = open('./test.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
#soup.tagName返回的是html中第一次出现的tagName标签
print(soup.a)
#print(soup.li)
#find('tagName') 等同于soup.tagName
print(soup.find('a'))
#返回符合要求的所有标签
print(soup.findAll('a'))
#select('某种选择器(id,class,标签...选择器)'),返回的是一个列表
print(soup.select('.more'))
# >表示的是一个层级, ' '(空格)表示多个层级
print(soup.select('.s1 > dd > ul > li > a')[0])
print(soup.select('.s1 > dd a')[0])
#获取标签中文本数据
#text/get_text()可以获取某一个标签中所有的文本内容,string只可以获取该标签下面直系的文本内容
print(soup.select('.s1 > dd a')[0].text)
#print(soup.select('.s1 > dd a')[0].string)
#print(soup.select('.s1 > dd a')[0].get_text())
#获取标签中的属性
print(soup.select('.s1 > dd a')[0]['href'])
需求:爬取指定图书网站三国演义的所有章节标题与内容
from bs4 import BeautifulSoup
import lxml
import requests
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
}
page_text = requests.get(url=url,headers=headers).content
#实例化Beautiful对象,需要将页面源码数据加载到该对象中
soup = BeautifulSoup(page_text,'lxml')
#解析标题和详情页的url
li_list = soup.select('.book-mulu > ul > li ')
title_list = []
content_list = []
#写入文本文件中
fp = open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
url_s = 'https://www.shicimingju.com' + li.a['href']
detail_page_text = requests.get(url=url_s,headers=headers).content
detail_soup = BeautifulSoup(detail_page_text,'lxml')
content = detail_soup.select('.chapter_content ')[0].get_text()
title = li.a.string
title_list.append(title)
fp.write(title + ':' + content + '\n')
print(title + '爬取成功!')
print(title_list)
需求:爬取58同城二手房的标题和价格
#需求:爬取58二手房的房源信息
from lxml import etree
import requests
url = 'https://bj.58.com/ershoufang/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
,
}
page_text = requests.get(url=url,headers=headers).text
#数据解析
tree = etree.HTML(page_text)
fp = open('./ershoufang.txt','w',encoding='utf-8')
title_list = tree.xpath('//div[@class="property-content-title"]//h3[@class="property-content-title-name"]/text()')
price_list = tree.xpath('//div[@class="property-price"]//span[@class="property-price-total-num"]/text()')
print(title_list)
print(price_list)
for i in range(len(title_list)):
fp.write(title_list[i] + '- price: '+ price_list[i] + '\n')
需求:爬取指定网站动漫人物所有的图片,包括1到136页
import requests
from lxml import etree
import os
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
,
}
url = 'https://pic.netbian.com/4kdongman/index.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'gbk'
page_text = response.text
tree = etree.HTML(page_text)
url_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]//img/@src')
print(url_list)
if not os.path.exists('./img'):
os.mkdir('./img')
for i in range(len(url_list)):
url_path = 'https://pic.netbian.com' + url_list[i]
img_name = ('./img/' + url_list[i].split('/')[-1])
imgdata = requests.get(url=url_path,headers=headers).content
with open(img_name,'wb') as fp:
fp.write(imgdata)
print(img_name + " download success\n")
for i in range(2,137):
url = 'https://pic.netbian.com/4kdongman/index_' + str(i) + '.html'
response = requests.get(url=url, headers=headers)
response.encoding = 'gbk'
page_text = response.text
tree = etree.HTML(page_text)
url_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]//img/@src')
if not os.path.exists('./img'):
os.mkdir('./img')
for i in range(len(url_list)):
url_path = 'https://pic.netbian.com' + url_list[i]
img_name = ('./img/' + url_list[i].split('/')[-1])
imgdata = requests.get(url=url_path, headers=headers).content
with open(img_name, 'wb') as fp:
fp.write(imgdata)
print(img_name + " download success\n")
需求:爬取全国城市的全部名称
import requests
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
,
}
url = 'https://www.aqistudy.cn/historydata/'
response = requests.get(url=url,headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
city_list = tree.xpath('//ul[@class="unstyled"]//a/text()')
unique_city_list = []
for city in city_list:
if city not in unique_city_list:
unique_city_list.append(city)
print(city_list)
print(len(unique_city_list))
fp = open('./city.txt','w',encoding='utf-8')
for city in unique_city_list:
fp.write(city + '\n')
fp.close()