4.1学习笔记
一、实例
1.东方财富
import requests
from re import search
import json
import csv
def an_data(data: str):
"""解析数据"""
result = search(r'.+?\((.+?)\)', data).group(1)
re_data = json.loads(result)
all_stock = re_data['data']['diff']
with open('files/股票2.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, list(all_stock[0].keys()))
writer.writeheader()
writer.writerows(all_stock)
def get_data():
"""获取数据"""
# url = 'http://quote.eastmoney.com/center/gridlist.html#hs_a_board'
url = 'http://57.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406995981144857424_1617242980372&pn=3&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1617242980373'
response = requests.get(url)
if response.status_code == 200:
# print(response.text)
an_data(response.text)
else:
print('爬取失败!')
get_data()
2.百度图片(图片加载)
import requests
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
def download_image(url: str):
"""下载指定图片"""
response = requests.get(url)
if response.status_code == 200:
with open(f'images/{url.split("/")[-1]}', 'wb') as f:
f.write(response.content)
print('下载成功!')
else:
print('下载失败!')
def an_data(data: str):
"""解析页面拿到图片的地址"""
soup = BeautifulSoup(data, 'lxml')
result = soup.select('.clearfix>li')
for li in result:
img = li.select('.main_img')[0]
img_url = img.attrs['data-imgurl']
# print(img_url)
download_image(img_url)
def get_data():
"""获取页面"""
global b
b = Chrome()
b.get('https://image.baidu.com/')
search_input = b.find_element_by_css_selector('#kw')
search_input.send_keys('路飞')
search_input.send_keys(Keys.ENTER)
time.sleep(1)
# print(b.page_source)
# time.sleep(15)
an_data(b.page_source)
b.close()
get_data()
二、pyquery的使用
from pyquery import PyQuery
# pyquery是通过css选择器来获取网页中的标签
# 1.获取数据(只能是html界面)
with open('files/data.html', encoding='utf-8') as f:
content = f.read()
# 2.创建PyQuery对象
html = PyQuery(content)
# 3.获取标签
# 直接在整个页面中按照css选择器获取指定标签
# PyQuery对象(css选择器) - 获取指定标签
p = html('div>p')
print(p)
lis = html('li')
print(lis)
f1 = html('#f1')
print(f1)
ps = html('p')
print(ps)
# 在指定标签中按照css选择器获取指定标签
div1 = html('#div1')
p = div1('p')
print(p, type(p))
divs = html('.c1')
print(divs)
ps = divs('p')
print(ps)
# 4.获取标签的内容和属性
# PyQuery对象.text() - 获取双标签的文本内容
# PyQuery对象.val() - 获取标签的value属性
# PyQuery对象.attr(属性名) - 获取标签指定的属性
result = html('#p1').text()
print(result) # 我是段落2
print('=================')
# 直接获取所有的p标签的文本内容
result = html('p').text()
print(result, type(result))
# 单独获取所有p标签的文本内容
ps = html('p')
for x in ps:
print('x:', PyQuery(x).text())
result = html('input').val()
print(result)
all_a = html('a')
for a in all_a:
print(PyQuery(a).attr('href'))
实例:中关村在线
import requests
from pyquery import PyQuery
import csv
def get_detail_page(url):
response = requests.get(url)
if response.status_code == 200:
html = PyQuery(response.text)
ps = html('.article-cont>p')
return ps.text().replace(' ', '\n')
else:
print('详情页获取失败!')
# item - links
def an_data(data: str):
html = PyQuery(data)
new_a = html('.news-list>li>a')
news = []
for x in new_a:
a = PyQuery(x)
href = a.attr('href')
new_href = href if href[:4] == 'http' else f'http:{href}'
result = get_detail_page(new_href)
print(result)
news.append((a.text(), new_href, result))
print('====================================================')
with open('files/新闻.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerows(news)
def get_data():
url = 'https://www.zol.com.cn/'
response = requests.get(url)
if response.status_code == 200:
# print(response.text)
an_data(response.text)
else:
print('请求失败')
get_data()
三、xPath解析
from lxml import etree
# xPath主要针对html文件和xml文件,解析原理:通过告诉解析器需要标签在页面中的路径来获取对应的标签
# xml也是一种通用的数据格式
"""
<supermarket>
<name>永辉超市</name>
<goodsList>
<goods price="100">衣服</goods>
<goods></goods>
<goods></goods>
<goods></goods>
</goodsList>
</supermarket>
"""
# 0.准备网页数据
with open('files/data.html', encoding='utf-8') as f:
content = f.read()
# 1.创建解析器对象
# 根节点 = etree.HTML(html文本数据)
html = etree.HTML(content) # <Element html at 0x1045b5e80>
# 2.获取节点(获取标签)
# 节点对象.xpath(路径)
# 1)标签名 - 在当前节点下找对应的子节点(相对路径)
# 获取html节点中名字叫body的子节点
result1 = html.xpath('body')
print(result1) #[<Element body at 0x105501340>]
result2 = html.xpath('body/div')
print(result2) # [<Element div at 0x10c8a23c0>, <Element div at 0x10c8a2440>, <Element div at 0x10c8a2480>]
result3 = html.xpath('body/div/img')
print(result3) # [<Element img at 0x10e3115c0>]
div = result2[0]
print(div.xpath('a'))
# 2). - 写相对路径
result4 = html.xpath('./body/div/font')
print(result4)
result5 = div.xpath('./img')
print(result5)
# 3)/ - 绝对路径(从根节点开始写路径,而且和谁去.xpath无关)
result6 = html.xpath('/html/body/div/img')
print(result6) # [<Element img at 0x10c78f1c0>]
result7 = div.xpath('/html/body/div/img')
print(result7) # [<Element img at 0x10c78f1c0>]
# 4) // - 从任意位置开始
# //img - 获取整个页面中所有的img节点
# //div/img - 获取整个页面中是div子节点的img节点
result8 = html.xpath('//p')
print(result8)
result9 = html.xpath('//div/div/p')
print(result9)
result10 = html.xpath('//div/p')
print(result10)
# 5) .. - 当前节点的上层节点
result11 = div.xpath('../ol')
print(result11)
# 6) @ - 获取属性值
# //img/@src - 获取整个页面中所有图片标签的src属性
img = html.xpath('//img/@src')
print(img)
# 7) text() - 获取标签的文本内容
lis = html.xpath('//li/text()')
print(lis)
四、xPath的谓词
from lxml import etree
# xPath的谓词可以理解成筛选条件,写的时候必须放在[]里面
# 0.准备网页数据
with open('files/data.html', encoding='utf-8') as f:
content = f.read()
html = etree.HTML(content)
# 1.位置
# [N] - 获取第N个(同层的第N个)
result = html.xpath('//div[1]')
print(result)
# 2.属性
# [@属性] - 筛选出包含指定属性的标签
# //p[@id] - 获取设置了id属性的p标签
result = html.xpath('//p[@id]/text()')
print(result) #['我是段落2', '我是段落3']
# [@属性=值] - 筛选出指定属性是指定值的标签
# //p[@id="p1"] - 获取id是p1的p标签
result = html.xpath('//p[@id="p1"]/text()')
print(result) # ['我是段落2']
# 子标签内容 - 通过子标签的内容来对父标签进行筛选
# //div[p="我是段落5"] - 获取包含内容是"我是段落5"的p标签的div
result = html.xpath('//div[p="我是段落5"]')
print(result)
result = html.xpath('//div[span>90]/p/text()')
print(result)
# 3.通配符
# 用*表示所有
result = html.xpath('//div[@id="div1"]/*')
print(result)
result = html.xpath('//*[@*]')
print(len(result))
# 4.同时选取多个路径
# 路径1|路径2|路径3|...
result = html.xpath('//div[@id="div1"]/p/text()|//div[@id="div1"]/a/text()')
print(result)