选项卡切换和xpath解析
京东商品50页数据爬虫
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
import time, csv
from bs4 import BeautifulSoup
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 如果浏览器对象是全局变量,操作结束后浏览器不会自动关闭;如果是局部变量,函数结束后会自动关闭浏览器
b = Chrome(options=options)
b.implicitly_wait(10)
# 创建writer
writer = csv.writer(open('files/运动鞋数据.csv', 'w', encoding='utf-8', newline=''))
writer.writerow(['商品名', '价格', '评论数', '商家', '商品链接', '图片', '商家链接'])
def analysis_data(html: str):
soup = BeautifulSoup(html, 'lxml')
all_goods_li = soup.select('.gl-warp>li')
all_data = []
for goods in all_goods_li:
a = goods.select_one('.p-img>a')
title = a.attrs['title']
goods_url = 'https:' + a.attrs['href']
img_dict = a.select_one('img').attrs
if 'src' in img_dict:
img_url = 'https:' + img_dict['src']
else:
img_url = 'https:' + img_dict['data-lazy-img']
price = goods.select_one('.p-price i').text
comment_count = goods.select_one('.p-commit a').text
shop = goods.select_one('.curr-shop')
shop_name = shop.text
shop_url = shop.attrs['href']
all_data.append([title, price, comment_count, shop_name, goods_url, img_url, shop_url])
writer.writerows(all_data)
print('====================数据写入成功====================')
def get_net_data():
# 1. 打开京东搜索 运动鞋
b.get('https://www.jd.com')
search = b.find_element_by_id('key')
search.send_keys('运动鞋')
search.send_keys(Keys.ENTER)
for _ in range(50):
# 2. 滚动页面
# height = 1000
for _ in range(10):
b.execute_script('window.scrollBy(0,800)')
time.sleep(1)
# 3. 获取网页数据,并且解析数据
analysis_data(b.page_source)
# print(b.page_source)
# 4.点击下一页
b.find_element_by_class_name('pn-next').click()
if __name__ == '__main__':
get_net_data()
中国知网选项卡切换
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
# 1. 打开中国知网
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = Chrome(options=options)
b.implicitly_wait(5)
b.get('https://kns.cnki.net/kns8')
# 2. 搜索相关文献
search = b.find_element_by_id('txt_search')
search.send_keys('人工智能')
search.send_keys(Keys.ENTER)
# 3.获取搜索结果
wait = WebDriverWait(b, 5)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'result-table-list')))
search_result = b.find_elements_by_css_selector('.result-table-list .name>a')
# 4. 点击每个搜索结果,获取详情页数据
for a in search_result:
# 点击每个结果
a.click()
time.sleep(1)
# 切换窗口
# 浏览器对象.window_handles - 获取当前浏览器中所有的窗口对象,返回一个列表
# 切换选项卡: 浏览器对象.switch_to.window(窗口对象)
b.switch_to.window(b.window_handles[-1])
print(b.page_source)
b.close()
# 切换回原窗口
b.switch_to.window(b.window_handles[0])
xpath数据解析
python基于xpath做数据解析需要使用的库是lxml,使用时需要导入
from lxml import etree
1.lxml第三方库
-
作用:lxml既可以解析xml内容也可以解析html内容
-
xpath相关术语
术语 解释 树 整个网页或者xml文件对应的结构 节点 标签、元素 根节点 最外层的标签 绝对路径 从根节点开始写的路径 相对路径 用.表示当前节点 节点内容(内容) 双标签的标签内容 属性 标签属性
2.xml数据结构
xml数据和json数据一样,是一种通用的数据格式
#json数据示例
supermarket = {
"name": "永辉超市",
"address": "肖家河大厦",
"staff": [
{"name":},
{}
{},
]
}
#xml数据示例
<supermarket>
<adress>肖家河大厦</adress>
<name>永辉超市</name>
<staffs>
<staff position="收营员" class="c2">张三</staff>
<staff position="售货员" class="c2">小明</staff>
<staff position="售货员">小红</staff>
<staff position="促销员">小花</staff>
</staffs>
<all_goods>
<goods class="c2">
<name>泡面</name>
<pirce>3.5</pirce>
<count>120</count>
</goods>
<goods class="c2">
<name>火腿肠</name>
<pirce>1.5</pirce>
<count>305</count>
</goods>
<goods class="c3" id="d2">
<name>矿泉水</name>
<pirce>1.5</pirce>
<count>1200</count>
</goods>
<goods id="d1">
<name>巧克力</name>
<pirce>11.5</pirce>
<count>50</count>
</goods>
</all_goods>
</supermarket>
3.xpath语法 - 获取标签
1.创建树并且获取根节点
etree.XML(xml数据) - 创建xml树结构并且返回根节点
etree.HTML(html数据) - 创建html树结构并且返回根节点
root = etree.XML(open('超市.xml', encoding='utf-8').read())
2.通过路径获取标签
节点对象.xpath(路径) - 返回值是列表,列表中的元素是通过指定路径找到的所有标签对应的节点对象
三种路径 | 表示方法 |
---|---|
绝对路径 | /绝对路径 |
相对路径 | 用 . 表示当前节点 (xpath前面是哪个节点,当前节点就是谁) |
用 … 表示当前节点的父节点 | |
任意路径 | //路径 |
# ==================1.绝对路径====================
# 绝对路径一定从根节点开始往下写,和xpath前面是哪个节点无关
result = root.xpath('/supermarket/all_goods/goods/name/text()')
print(result)
all_goods_node = root.xpath('/supermarket/all_goods')[0]
result = all_goods_node.xpath('/supermarket/all_goods/goods/name/text()')
print(result)
# ==================2.相对路径====================
result = root.xpath('./all_goods/goods/name/text()')
print(result)
result = all_goods_node.xpath('./goods/name/text()')
print(result)
result = all_goods_node.xpath('../all_goods/goods/name/text()')
print(result)
goods_list = all_goods_node.xpath('./goods')
for goods in goods_list:
print(goods.xpath('./name/text()')[0], end=' ')
print(goods.xpath('./pirce/text()')[0], end=' ')
print(goods.xpath('./count/text()')[0])
# ==================3.任意路径====================
result = root.xpath('//name/text()')
print(result) # ['永辉超市', '泡面', '火腿肠', '矿泉水', '巧克力']
result = root.xpath('//goods/name/text()')
print(result) # ['泡面', '火腿肠', '矿泉水', '巧克力']
3.获取标签内容
获取标签的路径/text()
4.获取标签属性值
获取标签的路径/@属性名
result = root.xpath('//staff/@position')
print(result) # ['收营员', '售货员', '售货员', '促销员']
staff = root.xpath('//staff')[0]
print(staff.xpath('./@position')[0], staff.xpath('./@class')[0])
5.谓语 - 条件
位置相关的条件
操作 | 含义 |
---|---|
标签名[N] | 第N个指定标签 |
标签名[last()] | 最后一个指定标签 |
标签名[last()-N] | 倒数第(N+1)个标签 |
标签名[position()<N] | 获取所有位置值(从1开始)小于N的标签 |
标签名[position()>N] | 获取所有位置值(从1开始)大于N的标签 |
标签名[position()<=N] | 获取所有位置值(从1开始)小于等于N的标签 |
标签名[position()>=N] | 获取所有位置值(从1开始)大于等于N的标签 |
# 第2个
result = root.xpath('//all_goods/goods[2]/name/text()')[0]
print(result) # 火腿肠
# 最后一个
result = root.xpath('//all_goods/goods[last()]/name/text()')[0]
print(result) # 巧克力
# 倒数第2个
result = root.xpath('//all_goods/goods[last()-1]/name/text()')[0]
print(result) # 矿泉水
result = root.xpath('//all_goods/goods[position()<=3]/name/text()')
print(result)
和属性相关的条件
操作 | 含义 |
---|---|
标签名[@属性名] | 获取拥有指定属性的指定标签 |
标签名[@属性名=值] | 获取指定属性为指定的指定标签 |
result = root.xpath('//goods[@class]/name/text()')
print(result)
result = root.xpath('//goods[@id]/name/text()')
print(result)
# //div[@class="c2"] == div.c2
result = root.xpath('//goods[@class="c2"]/name/text()')
print(result)
和标签内容相关的条件
用子标签的标签内容对父标签进行筛选
操作 |
---|
标签对象[子标签名>数据] |
标签对象[子标签名<数据] |
标签对象[子标签名<=数据] |
标签对象[子标签名>=数据] |
标签对象[子标签名=数据] |
# 获取价格大于3的商品的名称
result = root.xpath('//goods[pirce>3]/name/text()')
print(result) # ['泡面', '巧克力']
# 获取商品名为"巧克力"的商品的名字
result = root.xpath('//goods[name="巧克力"]/name/text()')
print(result) # ['巧克力']
result = root.xpath('//goods[name="巧克力"]/count/text()')
print(result)
6. 通配符 - *
用*代替任何标签或者任何属性
result = root.xpath('//*[@class="c2"]')
print(result)
result = root.xpath('//goods/*/text()')
print(result)
xpath解析豆瓣电影
import requests
from lxml import etree
def get_html(url):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
response = requests.get(url, headers=headers)
# 解析数据
html = etree.HTML(response.text)
# 方法1:
# all_li = html.xpath('//ol[@class="grid_view"]/li/div')
# for div in all_li:
# name = div.xpath('./div[2]/div[1]/a/span[1]/text()')[0]
# score = div.xpath('./div[2]/div[2]/div/span[2]/text()')[0]
# print(name, score)
# 方法2:
names = html.xpath('//div[@class="hd"]/a/span[1]/text()')
scores = html.xpath('//span[@class="rating_num"]/text()')
# [[name, score], [name2, score2]]
all_data = list(map(lambda name, score: [name, score], names, scores))
print(all_data)
if __name__ == '__main__':
get_html('https://movie.douban.com/top250')
练习:爬取英雄联盟所有英雄及皮肤并装入文件
import requests, json, os, csv
def get_all_skins(url: str):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
response = requests.get(url, headers=headers)
all_skins = json.loads(response.text)['skins']
all_hero_skin = []
for skin in all_skins:
skin_name = skin['name']
all_hero_skin.append([skin_name])
writer = csv.writer(open(f'{file_path}/皮肤.csv', 'a', encoding='utf-8', newline=''))
writer.writerows(all_hero_skin)
def get_all_hero():
url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
response = requests.get(url, headers=headers)
all_hero = json.loads(response.text)['hero']
for hero in all_hero:
number = hero['heroId']
name = hero['title']
title = hero['alias']
img_url = f'https://game.gtimg.cn/images/lol/act/img/champion/{title}.png'
game_skin_url = f'https://game.gtimg.cn/images/lol/act/img/js/hero/{number}.js'
global file_path
file_path = os.path.join(r'files', name)
os.mkdir(file_path)
get_all_skins(game_skin_url)
if __name__ == '__main__':
get_all_hero()