day-5DataAnalysis
一、selenium设置代理
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import requests
def get_ip():
url=''
response = erquests.get(url)
if response.status_code == 200:
if response.text[0] == '{':
print('获取失败')
else:
return [x for x in response.text.split('\n') if x]
else:
print('请求失败')
ips = get_ip()
while True:
if ips:
options = ChromeOptions()
options.add_argument(f'--proxy-server=http://{ips[0]}')
b = webdruver.Chrome(options=optons)
b.get('')
break
else:
print('获取失败')
二、bs4的使用
1.html页面解析的方法
"""
1.正则表达式
2.通过css选择器选中标签 bs4,pyQuery
3.通过xPath获取标签
"""
"""准备数据"""
def get_data():
with open('files/安居客.html',encoding='utf-8') as f:
return f.read()
2.bs4的使用
"""根据网页内容创建解析器对象"""
# BeautifulSoup(网页内容,解析器类型)
# 网页内容 - HTML格式的字符串 - 一般通过requests 和 selenium 去页面数据
soup = BeautifulSoup(get_data,'lxml')
"""根据css选择器获取标签"""
"""
bs对象.select(css选择器) - 获取选择器选中的标签
bs对象.select_noe(css选择器) - 获取选择器选中的第一个标签
"""
house_name = soup.select('.items-name')
house_details = soup.select('.favor-pos'
"""根据属性获取标签
find_all(attrs={属性1:属性值1,...}) - 获取指定属性是指定值的所有标签
"""
# 4)获取标签内容和属性
# a.标签内容值
# 标签对象.string 只提取标签里是文字的(自取子代)
# 标签对象.get_text() 标签里的所有文字 包括子标签
# 标签对象.contents
print('================名字==================')
for item in house_name:
print(item.string) # 只提取标签里是文字的(自取子代)
print(item.get_text()) # 标签里的所有文字 包括子标签
print(item.contents)
# b.标签属性值
# 标签对象.attrs[属性名]
print('================链接==================')
for x in house_details:
print(x.attrs['href'])
print('===============价格=====================')
house_price = soup.select('.price')
for item in house_price:
print(item.get_text())
print('===============图片=====================')
imgs = soup.select('.pic>img')
for x in imgs:
print(x.attrs['src'])
print('========================================================')
result = soup.find_all(attrs={'height':'135'})
print(result)
三、bs4的应用
from bs4 import BeautifulSoup
# 准备数据
def get_data():
with open('files/安居客.html',encoding='utf-8') as f:
return f.read()
# 解析数据
def analysis_data(data):
soup = BeautifulSoup(data,'lxml')
# 获取所有楼盘对应的div
house_boxs = soup.select('.list-results .item-mod')
# print(type(house_boxs))
all_house = []
for h_div in house_boxs:
# 楼盘名称
name_tag = h_div.select_one(".items-name")
# 价格
price_tag = h_div.select_one('.price')
# 楼盘链接
pic_tag = h_div.select_one('.lp-name')
house = {
'name':name_tag.string if name_tag else None,
'price':price_tag.get_text() if price_tag else None,
'pic':pic_tag.attrs['href'] if pic_tag else None,
}
# print(house)
all_house.append(house)
return all_house
if __name__ == '__main__':
print(analysis_data(get_data()))
四、pyQuery的使用
from pyquery import PyQuery
# 准备数据
def get_data():
with open('files/安居客.html',encoding='utf-8') as f:
return f.read()
"""创建pyQuery对象"""
doc = PyQuery(get_data())
# 2.获取标签
# pyQuery对象.(css选择器) - 选中css选中器选中的标签,返回PyQuery对象
names = doc('.items-name')
# 3.获取标签内容
# PuQuery对象.text()
# 直接操作pyQuery 对象会直接作用与这个对应中所有的标签
# print(names.text())
# 遍历pyQuery对象得到的是这个容器中的每个标签
for x in names:
print(x,PyQuery(x).text())
# 4.获取标签属性
# 1)value属性
# PyQuery对象.val()
# 2) 普通属性
# PyQuery对象.attr(属性名)
result = doc('.pic>img')
for x in result:
print(PyQuery(x).attr('src'))
五、pyQuery的应用
from pyquery import PyQuery
# 准备数据
def get_data():
with open('files/安居客.html',encoding='utf-8') as f:
return f.read()
def analysis_data(data):
doc = PyQuery(data)
all_house_tag = doc('.key-list>.item-mod')
for x in all_house_tag:
pq_x = PyQuery(x)
name = pq_x('.items-name').text()
area = pq_x('.building-area').text()
price = pq_x('.price').text()
title = pq_x('.group-mark').text()
house ={
'name':name,
'area': area,
'price':price,
'tile': title,
}
print(house)
if __name__ == '__main__':
analysis_data(get_data())