2020-10-30-DataAnalysis-bs4-pyQuery-lxml

最新推荐文章于 2024-08-20 08:46:34 发布

AMr.Yang

最新推荐文章于 2024-08-20 08:46:34 发布

阅读量93

点赞数

文章标签： css python

本文链接：https://blog.csdn.net/weixin_49736094/article/details/109392798

版权

day-5DataAnalysis

一、selenium设置代理

from selenium import webdriver
from selenium.webdriver import ChromeOptions
import requests

def get_ip():
    url=''
    response = erquests.get(url)
    if response.status_code == 200:
        if response.text[0] == '{':
            print('获取失败')
        else:
            return [x for x in response.text.split('\n') if x]
    else:
        print('请求失败')
        
ips = get_ip()
while True:
    if ips:
        options = ChromeOptions()
        options.add_argument(f'--proxy-server=http://{ips[0]}')
        b = webdruver.Chrome(options=optons)
        b.get('')
        break
    else:
        print('获取失败')

二、bs4的使用

1.html页面解析的方法

"""
1.正则表达式
2.通过css选择器选中标签  bs4,pyQuery
3.通过xPath获取标签
"""
"""准备数据"""
def get_data():
    with open('files/安居客.html',encoding='utf-8') as f:
        return f.read()

2.bs4的使用

"""根据网页内容创建解析器对象"""
# BeautifulSoup(网页内容，解析器类型)
# 网页内容  - HTML格式的字符串  - 一般通过requests 和 selenium 去页面数据
soup = BeautifulSoup(get_data,'lxml')

"""根据css选择器获取标签"""
"""
bs对象.select(css选择器)  - 获取选择器选中的标签
bs对象.select_noe(css选择器)  - 获取选择器选中的第一个标签
"""
house_name = soup.select('.items-name')
house_details = soup.select('.favor-pos'

"""根据属性获取标签
find_all(attrs={属性1：属性值1，...})  - 获取指定属性是指定值的所有标签
"""

# 4）获取标签内容和属性
# a.标签内容值
# 标签对象.string    只提取标签里是文字的(自取子代)
# 标签对象.get_text()  标签里的所有文字 包括子标签
# 标签对象.contents
print('================名字==================')
for item in house_name:
     print(item.string)  #  只提取标签里是文字的(自取子代)
     print(item.get_text())  #  标签里的所有文字 包括子标签
     print(item.contents)

# b.标签属性值
# 标签对象.attrs[属性名]

print('================链接==================')
for x in house_details:
    print(x.attrs['href'])

print('===============价格=====================')
house_price = soup.select('.price')
for item in house_price:
    print(item.get_text())

print('===============图片=====================')
imgs = soup.select('.pic>img')
for x in imgs:
    print(x.attrs['src'])

print('========================================================')
result = soup.find_all(attrs={'height':'135'})
print(result)

三、bs4的应用

from bs4 import BeautifulSoup


# 准备数据
def get_data():
    with open('files/安居客.html',encoding='utf-8') as f:
        return f.read()

# 解析数据
def analysis_data(data):
    soup = BeautifulSoup(data,'lxml')
    # 获取所有楼盘对应的div
    house_boxs = soup.select('.list-results .item-mod')
    # print(type(house_boxs))
    all_house = []
    for h_div in house_boxs:
        # 楼盘名称
        name_tag = h_div.select_one(".items-name")
        # 价格
        price_tag = h_div.select_one('.price')
        # 楼盘链接
        pic_tag = h_div.select_one('.lp-name')

        house = {
            'name':name_tag.string if name_tag else None,
            'price':price_tag.get_text() if price_tag else None,
            'pic':pic_tag.attrs['href'] if pic_tag else None,
        }
        # print(house)
        all_house.append(house)
    return all_house



if __name__ == '__main__':
    print(analysis_data(get_data()))

四、pyQuery的使用

from pyquery import PyQuery

# 准备数据
def get_data():
    with open('files/安居客.html',encoding='utf-8') as f:
        return f.read()
         
 """创建pyQuery对象"""
doc =  PyQuery(get_data())
# 2.获取标签
# pyQuery对象.（css选择器） - 选中css选中器选中的标签，返回PyQuery对象
names = doc('.items-name')

# 3.获取标签内容
# PuQuery对象.text()
# 直接操作pyQuery 对象会直接作用与这个对应中所有的标签
# print(names.text())

# 遍历pyQuery对象得到的是这个容器中的每个标签
for x in names:
    print(x,PyQuery(x).text())
# 4.获取标签属性
# 1）value属性
# PyQuery对象.val()

# 2) 普通属性
# PyQuery对象.attr(属性名)
result = doc('.pic>img')
for x in result:
    print(PyQuery(x).attr('src'))

五、pyQuery的应用

from pyquery import PyQuery

# 准备数据
def get_data():
    with open('files/安居客.html',encoding='utf-8') as f:
        return f.read()

def analysis_data(data):
    doc = PyQuery(data)
    all_house_tag = doc('.key-list>.item-mod')
    for x in all_house_tag:
        pq_x = PyQuery(x)
        name = pq_x('.items-name').text()
        area = pq_x('.building-area').text()
        price = pq_x('.price').text()
        title = pq_x('.group-mark').text()
        house ={
            'name':name,
            'area': area,
            'price':price,
            'tile': title,
        }

        print(house)




if __name__ == '__main__':
    analysis_data(get_data())