day-3 代理和css选择器解析

最新推荐文章于 2024-03-30 10:42:39 发布

xff980913

最新推荐文章于 2024-03-30 10:42:39 发布

阅读量79

点赞数

本文链接：https://blog.csdn.net/xff980913/article/details/117307071

版权

关键词由CSDN通过智能技术生成

代理和css选择器解析

获取代理ip

import requests


def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    response = requests.get(api)
    if response.status_code == 200:
        if response.text[0] == '{':
            print('获取代理失败！提取太频繁')
        else:
            # print(response.text.split('\n'))
            return response.text.split('\n')[:-1]
    else:
        print('请求失败!')


def get_net_data():
    url = 'https://movie.douban.com/top250'
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    # 代理
    while True:
        ips = get_proxy_ips()
        if ips:
            proxies = {
                'http': ips[0],    # 'http': 'ip地址:端口号'
                'https': ips[0]
            }

            response = requests.get(url, headers=headers, proxies=proxies, timeout=2)
            if response.status_code == 200:
                print(response.text)
            else:
                print('数据请求失败!')
        else:
            print('没有成功获取到代理')




if __name__ == '__main__':
    # get_proxy_ips()
    get_net_data()

2.代理的程序优化

import requests
import time


def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    response = requests.get(api)
    if response.status_code == 200:
        if response.text[0] == '{':
            print('获取代理失败！提取太频繁')
        else:
            # print(response.text.split('\n'))
            return response.text.split('\n')[:-1]
    else:
        print('ip请求失败!')


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    while True:
        # 获取5个代理ip
        ips = get_proxy_ips()
        # 如果没有取到
        if not ips:
            print('ip获取失败!')
            time.sleep(1)
            continue

        while ips:
            ip1 = ips.pop()
            ip2 = ips.pop()
            print(ip1, ip2)
            proxies = {
                'http': ip1,
                'https': ip2
            }
            try:
                response = requests.get(url, headers=headers, proxies=proxies, timeout=3)
                if response.status_code == 200:
                    # print(response.text)
                    return response.text
                else:
                    print('数据请求失败!')
            except (requests.exceptions.ProxyError,  requests.exceptions.ConnectTimeout):
                print('超时，继续请求')


if __name__ == '__main__':
    result = get_net_data('https://movie.douban.com/top250')
    print(result)

3. bs4的使用

import requests
from bs4 import BeautifulSoup
# BeautifulSoup - 解析器类

def get_net_data(url):
    """获取网络数据"""
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(response)

创建解析器对象

def analysis_data(data: str):
    # 1. 创建解析器对象
    # BeautifulSoup(需要解析的html字符串, 解析器名称)
    bs = BeautifulSoup(data, 'lxml')
    print(type(bs))
    # print(bs)

根据css选择器获取标签

# select(css选择器)  -  获取选择器选中的所有标签, 以列表的形式返回
# select_one(css选择器)  - 获取选择器选中的第一个标签
result = bs.select('#p1')
print(result, len(result), type(result[0]))

result = bs.select('p')
print(result, len(result), type(result[0]))

result = bs.select('div>p')
print(result, type(result[0]))

result = bs.select_one('div>p')
print(result)

获取标签内容

# 标签对象.string    -   获取标签的文字内容(如果标签内容中有多个子标签或者同时存在文字和子标签，结果是None),返回值是字符串
# 标签对象.get_text() - 获取标签的文字内容(如果有子标签，会将子标签中的文字内容一起获取),返回值是字符串
# 标签对象.contents  -  获取标签中文字内容和子标签，返回值是列表
p1 = bs.select_one('div>p')
print('string:', p1.string)    # 我是段落1
print('text:', p1.get_text())
print('contents:', p1.contents)

p2 = bs.select_one('#p1')
print('p2:', p2)
print('p2-string:', p2.string)
print('p2-text:', p2.get_text())
print('p2-contents:', p2.contents)

p3 = bs.select_one('#p2')
print('p3-string:', p3.string)    # None
print('p3-text:', p3.get_text())
print('p3-contents:', p3.contents)

print(f'价格:', bs.select_one('.main-price').get_text())

获取标签属性

# 标签对象.attrs[属性名]
img = bs.select_one('img')
print(img.attrs)
print(img.attrs['src'])

a = bs.select('a')
print(a[-1].attrs['href'])

在指定标签中获取子标签

# 标签对象.select(css选择器)    -   获取指定标签中选择器选中的所有标签
# 标签对象.select_one(css选择器)  -  获取指定标签中选择器选中的第一个标签
print('所有的p标签:', len(bs.select('p')))

div = bs.select_one('div')
result = div.select('p')
print('div中p标签:', len(result))


if __name__ == '__main__':
    # data = get_net_data('https://cd.fang.ke.com/loupan/pg1/')

示例：

data ="""
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p id="p1" class="title" name="dromouse"><b>The Dormouse's story</b></p>
            <p class="story">Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
            and they lived at the bottom of a well.</p>
            <p class="story">...</p>
            <div>
                <p>我是段落1</p>
                <p>我是段落2</p>
                <img src="https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png" title="图片1" alt="警告">
                <a href="https://www.baidu.com">百度一下</a>
                <p id="p2">我是段落2<b>你好</b></p>
                <p id="p3">你好<br><i>世界</i></p>
            </div>
            <div class="main-price">
                           <span class="number">28000</span>
               <span class="desc">&nbsp;元/㎡(均价)</span>
                     </div>
"""

    if data:
        analysis_data(data)

4. csv文件操作

import csv

将文件写入csv文件中

用列表提供数据

 csv.writer(文件对象)   -   以列表为单位写入一行数据
 with open('files/test.csv', 'w', newline='', encoding='utf-8') as f:

     writer = csv.writer(f)

     # 2.写入数据
     writer.writerow(['姓名', '性别', '年龄', '分数'])
     writer.writerows([
         ['张三', '男', 28, 98],
         ['小明', '男', 19, 72],
         ['小花', '女', 20, 99]
     ])

用字典提供数据

 with open('files/test2.csv', 'w', newline='', encoding='utf-8') as f:
     writer = csv.DictWriter(f, ['name', 'age', 'gender', 'score'])
     # 第一行内容
     writer.writerow({'name': '姓名', 'age': '年龄', 'gender': '性别', 'score': '分数'})
     # writer.writeheader()

     # 写一行
     writer.writerow({'name': '张三', 'age': 23, 'gender': '男', 'score': 76})
     # 同时写多行
     writer.writerows([
         {'name': '张三1', 'age': 23, 'gender': '男', 'score': 76},
         {'name': '张三2', 'age': 23, 'gender': '男', 'score': 76},
         {'name': '张三3', 'age': 23, 'gender': '男', 'score': 76}
     ])

读取csv文件内容

注意：任意一个csv文件都可以选择使用列表或者字典的方式去读‘

一行数据对应一个列表

 with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
     # reader就是每一行内容对应的迭代器。（reader是一个迭代器，迭代器中的元素是每一行内容对应的列表）
     reader = csv.reader(f)
      print(next(reader))
      print(next(reader))
     next(reader)
     print(list(reader))

一行数据对应一个字典

with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    print(reader.fieldnames)   # ['姓名', '性别', '年龄', '分数']
    print(next(reader))   # {'姓名': '张三', '性别': '男', '年龄': '28', '分数': '98'}

xff980913

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
day-3 代理和css选择器解析

代理和css选择器解析获取代理ipimport requestsdef get_proxy_ips(): api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3' response = requests.get(api) if res
复制链接

扫一扫