代理和css选择器解析
获取代理ip
import requests
def get_proxy_ips():
api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
response = requests.get(api)
if response.status_code == 200:
if response.text[0] == '{':
print('获取代理失败!提取太频繁')
else:
# print(response.text.split('\n'))
return response.text.split('\n')[:-1]
else:
print('请求失败!')
def get_net_data():
url = 'https://movie.douban.com/top250'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
# 代理
while True:
ips = get_proxy_ips()
if ips:
proxies = {
'http': ips[0], # 'http': 'ip地址:端口号'
'https': ips[0]
}
response = requests.get(url, headers=headers, proxies=proxies, timeout=2)
if response.status_code == 200:
print(response.text)
else:
print('数据请求失败!')
else:
print('没有成功获取到代理')
if __name__ == '__main__':
# get_proxy_ips()
get_net_data()
2.代理的程序优化
import requests
import time
def get_proxy_ips():
api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
response = requests.get(api)
if response.status_code == 200:
if response.text[0] == '{':
print('获取代理失败!提取太频繁')
else:
# print(response.text.split('\n'))
return response.text.split('\n')[:-1]
else:
print('ip请求失败!')
def get_net_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
while True:
# 获取5个代理ip
ips = get_proxy_ips()
# 如果没有取到
if not ips:
print('ip获取失败!')
time.sleep(1)
continue
while ips:
ip1 = ips.pop()
ip2 = ips.pop()
print(ip1, ip2)
proxies = {
'http': ip1,
'https': ip2
}
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=3)
if response.status_code == 200:
# print(response.text)
return response.text
else:
print('数据请求失败!')
except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout):
print('超时,继续请求')
if __name__ == '__main__':
result = get_net_data('https://movie.douban.com/top250')
print(result)
3. bs4的使用
import requests
from bs4 import BeautifulSoup
# BeautifulSoup - 解析器类
def get_net_data(url):
"""获取网络数据"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(response)
- 创建解析器对象
def analysis_data(data: str):
# 1. 创建解析器对象
# BeautifulSoup(需要解析的html字符串, 解析器名称)
bs = BeautifulSoup(data, 'lxml')
print(type(bs))
# print(bs)
- 根据css选择器获取标签
# select(css选择器) - 获取选择器选中的所有标签, 以列表的形式返回
# select_one(css选择器) - 获取选择器选中的第一个标签
result = bs.select('#p1')
print(result, len(result), type(result[0]))
result = bs.select('p')
print(result, len(result), type(result[0]))
result = bs.select('div>p')
print(result, type(result[0]))
result = bs.select_one('div>p')
print(result)
- 获取标签内容
# 标签对象.string - 获取标签的文字内容(如果标签内容中有多个子标签或者同时存在文字和子标签,结果是None),返回值是字符串
# 标签对象.get_text() - 获取标签的文字内容(如果有子标签,会将子标签中的文字内容一起获取),返回值是字符串
# 标签对象.contents - 获取标签中文字内容和子标签,返回值是列表
p1 = bs.select_one('div>p')
print('string:', p1.string) # 我是段落1
print('text:', p1.get_text())
print('contents:', p1.contents)
p2 = bs.select_one('#p1')
print('p2:', p2)
print('p2-string:', p2.string)
print('p2-text:', p2.get_text())
print('p2-contents:', p2.contents)
p3 = bs.select_one('#p2')
print('p3-string:', p3.string) # None
print('p3-text:', p3.get_text())
print('p3-contents:', p3.contents)
print(f'价格:', bs.select_one('.main-price').get_text())
- 获取标签属性
# 标签对象.attrs[属性名]
img = bs.select_one('img')
print(img.attrs)
print(img.attrs['src'])
a = bs.select('a')
print(a[-1].attrs['href'])
- 在指定标签中获取子标签
# 标签对象.select(css选择器) - 获取指定标签中选择器选中的所有标签
# 标签对象.select_one(css选择器) - 获取指定标签中选择器选中的第一个标签
print('所有的p标签:', len(bs.select('p')))
div = bs.select_one('div')
result = div.select('p')
print('div中p标签:', len(result))
if __name__ == '__main__':
# data = get_net_data('https://cd.fang.ke.com/loupan/pg1/')
示例:
data ="""
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p id="p1" class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<div>
<p>我是段落1</p>
<p>我是段落2</p>
<img src="https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png" title="图片1" alt="警告">
<a href="https://www.baidu.com">百度一下</a>
<p id="p2">我是段落2<b>你好</b></p>
<p id="p3">你好<br><i>世界</i></p>
</div>
<div class="main-price">
<span class="number">28000</span>
<span class="desc"> 元/㎡(均价)</span>
</div>
"""
if data:
analysis_data(data)
4. csv文件操作
import csv
将文件写入csv文件中
- 用列表提供数据
csv.writer(文件对象) - 以列表为单位写入一行数据
with open('files/test.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
# 2.写入数据
writer.writerow(['姓名', '性别', '年龄', '分数'])
writer.writerows([
['张三', '男', 28, 98],
['小明', '男', 19, 72],
['小花', '女', 20, 99]
])
- 用字典提供数据
with open('files/test2.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, ['name', 'age', 'gender', 'score'])
# 第一行内容
writer.writerow({'name': '姓名', 'age': '年龄', 'gender': '性别', 'score': '分数'})
# writer.writeheader()
# 写一行
writer.writerow({'name': '张三', 'age': 23, 'gender': '男', 'score': 76})
# 同时写多行
writer.writerows([
{'name': '张三1', 'age': 23, 'gender': '男', 'score': 76},
{'name': '张三2', 'age': 23, 'gender': '男', 'score': 76},
{'name': '张三3', 'age': 23, 'gender': '男', 'score': 76}
])
读取csv文件内容
注意:任意一个csv文件都可以选择使用列表或者字典的方式去读‘
- 一行数据对应一个列表
with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
# reader就是每一行内容对应的迭代器。(reader是一个迭代器,迭代器中的元素是每一行内容对应的列表)
reader = csv.reader(f)
print(next(reader))
print(next(reader))
next(reader)
print(list(reader))
- 一行数据对应一个字典
with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
print(reader.fieldnames) # ['姓名', '性别', '年龄', '分数']
print(next(reader)) # {'姓名': '张三', '性别': '男', '年龄': '28', '分数': '98'}