一、requests基于POST请求
#1.requests的GET与POST用法的区别:
'''
GET请求: (HTTP默认的请求方法就是GET)
* 没有请求体
* 数据必须在1K之内!
* GET请求数据会暴露在浏览器的地址栏中
GET请求常用的操作:
1. 在浏览器的地址栏中直接给出URL,那么就一定是GET请求
2. 点击页面上的超链接也一定是GET请求
3. 提交表单时,表单默认使用GET请求,但可以设置为POST
POST请求
(1). 数据不会出现在地址栏中
(2). 数据的大小没有上限
(3). 有请求体
(4). 请求体中如果存在中文,会使用URL编码!
!!!requests.post()用法与requests.get()完全一致,特殊的是requests.post()有一个data参数,用来存放请求体数据!
'''
#2、发送post请求,模拟浏览器的登录行为
import requests
import re
'''
URL: https://github.com/login
GET
Set-Cookie: _gh_sess=UkJNSE92NXNRMjhOajlvdXhEQ0JNQUFqZFdWNUJGWmZMTXNLNFVqV3RYLzdCdHM1NWR3VnVHV0tTYzlpb0xjRDh6RzgzaDdTb01ZMThKZEtYSW14WktIYnhVTFBXaHN1YW1WVC84YStoMURBa1NFZlJNbXJIb3F2bDZZWDFLYVVtZHB0Y3htYk9USzhBOWdYTXZHTE5FMU9SWkhwdjhSUE8wbHFwUHBZOXdpb3AyaVpVUFpxby8vUHBJY2pzeXR1WWx2UEhJOUZUamY3QUpvZ1lvc2dEVlV0UFRDL2U5c3RPYmF5RHJtN0t4UFFFaW14UGhOQ3dKdWU3NEpHSExZKytWbmoxOXQ4bjZ4NUt2ajJ0N2xRaW44MWlQZ1o2Q0Urbjc1N2FEdUI1MDlscjRLK3dMMTRTMHhXbVlNWUs1eU9lVG92SzNIb1FmdWRtL2hzbGkwQnJ0UjBWYzlNcCszNWdoVmFaUXdZeVNYVXFucWxITlcyM0ZVNGsrd0t0RjlSakFBZGV5SjA0NVNLdVlSZXJuYVFmV0NrM1hGVytFQ2phSXcxdXZDM2J6NTZVSnQ2WkljTnhPNG9NYTlobllhRExScDhlQ04xM1l6Zi9jL0p5UkdHRkc3S2Z3RUxSMUdvcEhCazBXWkYwdkE9LS1VbmpDRUwwZGw0amJVOEgwT2ZQaGlBPT0%3D--4562a6af2d21508e2f522a5983004b4b2bb5983c; path=/; secure; HttpOnly
Cookie: _ga=GA1.2.1697930951.1554622929; _octo=GH1.1.1498701842.1560392375; _device_id=22a0ddb58979d9c97ffafeb3113e2567; user_session=TZzfoXANu224u6MsNzS5Z3GSasDsMIBvZC4cvOknaNRrWRJe; __Host-user_session_same_site=TZzfoXANu224u6MsNzS5Z3GSasDsMIBvZC4cvOknaNRrWRJe; logged_in=no; tz=Asia%2FShanghai; has_recent_activity=1; _gat=1; _gh_sess=U3B3L1FZUzFTY3JCRGZqamZvMGxoakhlYys3U29CK1BVZnlwWDVFRXhHNnZ5ZnRvbWNyNVlzQmdiZXRXVGxlQUZObWlPd2dXODdkTVUydGFaNW5YRm03YlhRVEU2bWFiRWlmeE00K25rd2FzOHlYTUUxQkF2R0JpS3ZpdlhxWE1FdFMrdW4zS2VtWmtrc2xLdmpZVnA3NHZjd3R4d3pxMEM4QWh1Ykk1U2JCYmpvYWZycDJaZHRDdjFDT3JYeHJOMDgrbDl3Q2FOZlZIQUlOUXdSaGJQajVVeHF5VjF0Q21FWkpoTlRSWkR2b2haeUlGNmpkdnZOUUc3dG9ydldTaDhlYllZQWlrZkVnNE0rQ1AxNHBrMVNkbFd0M0NSdjRHYW5PalV0LzJqN0w5OE9ncUVzRThBZzAyUjJXUFVFZ3ZjNEhucGh0dVhlYkFWWks3T2JkOEc4TFdpK1VwbStFOVVrMnQrbGo2S3Q1YmtLcS9ZdSt6Skt1aXkrcldRRTVDZzV5Zkljc1M2dVVBWTRpMjNYbmRBeHdJTW5FLzZFZ2xrd3RNWWJTK1p3NnhCWGlGblBzcENtOFJHcUY5S3hGMmFHK0dTRE5lR1FvNVlLVXBPTEdJcE5BT3BWQWhaSW92aDFwRGdyNnI3cE09LS05L01jajZmUGVEcEl6cjJSYTBpaXVnPT0%3D--ae43dcfa7904aed3f423eb6715b139c831a68a5a
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36
'''
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
response = requests.get(url='https://github.com/login',headers=headers)
login_cookies = response.cookies.get_dict()
authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)[0]
print(authenticity_token)
headers2 = {
'Referer': 'https://github.com/login',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
}
# 拼接请求体信息
form_data = {
"commit": "Sign in",
"utf8": "✓",
"authenticity_token": authenticity_token,
"login": "tankjam",
"password": "kermit46709394",
"webauthn-support": "unsupported",
}
# 往session地址发送post请求
# 携带请求头、请求体、login页的cookies信息
response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies)
print(response2.status_code)
# print(response2.text)
with open('github.html', 'w', encoding='utf-8') as f:
f.write(response2.text)
二、response请求
import requests
response = requests.get('https://baidu.com')
# response响应
print(response.status_code) # 获取响应状态码
print(response.url) # 获取url地址
print(response.encoding) # 字符编码
response.encoding = 'utf-8'
print(response.text) # 获取文本
print(response.content) # 获取二进制流
print(response.headers) # 获取页面请求头信息
print(response.history) # 上一次跳转的地址
# 1、返回cookie字典 2、返回cookies对象
print(response.cookies) # 获取cookies信息,
print(response.cookies.get_dict()) # 获取cookies信息转换成字典
print(response.cookies.items()) # 获取cookies信息转换成字典
print(response.encoding)
print(response.elapsed) # 访问时间
import requests
# 往音频地址发送get请求
url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
response = requests.get(url, stream=True) # stream=True 把content设置为一个迭代器对象
print(response.content)
with open('love_for_GD.mp4', 'wb') as f:
for content in response.iter_content():
f.write(content)
三、requests高级用法
'''
'''
'''
证书验证(大部分网站都是https)
'''
import requests
# # 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端
# response = requests.get('https://www.xiaohuar.com')
# print(response.status_code)
# 改进1:去掉报错,但是会报警告
# import requests
# response = requests.get('https://www.xiaohuar.com', verify=False)
# # 不验证证书,报警告,返回200
# print(response.status_code)
# 改进2:去掉报错,并且去掉警报信息
# import requests
# import urllib3
# urllib3.disable_warnings() # 关闭警告
# response = requests.get('https://www.xiaohuar.com', verify=False)
# print(response.status_code)
# 改进3:加上证书
# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
# 知乎\百度等都是可带可不带
# 有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站
# import requests
# import urllib3
# # urllib3.disable_warnings() # 关闭警告
# # 伪代码
# response = requests.get(
# 'https://www.xiaohuar.com',
# # verify=False,
# # /path/server.crt证书的存放目录, /path/key
# cert=('/path/server.crt', '/path/key'))
# print(response.status_code)
'''
超时设置
'''
# 超时设置
# 两种超时:float or tuple
# timeout=0.1 # 代表接收数据的超时时间
# # timeout=(0.1,0.2) # 0.1代表链接超时 0.2代表接收数据的超时时间
#
# import requests
# response = requests.get('https://www.baidu.com',
# timeout=0.0001
# print(response.elapsed)
# print(response.status_code)
'''
代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
'''
# import requests
# proxies={
# # 带用户名密码的代理,@符号前是用户名与密码
# 'http':'http://tank:123@localhost:9527',
# # 'http':'http://localhost:9527',
# # 'https':'https://localhost:9527',
# }
# response=requests.get('https://www.12306.cn',
# proxies=proxies)
#
# print(response.status_code)
#
'''
爬取西刺免费代理:
1.访问西刺免费代理页面
2.通过re模块解析并提取所有代理
3.通过ip测试网站对爬取的代理进行测试
4.若test_ip函数抛出异常代表代理作废,否则代理有效
5.利用有效的代理进行代理测试
<tr class="odd">
<td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
<td>112.85.131.99</td>
<td>9999</td>
<td>
<a href="/2019-05-09/jiangsu">江苏南通</a>
</td>
<td class="country">高匿</td>
<td>HTTPS</td>
<td class="country">
<div title="0.144秒" class="bar">
<div class="bar_inner fast" style="width:88%">
</div>
</div>
</td>
<td class="country">
<div title="0.028秒" class="bar">
<div class="bar_inner fast" style="width:97%">
</div>
</div>
</td>
<td>6天</td>
<td>19-05-16 11:20</td>
</tr>
re:
<tr class="odd">(.*?)</td>.*?<td>(.*?)</td>
'''
import requests
import re
import time
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
def get_index(url):
time.sleep(1)
response = requests.get(url, headers=HEADERS)
return response
def parse_index(text):
ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S)
for ip_port in ip_list:
ip = ':'.join(ip_port)
yield ip
def test_ip(ip):
print('测试ip: %s' % ip)
try:
proxies = {
'https': ip
}
# ip测试网站
ip_url = 'https://www.ipip.net/'
# 使用有效与无效的代理对ip测试站点进行访问,若返回的结果为200则代表当前测试ip正常
response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1)
if response.status_code == 200:
print(f'有用的ip:{ip}')
return ip
# 若ip代理无效则抛出异常
except Exception as e:
print(e)
# # 使用代理爬取nba
def spider_nba(good_ip):
url = 'https://china.nba.com/'
proxies = {
'https': good_ip
}
response = requests.get(url, headers=HEADERS, proxies=proxies)
print(response.status_code)
print(response.text)
if __name__ == '__main__':
base_url = 'https://www.xicidaili.com/nn/{}'
for line in range(1, 3677):
ip_url = base_url.format(line)
response = get_index(ip_url)
# 解析西刺代理获取每一个ip列表
ip_list = parse_index(response.text)
# 循环每一个ip
for ip in ip_list:
# print(ip)
# 对爬取下来的ip进行测试
good_ip = test_ip(ip)
if good_ip:
# 真是代理,开始测试
spider_nba(good_ip)
'''
认证设置
'''
import requests
# 通过访问github的api来测试
# url = 'https://api.github.com/user'
# HEADERS = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
# }
# 测试1,失败返回401
# response = requests.get(url, headers=HEADERS)
# print(response.status_code) # 401
# print(response.text)
'''
打印结果:
{
"message": "Requires authentication",
"documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
}
'''
#
# # 测试2,通过requests.auth内的HTTPBasicAuth进行认证,认证成功返回用户信息
# from requests.auth import HTTPBasicAuth
# response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394'))
# print(response.text)
#
# 测试3,通过requests.get请求内的auth参数默认就是HTTPBasicAuth,认证成功返回用户信息
# response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394'))
# print(response.text)
'''
上传文件
'''
# import requests
# 上传文本文件
# files1 = {'file': open('user.txt', 'rb')}
# # files参数是POST请求固定参数
# response = requests.post('http://httpbin.org/post', files=files1)
# print(response.status_code) # 200
# print(response.text) # 200
# 上传图片文件
# files2 = {'jpg': open('一拳.jpg', 'rb')}
# response = requests.post('http://httpbin.org/post', files=files2)
# print(response.status_code) # 200
# print(response.text) # 200
#
# 上传视频文件
# files3 = {'movie': open('love_for_GD.mp4', 'rb')}
# response = requests.post('http://httpbin.org/post', files=files3)
# print(response.status_code) # 200
# print(response.text) # 200
四、selenium基本使用
''''''
'''
selenium模块讲解
一 什么是selenium?
最初是一个自动化测试工具。可以使用它帮我们驱动浏览器
自动去执行某些自定义好的操作。例如在页面中执行JS代码、
跳过登录验证。可以使用selenium帮我们实现爬虫。
二 为什么要使用selenium?
1、优点:
使用requests模块登录需要分析大量的复杂通信流程,使用selenium
可以轻松跳过登录验证。
2、缺点:
浏览器会加载css、js、图片、视频...数据,爬虫效率相比requests模块要低。
三 如何使用selenium?
下载selenium模块:
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
下载浏览器驱动:
http://npm.taobao.org/mirrors/chromedriver/2.38/
'''
# selenium之第一次
from selenium import webdriver # 用来驱动浏览器的
# 调用得到一个动作链对象,破解滑动验证码的时候用的,可以拖动图片
from selenium.webdriver import ActionChains
# 按照什么方式查找属性,By.ID, By.CSS_SELECTOR, By.Class
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # 键盘按键操作
# 和下面WebDriverWait一起用的,EC是expected_conditions的别名
from selenium.webdriver.support import expected_conditions as EC
# 等待页面加载某些元素
from selenium.webdriver.support.wait import WebDriverWait
import time
# 通过谷歌浏览器驱动打开谷歌浏览器
# webdriver.Chrome(r'chromedriver.exe的绝对路径')
# chrome = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') # 括号内输入chromedriver.exe的绝对路径
# chromedriver.exe存放于python解释器的Scripts文件夹中
# chrome是一个驱动对象
chrome = webdriver.Chrome()
'''
实例1
'''
# 若try出现异常
# try:
# # 往tank博客主页发送get请求
# # chrome.get('https://www.cnblogs.com/kermitjam/')
#
# # 参数1: 驱动对象 参数2: 等待时间
# wait = WebDriverWait(chrome, 10)
#
# # 1、访问百度
# chrome.get('https://www.baidu.com/')
#
# # 2、查找input输入框
# input_tag = wait.until(
# # 调用EC的presence_of_element_located()
# EC.presence_of_element_located(
# # 此处可以写一个元组
# # 参数1: 查找属性的方式
# # 参数2: 属性的名字
# (By.ID, "kw")
# )
# )
# input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw")))
#
# # 3、搜索一拳超人
# input_tag.send_keys('一拳超人')
#
# # 4、按键盘回车键
# input_tag.send_keys(Keys.ENTER)
#
# time.sleep(3)
#
# # 无论发生什么都会关闭浏览器
# finally:
# # 关闭浏览器
# chrome.close()
'''
实例2
'''
try:
# 往tank博客主页发送get请求
# chrome.get('https://www.cnblogs.com/kermitjam/')
# 参数1: 驱动对象 参数2: 等待时间
wait = WebDriverWait(chrome, 10)
# 1、访问京东主页
chrome.get('https://www.jd.com/')
# 2、查找input输入框
input_tag = wait.until(EC.presence_of_element_located((By.ID, "key")))
# 3、搜索唐诗三百首
input_tag.send_keys('唐诗三百首')
# 4、根据class属性名称查找标签
search_button = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, 'button')))
# 5、点击搜索按钮
search_button.click()
time.sleep(3)
# 无论发生什么都会关闭浏览器
finally:
# 关闭浏览器
chrome.close()
五、selenium之基本选择器
# from selenium import webdriver # 用来驱动浏览器的
# import time
#
# '''
# 隐式等待
# '''
# # 获取驱动对象、
# driver = webdriver.Chrome()
#
# try:
# # 显式等待: 等待某个元素加载
# # 参数1: 驱动对象 参数2: 等待时间
# # wait = WebDriverWait(chrome, 10)
#
# driver.get('https://china.nba.com/')
#
# # 隐式等待: 等待页面所有元素加载
# driver.implicitly_wait(10)
# news_tag = driver.find_element_by_class_name('nav-news')
# # 获取标签对象
# print(news_tag)
# # 获取标签的名字
# print(news_tag.tag_name)
#
#
# time.sleep(10)
#
# finally:
# driver.close()
from selenium import webdriver # 用来驱动浏览器的
import time
'''
===============所有方法===================
element是查找一个标签
elements是查找所有标签
1、find_element_by_link_text 通过链接文本去找
2、find_element_by_id 通过id去找
3、find_element_by_class_name
4、find_element_by_partial_link_text
5、find_element_by_name
6、find_element_by_css_selector
7、find_element_by_tag_name
'''
# 获取驱动对象、
driver = webdriver.Chrome()
try:
# 往百度发送请求
driver.get('https://www.baidu.com/')
driver.implicitly_wait(10)
# 1、find_element_by_link_text 通过链接文本去找
# 根据登录
# send_tag = driver.find_element_by_link_text('登录')
# send_tag.click()
# 2、find_element_by_partial_link_text 通过局部文本查找a标签
login_button = driver.find_element_by_partial_link_text('登')
login_button.click()
time.sleep(1)
# 3、find_element_by_class_name 根据class属性名查找
login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')
login_tag.click()
time.sleep(1)
# 4、find_element_by_name 根据name属性查找
username = driver.find_element_by_name('userName')
username.send_keys('15622792660')
time.sleep(1)
# 5、find_element_by_id 通过id属性名查找
password = driver.find_element_by_id('TANGRAM__PSP_10__password')
password.send_keys('*******')
time.sleep(1)
# 6、find_element_by_css_selector 根据属性选择器查找
# 根据id查找登录按钮
login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')
# driver.find_element_by_css_selector('.pass-button-submit')
login_submit.click()
# 7、find_element_by_tag_name 根据标签名称查找标签
div = driver.find_element_by_tag_name('div')
print(div.tag_name)
time.sleep(10)
finally:
driver.close()