文章目录
传送数据
urlopen(url, data=None, [timeout,]*, cafile=None, capath=None, cadefault=False, context=None)
from urllib.request import *
import urllib.parse
data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding = 'utf8')
data
b'word=hello'
response = urlopen('http://httpbin.org/post', data = data)
print(response.read().decode("utf-8"))
{
"args": {},
"data": "",
"files": {},
"form": {
"word": "hello"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "10",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"json": null,
"origin": "120.236.174.136, 120.236.174.136",
"url": "https://httpbin.org/post"
}
超时情况
import socket
try:
response = urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('Time Out')
Time Out
将urlopen中的参数改成Request类
Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method =None)
from urllib.parse import *
headers = {
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
'Host': 'httpbin.org'
}
dict = {
'name': 'happy'
}
data = bytes(urlencode(dict), encoding = 'utf8')
request = Request('http://httpbin.org/post', data=data, headers = headers, method='post')
response = urlopen(request)
print(response.read().decode("utf-8"))
{
"args": {},
"data": "",
"files": {},
"form": {
"name": "happy"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "10",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT)"
},
"json": null,
"origin": "120.236.174.151, 120.236.174.151",
"url": "https://httpbin.org/post"
}
使用Handler添加代理
ProxyHandler的构造参数是一个字典,键名是协议类型,键值是代理链接。
常规代理
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
proxy_handler = ProxyHandler({
"https": "https://223.243.254.191:65309",
"http": "http://113.121.93.90:9999",
"http": "http://123.163.27.237:9999"
})
opener = build_opener(proxy_handler)
遇到 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败 这种问题就把timeout调大
try:
response = opener.open('https://httpbin.org/get', timeout = 500)
print(response.read().decode("utf-8"))
except URLError as e:
print(e.reason)
{
"args": {},
"headers": {
"Accept-Encoding": "identity",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"origin": "223.245.39.218, 223.245.39.218",
"url": "https://httpbin.org/get"
}
可以看到origin和代理ip并不是一模一样的,不晓得是不是用了ip转发
try:
response = opener.open('http://httpbin.org/get')
print(response.read().decode("utf-8"))
except URLError as e:
print(e.reason)
{
"args": {},
"headers": {
"Accept-Encoding": "identity",
"Cache-Control": "max-age=259200",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"origin": "123.163.27.237, 123.163.27.237",
"url": "https://httpbin.org/get"
}
针对http的两个ip代理,第一个不能用第二个能用,发现有多种可能的情况下可以自动找到正确的那一个。
需要用户名密码代理
格式为’username:password@ipaddress’
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
proxy_handler = ProxyHandler({
# 随便编的
"http": "http://123:321@12.18.19.19:9999"
})
opener = build_opener(proxy_handler)
使用socks5协议看世界
import socks
import socket
# 下面的服务器地址改过了,不能用
socks.set_default_proxy(socks.SOCKS5, '22.20.18.92', 1080)
socket.socket = socks.socksocket
try:
response = urlopen("http://httpbin.org/get")
print(response.read().decode("utf-8"))
except URLError as e:
print(e.reason)
{
"args": {},
"headers": {
"Accept-Encoding": "identity",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"origin": "19.18.13.12, 19.16.13.12",
"url": "https://httpbin.org/get"
}
怒爬一波推特
from urllib.parse import *
socks.set_default_proxy(socks.SOCKS5, '22.20.18.92', 1080)
socket.socket = socks.socksocket
headers = {
'Origin': 'https://www.google.com',
'Referer': 'https://www.google.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
try:
request = Request("https://twitter.com/home", headers = headers)
response = urlopen(request)
print(response.read().decode("utf-8"))
except URLError as e:
print(e.reason)
<!DOCTYPE html>
<html lang="en" data-scribe-reduced-action-queue="true">
<head>
<meta charset="utf-8">
<script nonce="cpkeY2499BhzH1QhKNytTw==">
!function(){window.initErrorstack||(window.initErrorstack=[]),window.onerror=function(r,i,n,o,t){r.indexOf("Script error.")>-1||window.initErrorstack.push({errorMsg:r,url:i,lineNumber:n,column:o,errorObj:t})}}();
</script>
<script id="bouncer_terminate_iframe" nonce="cpkeY2499BhzH1QhKNytTw==">
if (window.top != window) {
window.top.postMessage({'bouncer': true, 'event': 'complete'}, '');
}
</script>
<script id="ttft_boot_data" nonce="cpkeY2499BhzH1QhKNytTw==">
window.ttftData={"transaction_id":"006fff6d002c1aa6.c6a409836c08a162\u003c:0088620e009a344f","server_request_start_time":1576248983417,"user_id":null,"is_ssl":true,"rendered_on_server":true,"is_tfe":true,"client":"macaw-swift","tfe_version":"tsa_a\/1.0.1\/20191126.1647.c3ada84","ttft_browser":"chrome"};!function(){function t(t,n){window.ttftData&&!window.ttftData[t]&&(window.ttftData[t]=n)}function n(){return o?Math.round(w.now()+w.timing.navigationStart):(new Date).getTime()}var w=window.performance,o=w&&w.now;window.ttft||(window.ttft={}),window.ttft.recordMilestone||(window.ttft.recordMilestone=t),window.ttft.now||(window.ttft.now=n)}();
</script>
<script id="swift_action_queue" nonce="cpkeY2499BhzH1QhKNytTw==">
!function(){function e(e){if(e||(e=window.event),!e)return!1;if(e.timestamp=(new Date).getTime(),!e.target&&e.srcElement&&(e.target=e.srcElement),document.documentElement.getAttribute("data-scribe-reduced-action-queue"))for(var t=e.target;t&&t!=document.body;){if("A"t.tagName)return;t=t.parentNode}return i("all",o(e)),a(e)?(document.addEventListener||(e=o(e)),e.preventDefault=e.stopPropagation=e.stopImmediatePropagation=function(){},y?(v.push(e),i("captured",e)):i("ignored",e),!1):(i("direct",e),!0)}function t(e){n();for(var t,r=0;t=v[r];r++){var a=e(t.target),i=a.closest("a")[0];if("click"t.type&&i){var o=e.data(i,"events"),u=o&&o.click,c=!i.hostname.match(g)||!i.href.match(/#/);if(!u&&c){window.location=i.href;continue}}a.trigger(e.event.fix(t))}window.swiftActionQueue.wasFlushed=!0}function r(){for(var e in b)if("all"!=e)for(var t=b[e],r=0;r<t.length;r++)console.log("actionQueue",c(t[r]))}function n(){clearTimeout(w);for(var e,t=0;e=h[t];t++)document["on"+e]=null}function a(e){if(!e.target)return!1;var t=e.target,r=(t.tagName||"").toLowerCase();if(e.metaKey)return!1;if(e.shiftKey&&"a"==r)return!1;if(t.hostname&&!t.hostname.match(g))return!1;if(e.type.match(p)&&s(t))return!1;if("label"==r){var n=t.getAttribute("for");if(n){var a=document.getElementById(n);if(a&&f(a))return!1}else for(var i,o=0;i=t.childNodes[o];o++)if(f(i))return!1}return!0}function i(e,t){t.bucket=e,b[e].push(t)}function o(e){var t={};for(var r in e)t[r]=e[r];return t}function u(e){for(;e&&e!=document.body;){if("A"==e.tagName)return e;e=e.parentNode}}function c(e){var t=[];e.bucket&&t.push("["+e.bucket+"]"),t.push(e.type);var r,n,a=e.target,i=u(a),o="",c=e.timestamp&&e.timestamp-d;return"click"===e.type&&i?(r=i.className.trim().replace(/\s+/g,"."),n=i.id.trim(),o=/[^#]/.test(i.href)?" ("+i.href+")":"",a='"'+i.innerText.replace(/\n+/g," ").trim()+'"'):(r=a.className.trim().replace(/\s+/g,"."),n=a.id.trim(),a=a.tagName.toLowerCase(),e.keyCode&&(a=String.fromCharCode(e.keyCode)+" : "+a)),t.push(a+o+(n&&"#"+n)+(!n&&r?"."+r:"")),c&&t.push(c),t.join(" ")}function f(e){var t=(e.tagName||"").toLowerCase();return"input"t&&"checkbox"e.getAttribute("type")}function s(e){var t=(e.tagName||"").toLowerCase();return"textarea"t||"input"t&&"text"e.getAttribute("type")||"true"e.getAttribute("contenteditable")}for(var m,d=(new Date).getTime(),l=1e4,g=/ (([)\.]+.)twitter.com$/,p=/^key/,h=["click","keydown","keypress","keyup"],v=[],w=null,y=!0,b={captured:[],ignored:[],direct:[],all:[]},k=0;m=h[k];k++)document["on"+m]=e;w=setTimeout(function(){y=!1},l),window.swiftActionQueue={buckets:b,flush:t,logActions:r,wasFlushed:!1}}();
</script>
<script id="composition_state" nonce="cpkeY2499BhzH1QhKNytTw==">
!function(){function t(t){t.target.setAttribute("data-in-composition","true")}function n(t){t.target.removeAttribute("data-in-composition")}document.addEventListener&&(document.addEventListener("compositionstart",t,!1),document.addEventListener("compositionend",n,!1))}();
</script>
使用Handler添加cookie
获取网站cookie
- cookiejar模块的主要作用是提供可存储cookie的对象,以便于与urllib模块配合使用来访问Internet资源。Cookiejar模块非常强大,我们可以利用本模块的CookieJar类的对象来捕获cookie并在后续连接请求时重新发送,比如可以实现模拟登录功能。
- 该模块主要的对象有CookieJar、FileCookieJar、MozillaCookieJar、LWPCookieJar。
- 它们的关系:CookieJar —-派生—->FileCookieJar —-派生—–>MozillaCookieJar和LWPCookieJar
from http.cookiejar import *
from urllib.request import *
cookie = CookieJar()
handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
print(item.name + "=" + item.value)
BAIDUID=90FC91BD0AED82481D19AE5BA741B2C7:FG=1
BIDUPSID=90FC91BD0AED8248A84C636B264E077B
H_PS_PSSID=1448_21100_30211_30327_30284_26350_22160
PSTM=1576730391
delPer=0
BDSVRTM=0
BD_HOME=0
保存为文件
- ignore_discard的意思是即使cookies将被丢弃也将它保存下来
- ignore_expires的意思是如果在该文件中cookies已经存在,则覆盖原文件写入
cookie = MozillaCookieJar('cookies.txt')
handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard = True, ignore_expires = True)
cookie = MozillaCookieJar('cookies.txt')
cookie.load(ignore_discard = True, ignore_expires = True)
for item in cookie:
print(item.name + "=" + item.value)
H_PS_PSSID=121849
PSTM=1576740535
delPer=0
BDSVRTM=0
BD_HOME=0