Infi-chu:
http://www.cnblogs.com/Infi-chu/
一、设置代理
1.urllib
#HTTP代理类型
from urllib.error import URLError
from urllib.requests import ProxyHandler,build_opener
proxy='127.0.0.1:9743'
# proxy='username:password@127.0.0.1:9743' 用户名密码放在开头
proxy_handler=ProxyHandler({
'http':'http://'+proxy,
'https':'https://'+proxy
})
opener=build_opener(proxy_handler)
try:
res = opener.open('http://httpbin.org/get')
print(res.read().decode('uft-8'))
except URLError as e:
print(e.reason)
#SOCK5代理类型
import socks # pip3 install PySocks
import socket
from urllib import request
from urllib.error import URLError
socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
socket.socket=socks.socksocket
try:
res = request.urlopen('http://httpbin.org/get')
print(res.read().decode('utf-8'))
except URLError as e:
print(e.reason)
2.requests
比urllib简单
# HTTP代理类型
improt requests
proxy='127.0.0.1:9743'
proxies = {
'http':'http://'+proxy,
'https':'https://'+proxy,
}
try:
res = requests.get('http://httpbin.org/get',proxies=proxies)
print(res.text)
except requests.exceptions.ConnectionError as e:
print('Error',e.args)
# SOCK5代理类型(1)
import requests # pip3 install 'requests[socks]'
proxy='127.0.0.1:9742'
proxies={
'http':'socks5://'+proxy,
'https':'socks5://'+proxy,
}
try:
res = requests.get('http://httpbin.org/get',proxies=proxies)
print(res.text)
except requests.exceptions.ConnectionError as e:
print('Error',e.args)
# SOCK5代理类型(2)
import requests,socks,socket
socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
socket.socket=socks.socksocket
try:
res = requests.get('http://httpbin.org/get',proxies=proxies)
print(res.text)
except requests.exceptions.ConnectionError as e:
print('Error',e.args)
3.Selenium
设置浏览器代理
from selenium import webdriver
proxy='127.0.0.1:9743'
chrome_options=webdriver.ChromeOptions() # 使用此方法传参数
chrome_options.add_argument('--proxy-server=http://'+proxy)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://httpbin.org/get')
设置认证代理
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import zipfile
ip='127.0.0.1'
port=9743
username='test'
password='test'
manifest_json="""
{
"version":"1.0.0",
"manifest_version":2,
"name":"Chrome Proxy",
"permissions":[
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background":{"scripts":["background.js"]}
}
"""
background_js="""
var config={
mode:"fixed_servers",
rules:{
singleProxy:{
scheme:"http",
host:"%(ip)s",
port:"%(port)s"
}
}
}
chrome.proxy.settings.set({value:config,scope:"regular"},function(){});
function callbackFn(details){
return{
authCredentials:{
username:"%(username)s",
password:"%(password)s"
}
}
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls:["<all_urls>"]},
['blocking']
)
"""%{'ip':ip,'port':port,'username':username,'port':port}
plugin_file='proxy_auth_plugin.zip'
with zipfile.ZipFile(plugin_file,'w') as zp:
zp.writestr("manifest_json",manifest_json)
zp.writestr("background.js",background_js)
chrome_options=Options()
chrome_options.add_argument('--start-maximized')
chrome_options.add_extension(plugin_file)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://httpbin.org/get')
二、代理池维护
单一代理并不能完成我们的代理任务,所以需要更多数量的代理为我们服务。
我们将对代理进行筛选,并高效的为我们提供服务。
1.准备
需要使用redis数据库,aiohttp、requests、redis-py、pyquery、flask库
2.代理池的目标:存储模块、获取模块、检测模块、接口模块
3.各模块的实现:
https://github.com/Infi-chu/proxypool
三、利用代理爬取微信文章
https://github.com/Infi-chu/weixinspider