概念
将不同的用户代理,构建成一个池子,然后随机调用
用户代理池简单应用
import urllib.request
import re
import random
urlpoors=[
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
]
def ua(urlpools):
thisua=random.choice(urlpoors)
print(thisua)
#浏览器伪装
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(0,10):
ua(urlpoors)
#再加具体爬取代码即可
单个IP代理(没钱买代理,只写了个具体的形式,后面再补充)
import urllib.request
ip=""
proxy=urllib.request.ProxyHandler({"http":ip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url="http://baidu.com"
data1=urllib.request.urlopen(url).read()
data=data1.decode("utf-8","ignore")
file = open(r"D:/python/mj/baidu.html", "wb")
file.write(data1)
file.close()
ip代理池实战
ip代理池构建的第一种方式(适用于稳定ip)
import urllib.request
impore randoom
ippools=[" "," '.
]
def ip(ippools):
thisip=random.choice(ippools)
print(thisip)
proxy=urllib.request.ProxyHandler({"http",thisip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
fpr i in range(0,5):
try:
ip(ippools)
url="http://baidu.com"
data1=urllib.request.urlopen(url).read()
data=data1.decode("utf-8","ignore")
file = open(r"D:/python/mj/baidu.html", "wb")
file.write(data1)
file.close()
except Exception as err:
print(err)
ip代理池构建的第二种方式(接口,调用)
实战:爬取京东某商品图片
import urllib.request
import re
import random
urlpools=[
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
]
def ua(urlpools):
thisua=random.choice(urlpools)
print(thisua)
#浏览器伪装
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(1,3):
url="https://list.jd.com/list.html?cat=1315,1342,1349&page="+str(i)
ua(urlpools)
data=urllib.request.urlopen(url).read().decode("utf-8")
pat='src="//img(.*?).jpg">'
imglist=re.compile(pat).findall(data)
for j in range(0,len(imglist)):
try:
thisimg=imglist[j]
thisimgurl="https://img"+thisimg+".jpg"
localfile="D:\\python\\mj\\jd\\"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename=localfile)
print("successful")
print(thisimg)
except Exception as err:
print(err)
print(thisimg)