百度图片爬虫
def use_proxy(proxy_addr,url):
import urllib.request
import urllib.parse
proxy = urllib.request.ProxyHandler({'http':proxy_addr})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
fdata = urllib.request.urlopen(url).read()
ffdata = urllib.parse.urlencode(fdata).encode('utf-8')
#ffdata = bytes(urllib.parse.urlencode(fdata), encoding='utf8') #Data的数据格式为bytes类型,需要decode()解码,转换成str类型。
return ffdata
import re
import urllib.request
#import time
tupian= input('输入想要爬取的图片:')
search = urllib.request.quote(tupian) #对想要爬取的内容编码
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1536904153130_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1536904153131%5E00_1349X654&word='+str(search)
#url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+search+'&ct=201326592&v=flip'
#print(url)
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36')
#proxy_addrs
#proxy_addr = "http://61.155.164.110:3128"
#data = use_proxy(proxy_addr,url)
try:
html= urllib.request.urlopen(req).read()#.decode('utf-8') #发送代理数据 #,data = data,method = 'POST'
except urllib.error.URLError as e:
print(e)
Regex = re.compile(r'"objURL":"(.*?)"')
mo = Regex.findall(str(html))
i = 1
for each in mo:
try:
req = urllib.request.Request(each)
imghtml = urllib.request.urlopen(req).read()
#time.sleep(1)
except urllib.error.URLError as e:
#print(e)
print('所爬图片'+str(i)+'网址有错! pass!')
img = open('D:/Desktop/tupian/' + str(i) +'.jpg' , 'wb') #覆盖掉之前的图片
img.write(imghtml)
img.close()
i = i + 1