采用代理方式,不知道为什么不行,请大神指教,代码如下:
import urllib.request
import os
import random
pagedict = { }
def url_open(url):
req = urllib.request.Request(url)
req.add_header('Usr-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')
#红色为添加代理部分
proxies = ['223.199.25.235:9999','183.166.139.189:9999','223.199.19.229:9999']
proxy = random.choice(proxies)
proxy_support = urllib.request.ProxyHandler({'http':proxy})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url,pages=2):
html = url_open(url).decode('utf-8')
a = html.find('//jandan.net/ooxx/',0)
b = html.find('">', a) # 从a开始,找到第一个">, 返回索引
c = html.find('</a>', b) # 从b开始,找到第一个 ], 返回索引
page = 'http:'+ html[a:b]
pagenum = html[b+2:c].strip()
pagedict.update({pagenum:page})
print(pagedict)
for i in range(pages):
a = html.find('//jandan.net/ooxx/',c)
b = html.find('">', a) # 从a开始,找到第一个">, 返回索引
c = html.find('</a>', b) # 从b开始,找到第一个 ], 返回索引
page = 'http:'+ html[a:b]
pagenum = html[b+2:c].strip()
pagedict.update({pagenum:page})
# print(pagedict)
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
print('http:' + each)
with open(filename, 'wb') as f:
img = url_open('http:' + each)
f.write(img)
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a + 255)
if b != -1: # 找到一个 jpg
# print('%s'%html[a+9 : b+4])
img_addrs.append(html[a+9 : b+4]) # 加入列表
else: # 到不到, 移动b的位置
b = a + 9
a = html.find('img src=', b) # 在b之后开始,再找img src
# for each in img_addrs:
# print(each)
return img_addrs
def download_mm(folder='OOXX',pages=10):
# if not os.path.exists(folder):
os.makedirs(folder)
os.chdir(folder)
url = "http://jandan.net/ooxx"
page_num = (get_page(url))
print(pagedict)
for key in pagedict:
page_url = pagedict[key]
print(page_url)
img_addrs = find_imgs(page_url)
save_imgs(folder,img_addrs)
if __name__ == '__main__':
download_mm()