动态爬取网页图片
纯原码
import urllib.request
import os
import random
import re
"""
def url_open(url):
ip_list=['14.116.213.100:8081','14.18.109.42:8081','47.107.128.69:888','47.108.155.96:80','183.7.29.244:9999','36.57.68.239:8888','171.15.65.120:8080'] #优化方向:动态获取IP
dynamic_ip=random.choice(ip_list)
user_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 ','Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36']
dynamic_user_agent = random.choice(user_agent_list) #变量名不能有-
proxy_support = urllib.request.ProxyHandler({'https':dynamic_ip})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
opener.addheaders=[('User-Agent',dynamic_user_agent)]
#req = urllib.request.Request(url)
#req.add_header('User-Agent',dynamic_user_agent)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
#print(html)
return html
"""
def img_url_open(url):
user_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 ','Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36']
dynamic_user_agent = random.choice(user_agent_list) #变量名不能有-
req = urllib.request.Request(url)
req.add_header('User-Agent',dynamic_user_agent)
response = urllib.request.urlopen(req)
html = response.read()
return html
"""
def page_num_moudle(url):
html = img_url_open(url).decode('utf-8')
html_index = []
a = html.find('//jandan.net/girl/') #优化方向:动态变化,此处最好优化为正则
while a != -1:
b = html.find('=#comments',a)
if b != -1:
html_index.append(html[a+18:b])
else:
b = a + 32
a = html.find('//jandan.net/girl/',b) #b是起始位置
return list(set(html_index)) #去重
#print(list(set(html_index)))
"""
def page_name_moudle_upgrade(url): #正则优化↑ <a href="//jandan.net/girl/MjAyMTA4MDMtODg=#comments">
html = img_url_open(url).decode('utf-8') #不用decode会报错
rule = r'<a href="//jandan.net/girl/(\w+)=#comments">' #用括号是因为pattern输出括号中的内容
html_address = re.findall(rule,html)
return(list(set(html_address)))
#print(list(set(html_address)))
def find_img(page_num,url): #获取所以图片地址
img_address = []
for i in page_num:
page_url = url + i + '=#comments'
html = img_url_open(page_url).decode('utf-8')
a = html.find('img src')
while a != -1:
b = html.find('.jpg',a,a+255) #a是起始,a+255是结束
if b != -1:
img_address.append('https:' + html[a+9:b+4])
else:
b = a + 9
a = html.find('img src',b)
return img_address
#print(img_address)
def save_img(folder,img_address):
for each in img_address:
filename = each.split('/')[-1]
with open(filename,'wb') as f:
img = img_url_open(each)
f.write(img)
def download_Mm(folder='FindMm'):
#优化方向:用户输入页数
os.mkdir(folder) #优化方向:保持在指定的位置
os.chdir(folder)
url = 'https://jandan.net/girl/'
page_num = page_name_moudle_upgrade(url)
img_address = find_img(page_num,url)
save_img(folder,img_address)
if __name__ == '__main__':
download_Mm()
#主程序调试块old
"""
url = 'http://jandan.net/girl/'
page_url = url + 'MjAyMTA3'+ 'MzAtOTE' + '=#comments'
#url_open(url)
#page_num_moudle(url)
find_img(page_num_moudle(url),url)
"""
'''
#正则调试块
url = 'http://jandan.net/girl/'
page_name_moudle_upgrade(url)
'''
#单功能图片下载调试块
"""
def download_Mmm(folder='FindMm'):
#优化方向:用户输入页数
os.mkdir(folder) #优化方向:保持在指定的位置;可以覆盖原有的文件夹
os.chdir(folder)
img_address = ['https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxqnnzbj8j30qo0qo0u5.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxqhishjnj30u011itho.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxqbu0blbj30jg0t6jxt.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxq4ynt5gj31930u043r.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxpykzih1j30oe0zkdii.jpg', 'https://wx3.sinaimg.cn/mw600/006AfEgvgy1gsiur6vwvpj31u52vh7wi.jpg', 'https://wx2.sinaimg.cn/mw600/699a48a7ly1gsirhya8i8j20xc1dzdwq.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxpr64odoj30u011i46l.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxplh1re8j30iz0o342e.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxpelh43pj30u011i77s.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxp70i166j318y0u0n2q.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxp0fr9vrj30u00u0q81.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxou799s3j30uk0kdac2.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxonjl6ikj30u00u0wjr.jpg', 'https://wx1.sinaimg.cn/mw600/006AfEgvgy1gsira2qrr2j31og2iox6p.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxob0lxquj30u011j10m.jpg', 'https://wx4.sinaimg.cn/mw600/006AfEgvgy1gsimijh71dj322o340e82.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxo43n2trj30sg16odmm.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxnylainyj30u00u10ys.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxnrt3wukj30u0190465.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxnm5q7bbj31900u0451.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxnfoebuzj30u00zzth0.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxn9qx0fuj30u00u0n6a.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxvmh0ksvj31c00u0qe1.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxvfqcom6j30jg0t6tdd.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxv9eqpgij30u011i0x9.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxv3k745ij30u0190q8a.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxur0fmktj30ku0q176b.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxudc76fnj30u00u0tc2.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxu6vo7ytj30u0190af8.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxu0yzdncj30jg0t6jvj.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxtuo9ofnj30u01900v0.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxtnrjtfwj318y0u078n.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxthwzp8cj60u011in2202.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxtc2hr2fj318g0tnn8u.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxt59a7vcj30u00u0aed.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxsyw95brj30u018zti5.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxss322wxj30u011iadf.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxsmdwlcaj30qo140dk3.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxs9y3iqdj30u011i40a.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxs408he0j30qo0xc0v8.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxrxev91gj30u011ijy2.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxrqkxwpaj30u0192gre.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxrjwgjqxj30u00k0t9l.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxrd0gqzzj30my0xcn4x.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxr6jvc8jj30u011hdiv.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxr0ft8y8j30u011iafc.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxqtrukqjj30ha0bit9q.jpg']
save_img(folder,img_address)
download_Mmm()
"""