python爬虫实例

动态爬取网页图片

纯原码

import urllib.request

import os

import random

import re

"""
def url_open(url):
   
   ip_list=['14.116.213.100:8081','14.18.109.42:8081','47.107.128.69:888','47.108.155.96:80','183.7.29.244:9999','36.57.68.239:8888','171.15.65.120:8080']  #优化方向:动态获取IP
   
   dynamic_ip=random.choice(ip_list)
   

   user_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 ','Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
                      'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36']

   dynamic_user_agent = random.choice(user_agent_list)       #变量名不能有-

   
   proxy_support = urllib.request.ProxyHandler({'https':dynamic_ip})
   
   opener = urllib.request.build_opener(proxy_support)

   urllib.request.install_opener(opener)
   
   opener.addheaders=[('User-Agent',dynamic_user_agent)]


   #req = urllib.request.Request(url)

   #req.add_header('User-Agent',dynamic_user_agent)
   
   response = urllib.request.urlopen(url)

   html = response.read().decode('utf-8')

   #print(html)

   return html
"""


def img_url_open(url):

   user_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 ','Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
                      'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36']

   dynamic_user_agent = random.choice(user_agent_list)       #变量名不能有-
   
   req = urllib.request.Request(url)

   req.add_header('User-Agent',dynamic_user_agent)
   
   response = urllib.request.urlopen(req)

   html = response.read()


   return html


"""
def page_num_moudle(url):       
   
    html = img_url_open(url).decode('utf-8')

    html_index = []

    a = html.find('//jandan.net/girl/')              #优化方向:动态变化,此处最好优化为正则

    while a != -1:
    
      b = html.find('=#comments',a)
       
      if b != -1:

          html_index.append(html[a+18:b])
          
      else:

          b = a + 32

      a = html.find('//jandan.net/girl/',b)      #b是起始位置
        

    return list(set(html_index))          #去重
   
    #print(list(set(html_index)))
"""




def page_name_moudle_upgrade(url):           #正则优化↑          <a href="//jandan.net/girl/MjAyMTA4MDMtODg=#comments">

   html = img_url_open(url).decode('utf-8')        #不用decode会报错

   rule = r'<a href="//jandan.net/girl/(\w+)=#comments">'                  #用括号是因为pattern输出括号中的内容
   
   html_address = re.findall(rule,html)
   
   return(list(set(html_address)))

   #print(list(set(html_address)))


def find_img(page_num,url):            #获取所以图片地址

   img_address = []

   for i in page_num:

       page_url = url + i + '=#comments'     
   
       html = img_url_open(page_url).decode('utf-8')

       a = html.find('img src')
    
       while  a != -1:
    
          b = html.find('.jpg',a,a+255)       #a是起始,a+255是结束
       
          if b != -1:

             img_address.append('https:' + html[a+9:b+4])

          else:

             b = a + 9
   
          a = html.find('img src',b)

   return img_address
   #print(img_address)
   


def save_img(folder,img_address):

   for each in img_address:

      filename = each.split('/')[-1]

      with open(filename,'wb') as f:

         img = img_url_open(each)

         f.write(img)

      

def download_Mm(folder='FindMm'):
                                             #优化方向:用户输入页数
   os.mkdir(folder)                          #优化方向:保持在指定的位置
    
   os.chdir(folder)

   url = 'https://jandan.net/girl/'

   page_num = page_name_moudle_upgrade(url)

   img_address = find_img(page_num,url)

   save_img(folder,img_address)
      

if __name__ == '__main__':

   download_Mm()      



#主程序调试块old
"""
url = 'http://jandan.net/girl/'

page_url = url + 'MjAyMTA3'+ 'MzAtOTE' + '=#comments'

#url_open(url)

#page_num_moudle(url)

find_img(page_num_moudle(url),url)
"""

'''
#正则调试块

url = 'http://jandan.net/girl/'
page_name_moudle_upgrade(url)
'''

#单功能图片下载调试块 
"""                                    
def download_Mmm(folder='FindMm'):

   
                                                #优化方向:用户输入页数
   os.mkdir(folder)                         #优化方向:保持在指定的位置;可以覆盖原有的文件夹
    
   os.chdir(folder)

   img_address = ['https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxqnnzbj8j30qo0qo0u5.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxqhishjnj30u011itho.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxqbu0blbj30jg0t6jxt.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxq4ynt5gj31930u043r.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxpykzih1j30oe0zkdii.jpg', 'https://wx3.sinaimg.cn/mw600/006AfEgvgy1gsiur6vwvpj31u52vh7wi.jpg', 'https://wx2.sinaimg.cn/mw600/699a48a7ly1gsirhya8i8j20xc1dzdwq.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxpr64odoj30u011i46l.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxplh1re8j30iz0o342e.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxpelh43pj30u011i77s.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxp70i166j318y0u0n2q.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxp0fr9vrj30u00u0q81.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxou799s3j30uk0kdac2.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxonjl6ikj30u00u0wjr.jpg', 'https://wx1.sinaimg.cn/mw600/006AfEgvgy1gsira2qrr2j31og2iox6p.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxob0lxquj30u011j10m.jpg', 'https://wx4.sinaimg.cn/mw600/006AfEgvgy1gsimijh71dj322o340e82.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxo43n2trj30sg16odmm.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxnylainyj30u00u10ys.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxnrt3wukj30u0190465.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxnm5q7bbj31900u0451.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxnfoebuzj30u00zzth0.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxn9qx0fuj30u00u0n6a.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxvmh0ksvj31c00u0qe1.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxvfqcom6j30jg0t6tdd.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxv9eqpgij30u011i0x9.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxv3k745ij30u0190q8a.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxur0fmktj30ku0q176b.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxudc76fnj30u00u0tc2.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxu6vo7ytj30u0190af8.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxu0yzdncj30jg0t6jvj.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxtuo9ofnj30u01900v0.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxtnrjtfwj318y0u078n.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxthwzp8cj60u011in2202.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxtc2hr2fj318g0tnn8u.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxt59a7vcj30u00u0aed.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxsyw95brj30u018zti5.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxss322wxj30u011iadf.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxsmdwlcaj30qo140dk3.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxs9y3iqdj30u011i40a.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxs408he0j30qo0xc0v8.jpg', 'https://wx1.sinaimg.cn/mw600/0076BSS5ly8gsxrxev91gj30u011ijy2.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxrqkxwpaj30u0192gre.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxrjwgjqxj30u00k0t9l.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxrd0gqzzj30my0xcn4x.jpg', 'https://wx4.sinaimg.cn/mw600/0076BSS5ly8gsxr6jvc8jj30u011hdiv.jpg', 'https://wx2.sinaimg.cn/mw600/0076BSS5ly8gsxr0ft8y8j30u011iafc.jpg', 'https://wx3.sinaimg.cn/mw600/0076BSS5ly8gsxqtrukqjj30ha0bit9q.jpg']
   
   save_img(folder,img_address)


download_Mmm()
"""


上述未详细注释,代码不够精简请观者理性看待。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值