简单的一个网站图片爬取

爬虫 主要的几个函数 urllib.request.Request(url) 模拟一个请求 add_header()   添加一个模拟请求头 ssl.create_unfived_context()绕过加密协议,usllib.request.urlopen() 发送请求头 ,urllib.request.urlretrieve()

 

import json
from urllib import request
import urllib
import re
import ssl
import random
import os
def htmlmjlator(url):
        heards=[
          r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6721.400 QQBrowser/10.2.2243.400"
           ,r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
           ,r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"
           ,r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
            ,r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"
           ]
        heardsdate=random.choice(heards)
        req=urllib.request.Request(url)
        req.add_header("User-Agent",heardsdate)
        contexts=ssl._create_unverified_context()
        responses=urllib.request.urlopen(req,context=contexts)
        # 页面本身是utf-8 但是有些地方不是utf-8编码使用igoret跳过
        # print(str(responses.read()))
        responseStr=responses.read().decode("utf-8",'ignore')
        a=r'<span .*><img data-ks-lazyload="(.*?)" src=".*"></span>'
        # b=r'<input type="hidden" name="oslanguage" />'
        partMatch=re.compile(a)
        divstr=partMatch.findall(responseStr)
        print("-----------------",divstr)
        # divStr=partMatch.findall(responseStr,re.S|re.M)
        # print("--------------",divStr)
         # local=os.path.join()
        # https://img.alicdn.com/imgextra/i1/13253094/TB2g_SeriOYBuNjSsD4XXbSkFXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i2/108689489/TB2JOp7FuuSBuNjSsziXXbq8pXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i1/116133295/TB2QIvobzfguuRjSspkXXXchpXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i3/28705317/TB2rzYOuYZnBKNjSZFKXXcGOVXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i1/133163417/TB2iYrECDlYBeNjSszcXXbwhFXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i1/60563529/TB2EWrJBsyYBuNkSnfoXXcWgVXa_!!0-saturn_solar.jpg_250x250.jpg
        # urllib.request.urlretrieve(urlimg,r"d:pywork")
        #
        #  urllib.request.urlcleanup()

        c=r'TB(.*?)_!!0-saturn_solar.jpg'
        lisimg=[]
        num=1;
        try:
            for urlimg in divstr:
                num=lisimg.__len__()
                print(num)
                partimg = re.compile(c)
                seargroup = re.search(partimg, urlimg)
                print(seargroup.group())
                lisimg.append(seargroup.group())
                urllib.request.urlretrieve(urlimg,"D:\pywork\image\ "+str(num)+".jpg")
                urllib.request.urlcleanup()
        except AttributeError as reason:
                print("groupNodeor*.groupnotype")
                pass

url2=r'https://uland.taobao.com/sem/tbsearch?refpid=mm_15891853_2192459_8654707&keyword=%E5%A5%B3%E8%A3%85&clk1=aeffe5fd082bfa0cc39160d0143b9007&upsid=aeffe5fd082bfa0cc39160d0143b9007'
url=r"https://re.taobao.com/search_ou?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A5%B3%E5%A4%8F&catid=&refpid=mm_26632258_3504122_32538762&_input_charset=utf8&clk1=7cbc46adc4816ebd2ffbcbd02cfb9966"
if __name__=="__main__":
    htmlstr=htmlmjlator(url)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值