爬虫 主要的几个函数 urllib.request.Request(url) 模拟一个请求 add_header() 添加一个模拟请求头 ssl.create_unfived_context()绕过加密协议,usllib.request.urlopen() 发送请求头 ,urllib.request.urlretrieve()
import json
from urllib import request
import urllib
import re
import ssl
import random
import os
def htmlmjlator(url):
heards=[
r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6721.400 QQBrowser/10.2.2243.400"
,r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
,r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"
,r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
,r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"
]
heardsdate=random.choice(heards)
req=urllib.request.Request(url)
req.add_header("User-Agent",heardsdate)
contexts=ssl._create_unverified_context()
responses=urllib.request.urlopen(req,context=contexts)
# 页面本身是utf-8 但是有些地方不是utf-8编码使用igoret跳过
# print(str(responses.read()))
responseStr=responses.read().decode("utf-8",'ignore')
a=r'<span .*><img data-ks-lazyload="(.*?)" src=".*"></span>'
# b=r'<input type="hidden" name="oslanguage" />'
partMatch=re.compile(a)
divstr=partMatch.findall(responseStr)
print("-----------------",divstr)
# divStr=partMatch.findall(responseStr,re.S|re.M)
# print("--------------",divStr)
# local=os.path.join()
# https://img.alicdn.com/imgextra/i1/13253094/TB2g_SeriOYBuNjSsD4XXbSkFXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i2/108689489/TB2JOp7FuuSBuNjSsziXXbq8pXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i1/116133295/TB2QIvobzfguuRjSspkXXXchpXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i3/28705317/TB2rzYOuYZnBKNjSZFKXXcGOVXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i1/133163417/TB2iYrECDlYBeNjSszcXXbwhFXa_!!0-saturn_solar.jpg_250x250.jpg', 'https://img.alicdn.com/imgextra/i1/60563529/TB2EWrJBsyYBuNkSnfoXXcWgVXa_!!0-saturn_solar.jpg_250x250.jpg
# urllib.request.urlretrieve(urlimg,r"d:pywork")
#
# urllib.request.urlcleanup()
c=r'TB(.*?)_!!0-saturn_solar.jpg'
lisimg=[]
num=1;
try:
for urlimg in divstr:
num=lisimg.__len__()
print(num)
partimg = re.compile(c)
seargroup = re.search(partimg, urlimg)
print(seargroup.group())
lisimg.append(seargroup.group())
urllib.request.urlretrieve(urlimg,"D:\pywork\image\ "+str(num)+".jpg")
urllib.request.urlcleanup()
except AttributeError as reason:
print("groupNodeor*.groupnotype")
pass
url2=r'https://uland.taobao.com/sem/tbsearch?refpid=mm_15891853_2192459_8654707&keyword=%E5%A5%B3%E8%A3%85&clk1=aeffe5fd082bfa0cc39160d0143b9007&upsid=aeffe5fd082bfa0cc39160d0143b9007'
url=r"https://re.taobao.com/search_ou?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A5%B3%E5%A4%8F&catid=&refpid=mm_26632258_3504122_32538762&_input_charset=utf8&clk1=7cbc46adc4816ebd2ffbcbd02cfb9966"
if __name__=="__main__":
htmlstr=htmlmjlator(url)