声明:以下代码,Python版本3.6完美运行
思路介绍浏览器浏览分析地址变化规律
Python测试类获取网页内容,从而获取图片地址
Python测试类下载图片,保存成功则爬虫可以实现
某网站(难度:❤❤)
from urllib import request
import re
import os
from bs4 import BeautifulSoup
from urllib.error import HTTPError
'''遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!'''
# 全局声明的可以写到配置文件,这里为了读者方便看,故只写在一个文件里面
# 图片地址
picpath = r'E:\Python_Doc\Images'
# 网站地址
mm_url = "http://pic.yesky.com/c/6_20771_%s.shtml"
# 保存路径的文件夹,没有则自己创建文件夹,不能创建上级文件夹
def setpath(name):
path = os.path.join(picpath, name)
if not os.path.isdir(path):
os.mkdir(path)
return path
# 获取html内容
def get_html(url):
req = request.Request(url)
return request.urlopen(req).read()
# 保存图片
def save_image(path, url):
req = request.Request(url)
get_img = request.urlopen(req).read()
with open(path + '/' + url[-14:] + '.jpg', 'wb') as fp:
fp.write(get_img)
return
def do_task(path, url):
html = get_html(url)
p = r'(http://pic.yesky.com/\d+/\d+.shtml)'
urllist = re.findall(p, str(html))
print(urllist)
for ur in urllist:
for i in range(2, 100):
url1 = ur[:-6] + "_" + str(i) + ".shtml"
print(url1)
try:
html1 = get_html(url1)
data = BeautifulSoup(html1, "lxml")
p = r"http://dynamic-image\.yesky\.com/740x-/uploadImages/\S+\.jpg"
image_list = re.findall(p, str(data))
print(image_list[0])
save_image(path, image_list[0])
except:
break
if __name__ == '__main__':
# 文件名
filename = "YeSky"
filepath = setpath(filename)
for i in range(2, 100):
print("正在6_20771_%s" % i)
url = mm_url % i
do_task(filepath, url)
from urllib import request
import re
import os
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
# 全局声明的可以写到配置文件,这里为了读者方便看,故只写在一个文件里面
# 图片地址
picpath = r'E:\Python_Doc\Images'
# 7160地址
mm_url = "http://www.7160.com/xingganmeinv/list_3_%s.html"
mm_url2 = "http://www.7160.com/meinv/%s/index_%s.html"
# 保存路径的文件夹,没有则自己创建文件夹,不能创建上级文件夹
def setpath(name):
path = os.path.join(picpath, name)
if not os.path.isdir(path):
os.mkdir(path)
return path
def get_html(url):
req = request.Request(url)
return request.urlopen(req).read()
def get_image(path, url):
req = request.Request(url)
get_img = request.urlopen(req).read()
with open(path + '/' + url[-14:] + '.jpg', 'wb') as fp:
fp.write(get_img)
return
def do_task(path, url):
html = get_html(url)
get_list = re.findall(p, str(html))
p = r"http://img\.7160\.com/uploads/allimg/\d+/\S+\.jpg"
image_list = re.findall(p, str(html2))
get_image(path, image_list[0])
print("正在下载页数:List_3_%s " % i)
3. 网站:http://www.263dm.com 263dm_spider.py
from urllib.request import urlopen
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
# 全局声明的可以写到配置文件,这里为了读者方便看,故只写在一个文件里面
picpath = r'E:\Python_Doc\Images'
mm_url = "http://www.263dm.com/html/ai/%s.html"
# 保存路径的文件夹,没有则自己创建文件夹,不能创建上级文件夹
path = os.path.join(picpath, name)
aa = urllib.request.Request(url)
html = urllib.request.urlopen(aa).read()
p = r"(http://www\S*/\d{4}\.html)"
return re.findall(p, str(html))
aa = urllib.request.Request(url)
html = urllib.request.urlopen(aa).read()
url_list = re.findall(p, str(html))
def save_image(url_ref, url, path):
headers = {"Referer": url_ref,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko)Chrome/62.0.3202.94 Safari/537.36'}
content = requests.get(url, headers=headers)
if content.status_code == 200:
with open(path + "/" + str(time.time()) + '.jpg', 'wb') as f:
for i in range(10705, 9424, -1):