完整代码:
# coding:utf-8
# 引入requests包和正则表达式包re
import requests
import re
from datetime import datetime
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium import webdriver
import time
from bs4 import BeautifulSoup
# # htp='http://pic.weather.com.cn/images/cn/photo/2021/09/16/20210916100549696B7FD68719D9208B6E945E3B1E260B_xm.jpg'
# htp='http://pic.weather.com.cn/images/cn/photo/2021/09/16/20210916100549696B7FD68719D9208B6E945E3B1E260B_xm.jpg'
# htp='https://bing.ioliu.cn/photo/FreshSalt_ZH-CN12818759319?force=ranking_1'
# rs=re.findall('.+?=ranking_1',htp)
# print(rs)
'''
# 自定义下载页面函数
def load_page(url):
response = requests.get(url)
data = response.content
return data
# 自定义保存页面图片函数
def get_image(html):
# regx = r'http://[\S]*.jpg' # 定义图片正则表达式jpg.+?\,
# regx = r'src="(.*?\.jpg)"' # 定义图片正则表达式jpg.+?\,
regx = r'http://[\S]+.png' # 定义图片正则表达式jpg.+?\,
pattern = re.compile(regx) # 编译表达式构造匹配模式
print(pattern)
get_images = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
print(get_images )
num = 1
# 遍历匹配成功的链接
for img in get_images:
image = load_page(img) # 根据图片链接,下载图片链接
# 将下载的图片保存到对应的文件夹中
with open('./picture/第%s张.jpg'% num , 'wb') as fb:#/spider_picture
fb.write(image)
print("正在下载第%s张图片" % num)
num = num + 1
print("下载完成!")
if __name__ == '__main__':
# 定义爬取页面的链接
url='http://p.weather.com.cn/'
# url='https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E4%BA%BA'
# url = 'http://p.weather.com.cn/2021/08/3490083.shtml'
# 调用load_page函数,下载页面内容
html = load_page(url)
# 在页面中,匹配图片链接,并将图片下载下来,保存到对应文件夹
get_image(html)
'''
'''
import re
import urllib.request # Python2中使用的是urllib2
import urllib
import os
def load_page(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/74.0.3729.131 Safari/537.36'}
response = requests.get(url,headers=headers)
data = response.content
# print(data)
return data
def get_image(html):
'图片地址注意要从浏览器中查看网页源代码找出图片路径'
# 要加括号,作为元组返回
# regx = r'src="(.+?\.jpg)" pic_ext' # 某个贴吧的图片
#bing网页图片获取
regx= r'data-progressive.*?src="(.*?)"' # Bing壁纸合集抓取地址
# regx= r'http://[\S]*.jpg\?imageslim' # Bing壁纸合集抓取地址
# regx =r'<h3>(.*?)</h3>'
# regx = r'src="(.*?\.jpg)"'
# regx= r'http://[\S]*.jpg'
# https: // bing.ioliu.cn / photo / FreshSalt_ZH - CN12818759319?force = ranking_1
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_640x480.jpg?imageslim
# reg = r'src="(.+?\.jpg)" ' # 我的网站图片地址
# reg = r'zoomfile="(.+?\.jpg)" ' # 威锋网手机壁纸
#正则表达式过滤
pattern = re.compile(regx) # 编译表达式构造匹配模式
get_images = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
try:
#保存图片
path = 'E:\\Temporary\\new' # 输入保存文件的目录地址
if not os.path.isdir(path):
os.makedirs(path) # 检查是否存在地址,如果不存在将自动创建文件夹目录
paths = path + '\\' # 保存在test路径下
x = 0
for imgurl in get_images:#imglist:
# urllib.request.urlretrieve(imgurl, '{}{}.jpg'.format(paths, x))
# print("正在下载第%s张图片" % x)
# # print("下载完成!")
image = load_page(imgurl) # 根据图片链接,下载图片链接
# # 将下载的图片保存到对应的文件夹中
with open( path +'\.第%s张.jpg' % x ,'wb') as fb: # /spider_picture
fb.write(image)
print("正在下载第%s张图片" % x)
x = x +1
print("下载完成!")
except:
# continue
print("下载错误!")
if __name__ == '__main__':
# html = getHtml("http://bbs.feng.com/read-htm-tid-10616371.html") # 威锋网手机壁纸
# html = getHtml("http://www.omegaxyz.com/") # 我的网站图片地址
url = "https://bing.ioliu.cn/ranking" # Bing壁纸合集抓取地址
# url="http://tieba.baidu.com/p/2460150866" # 某个贴吧的图片
html = load_page(url)
get_image(html)
# print(get_image(html))
'''
'''
import re,urllib.request
# request = urllib.request.urlopen('http://www.imooc.com/course/list')
# buf = request.read().decode('utf-8')
# # print(buf)
# # listurl = re.findall(r'src=.+?\.jpg',buf)
# listurl = re.findall(r'src="(.*?\.jpg)"',buf)
# print (listurl)
def load_page(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/74.0.3729.131 Safari/537.36'}
response = requests.get(url,headers=headers)
data = response.content
# print(data)
return data
url='https://bing.ioliu.cn/ranking'
html = load_page(url)
# print(html)
# regx =r'class="mark".*?href=.*?=ranking_1'#r'src="(.*?\.jpg)"'
regx= r'http://[\S]*.jpg\?imageslim' # Bing壁纸合集抓取地址
pattern = re.compile(regx) # 编译表达式构造匹配模式
print(pattern)
listurl = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
print(listurl)
# https://bing.ioliu.cn/photo/FreshSalt_ZH-CN12818759319?force=ranking_1
res=[]
index = 0
for url in listurl:
# print(url)
a= re.findall(r'\/photo\/.*?=ranking_1',url)
# print(a[0])
# a = re.findall(r'com(\/.+\.jpg)', url)
res.append('http://h2.ioliu.cn/bing' + a[0])
print(res[2])
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_1920x1080.jpg?imageslim
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_1920x1080.jpg?imageslim
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_640x480.jpg?imageslim
for url in res:
print(url)
image = load_page(url) # 根据图片链接,下载图片链接
with open('./picture/第%s张.jpg' % index, 'wb') as fb: # /spider_picture
fb.write(image)
print("正在下载第%s张图片" % index)
index = index + 1
# #关闭文件
fb.close()
'''
'''
def load_page(url):
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
# Chrome/74.0.3729.131 Safari/537.36'}
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
cookie = 'Cookie: winWH=%5E6_1446x763; BDIMGISLOGIN=0; BDqhfp=%E5%A3%81%E7%BA%B8%26%260-10-1undefined%26%260%26%261; BAIDUID=26E387397DECC40EF0CEC97E91622564:FG=1; __yjs_duid=1_58da01e99cf28c2cfbbb855096fa3e471635314939309; BIDUPSID=26E387397DECC40EF0CEC97E91622564; PSTM=1635315243; BDUSS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; BDUSS_BFESS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; MCITY=-%3A; BDSFRCVID_BFESS=rFuOJeC62uoQSpJHF-lJb5s04mqTv45TH6aoQGj2DDyBWL6TD1tDEG0PMM8g0KuMV-nyogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tb4qoC82fCD3fP36q4TjMbteblOf5Rj-HD7yWCv2JPbcOR5Jj65hyP-B2hj70Duf-DutQKt22RTSMDTF3MA--tRLbUcr2lTdMncuLlTdWRLWsq0x0hjte-bQyNOa--rzBIOMahkb5h7xOKbMQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTyjaD8q6tfJb3BQ5re54OEenurKxjhXUI8LNDH-4RIL5RZapLEapcDMt8xj-ca3b53jJO7ttoyLmvBoqbgMloR8KQ2Q5QMQML1Db3uW6vMtg3t2xLEatToepvoDPJc3Mv30-jdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjKjLEK5r2SCKKtILM3D; firstShowTip=1; indexPageSugList=%5B%22%E5%A3%81%E7%BA%B8%22%2C%22%E8%B4%A7%E8%BD%A6%E6%8E%92%E9%98%9F%E5%8F%AB%E5%8F%B7%E7%9C%8B%E6%9D%BF%22%2C%22%E8%B4%A7%E8%BD%A6%E6%8E%92%E9%98%9F%E5%8F%AB%E5%8F%B7%22%2C%22%E8%B4%A7%E8%BD%A6%E6%8E%92%E9%98%9F%E7%9C%8B%E6%9D%BF%22%2C%22%E8%BD%A6%E8%BE%86%E6%8E%92%E9%98%9F%E7%9C%8B%E6%9D%BF%22%2C%22%E8%BD%A6%E8%BE%86%E6%8E%92%E9%98%9F%E5%8F%AB%E5%8F%B7%22%2C%22%E8%BD%A6%E8%BE%86%E5%8F%AB%E5%8F%B7%E7%89%8C%22%2C%22%E7%9C%8B%E6%9D%BF%E7%AE%A1%E7%90%86%22%2C%22%E8%AE%BE%E5%A4%87%E8%B4%A7%E6%9E%B6%E5%B7%A5%E5%85%B7%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_MjVkMmNlODQ4Yzc0ZDRlMDk5Y2IyZGNmYjk0NTJiNTljY2E5MjgxOTBiNTM0YmM2ZTY5OWIxZGRkNDhkM2U5ZTg4ZmVmNmYzM2I1MzZiMWRlZDc1MzliNzM4MWE4NDIx; H_PS_PSSID=35784_35106_31254_35734_35488_35774_34584_35490_35245_35796_35316_26350_22157; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; PSINO=5; BA_HECTOR=802l81a1250k8000nk1gv21o30q; BAIDUID_BFESS=26E387397DECC40EF0CEC97E91622564:FG=1; BDRCVFR[Q5XHKaSBNfR]=mk3SLVN4HKm; userFrom=null'
# cookie ='Cookie: BAIDUID=26E387397DECC40EF0CEC97E91622564:FG=1; __yjs_duid=1_58da01e99cf28c2cfbbb855096fa3e471635314939309; BIDUPSID=26E387397DECC40EF0CEC97E91622564; PSTM=1635315243; BDUSS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; BDUSS_BFESS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; MCITY=-%3A; BDSFRCVID_BFESS=rFuOJeC62uoQSpJHF-lJb5s04mqTv45TH6aoQGj2DDyBWL6TD1tDEG0PMM8g0KuMV-nyogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tb4qoC82fCD3fP36q4TjMbteblOf5Rj-HD7yWCv2JPbcOR5Jj65hyP-B2hj70Duf-DutQKt22RTSMDTF3MA--tRLbUcr2lTdMncuLlTdWRLWsq0x0hjte-bQyNOa--rzBIOMahkb5h7xOKbMQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTyjaD8q6tfJb3BQ5re54OEenurKxjhXUI8LNDH-4RIL5RZapLEapcDMt8xj-ca3b53jJO7ttoyLmvBoqbgMloR8KQ2Q5QMQML1Db3uW6vMtg3t2xLEatToepvoDPJc3Mv30-jdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjKjLEK5r2SCKKtILM3D; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BAIDUID_BFESS=26E387397DECC40EF0CEC97E91622564:FG=1; delPer=0; PSINO=5; ZD_ENTRY=empty; BDRCVFR[gQU9D8KBoX6]=IdAnGome-nsnWnYPi4WUvY; H_PS_PSSID=35784_35106_31254_35734_35488_35774_34584_35490_35245_35796_35316_26350_22157; BA_HECTOR=8h8h84ak8ha5250kk91gv20jh0q; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; userFrom=www.baidu.com; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; ab_sr=1.0.1_NGZiYjhiM2E1YWMwMzdjMDhhNzQ0ODYyODhmMTZkMjlmYjBlZDNlNWQwYmM1MThkMTZlNDU1YjQ3ZjBmOWJjNDRhZDQ4ZTIxNzk0ZDc2MzkxMGE3NjM1ZWI4ZjQ3Y2I1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': cookie
}
response = requests.get(url,headers=headers)
data = response.content
return data
#网址
# url='https://bing.ioliu.cn/'#'https://bing.ioliu.cn/ranking'
url='https://image.baidu.com/search/albumsdetail?tn=albumsdetail&word=%E8%88%AA%E6%8B%8D%E5%9C%B0%E7%90%83%E7%B3%BB%E5%88%97&fr=albumslist&album_tab=%E8%AE%BE%E8%AE%A1%E7%B4%A0%E6%9D%90&album_id=312&rn=30'
html = load_page(url)
print(html)
# 和图片地址
# regx= r'http://[\S]*1920x1080.jpg\?imageslim' # Bing壁纸合集抓取地址#'http://h2.ioliu.cn/bing/StDwynwensDay_ZH-CN3187096355_640x480.jpg?imageslim'
# regx= r'https://[\S]*.jpg|https://[\S]*.png'#(\S*?jpg|\S*?JPG)
# regx=r'https://[\s]*.jpeg'#'https://pics3.baidu.com/feed/d8f9d72a6059252d7121de6ec645c7325ab5b98b.jpeg?token=6f062ca0182058a1405782417a1ee980'
#r'src[=\'\"\s]+[^\"\']+\.jpg[\"\']?token=[\s]'
regx='https://[\S]*.jpeg|https://[\S]*.jpg|https://[\S]*.png'
'https://pics6.baidu.com/feed/3b87e950352ac65ca4b6ab1b1633751892138a77.jpeg?token=46bcb0c0a40b6dd6f8f57420e76b3f14'
'https://pics4.baidu.com/feed/5bafa40f4bfbfbed7b46ef700926313faec31f86.png?token=4b3d3cbdea69f32693f8c3b5d8b4e933'
pattern = re.compile(regx) # 编译表达式构造匹配模式
print(pattern)
listurl = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
print(listurl)
index = 0
for url in listurl:
image = load_page(url) # 根据图片链接,下载图片链接
with open('./picture/第%s张' % index+ datetime.strftime(datetime.now(),'%H-%M-%S')+'.jpg','wb') as fb: # /spider_picture#+ datetime.strftime(datetime.now(),'%H-%M-%S')+
fb.write(image)
print("正在下载第%s张图片" % index )#%Y-%m-%d %p :%S
index = index + 1
# #关闭文件
fb.close()
'''
'''
# page = input('请输入要爬取多少页:')
page = 3#int(page) + 1
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
pn = 1
keyword="ppt目录"
# pn是从第几张图片获取 百度图片下滑时默认一次性显示30张
for m in range(1, page):
url = 'https://image.baidu.com/search/acjson?'
param = {
'tn': 'resultjson_com',
'logid': '8846269338939606587',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '-1',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '0',
'istype': '2',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': 'girl',
'pn': pn, # 从第几张图片开始
'rn': '30',
'gsm': '1e',
}
page_text = requests.get(url=url, headers=header, params=param)
# print(page_text.content)
page_text.encoding = 'utf-8'
page_text = page_text.json()
info_list = page_text['data']
del info_list[-1]
img_path_list = []
for i in info_list:
img_path_list.append(i['thumbURL'])
index=0
for img_path in img_path_list:
img_data = requests.get(url=img_path, headers=header).content
print(img_path)
# img_path = './' + str(n) + '.jpg'
# with open(img_path, 'wb') as fp:
# fp.write(img_data)
# n = n + 1
with open('./picture/第%s张' % index + datetime.strftime(datetime.now(), '%Y-%m-%d %H-%M') + '.jpg','wb') as fb: # /spider_picture#+ datetime.strftime(datetime.now(),'%H-%M-%S')+
fb.write(img_data)
print("正在下载第%s张图片" % index) # %Y-%m-%d %p :%S
index = index + 1
# #关闭文件
fb.close()
pn += 29
'''
def web(url):
opt = FirefoxOptions() # ChromeOptions() # 创建chrome参数
opt.headless = False # 显示浏览器
driver = webdriver.Firefox(options=opt) # Chrome(options=opt) # 浏览器实例化
driver.set_window_size(400, 900)
driver.get(url) # 加载网址
source = driver.page_source # 页面内容实例化
data = BeautifulSoup(source, 'html.parser') # 获取页面内容
return data # data
def load_page(url):
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
# Chrome/74.0.3729.131 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
# cookie = 'Cookie: winWH=%5E6_1446x763; BDIMGISLOGIN=0; BDqhfp=%E5%A3%81%E7%BA%B8%26%260-10-1undefined%26%260%26%261; BAIDUID=26E387397DECC40EF0CEC97E91622564:FG=1; __yjs_duid=1_58da01e99cf28c2cfbbb855096fa3e471635314939309; BIDUPSID=26E387397DECC40EF0CEC97E91622564; PSTM=1635315243; BDUSS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; BDUSS_BFESS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; MCITY=-%3A; BDSFRCVID_BFESS=rFuOJeC62uoQSpJHF-lJb5s04mqTv45TH6aoQGj2DDyBWL6TD1tDEG0PMM8g0KuMV-nyogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tb4qoC82fCD3fP36q4TjMbteblOf5Rj-HD7yWCv2JPbcOR5Jj65hyP-B2hj70Duf-DutQKt22RTSMDTF3MA--tRLbUcr2lTdMncuLlTdWRLWsq0x0hjte-bQyNOa--rzBIOMahkb5h7xOKbMQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTyjaD8q6tfJb3BQ5re54OEenurKxjhXUI8LNDH-4RIL5RZapLEapcDMt8xj-ca3b53jJO7ttoyLmvBoqbgMloR8KQ2Q5QMQML1Db3uW6vMtg3t2xLEatToepvoDPJc3Mv30-jdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjKjLEK5r2SCKKtILM3D; firstShowTip=1; indexPageSugList=%5B%22%E5%A3%81%E7%BA%B8%22%2C%22%E8%B4%A7%E8%BD%A6%E6%8E%92%E9%98%9F%E5%8F%AB%E5%8F%B7%E7%9C%8B%E6%9D%BF%22%2C%22%E8%B4%A7%E8%BD%A6%E6%8E%92%E9%98%9F%E5%8F%AB%E5%8F%B7%22%2C%22%E8%B4%A7%E8%BD%A6%E6%8E%92%E9%98%9F%E7%9C%8B%E6%9D%BF%22%2C%22%E8%BD%A6%E8%BE%86%E6%8E%92%E9%98%9F%E7%9C%8B%E6%9D%BF%22%2C%22%E8%BD%A6%E8%BE%86%E6%8E%92%E9%98%9F%E5%8F%AB%E5%8F%B7%22%2C%22%E8%BD%A6%E8%BE%86%E5%8F%AB%E5%8F%B7%E7%89%8C%22%2C%22%E7%9C%8B%E6%9D%BF%E7%AE%A1%E7%90%86%22%2C%22%E8%AE%BE%E5%A4%87%E8%B4%A7%E6%9E%B6%E5%B7%A5%E5%85%B7%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_MjVkMmNlODQ4Yzc0ZDRlMDk5Y2IyZGNmYjk0NTJiNTljY2E5MjgxOTBiNTM0YmM2ZTY5OWIxZGRkNDhkM2U5ZTg4ZmVmNmYzM2I1MzZiMWRlZDc1MzliNzM4MWE4NDIx; H_PS_PSSID=35784_35106_31254_35734_35488_35774_34584_35490_35245_35796_35316_26350_22157; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; PSINO=5; BA_HECTOR=802l81a1250k8000nk1gv21o30q; BAIDUID_BFESS=26E387397DECC40EF0CEC97E91622564:FG=1; BDRCVFR[Q5XHKaSBNfR]=mk3SLVN4HKm; userFrom=null'
cookie ='Cookie: BAIDUID=26E387397DECC40EF0CEC97E91622564:FG=1; __yjs_duid=1_58da01e99cf28c2cfbbb855096fa3e471635314939309; BIDUPSID=26E387397DECC40EF0CEC97E91622564; PSTM=1635315243; BDUSS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; BDUSS_BFESS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; MCITY=-%3A; BDSFRCVID_BFESS=rFuOJeC62uoQSpJHF-lJb5s04mqTv45TH6aoQGj2DDyBWL6TD1tDEG0PMM8g0KuMV-nyogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tb4qoC82fCD3fP36q4TjMbteblOf5Rj-HD7yWCv2JPbcOR5Jj65hyP-B2hj70Duf-DutQKt22RTSMDTF3MA--tRLbUcr2lTdMncuLlTdWRLWsq0x0hjte-bQyNOa--rzBIOMahkb5h7xOKbMQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTyjaD8q6tfJb3BQ5re54OEenurKxjhXUI8LNDH-4RIL5RZapLEapcDMt8xj-ca3b53jJO7ttoyLmvBoqbgMloR8KQ2Q5QMQML1Db3uW6vMtg3t2xLEatToepvoDPJc3Mv30-jdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjKjLEK5r2SCKKtILM3D; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BAIDUID_BFESS=26E387397DECC40EF0CEC97E91622564:FG=1; delPer=0; PSINO=5; ZD_ENTRY=empty; BDRCVFR[gQU9D8KBoX6]=IdAnGome-nsnWnYPi4WUvY; H_PS_PSSID=35784_35106_31254_35734_35488_35774_34584_35490_35245_35796_35316_26350_22157; BA_HECTOR=8h8h84ak8ha5250kk91gv20jh0q; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; userFrom=www.baidu.com; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; ab_sr=1.0.1_NGZiYjhiM2E1YWMwMzdjMDhhNzQ0ODYyODhmMTZkMjlmYjBlZDNlNWQwYmM1MThkMTZlNDU1YjQ3ZjBmOWJjNDRhZDQ4ZTIxNzk0ZDc2MzkxMGE3NjM1ZWI4ZjQ3Y2I1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': cookie
}
response = requests.get(url,headers=headers)
data = response.content
return data #data
if __name__=="__main__":
#网址
# url='https://bing.ioliu.cn/'#'https://bing.ioliu.cn/ranking'
url='https://mbd.baidu.com/newspage/data/landingsuper?context=%7B%22nid%22%3A%22news_10665880783857073972%22%7D&n_type=-1&p_from=-1'
html = web(url)
print(html)
# 和图片地址
# regx= r'http://[\S]*1920x1080.jpg\?imageslim' # Bing壁纸合集抓取地址#'http://h2.ioliu.cn/bing/StDwynwensDay_ZH-CN3187096355_640x480.jpg?imageslim'
# regx= r'https://[\S]*.jpg|https://[\S]*.png'#(\S*?jpg|\S*?JPG)
# regx=r'https://[\s]*.jpeg'#'https://pics3.baidu.com/feed/d8f9d72a6059252d7121de6ec645c7325ab5b98b.jpeg?token=6f062ca0182058a1405782417a1ee980'
#r'src[=\'\"\s]+[^\"\']+\.jpg[\"\']?token=[\s]'
regx='https://[\S]*.jpeg|https://[\S]*.jpg|https://[\S]*.png'
'https://pics6.baidu.com/feed/3b87e950352ac65ca4b6ab1b1633751892138a77.jpeg?token=46bcb0c0a40b6dd6f8f57420e76b3f14'
'https://pics4.baidu.com/feed/5bafa40f4bfbfbed7b46ef700926313faec31f86.png?token=4b3d3cbdea69f32693f8c3b5d8b4e933'
pattern = re.compile(regx) # 编译表达式构造匹配模式
print(pattern)
listurl = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
print(listurl)
index = 0
for url in listurl:
print(url)
image = load_page(url) # 根据图片链接,下载图片链接
with open('./picture/第%s张' % index+ datetime.strftime(datetime.now(),'%H-%M-%S')+'.jpg','wb') as fb: # /spider_picture#+ datetime.strftime(datetime.now(),'%H-%M-%S')+
fb.write(image)
print("正在下载第%s张图片" % index )#%Y-%m-%d %p :%S
index = index + 1
# #关闭文件
fb.close()
其他举例:
# 爬取豆瓣top 500
# 电影名,评分,评论
# 难易:⭐⭐
# 一.导入
# import re 导入re包
# import requests 导入正则表达式
#
# def aa():
# rest = requests.get('https://movie.douban.com/top250') 二.访问链接
# s = rest.content.decode() 三.获取该网页源代码
# 四.编写正则表达式,取到所需内容
# ss = re.findall(r'<span class="title">(.*)</span>',s) 爬取片名
# ss1 =re.findall(r'<span class="rating_num" property="v:average">(.*)</span>',s) 爬取评分
# ss2=re.findall(r'<span>(\d*)人评价</span>',s) 爬取评论
# 五.去除无关信息
# b = []
# for i in range(len(ss)):
# aa = re.findall(r' .*', ss[i])
# if aa == []:
# b.append(ss[i]) 得到所需信息 b=ss
#六.写入文件
# for i in range(len(b)):
# print(b[i], ss1[i], ss2[i])
# with open(r"C:\Users\\陈嘉玉\Desktop\ex.txt",'a+') as ff:
# ff.writelines(b[i]+' '+ss1[i]+' '+ss2[i]+'\n')
#七.抛出异常
# try:
# aa()
# print("已爬取")
# # except Exception as c:##
# # print("爬取失败,错误提示:"+c)##
# else:
# print("爬取失败,错误提示:"+c)
最新调试完成:
# coding:utf-8
# 引入requests包和正则表达式包re
import requests
import re
import urllib.request # Python2中使用的是urllib2
import urllib
import os
from datetime import datetime
import re,urllib.request
# # htp='http://pic.weather.com.cn/images/cn/photo/2021/09/16/20210916100549696B7FD68719D9208B6E945E3B1E260B_xm.jpg'
# htp='http://pic.weather.com.cn/images/cn/photo/2021/09/16/20210916100549696B7FD68719D9208B6E945E3B1E260B_xm.jpg'
# htp='https://bing.ioliu.cn/photo/FreshSalt_ZH-CN12818759319?force=ranking_1'
# rs=re.findall('.+?=ranking_1',htp)
# print(rs)
# html = getHtml("http://bbs.feng.com/read-htm-tid-10616371.html") # 威锋网手机壁纸
# # html = getHtml("http://www.omegaxyz.com/") # 我的网站图片地址
# url = "https://bing.ioliu.cn/ranking" # Bing壁纸合集抓取地址
# # url="http://tieba.baidu.com/p/2460150866" # 某个贴吧的图片
# html = load_page(url)
# get_image(html)
# # print(get_image(html))
def get_image_a(html):
'图片地址注意要从浏览器中查看网页源代码找出图片路径'
# 要加括号,作为元组返回
# regx = r'src="(.+?\.jpg)" pic_ext' # 某个贴吧的图片
#bing网页图片获取
regx= r'data-progressive.*?src="(.*?)"' # Bing壁纸合集抓取地址
# regx= r'http://[\S]*.jpg\?imageslim' # Bing壁纸合集抓取地址
# regx =r'<h3>(.*?)</h3>'
# regx = r'src="(.*?\.jpg)"'
# regx= r'http://[\S]*.jpg'
# https: // bing.ioliu.cn / photo / FreshSalt_ZH - CN12818759319?force = ranking_1
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_640x480.jpg?imageslim
# reg = r'src="(.+?\.jpg)" ' # 我的网站图片地址
# reg = r'zoomfile="(.+?\.jpg)" ' # 威锋网手机壁纸
#正则表达式过滤
pattern = re.compile(regx) # 编译表达式构造匹配模式
get_images = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
try:
#保存图片
path = 'E:\\Temporary\\new' # 输入保存文件的目录地址
if not os.path.isdir(path):
os.makedirs(path) # 检查是否存在地址,如果不存在将自动创建文件夹目录
paths = path + '\\' # 保存在test路径下
x = 0
for imgurl in get_images:#imglist:
# urllib.request.urlretrieve(imgurl, '{}{}.jpg'.format(paths, x))
# print("正在下载第%s张图片" % x)
# # print("下载完成!")
image = load_page(imgurl) # 根据图片链接,下载图片链接
# # 将下载的图片保存到对应的文件夹中
with open( path +'\.第%s张.jpg' % x ,'wb') as fb: # /spider_picture
fb.write(image)
print("正在下载第%s张图片" % x)
x = x +1
print("下载完成!")
except:
# continue
print("下载错误!")
def load_page_b(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/74.0.3729.131 Safari/537.36'}
response = requests.get(url,headers=headers)
data = response.content
# print(data)
# return data
url='https://bing.ioliu.cn/ranking'
html = load_page(url)
# print(html)
# regx =r'class="mark".*?href=.*?=ranking_1'#r'src="(.*?\.jpg)"'
regx= r'http://[\S]*.jpg\?imageslim' # Bing壁纸合集抓取地址
pattern = re.compile(regx) # 编译表达式构造匹配模式
print(pattern)
listurl = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
print(listurl)
# https://bing.ioliu.cn/photo/FreshSalt_ZH-CN12818759319?force=ranking_1
res=[]
index = 0
for url in listurl:
# print(url)
a= re.findall(r'\/photo\/.*?=ranking_1',url)
# print(a[0])
# a = re.findall(r'com(\/.+\.jpg)', url)
res.append('http://h2.ioliu.cn/bing' + a[0])
print(res[2])
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_1920x1080.jpg?imageslim
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_1920x1080.jpg?imageslim
# http://h2.ioliu.cn/bing/FalklandRockhoppers_ZH-CN5370686595_640x480.jpg?imageslim
for url in res:
print(url)
image = load_page(url) # 根据图片链接,下载图片链接
with open('./picture/第%s张.jpg' % index, 'wb') as fb: # /spider_picture
fb.write(image)
print("正在下载第%s张图片" % index)
index = index + 1
# #关闭文件
fb.close()
def load_page_a(url):
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
# Chrome/74.0.3729.131 Safari/537.36'}
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
cookie = 'Cookie: BAIDUID=26E387397DECC40EF0CEC97E91622564:FG=1; __yjs_duid=1_58da01e99cf28c2cfbbb855096fa3e471635314939309; BIDUPSID=26E387397DECC40EF0CEC97E91622564; PSTM=1635315243; BDUSS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; BDUSS_BFESS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; MCITY=-%3A; BDSFRCVID_BFESS=rFuOJeC62uoQSpJHF-lJb5s04mqTv45TH6aoQGj2DDyBWL6TD1tDEG0PMM8g0KuMV-nyogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tb4qoC82fCD3fP36q4TjMbteblOf5Rj-HD7yWCv2JPbcOR5Jj65hyP-B2hj70Duf-DutQKt22RTSMDTF3MA--tRLbUcr2lTdMncuLlTdWRLWsq0x0hjte-bQyNOa--rzBIOMahkb5h7xOKbMQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTyjaD8q6tfJb3BQ5re54OEenurKxjhXUI8LNDH-4RIL5RZapLEapcDMt8xj-ca3b53jJO7ttoyLmvBoqbgMloR8KQ2Q5QMQML1Db3uW6vMtg3t2xLEatToepvoDPJc3Mv30-jdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjKjLEK5r2SCKKtILM3D; H_PS_PSSID=31254_26350; x-logic-no=2; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BAIDUID_BFESS=26E387397DECC40EF0CEC97E91622564:FG=1; delPer=0; PSINO=5; ZD_ENTRY=empty; BDRCVFR[gQU9D8KBoX6]=IdAnGome-nsnWnYPi4WUvY; BA_HECTOR=2500alah84a5a0a0uc1gv1svp0q; ab_sr=1.0.1_ODNmMThjMGY3Y2FiZDc1ZWQ5MjAwOWY4YzUwZDUwODk2NTcxYjI1NjIzNmYyNjIxMDViN2M3MmNkYzczYzZmY2FhNjkzYmIwNTAzNzkxMmFiOTU1Y2U1MjNjOGVmNTc4MzFlYTBlM2NiZDZmMjVlY2VjOTI4MzBjNTU4ZGIwMThlYjE2NzMwMmVkN2RhMTU5Yzc3ZGJlYTcxOGEyMWNkMA=='
# cookie ='Cookie: BAIDUID=26E387397DECC40EF0CEC97E91622564:FG=1; __yjs_duid=1_58da01e99cf28c2cfbbb855096fa3e471635314939309; BIDUPSID=26E387397DECC40EF0CEC97E91622564; PSTM=1635315243; BDUSS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; BDUSS_BFESS=TRwaXc1WHNmRnVPY355VGVJMUJ6OU5MOTV-eEJ4c3p6OHVxNDFaUTdRTHkzYk5oRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPJQjGHyUIxhc; MCITY=-%3A; BDSFRCVID_BFESS=rFuOJeC62uoQSpJHF-lJb5s04mqTv45TH6aoQGj2DDyBWL6TD1tDEG0PMM8g0KuMV-nyogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tb4qoC82fCD3fP36q4TjMbteblOf5Rj-HD7yWCv2JPbcOR5Jj65hyP-B2hj70Duf-DutQKt22RTSMDTF3MA--tRLbUcr2lTdMncuLlTdWRLWsq0x0hjte-bQyNOa--rzBIOMahkb5h7xOKbMQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTyjaD8q6tfJb3BQ5re54OEenurKxjhXUI8LNDH-4RIL5RZapLEapcDMt8xj-ca3b53jJO7ttoyLmvBoqbgMloR8KQ2Q5QMQML1Db3uW6vMtg3t2xLEatToepvoDPJc3Mv30-jdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjKjLEK5r2SCKKtILM3D; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BAIDUID_BFESS=26E387397DECC40EF0CEC97E91622564:FG=1; delPer=0; PSINO=5; ZD_ENTRY=empty; BDRCVFR[gQU9D8KBoX6]=IdAnGome-nsnWnYPi4WUvY; H_PS_PSSID=35784_35106_31254_35734_35488_35774_34584_35490_35245_35796_35316_26350_22157; BA_HECTOR=8h8h84ak8ha5250kk91gv20jh0q; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; userFrom=www.baidu.com; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; ab_sr=1.0.1_NGZiYjhiM2E1YWMwMzdjMDhhNzQ0ODYyODhmMTZkMjlmYjBlZDNlNWQwYmM1MThkMTZlNDU1YjQ3ZjBmOWJjNDRhZDQ4ZTIxNzk0ZDc2MzkxMGE3NjM1ZWI4ZjQ3Y2I1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': cookie
}
response = requests.get(url,headers=headers)
data = response.content
# return data
#网址
# url='https://bing.ioliu.cn/'#'https://bing.ioliu.cn/ranking'
url='https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=%E5%A3%81%E7%BA%B8'
html = load_page(url)
print(html)
# 和图片地址
# regx= r'http://[\S]*1920x1080.jpg\?imageslim' # Bing壁纸合集抓取地址#'http://h2.ioliu.cn/bing/StDwynwensDay_ZH-CN3187096355_640x480.jpg?imageslim'
# regx= r'https://[\S]*.jpg|https://[\S]*.png'#(\S*?jpg|\S*?JPG)
# regx=r'https://[\s]*.jpeg'#'https://pics3.baidu.com/feed/d8f9d72a6059252d7121de6ec645c7325ab5b98b.jpeg?token=6f062ca0182058a1405782417a1ee980'
#r'src[=\'\"\s]+[^\"\']+\.jpg[\"\']?token=[\s]'
regx='https://[\S]*.jpeg|https://[\S]*.jpg|https://[\S]*.png'
'https://pics6.baidu.com/feed/3b87e950352ac65ca4b6ab1b1633751892138a77.jpeg?token=46bcb0c0a40b6dd6f8f57420e76b3f14'
'https://pics4.baidu.com/feed/5bafa40f4bfbfbed7b46ef700926313faec31f86.png?token=4b3d3cbdea69f32693f8c3b5d8b4e933'
pattern = re.compile(regx) # 编译表达式构造匹配模式
print(pattern)
listurl = re.findall(pattern, repr(html)) # 在页面中匹配图片链接
print(listurl)
index = 0
for url in listurl:
image = load_page(url) # 根据图片链接,下载图片链接
with open('./picture/第%s张' % index+ datetime.strftime(datetime.now(),'%H-%M-%S')+'.jpg','wb') as fb: # /spider_picture#+ datetime.strftime(datetime.now(),'%H-%M-%S')+
fb.write(image)
print("正在下载第%s张图片" % index )#%Y-%m-%d %p :%S
index = index + 1
# #关闭文件
fb.close()
def baidu():
page = input('请输入要爬取多少页:')
page = int(page) + 1
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
pn = 1
keyword="欧菲光"
# pn是从第几张图片获取 百度图片下滑时默认一次性显示30张
for m in range(1, page):
url = 'https://image.baidu.com/search/acjson?'
param = {
'tn': 'resultjson_com',
'logid': '8846269338939606587',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '-1',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '0',
'istype': '2',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': 'girl',
'pn': pn, # 从第几张图片开始
'rn': '30',
'gsm': '1e',
}
page_text = requests.get(url=url, headers=header, params=param)
# print(page_text.content)
page_text.encoding = 'utf-8'
page_text = page_text.json()
info_list = page_text['data']
del info_list[-1]
img_path_list = []
for i in info_list:
img_path_list.append(i['thumbURL'])
index=0
for img_path in img_path_list:
img_data = requests.get(url=img_path, headers=header).content
print(img_path)
# img_path = './' + str(n) + '.jpg'
# with open(img_path, 'wb') as fp:
# fp.write(img_data)
# n = n + 1
with open('./picture/第%s张' % index + datetime.strftime(datetime.now(), '%H-%M-%S') + '.jpg','wb') as fb: # /spider_picture#+ datetime.strftime(datetime.now(),'%H-%M-%S')+
fb.write(img_data)
print("正在下载第%s张图片" % index) # %Y-%m-%d %p :%S
index = index + 1
# #关闭文件
fb.close()
pn += 29
# 自定义保存页面图片函数
def get_image(url):
headers = {'User-Agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36',
'cookie':'cookie: PHPSESSID=nlt27c16iqlrt17i9gl419kbou; Hm_lvt_50c89d9a9ca9d1cf2481053d73c36002=1686318725; keyword=%E5%A4%A7%E7%A7%A6%E5%B8%9D%E5%9B%BD; Hm_lpvt_50c89d9a9ca9d1cf2481053d73c36002=1686318800'
}
# response = requests.get(url, headers=headers)
# data = response.content
# print(data)
# request = urllib.request.urlopen(url)
# res = request.read().decode('utf-8')
# print(res)
# # listurl = re.findall(r'src=.+?\.jpg',buf)
# listurl = re.findall(r'src="(.*?\.jpg)"',buf)
# print (listurl)
# regx = r'http://[\S]*.jpg' # 定义图片正则表达式jpg.+?\,
# regx = r'src="(.*?\.jpg)"' # 定义图片正则表达式jpg.+?\,
# regx = r'http://[\S]+.png' # 定义图片正则表达式jpg.+?\,
# pattern = re.compile(regx) # 编译表达式构造匹配模式
# print(pattern)
# get_images = re.findall(pattern, repr(data)) # 在页面中匹配图片链接
# print(get_images)
num = 1
img=url
# # 遍历匹配成功的链接
# for img in get_images:
img_path = requests.get(url=img, headers=headers).content
print(img_path)
image = img_path # 根据图片链接,下载图片链接
# 将下载的图片保存到对应的文件夹中
with open('./picture/第%s张.jpg'% num , 'wb') as fb:#/spider_picture
fb.write(image)
print("正在下载第%s张图片" % num)
# num = num + 1
# print("下载完成!")
if __name__ == '__main__':
url='https://www.makefont.com/index/font/preview/font_id/1773.html'
get_image(url)
# print(get_image(html))