手撸动态网页图片爬取,还有一些问题不太理解。。。三页图片数量应该是90张啊。。。
import re
import requests
url = 'http://image.baidu.com/search/index'
headers = {
'Accept':'text/plain, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Cookie':'BDqhfp=%E5%A7%9A%E6%98%8E%26%260-10-1undefined%26%260%26%261; BAIDUID=437781EFD36F19986A68243D59F201D4:FG=1; BDUSS=jhQT1Y5alZwckE2dmNocTFOOG9ydFdvaH5TWjFMdk5hc2llQVdkaC1jOFNmZUZiQVFBQUFBJCQAAAAAAAAAAAEAAAB1yiaEur0xNTgzNjkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABLwuVsS8LlbW; BIDUPSID=437781EFD36F19986A68243D59F201D4; PSTM=1539180099; UM_distinctid=167a777231447e-0e697c138ec3c1-335d4e7f-144000-167a7772315787; CNZZDATA1271442956=876411089-1545313908-%7C1545313908; Hm_lvt_f5df380d5163c1cc4823c8d33ec5fa49=1545316069; MCITY=-211%3A; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; userFrom=www.sogou.com; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; indexPageSugList=%5B%22%E5%A7%9A%E6%98%8E%22%2C%22%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1%22%5D; cleanHistoryStatus=0; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; uploadTime=1545902331022',
#'Host':'image.baidu.com',
'Referer':'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=000000&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%A7%9A%E6%98%8E&oq=%E5%A7%9A%E6%98%8E&rsp=-1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2577.400',
'X-Requested-With':'XMLHttpRequest'
}
def get_html(keyword,page_num):
dates = []
for i in range(1,page_num+1):
dates.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'expermode':'',
'force':'',
'cg':'star',
'pn': 30*i,
'rn': 30,
'gsm': '3c',
'1526377465547': ''
})
htmls = []
for i in dates:
html = requests.get(url,i,headers = headers).text
htmls.append(html)
return htmls
def get_img(htmls):
allimglist = []
reg = re.compile('http://.*?\.jpg')
for i in htmls:
imglist = re.findall(reg,i)[::3]
allimglist += imglist
x = 1
for i in allimglist:
with open('C:/Users/Lenovo/Desktop/我的/%s.jpg' % x, 'wb') as file:
if requests.get(i, headers=headers).content:
print('正在保存',i)
file.write(requests.get(i, headers=headers).content)
x += 1
else:
print('图片无效')
if __name__ == '__main__':
name = input('输入搜索内容')
page_num = int(input('输入图片页数'))
htmls = get_html(name,page_num)
get_img(htmls)
不是我的动态网页爬取
# coding = utf-8
import urllib.request
import re
import requests
def getDatas(keyword,pages):
params=[]
for i in range(30,30*pages+30,30):
params.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1526377465547': ''
})
url = 'https://image.baidu.com/search/index'
urls = []
for i in params:
urls.append(requests.get(url,params=i).json().get('data'))
return urls
def getImg(datalist,path):
x=0
for list in datalist:
for i in list:
if i.get('thumbURL') != None:
print('正在下载:%s' % i.get('thumbURL'))
urllib.request.urlretrieve(i.get('thumbURL'), path+'%d.jpg'%x)
x += 1
else:
print('图片链接不存在')
if __name__ == '__main__':
search = input("输入搜索内容:")
page_num = int(input('输入页数:'))
datalist=getDatas(search,page_num)
getImg(datalist,'C:/Users/Lenovo/Desktop/Pic/')