一、python请求要抓取的url页面
【%E7%BE%8E%E5%A5%B3】urlcode解码之后就是【美女】 可以看到这是百度图片的api接口 :http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=%E7%BE%8E%E5%A5%B3&cg=girl&rn=60&pn=60
pn表示当前页数,偏移量是60,也就是说下一页 pn=120,180,...
#!/usr/bin/python
# -*- coding:utf-8 -*-
import httplib
import urllib
import json
import urllib2
import re
import os
class BaiduImage(object):
def __init__(self):
super(BaiduImage,self).__init__()
print u'图片获取中,CTRL+C 退出程序...'
self.page = 60 #当前页数
def request(self):
while 1:
conn = httplib.HTTPConnection('image.baidu.com')
request_url ='/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=%E7%BE%8E%E5%A5%B3&cg=girl&rn=60&pn='+str(self.page)
headers = {'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0','Content-type': 'test/html'}
conn.request('GET',request_url,headers = headers)
r= conn.getresponse()
print r.status
self.page += 60
if __name__ == '__main__':
bi = BaiduImage()
bi.request()
#!/usr/bin/python
# -*- coding:utf-8 -*-
import httplib
import urllib
import json
import urllib2
import re
import os
class BaiduImage(object):
def __init__(self):
super(BaiduImage,self).__init__()
print u'图片获取中,CTRL+C 退出程序...'
self.page = 60 #当前页数
if not os.path.exists(r'./image'):
os.mkdir(r'./image')
def request(self):
try:
while 1:
conn = httplib.HTTPConnection('image.baidu.com')
request_url ='/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=%E7%BE%8E%E5%A5%B3&cg=girl&rn=60&pn='+str(self.page)
headers = {'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0','Content-type': 'test/html'}
#body = urllib.urlencode({'tn':'resultjsonavatarnew','ie':'utf-8','word':'%E7%BE%8E%E5%A5%B3','cg':'girl','pn':self.page,'rn':'60'})
conn.request('GET',request_url,headers = headers)
r= conn.getresponse()
#print r.status
if r.status == 200:
data = r.read()
data = unicode(data, errors='ignore')
decode = json.loads(data)
self.download(decode['imgs'])
self.page += 60
except Exception,e:
print e
finally:
conn.close()
def download(self,data):
for d in data:
#url = d['thumbURL'] 缩略图 尺寸200
#url = d['hoverURL'] 尺寸360
url = d['objURL']
data = urllib2.urlopen(url).read()
pattern = re.compile(r'.*/(.*?)\.jpg',re.S)
item = re.findall(pattern,url)
FileName = str('image/')+item[0]+str('.jpg')
with open(FileName,'wb') as f:
f.write(data)
if __name__ == '__main__':
bi = BaiduImage()
bi.request()