mysql百度图片爬虫_百度图片爬虫-python版-如何爬取百度图片?

这一篇我想写写如何爬取百度图片的爬虫,这个爬虫也是:搜搜gif(在线制作功能点我) 的爬虫代码,其实爬虫整体框架还是差不多的,但就是会涉及到图片的的一些处理,还是花费了我不少时间的,所以我请阅读的本爬虫的孩子还是认真一些,毕竟程序猿都不容易啊。

附代码:

PS:不会python的孩子赶快去补补吧,先把基础搞清楚再说

#coding:utf-8

"""

Createdon2015-9-17

@author:huangxie

"""

importtime,math,os,re,urllib,urllib2,cookielib

frombs4importBeautifulSoup

importtime

importre

importuuid

importjson

fromthreadingimportThread

fromQueueimportQueue

importMySQLdbasmdb

importsys

importthreading

importutils

importimitate_browser

fromMySQLdb.constants.REFRESHimportSTATUS

reload(sys)

sys.setdefaultencoding('utf-8')

DB_HOST='127.0.0.1'

DB_USER='root'

DB_PASS='root'

proxy={u'http':u'222.39.64.13:8118'}

TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"

KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"

"""

i_headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.11(KHTML,likeGecko)Chrome/23.0.1271.64Safari/537.11',

'Accept':'json;q=0.9,*/*;q=0.8',

'Accept-Charset':'utf-8;q=0.7,*;q=0.3',

'Accept-Encoding':'gzip',

'Connection':'close',

'Referer':None#注意如果依然不能抓取的话,这里可以设置抓取网站的host

}

"""

i_headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/31.0.1650.48'}

defGetDateString():

x=time.localtime(time.time())

foldername=str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))

returnfoldername

classBaiduImage(threading.Thread):

def__init__(self):

Thread.__init__(self)

self.browser=imitate_browser.BrowserBase()

self.chance=0

self.chance1=0

self.request_queue=Queue()

self.wait_ana_queue=Queue()

#self.key_word_queue.put((("动态图",0,24)))

self.count=0

self.mutex=threading.RLock()#可重入锁,使单线程可以再次获得已经获得的锁

self.commit_count=0

self.ID=500

self.next_proxy_set=set()

self.dbconn=mdb.connect(DB_HOST,DB_USER,DB_PASS,'sosogif',charset='utf8')

self.dbconn.autocommit(False)

self.dbcurr=self.dbconn.cursor()

self.dbcurr.execute('SETNAMESutf8')

"""

defrun(self):

whileTrue:

self.get_pic()

"""

defwork(self,item):

print"startthread",item

whileTrue:#MAX_REQUEST条以上则等待

self.get_pic()

self.prepare_request()

defformat_keyword_url(self,keyword):

returnKEYWORD_URL.format(wd=keyword).encode('utf-8')

defgenerateSeed(self,url):

html=self.browser.openurl(url).read()

ifhtml:

try:

soup=BeautifulSoup(html)

trs=soup.find('p',id='rs').find('table').find_all('tr')#获得所有行

fortrintrs:

ths=tr.find_all('th')

forthinths:

a=th.find_all('a')[0]

keyword=a.text.strip()

if"动态图"inkeywordor"gif"inkeyword:

print"keyword",keyword

self.dbcurr.execute('selectidfrominfowhereword=%s',(keyword))

y=self.dbcurr.fetchone()

ifnoty:

self.dbcurr.execute('INSERTINTOinfo(word,status,page_num,left_num,how_many)VALUES(%s,0,0,0,0)',(keyword))

self.dbconn.commit()

except:

pass

defprepare_request(self):

self.lock()

self.dbcurr.execute('select*frominfowherestatus=0')

result=self.dbcurr.fetchone()

ifresult:

id,word,status,page_num,left_num,how_many=result

self.request_queue.put((id,word,page_num))

ifpage_num==0andleft_num==0andhow_many==0:

url=self.format_keyword_url(word)

self.generateSeed(url)

html=""

try:

url=self.format_top_url(word,page_num,24)

html=self.browser.openurl(url).read()

exceptExceptionaserr:

print"err",err

#pass

ifhtml!="":

how_many=self.how_many(html)

print"how_many",how_many

ifhow_many==None:

how_many=0

t=math.ceil(how_many/24*100)#只要前1/100即可

num=int(t)

foriinxrange(0,num-1):

self.dbcurr.execute('INSERTINTOinfo(word,status,page_num,left_num,how_many)VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))

self.dbcurr.execute('updateinfoSETstatus=1WHEREid=%s',(id))#置为已经访问

self.dbconn.commit()

self.unlock()

defstart_work(self,req_max):

foriteminxrange(req_max):

t=threading.Thread(target=self.work,args=(item,))

t.setDaemon(True)

t.start()

deflock(self):#加锁

self.mutex.acquire()

defunlock(self):#解锁

self.mutex.release()

defget_para(self,url,key):

values=url.split('?')[-1]

forkey_valueinvalues.split('&'):

value=key_value.split('=')

ifvalue[0]==key:

returnvalue[1]

returnNone

defmakeDateFolder(self,par,child):

#self.lock()

ifos.path.isdir(par):

path=par+'//'+GetDateString()

newFolderName=path+'//'+child

ifnotos.path.isdir(path):

os.mkdir(path)

ifnotos.path.isdir(newFolderName):

os.mkdir(newFolderName)

returnnewFolderName

else:

returnpar

#self.unlock()

defparse_json(self,data):

ipdata=json.loads(data)

try:

ifipdata['imgs']:

forninipdata['imgs']:#data子项

ifn['objURL']:

try:

proxy_support=urllib2.ProxyHandler(proxy)

opener=urllib2.build_opener(proxy_support)

urllib2.install_opener(opener)

#print"proxy",proxy

self.lock()

self.dbcurr.execute('selectIDfrompic_infowhereobjURL=%s',(n['objURL']))

y=self.dbcurr.fetchone()

#print"y=",y

ify:

print"databaseexist"

self.unlock()#continue前解锁

continue

else:

real_extension=utils.get_extension(n['objURL'])

req=urllib2.Request(n['objURL'],headers=i_headers)

resp=urllib2.urlopen(req,None,5)

dataimg=resp.read()

name=str(uuid.uuid1())

filename=""

iflen(real_extension)>4:

real_extension=".gif"

real_extension=real_extension.lower()

ifreal_extension==".gif":

filename=self.makeDateFolder("E://sosogif","d"+str(self.count%60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension

self.count+=1

else:

filename=self.makeDateFolder("E://sosogif","o"+str(self.count%20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension

self.count+=1

"""

name=str(uuid.uuid1())

filename=""

iflen(real_extension)>4:

real_extension=".gif"

filename=self.makeDateFolder("E://sosogif","d"+str(self.count%60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension

self.count+=1

"""

try:

ifnotos.path.exists(filename):

file_object=open(filename,'w+b')

file_object.write(dataimg)

file_object.close()

self.anaylis_info(n,filename,real_extension)#入库操作

else:

print"fileexist"

exceptIOError,e1:

print"e1=",e1

pass

self.unlock()

exceptIOError,e2:

#print"e2=",e2

pass

self.chance1+=1

exceptExceptionasparse_error:

print"parse_error",parse_error

pass

deftitle_dealwith(self,title):

#print"title",title

a=title.find("")

temp1=title[0:a]

b=title.find("")

temp2=title[a+8:b]

temp3=title[b+9:len(title)]

return(temp1+temp2+temp3).strip()

defanaylis_info(self,n,filename,real_extension):

print"success."

#ifself.wait_ana_queue.qsize()!=0:

#n,filename,real_extension=self.wait.ana_queue.get()

#self.lock()

objURL=n['objURL']#图片地址

fromURLHost=n['fromURLHost']#来源网站

width=n['width']#宽度

height=n['height']#高度

di=n['di']#用来唯一标识

type=n['type']#格式

fromPageTitle=n['fromPageTitle']#来自网站

keyword=self.title_dealwith(fromPageTitle)

cs=n['cs']#未知

os=n['os']#未知

temp=time.time()

x=time.localtime(float(temp))

acTime=time.strftime("%Y-%m-%d%H:%M:%S",x)#爬取时间

self.dbcurr.execute('selectIDfrompic_infowherecs=%s',(cs))

y=self.dbcurr.fetchone()

ifnoty:

print'addpic',filename

self.commit_count+=1

self.dbcurr.execute('INSERTINTOpic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))

ifself.commit_count==10:

self.dbconn.commit()

self.commit_count=0

#self.unlock()

defformat_top_url(self,word,pn,rn):

url=TOP_URL.format(word=word,pn=pn,rn=rn).encode('utf-8')

returnurl

defhow_many(self,data):

try:

ipdata=json.loads(data)

ifipdata['displayNum']>0:

how_many=ipdata['displayNum']

returnint(how_many)

else:

return0

exceptExceptionase:

pass

defget_pic(self):

"""

word="gif"

pn=0

rn=24

ifself.key_word_queue.qsize()!=0:

word,pn,rn=self.key_word_queue.get()

url=self.format_top_url(word,pn,rn)

globalproxy

ifurl:

try:

html=""

try:

req=urllib2.Request(url,headers=i_headers)

response=urllib2.urlopen(req,None,5)

#print"url",url

html=self.browser.openurl(url).read()

exceptExceptionaserr:

print"err",err

#pass

ifhtml:

how_many=self.how_many(html)

#how_many=10000

print"how_many",how_many

word=self.get_para(url,"word")

rn=int(self.get_para(url,"rn"))

t=math.ceil(how_many/rn)

num=int(t)

foriteminxrange(0,num-1):

"""

try:

globalproxy

print"sizeofqueue",self.request_queue.qsize()

ifself.request_queue.qsize()!=0:

id,word,page_num=self.request_queue.get()

u=self.format_top_url(word,page_num,24)

self.lock()

self.dbcurr.execute('updateinfoSETstatus=1WHEREid=%s',(id))

self.dbconn.commit()

ifself.chance>0orself.chance1>1:#任何一个出问题都给换代理

ifself.ID%100==0:

self.dbcurr.execute("selectcount(*)fromproxy")

forrinself.dbcurr:

count=r[0]

ifself.ID>count:

self.ID=50

self.dbcurr.execute("select*fromproxywhereID=%s",(self.ID))

results=self.dbcurr.fetchall()

forrinresults:

protocol=r[1]

ip=r[2]

port=r[3]

pro=(protocol,ip+":"+port)

ifpronotinself.next_proxy_set:

self.next_proxy_set.add(pro)

self.chance=0

self.chance1=0

self.ID+=1

self.unlock()

proxy_support=urllib2.ProxyHandler(proxy)

opener=urllib2.build_opener(proxy_support)

urllib2.install_opener(opener)

html=""

try:

req=urllib2.Request(u,headers=i_headers)

#print"u=",u

response=urllib2.urlopen(req,None,5)

html=response.read()

ifhtml:

#print"html",type(html)

self.parse_json(html)

exceptExceptionasex1:

#print"error=",ex1

pass

self.chance+=1

ifself.chance>0orself.chance1>1:

iflen(self.next_proxy_set)>0:

protocol,socket=self.next_proxy_set.pop()

proxy={protocol:socket}

print"changeproxyfinished<

exceptExceptionase:

print"error1",e

pass

if__name__=='__main__':

app=BaiduImage()

app.start_work(80)

#app.generateSeed()

while1:

pass

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值