业余时用python写的百度贴吧爬虫程序,算是对学习python程序得一个练习。
本程序可以针对给定的贴吧链接,把帖子楼主的发言或者图片爬取出来,目前主要功能为下载所有楼主发的图片。爬取楼主发言的功能仅支持屏幕输出,没有保存到本地文件,有兴趣的朋友可以进行补充。仅供学习,转载请标明出处。
tieba_spider.py
#coding:utf-8
import urllib2,re,time,threading
import DownQueue
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36' #模拟浏览器访问
url='http://tieba.baidu.com/p/3271638607?see_lz=1&pn=' #贴吧地址,只看楼主
header={'User-Agent' : user_agent}
g_worker=DownQueue.down() #下载器
class Tieba_Spider(threading.Thread):
def __init__(self,url,type):
threading.Thread.__init__(self)
self.url=url
self.type=type
self.num=0
def run(self):
self.start_spider()
def get_info(self):
try:
req=urllib2.Request(self.url,headers=header)
response=urllib2.urlopen(req)
htm=response.read().decode('gbk')
self.num=self.get_page_num(htm)
print 'It has %d page' % self.num
self.title=self.get_title(htm)
print 'It\'s title is %s'%self.title
except urllib2.URLError,e:
if hasattr(e,'code'):
print 'Error code :',e.code
if hasattr(e,'reason'):
print 'Reason :',e.reason
def start_spider(self):
global g_worker
self.get_info()
for i in range(1,self.num+1,1):
print 'start : ',i
try:
req=urllib2.Request(self.url+str(i),headers=header)
response=urllib2.urlopen(req)
htm=response.read().decode('gbk')
if self.type==0:
self.page_deal(htm)
elif self.type==1:
self.down_pic(htm)
except urllib2.URLError,e:
if hasattr(e,'code'):
print 'Error code :',e.code
if hasattr(e,'reason'):
print 'Reason :',e.reason
g_worker.set_flag(True)
def get_page_num(self,htm):
match=re.search(r'<span class="red">(\d*)</span>',htm)
if match:
return int(match.group(1))
else:
return 0
def get_title(self,htm):
match=re.search(r'class="core_title_txt(\s+)"(\s+)title="(.*?)"',htm)
if match:
return match.group(3)
else:
print 'no match title'
return ''
def page_deal(self,htm):
match=re.findall(r'id="post_content_(.*?)">(.*?)</div>',htm)
if match:
for it in match:
print it[1],'\n'
else:
print 'no deal'
def down_pic(self,htm):
global g_worker
match=re.findall(r'<img class="BDE_Image" pic_type=(.*?)src="(.*?)"',htm)
if match:
for it in match:
print 'picture url :',it[1],'\n'
g_worker.push(it[1])
else:
print 'no deal'
if __name__=='__main__':
spider=Tieba_Spider(url,1)#参数1为下载图片。默认为0,功能为抓取楼主的发言在屏幕上显示
spider.start()
g_worker.start()
DownQueue.py
#coding:utf-8
import threading,Queue,re,time
import urllib2
class down(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.queue=Queue.Queue(1000)
self.semaphore=threading.Semaphore(0)
self.flag=False #是否停止
def push(self,obj):
self.queue.put(obj)
self.semaphore.release()
def set_flag(self,f):
self.flag=f
def run(self):
while True:
if self.semaphore.acquire():
obj=self.queue.get()
data=urllib2.urlopen(obj).read()
pic=re.search(r'.*/(.*)',obj)
print 'dowing ',pic.group(1)
fd=open('./spider_pic/%s'%pic.group(1),'wb')
fd.write(data)
fd.close()
if self.queue.empty() and self.flag: #线程结束条件,队列为空并且退出标志为真
break
Tieba_Spider 类为爬虫类,负责爬出楼主发言中的图片链接,并将其推入down类的队列中。down类的工作为下载图片。两个类均继承自threading.Thread。仅供学习,转载请标明出处。