原创博文,转载请注明出处。
单线程爬取:
所用模块urllib2,re
1 # -*- coding: cp936 -*- 2 import urllib2 3 import re 4 5 def main(): 6 url="http://www.baidu.com/" 7 req = urllib2.Request(url) 8 resp = urllib2.urlopen(req) 9 respHtml = resp.read() 10 #<a href="/duty/" name="tj_duty">使用百度前必读</a> ,匹配内容 11 ahn='<a\s+?href="/duty/"\s+?name="tj_duty">(?P<content>.+)</a>' 12 found=re.search(ahn,respHtml) 13 print 'found=',found 14 if(found): 15 a1=found.group("content") 16 print 'content',a1 17 if __name__=='__main__': 18 main()
(?P<name>...)通过定义一个group name,在后续的匹配对象中可以把name参数传入group()而得到(?P<name>...)所匹配的内容。
Beautiful soup 是一个可以从HTML或XML文件中提取数据的Python库。官当文档 我们同样可以使用beautiful soup来爬取数据。解释器是lxml库。
1 # -*- coding: cp936 -*- 2 from bs4 import BeautifulSoup 3 import urllib2 4 import re 5 6 def main(): 7 url="http://www.baidu.com/" 8 req = urllib2.Request(url) 9 resp = urllib2.urlopen(req) 10 respHtml = resp.read() 11 soup = BeautifulSoup(respHtml) 12 found = soup.find(href='/duty/') 13 #found = soup.find(attrs={'name':'tj_duty'})两种匹配方法 14 print 'found:',found 15 if(found): 16 content = found.string 17 print 'content:',content 18 if __name__=='__main__': 19 main()
想要提高爬取数据的效率,多线程的使用是必须的,下面是简单的多线程爬取
1 # -*- coding: cp936 -*- 2 from Queue import Queue 3 from threading import Thread 4 import time 5 import urllib2 6 import urlparse 7 8 num_threads =2 9 q = Queue() 10 urls=['http://www.baidu.com', 11 'http://www.sina.com', 12 'http://www.qq.com', 13 ] 14 for url in urls: 15 q.put(url) 16 17 def download(i,q): 18 while True: 19 print 'start download %s'%i 20 url = q.get()#在get()遇到了阻塞并等待 21 parsed_url = urlparse.urlparse(url) 22 print 'Downloading: %s'%url 23 req = urllib2.Request(url) 24 resp = urllib2.urlopen(req) 25 data = resp.read() 26 filename = url.rpartition('/')[-1] 27 with open(filename+'.html','wb') as outfile: 28 outfile.write(data) 29 print 'complete download %s:%s'%(url,i) 30 q.task_done() 31 32 for i in range(num_threads): 33 worker = Thread(target=download,args=(i,q,)) 34 worker.setDaemon(True) 35 worker.start() 36 q.join()
线程池可以提高并发执行任务的效率,这里有关于线程池的介绍http://www.cnblogs.com/tracylining/p/3471594.html 现在我们将其进行应用 。
1 from Queue import Queue 2 from threading import Thread 3 import datetime 4 import urllib2 5 import urlparse 6 import threadpool 7 8 url_list=['http://www.baidu.com', 9 'http://www.qq.com', 10 'http://www.sina.com', 11 ] 12 13 def download(url): 14 try: 15 parsed_url = urlparse.urlparse(url) 16 req = urllib2.Request(url) 17 resp = urllib2.urlopen(req) 18 data = resp.read() 19 with open("download/"+str(hash(url)),'wb') as f: 20 f.write(data) 21 f.flush() 22 f.close() 23 return url,'success' 24 except Exception: 25 return url,'falied' 26 27 def callback(url,result): 28 print '%s download is %s'%(url,result) 29 30 def threadPoolDownload(poolsize,args): 31 start = datetime.datetime.now() 32 pool = threadpool.ThreadPool(poolsize) 33 requests = threadpool.makeRequests(download,args,callback) 34 [pool.putRequest(req) for req in requests] 35 pool.wait() 36 end = datetime.datetime.now() 37 print "Start download : ",start 38 print "End download : ",end
实现多线程并发 还可以使用stackless 和 twisted ,关于stackless微线程,我还尚未进行学习。关于twisted的内容可以查看我的随笔。最近比较忙,代码留着以后再编吧。届时我打算将这几种并发方法进行一个比较,希望继续关注。