看了麦子学院的视频,整理出面向过程代码和面向对象代码,灰常感谢胡明星老师的讲解!
面向过程源码如下:
#coding:utf-8 import urllib2 import re import os print "start" for i in range(1,35): url = 'http://www.qiushibaike.com/8hr/page/'+str(i)+'/?s=4975281' user_agent ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' headers ={'User-Agent':user_agent} try: request = urllib2.Request(url=url,headers=headers) response = urllib2.urlopen(request) content = response.read() except urllib2.HTTPError as e: print e except urllib2.URLError as e: print e # 2、根据抓取到的网页源代码去提取想要的数据 pattern = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>',re.S) items = re.findall(pattern,content) for item in items: item = item.replace('<br/>', '\n') # 3、保存抓取的数据 path = '/Users/apple/Downloads/qiubai' if not os.path.exists(path): os.makedirs(path) file_path = path+'/'+item+'txt' f = open(file_path,'w') f.write(item) f.close() # 4、抓取其他剩下页面的 print "end"
面向对象源码如下:
#coding:utf-8 import urllib2 import re import os # 声明一个爬虫类 class spider(object): #构造方法 def __init__(self): self.url = 'http://www.qiushibaike.com/8hr/page/%s/?s=4975281' self.user_agent ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' #获取网页源代码 def get_page(self,page_index): headers = {'User-Agent': self.user_agent} try: request = urllib2.Request(url=self.url%str(page_index), headers=headers) response = urllib2.urlopen(request) content = response.read() return content except urllib2.HTTPError as e: print e except urllib2.URLError as e: print e #分析网页源代码 def analysis(self,content): pattern = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S) items = re.findall(pattern, content) return items #保存抓取的内容 def save(self,items,path): for item in items: item = item.replace('<br/>', '\n') path = '/Users/apple/Downloads/qiubai' if not os.path.exists(path): os.makedirs(path) file_path = path + '/' + item + 'txt' f = open(file_path, 'w') f.write(item) f.close() #运行的方法 def run(self): print "start" for i in range(1, 35): content = self.get_page(i) items = self.analysis(content) self.save(items,'qiubai') print "end" if __name__ == '__main__': spider = spider() spider.run()