背景:虚拟机ubuntu16.04 爬取内涵段子文字,replace处理字符串
要求,根据客户要求要爬取的page数,将段子爬取下来:
源码如下:
1 # -*- coding:utf-8 -*-
2
3 import urllib2
4 import re
5
6 class Spider:
7 def __init__(self):
8 self.page=1
9 self.switch = True
10
11 def loadPage(self):
12 #下载页面
13 url = "http://www.neihanpa.com/article/list_5_"+str(self.page)+".html"
14 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
15 request = urllib2.Request(url,headers=headers)
16 response = urllib2.urlopen(request)
17 html = response.read().decode("gbk").encode("utf-8")
18 #print html
19 self.dealPage(html)
20
21 def dealPage(self,html):
22 #处理每页的段子
23
24 pattern = re.compile('<div\sclass="f18 mb20">(.*?)</div>',re.S)
25 content_list = pattern.findall(html)
26 for content in content_list:
27 content = content.replace("<br>","").replace("&hellip","").replace("&ldquo","").replace("&rdquo","").replace("<br />","").replace("<p>","").replace("</p>","")
28 self.writePage(content)
29
30 def writePage(self,item):
31 #写入文件
32 with open("duanzi.txt","a") as f:
33 f.write(item)
34
35 def startWork(self):
36 while self.switch==True:
37 self.loadPage()
38 print "下载成功!"
39 command = raw_input("如果继续爬去,请按回车,退出则按(q): ")
40 if command =="q":
41 self.switch = False
42 print "谢谢使用!"
43 self.page+=1
44
45 if __name__ == "__main__":
46 neihanspider = Spider()
47 neihanspider.startWork()
打开duanzi.txt文件查看