在上一节记录了如何使用urllib进行网络爬虫,并将数据存储。但是我当时是使用的正则表达式进行的数据过滤,有些不全面。接下来我将记录一种更加方便的解析数据的操作–BeautifulSoup:
安装beautifulsoup4
导包
import urllib.request
from bs4 import BeautifulSoup
代码实现
#coding:utf-8
import urllib.request
from bs4 import BeautifulSoup
class Reptile(object):
def start(self):
pageCount = 10;
currentPage = 1;
while currentPage <= pageCount:
url = "http://www.neihan.net/text_%d.html"%currentPage
send_headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
print("开始爬取第%d页的数据..." % currentPage)
file = urllib.request.Request(url,headers=send_headers)
html = urllib.request.urlopen(file).read().decode("utf-8")
info = BeautifulSoup(html,"html.parser")
# 匹配所有<dd class="content">的标签
contents = info.find_all("dd", {"class": "content"})
self.writeToFile(contents)
print("爬取第%d页的数据完毕"%currentPage)
currentPage+=1
def writeToFile(self,list):
for temp in list:
file = open("段子.txt","a")
# 获取标签的内容
file.write(temp.get_text())
file.write("\n\n")
file.close()
if __name__ == "__main__":
reptile = Reptile()
reptile.start()