序
在Python3爬虫(五)博文使用utllib基本函数以及正则表达式技术实现了爬取csdn全部博文信息的任务。
上一篇,我们学习了BeautifulSoup这样一个优秀的Python库,必须有效利用起来。那么我们就利用BeautifulSoup4重新实现一次爬取csdn博文的任务。
由于我修改了博客配置,首页主题换了一下,我们基于新的主题查看网页,如下图所示:
同样的,确认要提取的信息,以及博文总页数。
分析网页源码
url以及请求报头的设置与之前相同,在这儿就不啰嗦了,主要详述怎样利用BeautifulSoup4获取我们的目标信息,先来看一下当前网页源码:
博文信息模块:
页码信息模块:
提取博文页数
-
- def getPages(self):
- req = urllib.request.Request(url=self.url, headers=self.headers)
- page = urllib.request.urlopen(req)
-
-
- data = page.read()
- data = ungzip(data)
- data = data.decode('utf-8')
-
-
- soup = BeautifulSoup(data,'html5lib')
-
- tag = soup.find('div',"pagelist")
- pagesData = tag.span.get_text()
-
- pagesNum = re.findall(re.compile(pattern=r'共(.*?)页'),pagesData)[0]
从上述代码可以看出,当我们读取网页数据之后,定义BeautifulSoup对象,调用find函数可读出“392条 共20页”,而我们要的是20这个数字,然后在利用正则表达式提取出来即可。
提取博文信息
-
- def readData(self):
- ret=[]
- req = urllib.request.Request(url=self.url, headers=self.headers)
- res = urllib.request.urlopen(req)
-
-
- data = res.read()
- data = ungzip(data)
- data = data.decode('utf-8')
-
- soup=BeautifulSoup(data,"html5lib")
-
- items = soup.find_all('div',"list_item article_item")
- for item in items:
-
- title = item.find('span',"link_title").a.get_text()
- link = item.find('span',"link_title").a.get('href')
- writeTime = item.find('span',"link_postdate").get_text()
- readers = re.findall(re.compile(r'
(.∗?)
'),item.find('span',"link_view").get_text())[0]
- comments = re.findall(re.compile(r'
(.∗?)
'),item.find('span',"link_comments").get_text())[0]
-
- ret.append('日期:'+writeTime+'\n标题:'+title
- +'\n链接:http://blog.csdn.net'+link
- +'\n'+'阅读:'+readers+'\t评论:'+comments+'\n')
- return ret
从代码可以看出,我们提取每个元素的信息可以很简便的采用BeautifulSoup的函数,不需要构造复杂的正则表达式,大大简化了操作。
其它操作与之前相同,就不赘述了,下面给出完整代码:
- ''
-
-
-
-
-
-
-
-
-
- import urllib.request,re,time,random,gzip
- from bs4 import BeautifulSoup
-
-
- def saveFile(data,i):
- path = "E:\\projects\\Spider\\06_csdn2\\papers\\paper_"+str(i+1)+".txt"
- file = open(path,'wb')
- page = '当前页:'+str(i+1)+'\n'
- file.write(page.encode('gbk'))
-
- for d in data:
- d = str(d)+'\n'
- file.write(d.encode('gbk'))
- file.close()
-
-
- def ungzip(data):
- try:
-
- data = gzip.decompress(data)
-
- except:
- print("未经压缩,无需解压...")
- return data
-
-
- class CSDNSpider:
- def __init__(self,pageIdx=1,url="http://blog.csdn.net/fly_yr/article/list/1"):
-
- self.pageIdx = pageIdx
- self.url = url[0:url.rfind('/') + 1] + str(pageIdx)
- self.headers = {
- "Connection": "keep-alive",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
- "(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
- "Accept-Encoding": "gzip, deflate, sdch",
- "Accept-Language": "zh-CN,zh;q=0.8",
- "Host": "blog.csdn.net"
- }
-
-
- def getPages(self):
- req = urllib.request.Request(url=self.url, headers=self.headers)
- page = urllib.request.urlopen(req)
-
-
- data = page.read()
- data = ungzip(data)
- data = data.decode('utf-8')
-
-
- soup = BeautifulSoup(data,'html5lib')
-
- tag = soup.find('div',"pagelist")
- pagesData = tag.span.get_text()
-
- pagesNum = re.findall(re.compile(pattern=r'共(.*?)页'),pagesData)[0]
- return pagesNum
-
-
- def setPage(self,idx):
- self.url = self.url[0:self.url.rfind('/')+1]+str(idx)
-
-
- def readData(self):
- ret=[]
- req = urllib.request.Request(url=self.url, headers=self.headers)
- res = urllib.request.urlopen(req)
-
-
- data = res.read()
- data = ungzip(data)
- data = data.decode('utf-8')
-
- soup=BeautifulSoup(data,"html5lib")
-
- items = soup.find_all('div',"list_item article_item")
- for item in items:
-
- title = item.find('span',"link_title").a.get_text()
- link = item.find('span',"link_title").a.get('href')
- writeTime = item.find('span',"link_postdate").get_text()
- readers = re.findall(re.compile(r'
(.∗?)
'),item.find('span',"link_view").get_text())[0]
- comments = re.findall(re.compile(r'
(.∗?)
'),item.find('span',"link_comments").get_text())[0]
-
- ret.append('日期:'+writeTime+'\n标题:'+title
- +'\n链接:http://blog.csdn.net'+link
- +'\n'+'阅读:'+readers+'\t评论:'+comments+'\n')
- return ret
-
-
- cs = CSDNSpider()
-
- pagesNum = int(cs.getPages())
- print("博文总页数: ",pagesNum)
-
- for idx in range(pagesNum):
- cs.setPage(idx)
- print("当前页:",idx+1)
-
- papers = cs.readData()
- saveFile(papers,idx)
GitHub完整代码链接 --- 请戳我吧~~~