自学python有了一段时间,终于成功敲出了自己第一个用python抓取网站数据的代码(这里抓取的是自己博客的数据),特此记录
from urllib import request
import re
class Spider():
"""
爬取网站信息
"""
root_pattern = '<div class="article-item-box csdn-tracking-statistics"([\s\S]*?)</div>'
url = 'https://blog.csdn.net/LT11hka?spm=1000.2115.3001.5343'
title_pattern = '原创</span>([\s\S]*?)</a>'
readNum_pattern = '<img src="https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png" alt="">([\s\S]*?)</span>'
def __fetch_content(self):
"""
docstring
"""
r = request.urlopen(Spider.url)
htmls = r.read()
htmls = str(htmls, encoding='utf-8')
return htmls
def __analysis(self,htmls):
"""
对抓取的html页面数据进行正则匹配
"""
datas = []
root_html = re.findall(Spider.root_pattern,htmls)
for html in root_html:
title = re.findall(Spider.title_pattern,html)
readNum = re.findall(Spider.readNum_pattern,html)
data = {'title':title,'num':readNum}
datas.append(data)
return datas
def __refine(self,datas):
"""
将抓取的数据进行美化
"""
l = lambda data: {
'title':data['title'][0].strip() if len(data['title']) != 0 else '无标题',
'num':data['num'][0]
}
return map(l,datas)
def __sort(self,datas):
"""
排序
"""
datas = sorted(datas,key=self.__sort_seed,reverse=True)
return datas
def __sort_seed(self,data):
"""
自定义排序方法
"""
number = int(data['num'])
return number
def __show(self,datas):
"""
数据展示
"""
for rank in range(0,len(datas)):
print('排名:' + str(rank + 1)
+ ' 博客:' + datas[rank]['title']
+ ' 阅读量:' + datas[rank]['num'])
def go(self):
"""
调用正则匹配方法获取想要抓取的数据
"""
htmls = self.__fetch_content()
datas = self.__analysis(htmls)
datas = list(self.__refine(datas))
datas = self.__sort(datas)
self.__show(datas)
spider = Spider()
spider.go()
效果如下:
做一条有梦想的咸鱼!!!奥利给!!!