爬虫步骤:
1.目的
2.找到数据对象的网页
3.分析网页的结构,找到数据所在的标签
4.模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML
5.利用正则表达式提取我们所需的数据(主播、人气)
1、目的:爬取虎牙王者荣耀主播人气排名
2、分析得到数据对象:
<li class="game-live-item" gid="2336">
<a href="https://www.huya.com/131499" class="video-info new-clickstat j_live-card " target="_blank" data-gid="2336" data-lp="1321051418" report="{"eid":"click/position","position":"wzry/0/1/1","game_id":"2336","ayyuid":"1321051418"}">
<img class="pic" data-original="//live-cover.msstatic.com/huyalive/22808102-2644054518-11356127683651043328-2642226292-10057-A-0-1/20200219155742.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" src="//live-cover.msstatic.com/huyalive/22808102-2644054518-11356127683651043328-2642226292-10057-A-0-1/20200219155742.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" data-default-img="338x190" alt="东辰-寒冰的直播" title="东辰-寒冰的直播">
<em class="tag tag-recommend">大神推荐</em>
<div class="item-mask"></div>
<i class="btn-link__hover_i"></i>
<p class="tag-right">
<!-- 手机开播 -->
<!-- 无损音质 || 蓝光 -->
<em class="tag-blue">蓝光4M</em>
</p>
</a>
<a href="https://www.huya.com/131499" class="title new-clickstat j_live-card" data-gid="2336" data-lp="1321051418" report="{"eid":"click/position","position":"wzry/0/1/1","game_id":"2336","ayyuid":"1321051418"}" title="【看我撩比心小姐姐会不会动心】" target="_blank">【看我撩比心小姐姐会不会动心】</a>
<span class="txt">
<span class="avatar fl">
<img data-original="https://huyaimg.msstatic.com/avatar/1001/14/01ae1488faf1d6d3cf301d2540de46_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1001/14/01ae1488faf1d6d3cf301d2540de46_180_135.jpg" data-default-img="84x84" alt="东辰-寒冰" title="东辰-寒冰">
<i class="nick" title="东辰-寒冰">东辰-寒冰</i>
</span>
<span class="num"><i class="num-icon"></i><i class="js-num">268.0万</i></span>
</span>
</li>
#原生爬虫
from urllib import request
import re
#F5启动程序debug
class Splider():
url = 'https://www.huya.com/g/wzry'
root_pattern = '<li class="game-live-item" gid="2336">([\\s\\S]*?)</li>'
name_pattern = '<i class="nick" title="[\\s\\S]*?">([\\s\\S]*?)</i>'
ranking_pattern = '<i class="js-num">([\\s\\S]*?)</i>'
#爬取html内容(私有方法)
def __fetch_contents(self):
r = request.urlopen(Splider.url)
htmls = r.read()
htmls = str(htmls,encoding='utf-8') #转码
return htmls
#分析处理htmls
def __analysis(self,htmls):
numbers = [] #结果存储为列表
first_contents = re.findall(Splider.root_pattern, htmls)
for html in first_contents:
name = re.findall(Splider.name_pattern,html)
number = re.findall(Splider.ranking_pattern,html)
one = {'name':name, 'number':number}
numbers.append(one)
return numbers
#对数据进行精炼,整理
def __refine(self,numbers):
l = lambda number: {
'name':number['name'][0].strip(), #去掉字符串首尾空格
'number': number['number'][0]
}
return map(l,numbers)
#排序
def __sorting(self,numbers):
#排序,根据key指定排序的字段,reverse:True 降序
numbers = sorted(numbers,key=self.__sort_seed, reverse=True)
return numbers
#指定排序规则
def __sort_seed(self,number):
r = re.findall('\\d*',number['number'])
num = float(r[0])
if '万' in number:
num *= 10000
return num
#打印所有主播
def __printall(self,numbers):
for rank in range(0,len(numbers)):
print('rank: '+str(rank+1)+' '+numbers[rank]['name']+' '+numbers[rank]['number'])
# print(number['name']+'-------'+number['number'])
#入口函数
def go(self):
htmls = self.__fetch_contents()
numbers = self.__analysis(htmls)
deal_numbers = list(self.__refine(numbers))
sort_numbers = self.__sorting(deal_numbers)
self.__printall(sort_numbers)
sp = Splider()
sp.go()
爬取的部分结果: