1 #样例:原生爬虫爬取虎牙的王者荣耀板块,进行主播人气排序
2 #拓展爬虫框架:BeautifulSoup,Scrapy
3 #爬虫、反爬虫、反反爬虫 ip容易被封,代理IP库
4 importre5 from urllib importrequest6 importssl7 #断点调试
8 classSpider():9 #定义链接、截取字段
10 url = 'https://www.huya.com/g/wzry' #爬虫获取的网站
11 root_pattern = '([\s\S]*?)' #爬虫获取的节点
12 #root_pattern2 = '
'13 name_pattern = ''#爬虫获取的名字(正则)
14 number_pattern = '([\s\S]*?)' #爬虫获取的人气值(正则)
15
16 #获取网站的代码
17 def __fetch_content(self):18 ssl._create_default_https_context = ssl._create_unverified_context#创建免验证的ssl
19 r = request.urlopen(Spider.url)#获取地址
20 htmls = r.read() #读取代码
21 htmls = str(htmls, encoding='utf-8')#变为可阅读的字符串格式
22 returnhtmls23 #24 #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
25 def __analysis(self, htmls):26 root_html =re.findall(Spider.root_pattern,htmls)27 print(root_html[0])28 #root_html2 = re.findall(Spider.root_pattern2,htmls)
29 anchors =[]30 for html inroot_html:31 name =re.findall(Spider.name_pattern,html)32 number =re.findall(Spider.number_pattern,html)33 anchor = {'name':name,'number':number} #{'name': ['Dae-心态'], 'number': ['473.4万']}
34 anchors.append(anchor)35 #print(anchors[0])
36 a = 1
37 returnanchors38
39 #处理所获取数组中多余的符号等
40 def __refine(self,anchors):41 l = lambdaanchor:{42 'name':anchor['name'][0].strip(),43 'number': anchor['number'][0]44 }45 returnmap(l,anchors)46
47 #排序
48 def __sort(self,anchors):49 #filter
50 anchors = sorted(anchors,key=self.__sort_seed,reverse=True)51 returnanchors52 #排序的条件
53 def __sort_seed(self,anchor):54
55 #r = re.findall('[1-9]\d[^,]*.\d*|0\.\d*[1-9]\d*|[^,]',anchor['number'])
56 #r = re.findall('[1-9][^,]\d*.\d*|0\.\d*[1-9][^,]\d*', '1,816.1万')
57 #print(anchor['number'],list(r),r[0])
58 number = float(str(anchor['number']).replace('万', ''))59
60 if ',' in anchor['number']:61 number = float(str('1,816.1万').replace(',','').replace('万',''))62 elif '万' in anchor['number']:63 number *= 10000
64 returnnumber65
66 #展示
67 def __show(self,anchors):68 for rank inrange(0,len(anchors)):69 #print(anchor['name']+'-----'+anchor['number'])
70 print('rank' + str(rank + 1)71 + ':' + anchors[rank]['name']72 + ' ' + anchors[rank]['number'])73 #公共方法区调用私有方法
74 defgo(self):75 htmls = self.__fetch_content() #获取网站的代码
76 anchors = self.__analysis(htmls) #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
77 anchors = list(self.__refine(anchors)) #处理所获取数组中多余的符号等
78 anchors = self.__sort(anchors) #排序
79 self.__show(anchors) #展示
80 #print(list(anchors))
81
82 spider =Spider()83 spider.go()84
85 """
86
87 88 89 94 106 107 大仙来啦108 =============================================================================109 110 111 张大仙112 113 114 115 1,404.5万116 117 118119 """