python爬虫基础教程115_Python入门学习笔记11：原生爬虫

最新推荐文章于 2024-05-31 21:24:28 发布

weixin_39861905

最新推荐文章于 2024-05-31 21:24:28 发布

阅读量84

点赞数

文章标签： python爬虫基础教程115

1 #样例：原生爬虫爬取虎牙的王者荣耀板块，进行主播人气排序

2 #拓展爬虫框架：BeautifulSoup，Scrapy

3 #爬虫、反爬虫、反反爬虫 ip容易被封，代理IP库

4 importre5 from urllib importrequest6 importssl7 #断点调试

8 classSpider():9 #定义链接、截取字段

10 url = 'https://www.huya.com/g/wzry' #爬虫获取的网站

11 root_pattern = '([\s\S]*?)' #爬虫获取的节点

12 #root_pattern2 = '

13 name_pattern = ''#爬虫获取的名字(正则)

14 number_pattern = '([\s\S]*?)' #爬虫获取的人气值(正则)

16 #获取网站的代码

17 def __fetch_content(self):18 ssl._create_default_https_context = ssl._create_unverified_context#创建免验证的ssl

19 r = request.urlopen(Spider.url)#获取地址

20 htmls = r.read() #读取代码

21 htmls = str(htmls, encoding='utf-8')#变为可阅读的字符串格式

22 returnhtmls23 #24 #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中

25 def __analysis(self, htmls):26 root_html =re.findall(Spider.root_pattern,htmls)27 print(root_html[0])28 #root_html2 = re.findall(Spider.root_pattern2,htmls)

29 anchors =[]30 for html inroot_html:31 name =re.findall(Spider.name_pattern,html)32 number =re.findall(Spider.number_pattern,html)33 anchor = {'name':name,'number':number} #{'name': ['Dae-心态'], 'number': ['473.4万']}

34 anchors.append(anchor)35 #print(anchors[0])

36 a = 1

37 returnanchors38

39 #处理所获取数组中多余的符号等

40 def __refine(self,anchors):41 l = lambdaanchor:{42 'name':anchor['name'][0].strip(),43 'number': anchor['number'][0]44 }45 returnmap(l,anchors)46

47 #排序

48 def __sort(self,anchors):49 #filter

50 anchors = sorted(anchors,key=self.__sort_seed,reverse=True)51 returnanchors52 #排序的条件

53 def __sort_seed(self,anchor):54

55 #r = re.findall('[1-9]\d[^,]*.\d*|0\.\d*[1-9]\d*|[^,]',anchor['number'])

56 #r = re.findall('[1-9][^,]\d*.\d*|0\.\d*[1-9][^,]\d*', '1,816.1万')

57 #print(anchor['number'],list(r),r[0])

58 number = float(str(anchor['number']).replace('万', ''))59

60 if ',' in anchor['number']:61 number = float(str('1,816.1万').replace(',','').replace('万',''))62 elif '万' in anchor['number']:63 number *= 10000

64 returnnumber65

66 #展示

67 def __show(self,anchors):68 for rank inrange(0,len(anchors)):69 #print(anchor['name']+'-----'+anchor['number'])

70 print('rank' + str(rank + 1)71 + ':' + anchors[rank]['name']72 + ' ' + anchors[rank]['number'])73 #公共方法区调用私有方法

74 defgo(self):75 htmls = self.__fetch_content() #获取网站的代码

76 anchors = self.__analysis(htmls) #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中

77 anchors = list(self.__refine(anchors)) #处理所获取数组中多余的符号等

78 anchors = self.__sort(anchors) #排序

79 self.__show(anchors) #展示

80 #print(list(anchors))

82 spider =Spider()83 spider.go()84

85 """

87 88

106 107 大仙来啦108 =============================================================================109 110

111 张大仙112 113 114 115 1,404.5万116 117 118

119 """

weixin_39861905

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫