这个程序打算实现的功能是:从一个网页爬取所要的信息
获取HTML
def __getHTML(self):
r = request.urlopen(Spider.__url)
#bytes
html = r.read()
#转成字符串
html = str(html,encoding='utf-8')
b = 1
在获取到的HTML里面获取所需信息
__root_pattern = '<div class="mes">([\s\S]*?)<div class="impress-tag-list">'
__name_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</span>'
__number_pattern = '<span class="dy-num fr" >([\s\S]*?)</span>'
#上面是所用到的正则表达式
#正则表达式匹配
def __analysis(self,html):
root_html = re.findall(Spider.__root_pattern,html)
anchors = []
for html in root_html:
name = re.findall(Spider.__name_pattern,html)
number = re.findall(Spider.__number_pattern,html)
anchor = {'name' : name,'number':number}
anchors.append(anchor)
数据排序
# 排序
def __sort(self,anchors):
anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
return anchors
# 排序方法
def __sort_seed(self,anchor):
r = re.findall('\d*',anchor['number'][0])
number = float(r[0])
if '万' in anchor['number'][0]:
number *= 10000
return number
完整程序
from urllib import request
import re
class Spider:
#爬去的网页
__url = 'https://www.douyu.com/g_How'
__root_pattern = '<div class="mes">([\s\S]*?)<div class="impress-tag-list">'
__name_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</span>'
__number_pattern = '<span class="dy-num fr" >([\s\S]*?)</span>'
#获取HTML
def __getHTML(self):
r = request.urlopen(Spider.__url)
#bytes
html = r.read()
#转成字符串
html = str(html,encoding='utf-8')
return html
#正则表达式
def __analysis(self,html):
root_html = re.findall(Spider.__root_pattern,html)
anchors = []
for html in root_html:
name = re.findall(Spider.__name_pattern,html)
number = re.findall(Spider.__number_pattern,html)
anchor = {'name' : name,'number':number}
anchors.append(anchor)
return anchors
# 精炼数据
def __refine(self,anchors):
pass
# 排序
def __sort(self,anchors):
anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
return anchors
# 排序方法
def __sort_seed(self,anchor):
r = re.findall('\d*',anchor['number'][0])
number = float(r[0])
if '万' in anchor['number'][0]:
number *= 10000
return number
# 输出
def __print(self,anchors):
for anchor in anchors:
print(anchor['name'][0]+'----->'+anchor['number'][0])
# 入口
def run(self):
html = self.__getHTML()
anchors = self.__analysis(html)
anchors = self.__sort(anchors)
self.__print(anchors)
a = Spider()
a.run()
至此,把python一些基础知识用了一下。