用于简单的测试正则表达式的正确性,顺便爬得一些信息
from urllib import request
import re
import time
class Read_Msg():
def __init__(self, url, regular, sign):
self.url = url
self.regular = regular
self.head = {}
self.head[
'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
self.model = re.compile(self.regular, re.M)
self.sign = sign
def gethtml(self):
req = request.Request(self.url, headers=self.head)
data = request.urlopen(req).read()
html = data.decode('utf-8')
msg = self.model.findall(html)
return msg
def printmsg(self):
print("\n%s\n" % self.sign.join(i for i in self.gethtml()))
def main():
print("欢迎使用网页信息提取器!本提取器可用于贴吧,论坛等信息的快捷爬取及正则的测试\n")