该博文涵盖了如下内容:
1.正则表达式 re
2.url库 urllib
3.debug 方法
4.面向对象封装方法
1.正则表达式 re
2.url库 urllib
3.debug 方法
4.面向对象封装方法
#encoding=utf-8
'''
python learn regular express
url : http://docs.python.org/library/re.html
parse html url : http://www.boddie.org.uk/python/HTML.html
author : liuzheng
'''
import re
import urllib
#分析javaeye blog 频道
class ParseHTML:
'''
parse html for infomation
parse javeeye page
'''
def __init__(self,url):
self.url = url
pass
#analyses html
def parse(self):
sock = urllib.urlopen(self.url)
html = sock.read()
self.__puts(html)
pass
#打印html 匹配数据
def __puts(self,html):
b = re.compile(r"<a href='([\w./:\\]+?)'[\s]*title=([^<>]+?)[\s]*target=([^<>]+?)>([^<>]+?)</a>",re.I)
m = re.findall(b,html)
#这里有encode 问题?,不知道,大家是否可以帮忙解答
print m
if __name__ == '__main__':
url = "http://www.iteye.com/blogs"
p = ParseHTML(url)
p.parse()
if __debug__:
print "debuging is %s" % __debug__
print "regular" + "* " * 30
#math
str = "800-820-8800"
m = re.match(r"(\d{3})-(\d{3})-(\d{4})", str)
print "result : " ,m.groups()
#split
print "split : %s" % re.split('\W', 'Words, words, words.')
#findall
text = "He was carefully disguised but captured quickly by police."
print "findall :%s" % re.findall(r"\w+ly",text)
#sub
text = "hello world!"
print "sub:%s" % re.sub(r"\s+","--",text)