1、urlparse
# -*- coding:UTF-8 -*-
#!D:\installedsoftware\python27\python.exe
import urlparse
print "用Google搜索python时地址栏中URL的解析结果"
parsedTuple = urlparse.urlparse("http://www.google.com/search?hl=en&q=python&btnG=Google+Search")
print parsedTuple
#将组件反解析成URL
print "反解析python文档页面的URL"
URLscheme = "http"
URLlocation = "www.python.org"
URLpath = "lib/module-urlparse.html"
modList = ("urllib", "urllib2", \
"httplib", "cgilib")
unparsedURL = urlparse.urlunparse( \
(URLscheme, URLlocation, URLpath, '', '', ''))
print "\t" + unparsedURL
#将路径和新文件组成一个新的URL
print " 利用拼接方式添加更多python文档页面的URL"
for mod in modList:
newURL = urlparse.urljoin(unparsedURL, \
"module-%s.html" % (mod))
print "\t" + newURL
#通过为路径添加一个子路径来组成一个新的URL
print "通过拼接子路径生成URL"
newURL = urlparse.urljoin(unparsedURL,
"module-urllib2/request-objects.html")
print "\t" + newURL
执行结果如下:
用Google搜索python时地址栏中URL的解析结果
ParseResult(scheme='http', netloc='www.google.com', path='/search', params='', query='hl=en&q=python&btnG=Google+Search', fragment='')
鍙嶈В鏋恜ython鏂囨。椤甸潰鐨刄RL
http://www.python.org/lib/module-urlparse.html
利用拼接方式添加更多python文档页面的URL
http://www.python.org/lib/module-urllib.html
http://www.python.org/lib/module-urllib2.html
http://www.python.org/lib/module-httplib.html
http://www.python.org/lib/module-cgilib.html
閫氳繃鎷兼帴瀛愯矾寰勭敓鎴怳RL
http://www.python.org/lib/module-urllib2/request-objects.html
2、urllib
# -*- coding:utf-8 -*-
import urllib
webURL = "http://www.qq.com"
#通过URL打开远程页面
u = urllib.urlopen(webURL)
buffer = u.read()
print u.info()
print "从%s读取了%d 字节数据. " % (u.geturl(),len(buffer) )
执行结果如下:
Server: squid/3.5.20
Date: Thu, 29 Dec 2016 07:54:14 GMT
Content-Type: text/html; charset=GB2312
Connection: close
Vary: Accept-Encoding
Vary: Accept-Encoding Expires: Thu, 29 Dec 2016 07:55:14 GMT Cache-Control: max-age=60 Vary: Accept-Encoding Vary: Accept-Encoding X-Cache: HIT from shenzhen.qq.com 从http://www.qq.com读取了250186 字节数据.
3、HTMLParser
示例1:
from HTMLParser import HTMLParser
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print "Encountered a start tag:", tag
def handle_endtag(self, tag):
print "Encountered an end tag :", tag
def handle_data(self, data):
print "Encountered some data :", data
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1>Parse me!</h1></body></html>')
执行结果:
Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html
示例2:
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
print "Encountered the beginning of a %s tag" % tag
if tag == "a":
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
if variable == "href":
self.links.append(value)
def handle_startendtag(self, tag, attrs):
print "\nEncountered the beginning of a %s tag\n" % tag
if tag == "img":
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
if variable == "src":
self.links.append(value)
if __name__ == "__main__":
html_code = """ <a href="www.google.com"> google.com</a>
<img src='http://www.google.com/intl/zh-CN_ALL/images/logo.gif' />
<A Href="www.pythonclub.org"> PythonClub </a>
<A HREF = "www.sina.com.cn"> Sina </a> """
hp = MyHTMLParser()
hp.feed(html_code)
hp.close()
print(hp.links)
执行结果:
Encountered the beginning of a a tag
Encountered the beginning of a img tag
Encountered the beginning of a a tag
Encountered the beginning of a a tag
['www.google.com', 'http://www.google.com/intl/zh-CN_ALL/images/logo.gif', 'www.pythonclub.org', 'www.sina.com.cn']