1. 抓取一段网页
http://blog.csdn.net/zsuguangh/article/details/6226385
---------------------------------------------------------------------------------------------------------------------------------------------------------
#!/usr/bin/env python
# 1.py
# use UTF-8
# Python 3.3.0
# get code of given URL as html text string
# Python3 uses urllib.request.urlopen()
# instead of Python2's urllib.urlopen() or urllib2.urlopen()
# http://blog.csdn.net/zsuguangh/article/details/6226385
import urllib.request
fp = urllib.request.urlopen("http://www.baidu.com")
mybytes = fp.read()
# note that Python3 does not read the html code as string
# but as html code bytearray, convert to string with
mystr = mybytes.decode("utf8") # 说明接收的数据是UTF-8格式(这样子可以解析和显示中文)
fp.close()
print(mystr)
---------------------------------------------------------------------------------------------------------------------------------------------------------
2. 分析html的编码方式(其实就是字符串的分析)
---------------------------------------------------------------------------------------------------------------------------------------------------------
#!/usr/bin/env python
# 2.py
# use UTF-8
# Python 3.3.0
# get the code of a given URL as html text string
# Python3 uses urllib.request.urlopen()
# get the encoding used first
# tested with Python 3.1 with the Editra IDE
import urllib.request
def extract(text, sub1, sub2):
"""
extract a substring from text between first
occurances of substrings sub1 and sub2
"""
return text.split(sub1, 1)[-1].split(sub2, 1)[0]
fp = urllib.request.urlopen("http://www.baidu.com") # 打开URL
mybytes = fp.read() # 读取HTML信息
encoding = extract(str(mybytes).lower(), 'charset=', '"') # 查找HTML数据中"charset"字符, 找到编码方式
print('-'*50)
print( "Encoding type = %s" % encoding )
print('-'*50)
if encoding:
# note that Python3 does not read the html code as string
# but as html code bytearray, convert to string with
mystr = mybytes.decode(encoding)
print(mystr)
else:
print("Encoding type not found!")
fp.close()
---------------------------------------------------------------------------------------------------------------------------------------------------------