def get_page_content(url):
url_content = urllib.urlopen(url).read()
char_det = chardet.detect(url_content)
get_encoding_charset = char_det[
'encoding']
if get_encoding_charset==
'utf-8'
or get_encoding_charset==
'UTF-8':
url_content=url_content
else:
url_content = url_content.decode(
'gb2312',
'ignore')
# 用来解码,在pyhton2中不用再用encoding("utf-8")
return url_content
其中需要用的库是:chardet 和 urllib
以上的代码是基于python2.7的!