安装BeautifulSoup http://www.crummy.com/software/BeautifulSoup/
编写代码如下
import urllib.request
import sys
from bs4 import BeautifulSoup
from html.parser import HTMLParser
type = sys.getfilesystemencoding()
request = urllib.request.Request("http://www.baidu.com")
response = urllib.request.urlopen(request)
the_page = response.read()
//指定网页编码
parser = BeautifulSoup(the_page,from_encoding="utf-8")
print(parser.prettify(formatter="minimal"))
结果如下:
<!DOCTYPE doctype html>
<html>
<head>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type">
<title>
百度一下,你就知道
</title>
<style>
html{overflow-y:auto}body{font:12px arial;text-align:center;background:#fff}body,p,form,ul,li{margin:0;padding:0;list-style:none}body,form,#fm{position:relative}td{text-align:left}img{border:0}a{color:#00c}a:active{color:#f60}#u{color:#999;padding:4px 10px 5px 0;text-align:right}#u a{margin:0 5px}#u .reg{margin:0}#m{width:680px;margin:0 auto;}#nv a,#nv b,.btn,#lk{font-size:14px}#fm{padding-left:90px;text-align:left}input{border:0;padding:0}#nv{height:19px;font-size:16px;margin:0 0 4px;text-align:left;text-indent:117px;}.s_ipt_wr{width:418px;height:30px;display:inline-block;margin-right:5px;background:url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png) no-repeat -304px 0;border:1px solid #b6b6b6;border-color:#9a9a9a #cdcdcd #cdcdcd #9a9a9a;vertical-align:top}.s_ipt{width:405px;height:22px;font:16px/22px arial;margin:5px 0 0 7px;background:#fff;outline:none;-webkit-appearance:none}.s_btn{width:95px;height:32px;padding-top:2px\9;font-size:14px;background:#ddd url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png);cursor:pointer}.s_btn_h{background-position:-100px 0}.s_btn_wr{width:97px;height:34px;display:inline-block;background:url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png) no-repeat -202px 0;*position:relative;z-index:0;vertical-align:top}#lg img{vertical-align:top;margin-bottom:3px}#lk{margin:33px 0}#lk span{font:14px "宋体"}#lm{height:60px}#lh{margin:16px 0 5px;word-spacing:3px}.tools{position:absolute;top:-4px;*top:10px;right:-13px;}#mHolder{width:62px;position:relative;z-index:296;display:none}#mCon{height:18px;line-height:18px;position:absolute;cursor:pointer;padding:0 18px 0 0;background:url(http://s1.bdstatic.com/r/www/img/bg-1.0.0.gif) no-repeat right -134px;background-position:right -136px\9}#mCon span{color:#00c;cursor:default;display:block}#mCon .hw{text-decoration:underline;cursor:pointer}#mMenu a{width:100%;height:100%;display:block;line-height:22px;text-indent:6px;text-decoration:none;filter:none\9}#mMenu,#user ul{box-shadow:1px 1px 2px #ccc;-moz-box-shadow:1px 1px 2px #ccc;-webkit-box-shadow:1px 1px 2px #ccc;filter: progid:DXImageTransform.Microsoft.Shadow(Strength=2, Direction=135, Color="#cccccc")\9;}#mMenu{width:56px;border:1px solid #9b9b9b;list-style:none;position:absolute;right:7px;top:28px;display:none;background:#fff}#mMenu a:hover{background:#ebebeb}#mMenu .ln{height:1px;background:#ebebeb;overflow:hidden;font-size:1px;line-height:1px;margin-top:-1px}#cp,#cp a{color:#77c}#seth{display:none;behavior:url(#default#homepage)}#setf{display:none;}#sekj{margin-left:14px;}
</style>
<script type="text/javascript">
function h(obj){obj.style.behavior='url(#default#homepage)';var a = obj.setHomePage('http://www.baidu.com/');}
</script>
</meta>
</head>
<body>
<div id="u">
<a href="http://www.baidu.com/gaoji/preferences.html" name="tj_setting">
搜索设置
</a>
|
<a href="https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F" id="lb" name="tj_login" οnclick="return false;">
登录
</a>
<a class="reg" href="https://passport.baidu.com/v2/?reg&regType=1&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F" name="tj_reg" target="_blank">
注册
</a>
</div>
<div id="m">
<p id="lg">
<img height="129" src="http://www.baidu.com/img/baidu_sylogo1.gif" usemap="#mp" width="270">
<map name="mp">
<area coords="40,25,230,95" href="http://hi.baidu.com/baidu/" shape="rect" target="_blank" title="点此进入 百度的空间">
</area>
</map>
</img>
</p>
<p id="nv">
<a href="http://news.baidu.com">
新 闻
</a>
<b>
网 页
</b>
<a href="http://tieba.baidu.com">
贴 吧
</a>
<a href="http://zhidao.baidu.com">
知 道
</a>
<a href="http://mp3.baidu.com">
MP3
</a>
<a href="http://image.baidu.com">
图 片
</a>
<a href="http://video.baidu.com">
视 频
</a>
<a href="http://map.baidu.com">
地 图
</a>
</p>
<div id="fm">
<form action="/s" name="f">
<span class="s_ipt_wr">
<input class="s_ipt" id="kw" maxlength="100" name="wd" type="text"/>
</span>
<input name="rsv_bp" type="hidden" value="0">
<input name="rsv_spt" type="hidden" value="3">
<span class="s_btn_wr">
<input class="s_btn" id="su" οnmοusedοwn="this.className='s_btn s_btn_h'" οnmοuseοut="this.className='s_btn'" type="submit" value="百度一下"/>
</span>
</input>
</input>
</form>
<span class="tools">
<span id="mHolder">
<div id="mCon">
<span>
输入法
</span>
</div>
</span>
</span>
<ul id="mMenu">
<li>
<a href="#" name="ime_hw">
手写
</a>
</li>
<li>
<a href="#" name="ime_py">
拼音
</a>
</li>
<li class="ln">
</li>
<li>
<a href="#" name="ime_cl">
关闭
</a>
</li>
</ul>
</div>
<p id="lk">
<a href="http://baike.baidu.com">
百科
</a>
<a href="http://wenku.baidu.com">
文库
</a>
<a href="http://www.hao123.com">
hao123
</a>
<span>
|
<a href="/more/">
更多>>
</a>
</span>
</p>
<p id="lm">
</p>
<p>
<a href="http://utility.baidu.com/traf/click.php?id=215&url=http://www.baidu.com" id="seth" οnclick="h(this)" οnmοusedοwn="return ns_c({'fm':'behs','tab':'homepage','pos':0})">
把百度设为主页
</a>
<a href="http://www.baidu.com/cache/sethelp/index.html" id="setf" οnmοusedοwn="return ns_c({'fm':'behs','tab':'favorites','pos':0})" target="_blank">
把百度设为主页
</a>
<span id="sekj">
<a href="http://www.baidu.com/search/baidukuijie_mp.html" οnmοusedοwn="return ns_c({'fm':'behs','tab':'kuaijie','pos':1})" target="_blank">
把百度添加到桌面
</a>
</span>
</p>
<p id="lh">
<a href="http://e.baidu.com/?refer=888">
加入百度推广
</a>
|
<a href="http://top.baidu.com">
搜索风云榜
</a>
|
<a href="http://home.baidu.com">
关于百度
</a>
|
<a href="http://ir.baidu.com">
About Baidu
</a>
</p>
<p id="cp">
©2012 Baidu
<a href="/duty/">
使用百度前必读
</a>
<a href="http://www.miibeian.gov.cn" target="_blank">
京ICP证030173号
</a>
<img src="http://www.baidu.com/cache/global/img/gs.gif"/>
</p>
</div>
</body>
<script>
var bds={se:{},comm : {ishome : 1,sid : "",user : "",username : "",sugHost : "http://suggestion.baidu.com/su",loginAction : []}}
</script>
<script src="http://s1.bdstatic.com/r/www/cache/global/js/home-1.1.js" type="text/javascript">
</script>
<script>
var bdUser = null;var w=window,d=document,n=navigator,k=d.f.wd,a=d.getElementById("nv").getElementsByTagName("a"),isIE=n.userAgent.indexOf("MSIE")!=-1&&!window.opera;for(var i=0;i<a.length;i++){a[i].οnclick=function(){if(k.value.length>0){var o=this,h=o.href,q=encodeURIComponent(k.value);if(h.indexOf("q=")!=-1){o.href=h.replace(/q=[^&\x24]*/,"q="+q)}else{this.href+="?q="+q}}}};(function(){if(/q=([^&]+)/.test(location.search)){k.value=decodeURIComponent(RegExp["\x241"])}})();if(n.cookieEnabled&&!/sug?=0/.test(d.cookie)){bds.se.sug();};function addEV(o, e, f){if(w.attachEvent){o.attachEvent("on" + e, f);}else if(w.addEventListener){ o.addEventListener(e, f, false);}}function G(id){return d.getElementById(id);}function ns_c(q){var p = encodeURIComponent(window.document.location.href), sQ = '', sV = '', mu='', img = window["BD_PS_C" + (new Date()).getTime()] = new Image();for (v in q) {sV = q[v];sQ += v + "=" + sV + "&";} mu= "&mu=" + p ;img.src = "http://nsclick.baidu.com/v.gif?pid=201&pj=www&rsv_sid=&" + sQ + "path="+p+"&t="+new Date().getTime();return true;}if(/\bbdime=[12]/.test(d.cookie)){document.write('<script src=http://s1.bdstatic.com/r/www/cache/ime/js/openime-1.0.0.js><\/script>');}(function(){var u = G("u").getElementsByTagName("a"), nv = G("nv").getElementsByTagName("a"), lk = G("lk").getElementsByTagName("a"), un = "";var tj_nv = ["news","tieba","zhidao","mp3","img","video","map"];var tj_lk = ["baike","wenku","hao123","more"];un = bds.comm.user == "" ? "" : bds.comm.user;function _addTJ(obj){addEV(obj, "mousedown", function(e){var e = e || window.event;var target = e.target || e.srcElement;ns_c({'fm':'behs','tab':target.name||'tj_user','un':encodeURIComponent(un)});});}for(var i = 0; i < u.length; i++){_addTJ(u[i]);}for(var i = 0; i < nv.length; i++){nv[i].name = 'tj_' + tj_nv[i];_addTJ(nv[i]);}for(var i = 0; i < lk.length; i++){lk[i].name = 'tj_' + tj_lk[i];_addTJ(lk[i]);}})();addEV(w,"load",function(){k.focus()});w.οnunlοad=function(){};
</script>
<script src="http://s1.bdstatic.com/r/www/cache/global/js/tangram-1.3.4c1.0.js" type="text/javascript">
</script>
<script src="http://s1.bdstatic.com/r/www/cache/user/js/u-1.3.1.js" type="text/javascript">
</script>
</html>
<!--254e1a7269462c92-->