我使用beautiful soup解析京东的界面,把里面的文本全都提取出来,但是打印的时候发现全是乱码。jd的界面使用utf-8编码的,我在解码成gbk时却遇到错误。
下面是代码,请指教。
#encoding=gbk
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Comment
from bs4 import Doctype
import urllib2
def walker(soup, indent):
text=""
if soup.name is not None:
for child in soup.children:
if isinstance(child, NavigableString):
if len(child) != 1: #如何判断是否为空
text = indent + unicode(child).encode('utf-8').strip() #.decode('utf-8').encode('gbk')
text += walker(child, indent+"\t")
return text
if __name__ == "__main__":
soup = BeautifulSoup( urllib2.urlopen("http://item.jd.com/1592573020.html").read())
doctypes=soup.findAll(text=lambda text: isinstance(text, Doctype))
[doctype.extract() for doctype in doctypes]
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
for script in soup("script"):
script.extract()
for noscript in soup("noscript"):
noscript.extract()
for style in soup("style"):
style.extract()
text=walker(soup, "")
print "text", text.decode('utf-8').encode('gbk') #这里会出错