遇到解码问题,搜索后找到解决方法。源码经过修改如下:
import re
from urllib.request import urlopen
myurl = 'http://www.purepen.com/hlm/'
myhost = myurl
f = open('d:/mytemp/红楼梦.txt','w+',encoding='gb18030')
smsg = urlopen(myurl).read()
#选择解码字符集
if re.search(b'[a-zA-Z0-9\-]*',smsg[smsg.find(b'charset=')+8:]).group() == b'GB2312' \
or re.search(b'[a-zA-Z0-9\-]*',smsg[smsg.find(b'charset=')+8:]).group() == b'gb2312':
charset = 'gb18030'
if re.search(b'[a-zA-Z0-9\-]*',smsg[smsg.find(b'charset=')+8:]).group() == b'GBK' \
or re.search(b'[a-zA-Z0-9\-]*',smsg[smsg.find(b'charset=')+8:]).group() == b'gbk':
charset = 'gbk'
if re.search(b'[a-zA-Z0-9\-]*',smsg[smsg.find(b'charset=')+8:]).group() == b'UTF-8' \
or re.search(b'[a-zA-Z0-9\-]*',smsg[smsg.find(b'charset=')+8:]).group() == b'utf-8':
charset = 'utf-8'
smsg = smsg.decode(charset) #解码
tmsg = smsg.find("
第 一 回")t = smsg[tmsg:smsg.find("",tmsg)]
")chapurl = myhost + t[tmsg + 9 : t.find("\"",tmsg + 12)]
#chapname = t[t.find("html\">")+6:t.find("
",t.find("html\">")+6)]+"\n"tmsg = t.find("")
t = t[tmsg+6:]
temp = urlopen(chapurl).read().decode('gb18030')
chapname = temp[temp.find("")+3:temp.find("")] + '\n'
")]content = temp[temp.find("size=\"3\">")+9:temp.find("")] + "\n"
content = content.replace("\t","")
content = content.replace("
","")
content = content.replace(" ","")
temp = re.split(r'\n',content)
for i in range(len(temp)):
if len(temp[i])<33:
temp[i] = temp[i]+'\n'
f.write(temp[i])
f.close()