import urllib.request
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
concent = response.read().decode("utf-8")
soup = BeautifulSoup(concent, 'lxml')
ret = soup.select('.book-mulu >ul >li >a')
with open('三国演义.txt', 'w', encoding='utf-8')as fp:
for item in ret:
url_1 = 'http://www.shicimingju.com' + item['href']
title = item.string
print("正在爬取: %s" %title)
request_1 = urllib.request.Request(url=url_1, headers=headers)
response_1 = urllib.request.urlopen(request_1)
concent_1 = response_1.read().decode('utf-8')
soup_1 = BeautifulSoup(concent_1, 'lxml')
string = soup_1.select('.chapter_content')[0].text
fp.write(title + string)
print("爬取结束:%s" %title)
报错:UnicodeEncodeError: 'gbk' codec can't encode character '\xa9' in position 30
解决:
import io
import sys
#改变标准输出的默认编码
#utf-8中文乱码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
~~~