python获取html如果url存在中文 ,会抛出UnicodeEncodeError异常
参考 https://www.cnblogs.com/jessicaxu/p/7977277.html
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-24: ordinal not in range(128)
#该函数用于获取html内容
#使用到urlopen的函数
def getHtml(url):
print("开始访问:" + url)
# 如果不加上下面的这行出现会出现urllib2.HTTPError: HTTP Error 403: Forbidden错误
# 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}#Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
#Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
req = urllib.request.Request(url=url, headers=headers)
page = urllib.request.urlopen(req, timeout=1)
#3.0直接使用read()函数会出现报错,提示是编码有问题。在后面加上编码就ok了。
html = page.read().decode("utf-8")
print("访问成功:" + url)
return html
cUrl https://www.aa223.com/yousheng/list-诱惑短篇小说.html
开始访问:https://www.aa223.com/yousheng/list-诱惑短篇小说.html
Traceback (most recent call last):
File "E:\GitMulu\PythonCrawler\src\kxf\test\mp3\__init__.py", line 107, in <module>
htmlChild = getHtml(str(cUrl))
File "E:\GitMulu\PythonCrawler\src\kxf\test\utils\__init__.py", line 46, in getHtml
page = urllib.request.urlopen(req, timeout=1)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 543, in _open
'_open', req)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "D:\Programs\Python\Python37\lib\urllib\request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "D:\Programs\Python\Python37\lib\http\client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\Programs\Python\Python37\lib\http\client.py", line 1255, in _send_request
self.putrequest(method, url, **skips)
File "D:\Programs\Python\Python37\lib\http\client.py", line 1122, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-24: ordinal not in range(128)
转下编码就可以解决,如下
#该函数用于获取html内容
#使用到urlopen的函数
def getHtml(url):
url = getUrlCode(url);
# 如果不加上下面的这行出现会出现urllib2.HTTPError: HTTP Error 403: Forbidden错误
# 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}#Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
#Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0
req = urllib.request.Request(url=url, headers=headers)
page = urllib.request.urlopen(req, timeout=10)
#3.0直接使用read()函数会出现报错,提示是编码有问题。在后面加上编码就ok了。
html = page.read().decode("utf-8")
print("访问成功:" + url)
return html
#将字符串进行编码 将中文字符转换成url编码
def getUrlCode(url):
print("getUrlCode " + url)
name = re.findall(u"[\u4e00-\u9fa5]+",url)
if len(name)>0:
for name1 in name:
str2 = parse.quote(name1) #将字符串进行编码
print(str2) #哈哈 %E5%93%88%E5%93%88
url = url.replace(name1, str2);
print("getUrlCode =====" + url)
return url