以下代码在ipython执行无报错,且有正确结果,但在pycharm执行就报错,错误代码见第二段
# coding=utf-8
import re
import urllib.request
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('utf-8')
return html
def getImg(html):
reg = r'<p class="img_title">(.*)</p>'
img_title = re.compile(reg)
imglist = re.findall(img_title, html)
return imglist
url = "https://tieba.baidu.com"
html = getHtml(url)
imglist = getImg(html)
print(imglist)
/Users/zhu/python3/env/bin/python /Users/zhu/python3/the19/ex3.py
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1026, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 964, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1400, in connect
server_hostname=server_hostname)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 407, in wrap_socket
_context=self, _session=session)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 814, in __init__
self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 1068, in do_handshake
self._sslobj.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 689, in do_handshake
self._sslobj.do_handshake()
ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/zhu/python3/the19/ex3.py", line 21, in <module>
html = getHtml(url)
File "/Users/zhu/python3/the19/ex3.py", line 7, in getHtml
page = urllib.request.urlopen(url)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1320, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)>
Process finished with exit code 1
因为爬虫对象是https链接,导入一个ssl模块就可以解决问题,最后代码改成如下:
# coding=utf-8
import re
import urllib.request
import ssl
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('utf-8')
return html
def getImg(html):
reg = r'<p class="img_title">(.*)</p>'
img_title = re.compile(reg)
imglist = re.findall(img_title, html)
return imglist
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://tieba.baidu.com"
html = getHtml(url)
imglist = getImg(html)
print(imglist)