python正则爬取糗事百科图片
刚学正则,最终没有成功,错误信息附后
import urllib.parse
import urllib.request
import re
import os
def handle_request(url, page):
url = url +str(page) + '/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
}
request = urllib.request.Request(url=url, headers=headers)
return request
def download_image(content):
pattern = re.compile(r'<img src="(.*?)\?imageView2/1/w/150/h/112" alt=.*?>',re.S)
lt = pattern.findall(content)
for image_src in lt:
image_src = 'https:'+image_src
dirname = 'qiutu'
if not os.path.exists(dirname):
os.mkdir(dirname)
filename = image_src.split('/')[-1]
filepath = dirname + '/' + filename
print("%s图片正在下载......" % filename)
urllib.request.urlretrieve(image_src, filepath)
print("%s图片下载结束......" % filename)
def main():
url = 'https://www.qiushibaike.com/8hr/page/'
start_page = int(input("请输入起始页码:"))
end_page = int(input("请输入结束页码:"))
for page in range(start_page,end_page +1):
print("第%s页开始下载......" % page)
request = handle_request(url, page)
content = urllib.request.urlopen(request).read().decode()
download_image(content)
print("第%s页下载结束......" % page)
print()
print()
if __name__ == '__main__':
main()
错误信息如下,求指点
请输入起始页码:1
请输入结束页码:1
第1页开始下载......
R8IQ90O6L6EQB0E9_hd.jpg图片正在下载......
R8IQ90O6L6EQB0E9_hd.jpg图片下载结束......
AJQ9NGEZP2R8DMOF_hd.jpg图片正在下载......
Traceback (most recent call last):
File "C:/Users/wxl69/PycharmProjects/Compass/0330/day3_5-zhengze.py", line 68, in <module>
main()
File "C:/Users/wxl69/PycharmProjects/Compass/0330/day3_5-zhengze.py", line 62, in main
download_image(content)
File "C:/Users/wxl69/PycharmProjects/Compass/0330/day3_5-zhengze.py", line 46, in download_image
urllib.request.urlretrieve(image_src, filepath)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1362, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1319, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1230, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1241, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1092, in putrequest
self._validate_path(url)
File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1183, in _validate_path
raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. '/system/avtnew/3195/31951723/thumb/20180815155042.jpg?imageView2/1/w/50/h/50" alt="HGTGFHGH" />\n\n<span class="recmd-name">HGTG...</span>\n</a>\n</div>\n</div>\n</li>\n\n\n<!-- 相关推荐item -->\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<li class="item typs_video" id=\'qiushi_tag_123022151\'>\n<a class="recmd-left video" href="/article/123022151" rel="nofollow" target="_blank" οnclick="_hmt.push([\'_trackEvent\',\'web-list-video\',\'chick\'])">\n\n\n\n\n\n<img src="//qiubai-video-web.qiushibaike.com/AJQ9NGEZP2R8DMOF_hd.jpg' (found at least ' ')
Process finished with exit code 1