今爬取糗事百科图片----文科生的python自学之路(8)

python正则爬取糗事百科图片

刚学正则,最终没有成功,错误信息附后

# -*- coding: utf-8 -*- 
# coding:unicode_escape
# @Time : 2020/4/22 23:00
# @Author : Compass 
# @File : day3_5-zhengze.py


# 糗事百科
import urllib.parse
import urllib.request
import re
import os
#根据函数生成请求对象
def handle_request(url, page):
    url = url +str(page) + '/'
    # 测试是否成功构建地址
    # print(url)
    # 构建请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
    }
    # 构建请求request
    request = urllib.request.Request(url=url, headers=headers)
    return request

def download_image(content):
    # 每一页调用此函数,共17个
    # 根据内容下载图片,?分析网页代码?
    pattern = re.compile(r'<img src="(.*?)\?imageView2/1/w/150/h/112" alt=.*?>',re.S)
    lt = pattern.findall(content)
    # print(lt)
    # print(len(lt)) 此时的链接不完整 缺少http
    # 遍历列表,依次下载图片
    for image_src in lt:
        # 先处理image_src
        image_src = 'https:'+image_src
        # 发送请求,下载图片
        # 创建文件夹
        dirname = 'qiutu'
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        # 图片的名字  按照斜杆切割,用本名
        filename = image_src.split('/')[-1]
        filepath = dirname + '/' + filename
        print("%s图片正在下载......" % filename)
        urllib.request.urlretrieve(image_src, filepath)
        print("%s图片下载结束......" % filename)

def main():
    url = 'https://www.qiushibaike.com/8hr/page/'
    start_page = int(input("请输入起始页码:"))
    end_page = int(input("请输入结束页码:"))
    for page in range(start_page,end_page +1):
        # 打印下载信息
        print("第%s页开始下载......" % page)
        # 生成请求对象
        request = handle_request(url, page)
        # 发送请求对象,获取响应内容
        content = urllib.request.urlopen(request).read().decode()
        # 原编码为utf-8
        # 解析内容,根据content提取所有图片链接,下载图片,每一页都调取此函数
        download_image(content)
        print("第%s页下载结束......" % page)
        print()
        print()

if __name__ == '__main__':
    main()

错误信息如下,求指点

请输入起始页码:1
请输入结束页码:11页开始下载......
R8IQ90O6L6EQB0E9_hd.jpg图片正在下载......
R8IQ90O6L6EQB0E9_hd.jpg图片下载结束......
AJQ9NGEZP2R8DMOF_hd.jpg图片正在下载......
Traceback (most recent call last):
  File "C:/Users/wxl69/PycharmProjects/Compass/0330/day3_5-zhengze.py", line 68, in <module>
    main()
  File "C:/Users/wxl69/PycharmProjects/Compass/0330/day3_5-zhengze.py", line 62, in main
    download_image(content)
  File "C:/Users/wxl69/PycharmProjects/Compass/0330/day3_5-zhengze.py", line 46, in download_image
    urllib.request.urlretrieve(image_src, filepath)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 247, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 525, in open
    response = self._open(req, data)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 502, in _call_chain
    result = func(*args)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1362, in https_open
    return self.do_open(http.client.HTTPSConnection, req,
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1319, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1230, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1241, in _send_request
    self.putrequest(method, url, **skips)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1092, in putrequest
    self._validate_path(url)
  File "C:\Users\wxl69\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1183, in _validate_path
    raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. '/system/avtnew/3195/31951723/thumb/20180815155042.jpg?imageView2/1/w/50/h/50" alt="HGTGFHGH" />\n\n<span class="recmd-name">HGTG...</span>\n</a>\n</div>\n</div>\n</li>\n\n\n<!-- 相关推荐item -->\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<li class="item typs_video" id=\'qiushi_tag_123022151\'>\n<a class="recmd-left video" href="/article/123022151" rel="nofollow" target="_blank" οnclick="_hmt.push([\'_trackEvent\',\'web-list-video\',\'chick\'])">\n\n\n\n\n\n<img src="//qiubai-video-web.qiushibaike.com/AJQ9NGEZP2R8DMOF_hd.jpg' (found at least ' ')

Process finished with exit code 1
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值