day10 下载

最新推荐文章于 2020-10-31 16:41:36 发布

两条小鱼

最新推荐文章于 2020-10-31 16:41:36 发布

阅读量232

点赞数 1

分类专栏：笔记

本文链接：https://blog.csdn.net/weixin_40447206/article/details/81227025

版权

笔记专栏收录该内容

54 篇文章 0 订阅

订阅专栏

from urllib import request
import os
import re

def download_file(url, dest_dir):
    dst_fname = url.split('/')[-1]
    dst_fname = os.path.join(dest_dir, dst_fname)
    html = request.urlopen(url)
    with open(dst_fname, 'wb') as fobj:
        while True:
            data = html.read(4096)
            if not data:
                break
            fobj.write(data)

def get_patt(fname, patt):
    patt_list = []
    cpatt = re.compile(patt)
    with open(fname, 'rb') as fobj:
        while True:
            try:
                line = fobj.readline().decode('utf8')
            except:
                continue
            if not line:
                break
            m = cpatt.search(line)
            if m:
                patt_list.append(m.group())
    return patt_list

if __name__ == '__main__':
    if not os._exists('/tmp/netease'):
        os.makedirs('/tmp/netease')
    download_file('http://sports.163.com/index.html', '/tmp/netease')
    url_patt = 'http://[^\s;)(:]+\.(png|jpeg|jpg)'
    url_list = get_patt('/tmp/netease/index.html', url_patt)
    for img_url in url_list:
        download_file(img_url, '/tmp/netease')