Python3 获取网络图片并且保存到本地

最新推荐文章于 2024-08-19 22:21:41 发布

weixin_30949361

最新推荐文章于 2024-08-19 22:21:41 发布

阅读量1.8k

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/lizm166/p/9680385.html

版权

Python3 获取网络图片并且保存到本地

import requests
from bs4 import BeautifulSoup
from urllib import request
import sys
import re
import os

def getNews(title,url,m):
    Hostreferer = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    req = request.Request(url)
    response = request.urlopen(req)
    #过滤非utf-8的网页新闻
    response = response.read().decode('utf-8',"ignore")
    soup = BeautifulSoup(response,'lxml')
    tag = soup.find('div',class_='article')    
    if tag == None:
        return 0#获取文章内容
    rep = re.compile("[\s+\.\!\/_,$%^*(+\"\']+|[+<>?、~*（）]+")
    title = rep.sub('',title)
    title = title.replace(':','：')
    filename = sys.path[0]+"/news/"+title+".txt"
    with open(filename,'w',encoding='utf8') as file_object:
        file_object.write("网址:"+url)
        file_object.write("\n")
        file_object.write(title)
        file_object.write(tag.get_text())

    i = 0
    for image in tag.find_all('div','img_wrapper'): 
        title_img = title +str(i)
        # 保存图片
        #判断目录是否存在
        if (os.path.exists(sys.path[0]+"/news/"+title)):
            pass
        else:
            #不存在，则新建目录
            os.mkdir(sys.path[0]+"/news/"+title)
        os.chdir(sys.path[0]+"/news/"+title)
        file_name = "http://news.***.com.cn/"+image.img.get('src').replace('//','')
        html = requests.get(file_name, headers=Hostreferer)
        # 图片不是文本文件，以二进制格式写入，所以是html.content
        title_img = title_img +".jpg"
        f = open(title_img, 'wb')
        f.write(html.content)
        f.close()
        i+=1

    print('成功爬取第', m,'个新闻',title)
    return 0