python_pandas excel某一列检索图片，网上下载保存本地

gwx113036

于 2023-10-26 03:57:17 发布

阅读量127

点赞数

文章标签： python pandas excel

本文链接：https://blog.csdn.net/gwx113036/article/details/134047216

版权

代码如下：


import requests # 爬虫必备
import time # 限制爬虫速度
import os # 新建指定存储文件夹
import pandas as pd  #读取EXCEL文件 
import re #re是正则表达式模块
import time
import sys
from tqdm import tqdm
import json
import requests
import openpyxl
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from openpyxl import load_workbook

#设置进度条
n= 0
pbar = tqdm(total=100)


# 自定义目录存放日志文件
log_path = os.getcwd()+ "/FailLogs/"
if not os.path.exists(log_path):
    os.makedirs(log_path)

def filterHtmlTag(htmlstr):
    '''
    过滤html中的标签
    '''
    #兼容换行
    s = htmlstr.replace('\r\n','\n')
    s = htmlstr.replace('\r','\n')
    s = htmlstr.replace('','')
    #规则
    re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA
    re_script = re.compile('<\s*script[^>]*>[\S\s]*?<\s*/\s*script\s*>',re.I)#script
    re_style = re.compile('<\s*style[^>]*>[\S\s]*?<\s*/\s*style\s*>',re.I)#style
    re_br = re.compile('<br\\s*?\/??>',re.I)#br标签换行
    re_p = re.compile('<\/p>',re.I)#p标签换行
    re_h = re.compile('<[\!|/]?\w+[^>]*>',re.I)#HTML标签
    re_comment = re.compile('<!--[^>]*-->')#HTML注释
    re_hendstr = re.compile('^\s*|\s*$')#头尾空白字符
    re_lineblank = re.compile('[\t\f\v ]*')#空白字符
    re_linenum = re.compile('\n+')#连续换行保留1个
    #处理
    s = re_cdata.sub('',s)#去CDATA
    s = re_script.sub('',s) #去script
    s = re_style.sub('',s)#去style
    s = re_br.sub('\n',s)#br标签换行
    s = re_p.sub('\n',s)#p标签换行
    s = re_h.sub('',s) #去HTML标签
    s = re_comment.sub('',s)#去HTML注释
    s = re_lineblank.sub('',s)#去空白字符
    s = re_linenum.sub('\n',s)#连续换行保留1个
    s = re_hendstr.sub('',s)#去头尾空白字符
    #替换实体
    s = replaceCharEntity(s)
    return s

def replaceCharEntity(htmlStr): 
    CHAR_ENTITIES={'nbsp':' ','160':' ',
              'lt':'<','60':'<',
              'gt':'>','62':'>',
              'amp':'&','38':'&',
              'quot':'"','34':'"',}
    re_charEntity=re.compile(r'&#?(?P<name>\w+);')
    sz=re_charEntity.search(htmlStr)
    while sz:
        entity=sz.group()#entity全称，如>
        key=sz.group('name') #去除&;后的字符如（" "--->key = "nbsp"）    去除&;后entity,如>为gt
        try:
            htmlStr= re_charEntity.sub(CHAR_ENTITIES[key],htmlStr,1)
            sz=re_charEntity.search(htmlStr)
        except KeyError:
            #以空串代替
            htmlStr=re_charEntity.sub('',htmlStr,1)
            sz=re_charEntity.search(htmlStr)
    return htmlStr

#提取中文
def chinese(words): 
    chinese =''.join(re.findall('[\u4e00-\u9fa5]', words))
    return chinese 

#百度检索结果 获取10个
def get_img_url(keyword):
    """发送请求，获取接口中的数据"""
    # 接口链接
    url = 'https://image.baidu.com/search/acjson?'
    # 请求头模拟浏览器
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
    # 构造网页的params表单
    params = {
        'tn': 'resultjson_com',
        'logid': '6918515619491695441',
        'ipn': 'rj',
        'ct': '201326592',
        'is': '',
        'fp': 'result',
        'queryWord': f'{keyword}',
        'word': f'{keyword}',
        'cl': '2',
        'lm': '-1',
        'ie': 'utf-8',
        'oe': 'utf-8',
        'adpicid': '',
        'st': '-1',
        'z': '',
        'ic': '',
        'hd': '',
        'latest': '',
        'copyright': '',
        's': '',
        'se': '',
        'tab': '',
        'width': '',
        'height': '',
        'face': '0',
        'istype': '2',
        'qc': '',
        'nc': '1',
        'fr': '',
        'expermode': '',
        'force': '',
        'cg': 'girl',
        'pn': 1,
        'rn': '10',
        'gsm': '1e',
    }

    # 用于存储图片链接的列表
    img_url_list = []
    try:
            # 携带请求头和params表达发送请求
            response  = requests.get(url=url, headers=headers, params=params)
            # 设置编码格式
            response.encoding = 'utf-8'  
            pattern=filterHtmlTag(response.text)
            json_dict = json.loads(pattern)  
            # 定位到10个图片上一层
            data_list = json_dict['data']
            # 删除列表中最后一个空值
            del data_list[-1]
            for i in data_list:
                img_url = i['thumbURL'] 
                img_url_list.append(img_url) 
            return img_url_list 
    except Exception as e: 
            with open(os.getcwd()+ "/FailLogs/"+keyword.lstrip()+".txt", "w") as txt_file:
                txt_file.write(pattern +"无法编译的原因========>"+repr(e)) 
            return img_url_list 

    

#下载图片到本地
def get_down_img(img_url_list,dirs):
    # 在当前路径下生成存储图片的文件夹
    if not os.path.exists(os.getcwd()+ "/" + dirs):
        os.makedirs(dirs)  
    # 定义图片编号
    n = 0
    for img_url in img_url_list: 
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'} 
        # 拼接图片存放地址和名字
        img_path = os.getcwd()+"/"+dirs +"/"+ str(n) + '.jpg'
        # 下载图片
        try:
            res = requests.get(url=img_url, headers=headers)
            if 200 == res.status_code:
                with open(img_path, 'wb') as file:
                    file.write(res.content)
        except Exception as e:
            print(f'下载文件失败: {file_path}')
            print(repr(e)) 
        # 图片编号递增
        n = n + 1

if __name__ == '__main__':
    pbar = tqdm(total=100)
    #提取excel  
    df1=pd.read_excel('嗨购仓维护主图.xlsx',sheet_name='Sheet1')#读取sheet页  
    list1=[]#商品名  
    for j in df1['品名（必填）']:
        list1.append(str(j))
    #去重
    list2 =list1 #list(set(list1)) 不用去重，按照行内容索引
    print("总共有"+str(len(list2))+"个词条")
    # 打开Excel文件
    wb = openpyxl.load_workbook('嗨购仓维护主图.xlsx')
    # 激活当前工作表
    ws = wb.active 
    # 1. 循环关键词
    for i in range(len(list2)):
        keyword = list2[i].lstrip()
        # 2. 获取指定关键词的图片链接
        img_url_list = get_img_url(keyword)
        if len(img_url_list)==0:
            print("关键词："+keyword+"下载失败！") 
        else:
            #下载图片到指定位置
            get_down_img(img_url_list,keyword)  
            # 创建Image对象
            img = Image(os.getcwd()+"/"+keyword +"/"+ '0.jpg' )
            # 调整图片大小
            img.width = 150
            img.height = 100  
            #当前行数据所在行索引
            row_index=df1[df1['品名（必填）'].isin([list2[i]])].index.tolist()[0]
            # print("关键词："+keyword+"所在的索引行是："+str(row_index))
            # 将图片插入到指定单元格
            ws.add_image(img, 'G'+str(row_index+2)) 
            ws.row_dimensions[i+2].height= 100  
            ws.column_dimensions['G'].width =50
        if i % 8== 0:
            pbar.update(1)
    pbar.close()   
    wb.save("嗨购仓维护主图_图片版本.xlsx")
    print("全部词条下载完成！")

运行结果：