Python BeautifulSoup 实战演练

最新推荐文章于 2023-03-25 12:14:35 发布

古月随笔

最新推荐文章于 2023-03-25 12:14:35 发布

阅读量853

点赞数

分类专栏： Python 文章标签： python BeautifulSoup urlretrieve 图片下载

本文链接：https://blog.csdn.net/tigereg000/article/details/49490079

版权

Python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

前几天获取到一个比较有意思的需求，在天猫超市，输入关键字，把查询返回第一页的商品图片下载下来；基于这么个需求，开始去实施

思路:

我这边采取从excel中读取商品名称

excel读取方法:

#coding=utf-8
import xlrd
from xlutils.copy import copy

'''
从excel中读取要查找的商品名称，第一列默认为0
'''
def Read_Good_Name(xpath,col_index=None):
    col_index = int(col_index)
    #打开xls格式文件，并保存之前数据的格式
    rb = xlrd.open_workbook(xpath,formatting_info=True)
    #获取当前sheet页
    r_sheet = rb.sheet_by_index(0)
    #获取总行数
    table_row_nums = r_sheet.nrows
    list = []
    #进行格式转换
    for i in range(1,table_row_nums):
        #按列读取行值
        cvalue = r_sheet.cell(i,col_index).value
        if type(cvalue).__name__ == 'unicode':
            cvalue = cvalue.encode('utf-8')
        elif type(cvalue).__name__ == 'float':
            cvalue = str(int(cvalue))
        #保存到list中
        list.append(cvalue)
    return list

查找商品并保存图片--解决了中文字符乱码问题。。。

#coding:utf-8
import requests
import os
from bs4 import BeautifulSoup
import uuid
import urllib
import excel_md
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
'''
按商品名称创建文件夹
'''
def mkdir(key):
    key = str(key)
    path = os.getcwd()   				     # 获取此脚本所在目录
    new_path = os.path.join(path,'picture\\%s'%(key).encode("gbk"))
    if not os.path.isdir(new_path):
        os.mkdir(new_path)
    return new_path
'''
查找图片url，并保存到txt文件
'''

def search(key):
    #搜索地址
    search_url = 'https://list.tmall.com//search_product.htm?q=%s&user_id=725677994&type=p&cat=50514008&spm=1.1.a2227oh.d100&from=chaoshi..pc_1_searchbutton'%(key)

    #建立request Session
    session = requests.Session()
    req = session.get(search_url)
    content = req.text
    #使用BeautifulSoup解析响应报文
    soup = BeautifulSoup(content, "html.parser")
    #查找tag 为div，且属性class名称为product-img的数据
    good_pic = soup.find_all('div',attrs={'class':'product-img'})

    #新建空list
    links = []
    for i in good_pic:
        #查找tag 为img的信息
        jokes = i.find('img')
        #返回img 中data-ks-lazyload 值1
        link = str(jokes.get('data-ks-lazyload'))
        #截取倒数第12个字符前的str
        x = "http:"+link[:-12]
        #添加到列表中
        links.append(x)
        #保存地址到txt
    f1 = open('list_%s_url.txt'%(key).decode("utf-8"),'w')
    for i in links:
        #写入url
        f1.write(i)
        #换行
        f1.write("\n")
    f1.close()


#生成一个文件名字符串
def generateFileName():
    return str(uuid.uuid1())

#根据文件名创建文件
def createFileWithFileName(localPathParam,fileName):
    totalPath=localPathParam+'\\'+fileName
    if not os.path.exists(totalPath):
        file=open(totalPath,'a+')
        file.close()
        return totalPath

#根据图片的地址，下载图片并保存在本地
def getAndSaveImg(imgUrl,path):
        imgUrl = str(imgUrl)
        if( len(imgUrl)!= 0 ):
            fileName=generateFileName()+'.jpg'
            urllib.urlretrieve(imgUrl,createFileWithFileName(path,fileName))


if __name__ == "__main__":  #程序运行入口
        #从excel中读取商品名称
        Good_list = excel_md.Read_Good_Name("test.xls",0,0)
        #遍历商品名称
        for key in Good_list:
            #查找商品
            search(key)
            #图片url保存到txt
            file_object = open("list_%s_url.txt"%(key).decode("utf-8"),'r')
            links = []
            for line in file_object:
                #给遍历出来的图片url加上http头

                #保存图片
                getAndSaveImg(line,mkdir(key))