Python3实例：爬取淘宝商品列表

最新推荐文章于 2023-10-16 11:04:58 发布

A programming ape

最新推荐文章于 2023-10-16 11:04:58 发布

阅读量1k

点赞数

文章标签： Python

这个实例是从淘宝爬数据，原文是：http://www.cnblogs.com/nima/p/5324490.html
因为我比较关心的是网络这一块，所以对文章做了很多删改。侧重在理解request、cookie两个模块
至于如何把数据保存到excel，怎么排版，这些是完全没有意义的，不是正式生产环境，做得多么漂亮都没意义。

这次用了很多新的模块或概念：
图像相关的库Pillow，
下载地址点击打开链接
原文的是把图片链接写到excel里面的，我改了之后就没有了，只是下载下来。

Mozilla：
经常看到这个单词。原来是只Mozilla基金会，为支持和领导开源的Mozilla而设立的一个非营利组织。

cx_Freeze：打包工具，将python程序打包成exe文件。
下载地址：点击打开链接
打包的稍后再研究。

cookie的内容通过cookiejar模块来操作，代码里面使用的是子类MozillaCookieJar。
cookie详细讲解看这里点击打开链接
这个类跟父类只是保存和加载文件的方式有所不同。
看一下保存的内容，用\t分割。为方便查看，用""包着了。
cookie.domain："s.m.taobao.com"，访问的域名
initial_dot："False"，域名是否以“.”开始的，要做一些特殊处理。
cookie.path："/" 好像是文件目录
secure："False" 安全的
expires："" 应该是过期时间
name："JSESSIONID"
value："770326E8F4997185C7DB2714D7569FF1"

request：
这不是新的模块，但是趁着这次详细了解了一番。

点击打开链接

下面说说具体的代码

1.这里去的是手机淘宝的页面，获得的信息居然是json来的。

2.代码的核心就是从网上下载网页。因为淘宝可能会出现反爬虫，所以使用cookie，构建head是很有必要的。尽量把自己伪装成一个浏览器。

3.把内容写到excel中。怎么写不重要了。反正没什么好看的。一个练习的例子，往往都是下载数据而已。

但是从数据到信息还差一步，这是一个商业秘密了。所以数据获得再多，不能转化为信息，最终还只是练习而已。

好吧，还是看看代码吧。

 
   [python]  
   view plaincopy
# -*- coding:utf-8 -*-  
import urllib.request, urllib.parse, http.cookiejar  
import os, time,re  
from PIL import Image  
import json  
from openpyxl import Workbook  
   
# 找出文件夹下所有html后缀的文件  
def listfiles(rootdir, prefix='.xml'):  
    file = []  
    for parent, _, filenames in os.walk(rootdir):  
        if parent == rootdir:  
            for filename in filenames:  
                if filename.endswith(prefix):  
                    file.append(rootdir + '/' + filename)  
            return file  
        else:  
            pass  
  
def writeexcel(path,dealcontent):  
    workbook = Workbook() #构造一个workBook的对象   
    worksheet = workbook.create_sheet('1',0)#构造一个表格。坐标要从1开始的。  
    for i in range(0,len(dealcontent)):  
        for j in range(0,len(dealcontent[i])):  
            if i!=0 and j==len(dealcontent[i])-1:  
                if dealcontent[i][j] != '':   
                    try:  
                        worksheet.cell(row=i+1,column=j+1).value = dealcontent[i][j]#写入sheet中   
                    except:  
                        pass  
            else:  
                if dealcontent[i][j]:  
                    worksheet.cell(row=i+1,column=j+1).value = dealcontent[i][j].replace(' ','')   
    workbook.save(path)  
      
#这里才是代码的核心  
def getHtml(url, myProxy='', postdata={}):  
    """ 
        抓取网页：支持cookie 
    url网址，postdata为POST的数据 
 
    """  
    # COOKIE文件保存路径  
    filename = 'cookie.txt'  
  
    # 声明一个MozillaCookieJar对象实例保存在文件中  
    cj = http.cookiejar.MozillaCookieJar(filename)   
  
    # 从文件中读取cookie内容到变量  
    # ignore_discard的意思是即使cookies将被丢弃也将它保存下来  
    # ignore_expires的意思是如果过期了也照样保存  
    # 如果存在，则读取主要COOKIE  
    if os.path.exists(filename):  
        cj.load(filename, ignore_discard=True, ignore_expires=True)  
    # 建造带有COOKIE的处理器  
    cookieHandler = urllib.request.HTTPCookieProcessor(cj)  
    if myProxy:# 开启代理支持  
        #使用代理，就要用到代理的Handler  
        proxyHandler = urllib.request.ProxyHandler({'http':'http://'+myProxy})  
        print('代理:'+myProxy+'启动')  
        opener = urllib.request.build_opener(proxyHandler, cookieHandler )  
    else:  
        opener = urllib.request.build_opener(cookieHandler)  
  
    # 打开专家加头部  
    opener.addheaders = [('User-Agent',  
                          'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'),  
                         ('Referer',  
                          'http://s.m.taobao.com'),  
                         ('Host', 'h5.m.taobao.com')]  
  
    # 分配专家  
    urllib.request.install_opener(opener)  
    # 有数据需要POST  
    if postdata:  
        # 数据URL编码  
        postdata = urllib.parse.urlencode(postdata)  
        html_bytes = urllib.request.urlopen(url, postdata.encode()).read()  
    else:  
        html_bytes = urllib.request.urlopen(url).read()  
  
    # 保存COOKIE到文件中  
    cj.save(ignore_discard=True, ignore_expires=True)  
    return html_bytes  
  
# 去除标题中的非法字符 (Windows)  
def validateTitle(title):  
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/\:*?"<>|'  
    new_title = re.sub(rstr, "", title)  
    return new_title  
  
# 递归创建文件夹  
def makeFolder(path):  
    try:  
        os.makedirs(path)  
    except:  
        print('目录已经存在：'+path)  
   
if __name__ == '__main__':  
    #对应目录  
    dataDir = './data'   
    imageDir = './image'  
    makeFolder(dataDir)  
    #表单参数    
    keyword = r'卡包'  
    orderType = 1 #1.按销量优先，2.按价格低到高，3.价格高到低，4.信用排序，5.综合排序  
    pageNum = 10 #需要抓取的页数  
    waitSeconds = 4#每次抓取后暂停时间  
    isGetImage = 1#'抓取图片按1，不抓取按2：'  
    #构建表单      
    postdata = {}  
    postdata['event_submit_do_new_search_auction']= 1  
    postdata['search']= '提交查询'  
    postdata['_input_charset']= 'utf-8'  
    postdata['topSearch']= 1  
    postdata['atype']= 'b'  
    postdata['searchfrom']= 1  
    postdata['action']= 'home:redirect_app_action'  
    postdata['from']= 1  
    postdata['q']= keyword  
    postdata['sst']= 1  
    postdata['n']= 20  
    postdata['buying']= 'buyitnow'  
    postdata['m']= 'api4h5'  
    postdata['abtest']= 16  
    postdata['wlsort']= 16  
    postdata['style']= 'list'  
    postdata['closeModues']= 'nav,selecthot,onesearch'  
    if orderType == 1:  
        postdata['sort'] = '_sale'  
    elif orderType == 2:  
        postdata['sort'] = 'bid'  
    elif orderType== 2:  
        postdata['sort']='_bid'  
    elif orderType==4:  
        postdata['sort']='_ratesum'  
      
    #获取每一页的数据  
    for page in range(0, pageNum):   
        postdata['page']= page   
        taobaoUrl = "http://s.m.taobao.com/search?"   
        try:  
            content1 = getHtml(taobaoUrl, '', postdata)  
            file = open(dataDir + '/' + str(page) + '.json', 'wb')#这是手机淘宝，获得的是json文件  
            file.write(content1)  
        except Exception as e:  
                if hasattr(e, 'code'):  
                    print('页面不存在或时间太长.')  
                    print('Error code:', e.code)  
                elif hasattr(e, 'reason'):  
                        print("无法到达主机.")  
                        print('Reason:  ', e.reason)  
                else:  
                    print(e)  
        time.sleep(waitSeconds)  
        print('暂停'+str(waitSeconds)+'秒')      
               
    files = listfiles(dataDir, '.json')  
    total = [['页数', '店名', '商品标题', '商品打折价', '发货地址', '评论数', '原价', '售出件数', '政策享受', '付款人数', '金币折扣','URL地址','图像URL','图像'],]  
    for filename in files:  
        try:  
            doc = open(filename, 'rb')  
            doccontent = doc.read().decode('utf-8', 'ignore')  
            product = doccontent.replace(' ', '').replace('\n', '')  
            product = json.loads(product)  
            onefile = product['listItem']  
        except:  
            print('抓不到' + filename)  
            continue  
        for item in onefile:  
            itemlist = [filename, item['nick'], item['title'], item['price'], item['location'], item['commentCount']]  
            itemlist.append(item['originalPrice'])   
            itemlist.append(item['sold'])  
            itemlist.append(item['zkType'])  
            itemlist.append(item['act'])  
            itemlist.append(item['coinLimit'])  
            itemlist.append('http:'+item['url'])  
            picpath=item['pic_path'].replace('60x60','720x720')  
            itemlist.append(picpath)  
            if isGetImage==1:  
                if os.path.exists(imageDir):  
                    pass  
                else:  
                    makeFolder(imageDir)  
                url=urllib.parse.quote(picpath).replace('%3A',':')  
                urllib.request.urlcleanup()  
                try:  
                    pic=urllib.request.urlopen(url)  
                    picno=time.strftime('%H%M%S', time.localtime())  
                    filenamep=imageDir+'/'+picno+validateTitle(item['nick']+'-'+item['title'])  
                    filenamepp=filenamep+'.jpeg'  
                    sfilename=filenamep+'s.jpeg'  
                    filess=open(filenamepp,'wb')#从网络上获得图片  
                    filess.write(pic.read())  
                    filess.close()  
                    img = Image.open(filenamepp)#以图片的格式打开  
                    w, h = img.size  
                    size=w/6,h/6  
                    img.thumbnail(size, Image.ANTIALIAS)  
                    img.save(sfilename,'jpeg')  
                    itemlist.append(sfilename)  
                    print('抓到图片：'+sfilename)  
                except Exception as e:  
                    if hasattr(e, 'code'):  
                        print('页面不存在或时间太长.')  
                        print('Error code:', e.code)  
                    elif hasattr(e, 'reason'):  
                            print("无法到达主机.")  
                            print('Reason:  ', e.reason)  
                    else:  
                        print(e)  
                    itemlist.append('')  
            else:  
                itemlist.append('')   
            total.append(itemlist)  
    if len(total) > 1:    
        writeexcel( keyword + '淘宝手机商品.xlsx', total)  
    else:  
        print('什么都抓不到')  

A programming ape

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Python3实例：爬取淘宝商品列表

这个实例是从淘宝爬数据，原文是：http://www.cnblogs.com/nima/p/5324490.html因为我比较关心的是网络这一块，所以对文章做了很多删改。侧重在理解request、cookie两个模块至于如何把数据保存到excel，怎么排版，这些是完全没有意义的，不是正式生产环境，做得多么漂亮都没意义。这次用了很多新的模块或概念：图像相关的库Pillow，下载地址点击打开链接原文的...
复制链接

扫一扫