【爬虫】shopXO 商城

最新推荐文章于 2023-07-04 11:00:02 发布

StudiousTiger

最新推荐文章于 2023-07-04 11:00:02 发布

阅读量904

点赞数 3

分类专栏： Tiger の爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/m0_45067620/article/details/107841874

版权

Tiger の爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

获取到 shopXO商城中数码产品中商品的 “产品名称”,“价格”,“浏览数”,“销量”,“库存”

#-*- coding = utf-8 -*-
#@Time:2020/7/18 17:11
#@Author:huxuehao
#@File:
#@Software:PyCharm
#@Theme:

import re   #正则表达式
import urllib.request,urllib.error   #指定url,获取网页数据
# import urllib
import xlwt     #进行Excel操作
import csv      #进行csv操作
from bs4 import BeautifulSoup   #网页解析

#注意下面路径中的必须是“start=”，因为我们通过手动添加start的值进行锁定爬取页面
src="https://guest.shopxo.net/goods-"
# src="www.huxuehaoshizuishuaide.hahahahahaha.com"
#获取网页源码
def gethtml(url):
    head={
            "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 83.0.4103.116Safari / 537.36"
        }
    #User-Agent的信息为Chrome的信息，请将默认浏览器设置为Chrome浏览器
    req=urllib.request.Request(url,headers=head)
    response=urllib.request.urlopen(req)
    html=response.read().decode("utf-8","ignore")
    # print(html)
    soup=BeautifulSoup(html,"html.parser")
    # print(soup)
    return soup

#爬取信息的正则表达式
findName=re.compile(r'<h1 class="detail-title am-margin-bottom-xs">(.*?)<!',re.S)
findMoney=re.compile(r'<b class="goods-price" data-original-price="(.*?)">(.*?)</b>',re.S)
findXiangQing=re.compile(r'</span><span class="tm-count">(.*?)</span></div>',re.S)
findKuCun=re.compile(r'<span class="tb-hidden stock-tips">库存<span class="stock" data-max-limit="0" data-min-limit="1" data-original-stock="(.*?)" data',re.S)
findNokucun=re.compile(r'<p class="goods-not-buy-tips">(.*?)</p>',re.S)


#将得到的有效信息存放在list中
datalist=[]
def get_target_data():
    for i in range(0,12):  #对20个页面进行读取（一共读取1500条信息）
        urls=src+str(i+1)+".html"
        try:                   #异常处理，防止网址不存在的情况
            soup=gethtml(urls) #爬取的网址
        except Exception:      #当网址异常时，接收本次循环
            print(urls,"不能访问！")
            continue
        if "资源不存在或已被删除" in soup.text:  #若此页面中有资源不存在的字样，则跳过本次循环
            print("第%d页的资源不存在或已被删除..."%(i+1))
            continue
        else:
            for item in soup.find_all('div',class_="clearfix-right"):
                data=[] #用于接收每一条信息
                item=str(item)
                # print(item)
                # break
                name = re.findall(findName, item)  #找名字
                if len(name)==0:  #防止此信息为空
                    data.append("暂无信息")
                else:
                    name[0]=name[0].replace("\n","") #替换\n
                    name[0] = name[0].replace(" ", "") #替换空格
                    data.append(name[0])
                money = re.findall(findMoney, item)  #找价格
                if len(money)==0:   #防止此信息为空
                    data.append("暂无信息")
                else:
                    moneys=[]
                    moneys.append(money[0][0]) #因为money[0][0]不能直接使用.replace，所以将其替换成moneys=[]
                    moneys[0] = moneys[0].replace("\n", "")  # 替换\n
                    moneys[0] = moneys[0].replace(" ", "")  # 替换空格
                    data.append(moneys[0])
                xiangqing=re.findall(findXiangQing,item)
                if len(xiangqing)<3:
                    data.append("暂无信息")
                    data.append("暂无信息")
                    # data.append("暂无信息")
                else:
                    for m in range(0,2):
                        xiangqing[m] = xiangqing[m].replace("\n", "")  # 替换\n
                        xiangqing[m] = xiangqing[m].replace(" ", "")  # 替换空格
                    data.append(xiangqing[1])
                    data.append(xiangqing[0])
                ""
                # if "商品卖光了" in soup.text:
                Nokucun = re.findall(findNokucun, item)
                if len(Nokucun) == 1:
                    # data.append(Nokucun[0])
                    # data.append("商品卖光了")
                    print(Nokucun[0])
                    data.append("0")
                else:
                    kucun = re.findall(findKuCun, item)
                    if len(kucun) >0:  # 防止此信息为空
                        kucun[0] = kucun[0].replace("\n", "")  # 替换\n
                        kucun[0] = kucun[0].replace(" ", "")  # 替换空格
                        data.append(kucun[0])
                datalist.append(data)
            print("第%d页的资源爬取完毕..."% (i + 1))
    print("一共爬取了%d条有效数据"%len(datalist))  #打印一共有多少条信息
    print(datalist[10])
    for i in datalist:
        print(i)
#将list中的信息写入txt文档
def saveTxt(list):
    print("爬取的数据将保存在txt文件中..")
    f=open("shopXO.txt","w",encoding="utf-8")
    for i in list:
         # print("已写入第%d条" % (i + 1))
         for j in i:
             string=str(j) #转换成字符串
             f.write(string) #写入
             if(j==i[len(i)-1]): #保证每一行的末尾不加‘，’
                f.write(",") #用','进行隔开
         f.write("\n")
    f.close

#将list中的信息存到Excel中
def saveExcel(list):
    print("爬取的数据将保存在Excel文件中..")
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)  # 创建book对象
    sheet = book.add_sheet('shopXO',cell_overwrite_ok=True)  # 创建工作表
    # col=("产品名称","价格","浏览数","销量","库存")
    # for i in range(len(col)):
    #     sheet.write(0,i,col[i])
    for i in range(0,len(list)):
        data=list[i]
        for j in range(0,len(data)):
            sheet.write(i+1,j,data[j])
    book.save("shopXO.xls")

#将list中的数据存储到csv文件中
def saveCsv(list):
    print("爬取的数据将保存在csv文件中..")
    with open('shopXO.csv', 'w', encoding='utf-8',newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',') # 这里在初始化写入对象时传入delimiter为','，此时输出结果的每一列就是以','分隔了
        writer.writerow(['产品名称','价格','浏览数','销量','库存'])
        for i in list: #将datalist中的列表循环写入
            writer.writerow(i)
if __name__ == '__main__':
    get_target_data()
    # saveCsv(datalist)
    # saveTxt(datalist)
    # saveExcel(datalist)
    print("爬取的数据已保存!")

StudiousTiger

关注

3
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
【爬虫】shopXO 商城

获取到shopXO商城中数码产品中商品的 “产品名称”,“价格”,“浏览数”,“销量”,“库存”#-*- coding = utf-8 -*-#@Time:2020/7/18 17:11#@Author:huxuehao#@File:#@Software:PyCharm#@Theme:import re #正则表达式import urllib.request,urllib.error #指定url,获取网页数据# import urllibimport xlwt #进行
复制链接

扫一扫

专栏目录