1.获取店铺的商品列表
import os
import re
import time
import random
import logging
import pathlib
import requests
from lxml import etree
from pymongo import MongoClient
# 根据需要手动修改要爬取的店铺ID,并设置cookie
shopId = 9
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; tk_trace=1; dnk=pengjun%5Cu674E; tracknick=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; lgc=pengjun%5Cu674E; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; whl=-1%260%260%260; cq=ccp%3D1; pnm_cku822=; uc1=cookie15=VT5L2FSpMGV7TQ%3D%3D&lng=zh_CN&tag=8&cookie14=UoTbnxr%2FZWCkMg%3D%3D&pas=0&cookie21=UtASsssme%2BBq&cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&existShop=false; uc3=vt3=F8dByucubcTH2VejRjM%3D&nk2=E6EQ1CLKS%2FnL&lg2=UtASsssmOIJ0bQ%3D%3D&id2=VWeT3jqq6jDz; uc4=nk4=0%40EbhmhLlrKdq9uf0H4%2BXCrErUl%2Fs%3D&id4=0%40V8Zo2exYFQXrRTZfaqzLU%2FcZ98M%3D; csg=782dbae8; _m_h5_tk=0e62215aaf3f40d577a1722c0b5b4c70_1572351656396; _m_h5_tk_enc=b33dd8a3ba2bab1dc8b527d33a608da6; isg=BGBg3lxDKRtzXZXAygvAZXuKMW6eeRQcyOEA4NpxLHsO1QD_gnkUwzbjbT1w5fwL; l=dBjfAG8qqFtnS6_ABOCwourza77OSIRAguPzaNbMi_5pC6L_yz7OkNeBAFp6VjWfTFLB4-YhSFe9-etkmdlXPWYD090orxDc.'
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
# 可爬取的店铺配置参数
shop_list = {
1 :{
"shop_name": "purcotton",
"list_url": "https://purcotton.tmall.com/i/asynSearch.htm?callback=jsonp693&mid=w-14440378953-0&wid=14440378953&path=/search.htm&search=y&pageNo={}",
"referer": "https://purcotton.tmall.com/search.htm"
},
2 :{
"shop_name": "miansen",
"list_url": "https://miansen.tmall.com/i/asynSearch.htm?callback=jsonp363&mid=w-16800593356-0&wid=16800593356&path=/search.htm&search=y&pageNo={}",
"referer": "https://miansen.tmall.com/search.htm"
},
3 :{
"shop_name": "zichu",
"list_url": "https://zichu.tmall.com/i/asynSearch.htm?callback=jsonp363&mid=w-14977327192-0&wid=14977327192&path=/search.htm&search=y&pageNo={}",
"referer": "https://zichu.tmall.com/search.htm"
},
4 :{
"shop_name": "babycaremy",
"list_url": "https://babycaremy.tmall.com/i/asynSearch.htm?callback=jsonp125&mid=w-14913709402-0&wid=14913709402&path=/search.htm&search=y&pageNo={}",
"referer": "https://babycaremy.tmall.com/search.htm"
},
5 :{
"shop_name": "jianrou",
"list_url": "https://jianrou.tmall.com/i/asynSearch.htm?callback=jsonp125&mid=w-16603479881-0&wid=16603479881&path=/search.htm&search=y&pageNo={}",
"referer": "https://jianrou.tmall.com/search.htm"
},
6 :{
"shop_name": "qingshenghuorh",
"list_url": "https://qingshenghuorh.tmall.com/i/asynSearch.htm?callback=jsonp116&mid=w-14896201470-0&wid=14896201470&path=/search.htm&search=y&pageNo={}",
"referer": "https://qingshenghuorh.tmall.com/search.htm"
},
7 :{
"shop_name": "shiyuejiejing",
"list_url": "https://shiyuejiejing.tmall.com/i/asynSearch.htm?callback=jsonp116&mid=w-14873232671-0&wid=14873232671&path=/search.htm&search=y&pageNo={}",
"referer": "https://shiyuejiejing.tmall.com/search.htm"
},
8 :{
"shop_name": "keyoubi",
"list_url": "https://keyoubi.tmall.com/i/asynSearch.htm?_ksTS=1572255788071_428&callback=jsonp429&mid=w-14866275144-0&wid=14866275144&path=/category.htm&scene=taobao_shop&search=y&pageNo={}",
"referer": "https://keyoubi.tmall.com/search.htm"
},
9 :{
"shop_name": "goodbaby",
"list_url": "https://goodbaby.tmall.com/i/asynSearch.htm?_ksTS=1572256036653_454&callback=jsonp455&mid=w-14920527991-0&wid=14920527991&path=/search.htm&search=y&pageNo={}",
"referer": "https://goodbaby.tmall.com/search.htm"
}
}
# 构造请求头
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Referer' : shop_list[shopId]['referer'],
'Cookie' : cookie,
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}
# 设置店铺数据存放目录
now = time.time()
date_str = time.strftime('%Y-%m-%d', time.localtime(now))
shop_dir = "D:\scrapy\double11\{}\{}".format( date_str,shop_list[shopId]['shop_name'])
if os.path.exists(shop_dir) is False:
os.makedirs(shop_dir)
# 请求响应内容需匹配的正则模式
search_pattern = re.compile(r'jsonp\d*\(\"(.*?)\"\)\s*$', re.S)
# 连接Mongodb数据库
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']
# 先将店铺数据全部置为不可用,剔除下架商品
db.update_many({"shopId":shopId},{'$set': {"enabled":False}})
goods_list = [] # 用来保存爬到的商品ID
# 用来进行迭代请求,一次最多请求15,防止被反爬
currentPage = 1
hasNext = True
requests_count = 15
# 设置字符集
charSet = "gbk"
# 创建session
s = requests.Session()
while hasNext and requests_count>0:
logger.info("开始爬取第 {} 页".format(currentPage))
pageFilePath = shop_dir + "\page_{}.txt".format(currentPage)
pagePath = pathlib.Path(pageFilePath)
if pagePath.exists():
logger.info("使用本地文件。。。")
with open(pageFilePath, 'r', encoding=charSet) as f:
html_str = f.read()
else:
logger.info("发送网络请求。。。")
url = shop_list[shopId]['list_url'].format(currentPage)
time.sleep(1+random.randint(0,1))
try :
requests_count = requests_count-1
response = s.get(url, headers=headers)
except BaseException as e:
logger.info(e)
break
else:
ret_str = response.content.decode(charSet,"ignore")
print(ret_str)
# 替换掉响应内容中影响xml解析的内容
format_str = ret_str.replace('=\\\"', '=')
format_str = format_str.replace('\\\" ', ' ')
format_str = format_str.replace('\\\">', '>')
search_ret = search_pattern.search(format_str)
if search_ret:
html_str = search_ret.group(1)
with open(pageFilePath, 'w', encoding=charSet) as f:
f.write(html_str)
if html_str:
# 构造xml树
html = etree.HTML(html_str)
# 美化输出html的内容
# print(etree.tostring(html, pretty_print=True).decode('utf-8'))
# 看看是否还有下一页
next_page = html.xpath('//p[@class="ui-page-s"]/a[@title="下一页"]')
if len(next_page) == 0:
hasNext = False
# 解析页面内容
items = html.xpath('//div[@class="J_TItems"]/div[(@class="item5line1" or @class="item4line1") and position()<last()-2]/dl[@class="item"]')
for item in items:
sku_id = int(item.xpath('./@data-id')[0])
img_url = item.xpath('./dt[@class="photo"]/a[@class="J_TGoldData"]/img/@data-ks-lazyload')[0]
item_name = item.xpath('./dd[@class="detail"]/a[@class="item-name"]/text()')[0].strip()
attr_item = item.xpath('./dd[@class="detail"]/div[@class="attribute"]')[0]
sku_price_str = attr_item.xpath('./div[@class="cprice-area"]/span[@class="c-price"]/text()')[0].strip()
sale_list = attr_item.xpath('./div[@class="sale-area"]/span[@class="sale-num"]/text()')
sale_count_str = sale_list[0].strip() if len(sale_list)>0 else 0
rate_list = item.xpath('./dd[@class="rates"]/div/h4/a/span/text()')
rate_count_str = rate_list[0].strip().replace("评价: ","") if len(rate_list)>0 else 0
ret = db.find_one({'_id': sku_id})
if ret is None:
o = {}
o['_id'] = sku_id
o['imgUrl'] = img_url
o['title'] = item_name
o['price'] = float(sku_price_str)
o['totalSaleCount'] = int(sale_count_str)
o['rateCount'] = int(rate_count_str)
o['enabled'] = True
o['updateTime'] = int(now)
# 关联的店铺ID
o['shopId'] = shopId
db.insert_one(o)
else:
o = {'$set': {}}
o['$set']['imgUrl'] = img_url
o['$set']['title'] = item_name
o['$set']['price'] = float(sku_price_str)
o['$set']['totalSaleCount'] = int(sale_count_str)
o['$set']['rateCount'] = int(rate_count_str)
o['$set']['enabled'] = True
o['$set']['updateTime'] = int(now)
db.update_one({'_id': sku_id}, o)
goods_list.append(sku_id)
currentPage=currentPage+1
print(goods_list)
# 数据示例,需关注字段类型
"""
{
"_id": 554949386593,
"imgUrl": "//img.alicdn.com/bao/uploaded/i3/430490406/O1CN01MfS3gC1ErzMAMswd5_!!0-item_pic.jpg_180x180.jpg",
"title": "全棉时代擦脸巾洗脸巾女一次性洁面巾棉柔纯棉抽取式实惠盒装纸巾",
"price": 89.9,
"totalSaleCount": 359972,
"rateCount": 68780,
"enabled": true,
"updateTime": 1571987992,
"shopId": 1
}
"""
2.获取店铺的商品数据
import os
import re
import json
import time
import random
import logging
import pathlib
import requests
from pymongo import MongoClient
# 根据需要手动修改要爬取的店铺ID,并设置cookie
shopId = 9
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; swfstore=309224; x=__ll%3D-1%26_ato%3D0; tk_trace=1; dnk=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; whl=-1%260%260%260; uc4=id4=0%40V8Zo2exYFQXrRTZfaqzNdlrgSjA%3D&nk4=0%40EbhmhLlrKdq9uf0H4%2BXEnmy5aLE%3D; csg=4f316563; cq=ccp%3D1; _m_h5_tk=4cf4dad64594442bb3992f537a85be38_1572430570733; _m_h5_tk_enc=1218f7723d8ca791cc4e6f3243b21d99; pnm_cku822=098%23E1hvmvvUvbpvUpCkvvvvvjiPRsqpAjnHRszy6jYHPmPUgjnhRLdZ0jY8n2F96jtbR4wCvvpvvUmmmphvLvo8Qpvj7uoQD70Oe169D7zhz8TxEcqhsj7QrEtApaLO%2BnezrmphQRAn3feAOHjhTWexRdIAcUmxfamK5eU%2Flw2hz2WKnpch6Ul1K3ktvpvIvvvvvhCvvvvvvvW9phvvQ9vvvACvpvQovvv2UhCv2jhvvvW9phvWwOyCvvOUvvVva68ivpvUvvmvnl1Bdo%2BCvpvVvmvvvhCv2QhvCvvvMMGtvpvhvvvvvv%3D%3D; l=dBjfAG8qqFtnSToyBOCNqQKXiCQOSIRAguSJGwSBi_5C868sG3QOkNtq1FJ6VjWftgLB4-YhSFe9-etkiepTY-fP97Rw_xDc.; isg=BOTkV8gOZe4hFZH87p88kefWteJ6bVjI5A0E_P4FcK9xqYRzJo3YdxoLaUEUcUA_'
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
# 可爬取的店铺配置参数
shop_list = {
1 :{ "shop_name": "purcotton" },
2 :{ "shop_name": "miansen" },
3 :{ "shop_name": "zichu" },
4 :{ "shop_name": "babycaremy" },
5 :{ "shop_name": "jianrou" },
6 :{ "shop_name": "qingshenghuorh" },
7: {"shop_name": "shiyuejiejing"},
8: {"shop_name": "keyoubi"},
9: {"shop_name": "goodbaby"}
}
# 构造请求头
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Cookie' : cookie,
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}
# 设置店铺数据存放目录
now = time.time()
date_str = time.strftime('%Y-%m-%d', time.localtime(now))
shop_dir = "D:\scrapy\double11\{}\{}".format( date_str,shop_list[shopId]['shop_name'])
if os.path.exists(shop_dir) is False:
os.makedirs(shop_dir)
# 连接Mongodb数据库
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']
# 获取店铺的商品列表
goods_list = db.find({"shopId":shopId})
# 设置字符集
charSet = "gbk"
# 创建session
s = requests.Session()
# 请求响应内容需匹配的正则模式
search_pattern = re.compile(r'setMdskip\s\((.*?)\)$', re.S)
discount_pattern = re.compile(r'付定金立减(\d*)', re.S)
info_url = "https://mdskip.taobao.com/core/initItemDetail.htm?isUseInventoryCenter=false&cartEnable=false&service3C=false&isApparel=false&isSecKill=false&tmallBuySupport=true&isAreaSell=false&tryBeforeBuy=false&offlineShop=false&itemId={}&showShopProm=false&isPurchaseMallPage=false&isRegionLevel=false&household=false&sellerPreview=false&queryMemberRight=true&addressLevel=2&isForbidBuyItem=false&callback=setMdskip×tamp={}"
referer_base= "https://detail.tmall.com/item.htm?id={}"
for goods in goods_list:
goods_id = goods["_id"]
logger.info("开始爬取商品: {}".format(goods_id))
itemFilePath = shop_dir + "\\info_{}.txt".format(goods_id)
pagePath = pathlib.Path(itemFilePath)
if pagePath.exists():
logger.info("使用本地文件。。。")
with open(itemFilePath, 'r', encoding=charSet) as f:
json_str = f.read()
else:
logger.info("发送网络请求。。。")
time.sleep(1+random.randint(0,3))
headers['Referer'] = referer_base.format(goods_id)
now = time.time()
url = info_url.format(goods_id, int(round(now * 1000)))
try :
response = s.get(url, headers=headers)
except BaseException as e:
logger.info(e)
break
else:
ret_str = response.content.decode(charSet,"ignore")
print(ret_str)
search_ret = search_pattern.search(ret_str)
if search_ret:
json_str = search_ret.group(1)
with open(itemFilePath, 'w', encoding=charSet) as f:
f.write(json_str)
else:
break
if json_str:
# 转换为json对象
json_obj = json.loads(json_str)
# 解析响应数据
if 'sellCount' in json_obj['defaultModel']['sellCountDO']:
monthSaleCount = json_obj['defaultModel']['sellCountDO']['sellCount']
else:
monthSaleCount = '-'
totalQuantity = json_obj['defaultModel']['inventoryDO']['totalQuantity']
price_info= json_obj['defaultModel']['itemPriceResultDO']['priceInfo']
if 'def' in price_info:
def_info = price_info['def']
else:
for key in price_info:
def_info = price_info[key]
if def_info:
# 如果包含预售字段
if 'wrtInfo' in def_info:
wrt_info = def_info['wrtInfo']
original_price = def_info['promotionList'][0]['price']
pre_cash = wrt_info['price']
pre_count = wrt_info['groupUC']
o = { '$set': {} }
preSaleDiscount = 0
if 'depositExpandTextBySku' in wrt_info:
search_discount = discount_pattern.search(wrt_info['depositExpandTextBySku'])
if search_discount:
preSaleDiscount = int(search_discount.group(1))
o['$set']['preSaleDiscount'] = preSaleDiscount
o['$set']['monthSaleCount'] = monthSaleCount
o['$set']['originalPrice'] = float(original_price)
o['$set']['preSaleCash'] = int(pre_cash)/100
o['$set']['preSaleCount'] = int(pre_count)
o['$set']['preSale'] = True
o['$set']['preSaleTotal'] = (goods["price"]-preSaleDiscount)*int(pre_count)
o['$set']['totalQuantity'] = int(totalQuantity)
o['$set']['updateTime'] = int(now)
db.update_one({'_id':int(goods_id)},o)
else:
original_price = def_info['price']
o = { '$set': {} }
o['$set']['monthSaleCount'] = monthSaleCount
o['$set']['originalPrice'] = float(original_price)
o['$set']['preSale'] = False
o['$set']['totalQuantity'] = int(totalQuantity)
o['$set']['updateTime'] = int(now)
db.update_one({'_id': int(goods_id)}, o)
else:
break
# 数据示例,需关注字段类型
"""
{
"_id": 20159694203,
"imgUrl": "//img.alicdn.com/bao/uploaded/i4/430490406/O1CN01BzX29B1ErzMxI791m_!!0-item_pic.jpg_180x180.jpg",
"title": "全棉时代 产妇一次性内裤女士纯棉孕妇产后月子待产用品旅行 25条",
"price": 96,
"totalSaleCount": 631868,
"rateCount": 40631,
"enabled": true,
"updateTime": 1571993200,
"shopId": 1,
"monthSaleCount": "2.5万+",
"originalPrice": 194,
"preSale": true,
"preSaleCash": 10,
"preSaleCount": 26135,
"preSaleTotal": 2508960,
"totalQuantity": 18423
}
"""
3.更新Mysql数据库中店铺内容
# encoding=utf-8
import logging
import mysql.connector #先安装mysql-connector-python-1.0.12-py3.3,再引入包
from pymongo import MongoClient
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
# 根据需要手动修改要爬取的店铺ID
shopId = 9
# 连接Mongodb数据库
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']
# 获取店铺的商品数量
goods_count = db.count_documents({"shopId":shopId})
logger.info("店铺商品总数:" + str(goods_count))
# 获取店铺的预售商品数量
presale_count = db.count_documents({"shopId":shopId,"preSale":True})
logger.info("预售商品数量:" + str(presale_count))
presale_list = db.find({"shopId":shopId,"preSale":True})
# 计算预售商品订单总金额
presale_bill_total = 0.0
for goods in presale_list:
presale_bill_total = presale_bill_total+ goods['preSaleTotal']
logger.info("预售订单总额:" + str(presale_bill_total))
#创建链接Mysql数据库
config={'host':'172.16.250.233',#默认127.0.0.1
'user':'root',
'password':'123456',
'port':3306,#默认即为3306
'database':'digitalx_winner',
'charset':'utf8'#默认即为utf8
}
try:
# connect方法加载config的配置进行数据库的连接,完成后用一个变量进行接收
cnn=mysql.connector.connect(**config)
except mysql.connector.Error as e:
print('数据库链接失败!',str(e))
else:
#try没有异常的时候才会执行
print("数据库连接成功!")
# 获取游标
cursor = cnn.cursor()
# 执行sql语句
sql = 'update tbl_tmall_shop set goods_count = %s ,presale_goods_count = %s ,presale_bill_total = %s where id = %s'
rows = cursor.execute(sql, (goods_count,presale_count,int(presale_bill_total/10000),shopId))
# 提交
cnn.commit()
# 关闭游标
cursor.close()
# 关闭连接
cnn.close()