随着反爬手段的推陈出新,使用爬虫框架被反爬的几率也越来越高了, 于是我尝试使用最呆笨的方式爬取试了一下,居然可以,在此做下记录。
# encoding=utf-8
import re
import os
import time
import json
import pathlib
import requests
import logging
import random
from pymongo import MongoClient
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
# 启动爬虫前需先修改下面三项的值
itemId= 582343703210
base_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId=1112124124&sellerId=430490406&order=3¤tPage={}"
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; _med=dw:1600&dh:900&pw:1600&ph:900&ist:0; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; swfstore=192694; tk_trace=1; _m_h5_tk=fefe9396dafd34344a5209f356fa5219_1568970404033; _m_h5_tk_enc=95eb1bc833877a790160e51ac2b95c60; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; tt=login.tmall.com; res=scroll%3A990*6474-client%3A924*772-offset%3A924*6474-screen%3A1600*900; whl=-1%260%260%260; cq=ccp%3D1; pnm_cku822=098%23E1hv9vvUvbpvUvCkvvvvvjiPRFLy0jDCRscvzjthPmPwtjnmR2sw0jECnLdUzjYRiQhvCvvvpZptvpvhvvCvpvGCvvpvvPMMmphvLvm4vvvaaB4AdBDQbNLt%2Bu6XjC61D76Xj8TJEcqUz8gL%2BulAp57Q%2Bu6XeC69%2FX7rVBAKfvDrAEuK53rsxXxreEeKHkx%2FAWmK5dot2e%2FivpvUvvCCb2cDLs8EvpvVvpCmpYsZKphv8vvvphvvvvvvvvC2q9vv9OIvvhOVvvmCp9vvB9OvvUhKvvCVC9vv9ogCvpvVvUCvpvvv; l=cBjfAG8qqFtnScxbBOCwNQKXiCQT9IRAguSJGwSBi_5CB6LsfqQOk_LFbFp6cjWdtLLB4-YhSFe9-etki0ILNztqTtrR.; isg=BJSUQtNutcPQJiHsng_MgZdGZdKq_ej41L3UzC51G5-iGTRjVv6lZ_mbGVEk4fAv; dnk=pengjunlee1; uc1=tag=8&lng=zh_CN&cookie15=V32FPkk%2Fw0dUvg%3D%3D&cookie14=UoTaEC%2BQ14Cucg%3D%3D&cookie21=WqG3DMC9Eman&pas=0&existShop=false&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D; uc3=vt3=F8dByuK3QCZUfvQCEBM%3D&id2=UUphy%2FZ7Cdwys6i9Hw%3D%3D&nk2=E6EQ1CLKS1V0cko%3D&lg2=URm48syIIVrSKA%3D%3D; tracknick=pengjunlee1; lid=pengjunlee1; _l_g_=Ug%3D%3D; uc4=nk4=0%40EbhmhLlrKXZUZrdwvqX9l9vjDvaQPA%3D%3D&id4=0%40U2grEJGEPwwA5lpVHyA%2BGBdaEx29RDor; unb=2201414542619; lgc=pengjunlee1; cookie1=VvuEIF254OaIGYBx5CMIrukTXbnCoYs81mNl4ECxqes%3D; login=true; cookie17=UUphy%2FZ7Cdwys6i9Hw%3D%3D; _nk_=pengjunlee1; sg=196; csg=96ab7a84'
referer = 'https://detail.tmall.com/item.htm?id={}'.format(itemId)
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Referer' : referer,
'Cookie' : cookie,
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}
# 为每一个商品创建一个单独的文件夹用来存放爬到的内容
fpath = 'D:\\scrapy\\new_tmall\\{}'.format(itemId)
if not os.path.exists(fpath):
os.makedirs(fpath)
rate_pattern = re.compile(r'jsonp\d+\((.*?)\)$', re.S)
session = requests.Session()
pageCount = 0
currentPage = 1
# 一次最多爬取15页,防止被封
requests_count = 15
while ( currentPage <= pageCount or pageCount == 0) and requests_count>0:
logger.info("开始爬取第 {} 页".format(currentPage))
pagePath = pathlib.Path('D:\\scrapy\\new_tmall\\{}\\{}.txt'.format(itemId,currentPage))
if pagePath.exists():
logger.info("使用本地文件。。。")
with open('D:\\scrapy\\new_tmall\\{}\\{}.txt'.format(itemId,currentPage), 'r', encoding='utf-8') as f:
json_str = f.read()
ret_str = rate_pattern.search(json_str)
else:
logger.info("发送网络请求。。。")
url = base_url.format(itemId, currentPage)
time.sleep(5+random.randint(0,5))
try :
requests_count = requests_count-1
response = session.get(url, headers=headers)
except BaseException as e:
logger.info(e)
break
else:
json_str = response.content.decode()
ret_str = rate_pattern.search(json_str)
if ret_str:
with open('D:\\scrapy\\new_tmall\\{}\\{}.txt'.format(itemId,currentPage), 'w', encoding='utf-8') as f:
f.write(response.content.decode())
else:
break
if ret_str:
# 连接Mongodb数据库
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallRateEntity']
json_obj = json.loads(ret_str.group(1))
rate_list = json_obj['rateDetail']['rateList']
with open('{}.txt'.format(itemId), 'a', encoding='utf-8') as f:
for rate in rate_list:
o = {}
o['rateContent'] = rate['rateContent']
o['rateDate'] = rate['rateDate']
o['_id'] = rate['id']
o['goodsId'] = itemId
o['pics'] = rate['pics']
o['anony'] = rate['anony']
o['userVipLevel'] = rate['userVipLevel']
o['goldUser'] = rate['goldUser']
o['displayUserNick'] = rate['displayUserNick']
o['appendComment'] = rate['appendComment']
ret = db.find_one({'_id': o['_id']})
if ret is None:
db.insert_one(o)
if pageCount == 0:
pageCount = json_obj['rateDetail']['paginator']['lastPage']
logger.info("总共 {} 页".format(pageCount))
currentPage = currentPage + 1