评论爬虫小样

随着反爬手段的推陈出新,使用爬虫框架被反爬的几率也越来越高了, 于是我尝试使用最呆笨的方式爬取试了一下,居然可以,在此做下记录。

# encoding=utf-8
import re
import os
import time
import json
import pathlib
import requests
import logging
import random
from pymongo import MongoClient


# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
                    datefmt='%Y-%m-%d %T'
                    )
logger = logging.getLogger(__name__)

# 启动爬虫前需先修改下面三项的值
itemId= 582343703210
base_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId=1112124124&sellerId=430490406&order=3&currentPage={}"
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; _med=dw:1600&dh:900&pw:1600&ph:900&ist:0; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; swfstore=192694; tk_trace=1; _m_h5_tk=fefe9396dafd34344a5209f356fa5219_1568970404033; _m_h5_tk_enc=95eb1bc833877a790160e51ac2b95c60; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; tt=login.tmall.com; res=scroll%3A990*6474-client%3A924*772-offset%3A924*6474-screen%3A1600*900; whl=-1%260%260%260; cq=ccp%3D1; pnm_cku822=098%23E1hv9vvUvbpvUvCkvvvvvjiPRFLy0jDCRscvzjthPmPwtjnmR2sw0jECnLdUzjYRiQhvCvvvpZptvpvhvvCvpvGCvvpvvPMMmphvLvm4vvvaaB4AdBDQbNLt%2Bu6XjC61D76Xj8TJEcqUz8gL%2BulAp57Q%2Bu6XeC69%2FX7rVBAKfvDrAEuK53rsxXxreEeKHkx%2FAWmK5dot2e%2FivpvUvvCCb2cDLs8EvpvVvpCmpYsZKphv8vvvphvvvvvvvvC2q9vv9OIvvhOVvvmCp9vvB9OvvUhKvvCVC9vv9ogCvpvVvUCvpvvv; l=cBjfAG8qqFtnScxbBOCwNQKXiCQT9IRAguSJGwSBi_5CB6LsfqQOk_LFbFp6cjWdtLLB4-YhSFe9-etki0ILNztqTtrR.; isg=BJSUQtNutcPQJiHsng_MgZdGZdKq_ej41L3UzC51G5-iGTRjVv6lZ_mbGVEk4fAv; dnk=pengjunlee1; uc1=tag=8&lng=zh_CN&cookie15=V32FPkk%2Fw0dUvg%3D%3D&cookie14=UoTaEC%2BQ14Cucg%3D%3D&cookie21=WqG3DMC9Eman&pas=0&existShop=false&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D; uc3=vt3=F8dByuK3QCZUfvQCEBM%3D&id2=UUphy%2FZ7Cdwys6i9Hw%3D%3D&nk2=E6EQ1CLKS1V0cko%3D&lg2=URm48syIIVrSKA%3D%3D; tracknick=pengjunlee1; lid=pengjunlee1; _l_g_=Ug%3D%3D; uc4=nk4=0%40EbhmhLlrKXZUZrdwvqX9l9vjDvaQPA%3D%3D&id4=0%40U2grEJGEPwwA5lpVHyA%2BGBdaEx29RDor; unb=2201414542619; lgc=pengjunlee1; cookie1=VvuEIF254OaIGYBx5CMIrukTXbnCoYs81mNl4ECxqes%3D; login=true; cookie17=UUphy%2FZ7Cdwys6i9Hw%3D%3D; _nk_=pengjunlee1; sg=196; csg=96ab7a84'

referer = 'https://detail.tmall.com/item.htm?id={}'.format(itemId)
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
    'Referer' : referer,
    'Cookie' : cookie,
    'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding' : 'gzip, deflate, br',
    'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}

# 为每一个商品创建一个单独的文件夹用来存放爬到的内容
fpath = 'D:\\scrapy\\new_tmall\\{}'.format(itemId)
if not os.path.exists(fpath):
    os.makedirs(fpath)

rate_pattern = re.compile(r'jsonp\d+\((.*?)\)$', re.S)
session = requests.Session()
pageCount = 0
currentPage = 1
# 一次最多爬取15页,防止被封
requests_count = 15

while ( currentPage <= pageCount or pageCount == 0) and requests_count>0:

    logger.info("开始爬取第 {} 页".format(currentPage))
    pagePath = pathlib.Path('D:\\scrapy\\new_tmall\\{}\\{}.txt'.format(itemId,currentPage))
    if pagePath.exists():
        logger.info("使用本地文件。。。")
        with open('D:\\scrapy\\new_tmall\\{}\\{}.txt'.format(itemId,currentPage), 'r', encoding='utf-8') as f:
            json_str = f.read()
        ret_str = rate_pattern.search(json_str)
    else:
        logger.info("发送网络请求。。。")
        url = base_url.format(itemId, currentPage)
        time.sleep(5+random.randint(0,5))
        try :
            requests_count = requests_count-1
            response = session.get(url, headers=headers)
        except BaseException as e:
            logger.info(e)
            break
        else:
            json_str = response.content.decode()
            ret_str = rate_pattern.search(json_str)
            if ret_str:
                with open('D:\\scrapy\\new_tmall\\{}\\{}.txt'.format(itemId,currentPage), 'w', encoding='utf-8') as f:
                    f.write(response.content.decode())
            else:
                break
    if ret_str:
        # 连接Mongodb数据库
        m = MongoClient(host="172.16.250.238", port=27017)
        test_db = m["test"]
        db = test_db['tmallRateEntity']

        json_obj = json.loads(ret_str.group(1))
        rate_list = json_obj['rateDetail']['rateList']
        with open('{}.txt'.format(itemId), 'a', encoding='utf-8') as f:
            for rate in rate_list:
                o = {}
                o['rateContent'] = rate['rateContent']
                o['rateDate'] = rate['rateDate']
                o['_id'] = rate['id']
                o['goodsId'] = itemId
                o['pics'] = rate['pics']
                o['anony'] = rate['anony']
                o['userVipLevel'] = rate['userVipLevel']
                o['goldUser'] = rate['goldUser']
                o['displayUserNick'] = rate['displayUserNick']
                o['appendComment'] = rate['appendComment']
                ret = db.find_one({'_id': o['_id']})
                if ret is None:
                    db.insert_one(o)
        if pageCount == 0:
            pageCount = json_obj['rateDetail']['paginator']['lastPage']
    logger.info("总共 {} 页".format(pageCount))
    currentPage = currentPage + 1

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值