爬取某度某吧?高通过率百度旋转验证码逆向实现

爬取某度某吧?高通过率百度旋转验证码逆向实现

声明
本文章中所有内容仅供学习交流使用,不用于其他任何目的,不提供完整代码,抓包内容、敏感网址、数据接口等均已做脱敏处理,严禁用于商业用途和非法用途,否则由此产生的一切后果均与作者无关!
本文章未经许可禁止转载,禁止任何修改后二次传播,擅自使用本文讲解的技术而导致的任何意外,作者均不负责,若有侵权,请在联系作者立即删除!

数据爬取

热门吧爬取

热门吧爬取分析

首先是进入网站的首页 看到热门吧
点击跳转热门吧页面,查看请求信息

请添加图片描述

复制到postman之后,修改rn参数 发现所有的数据都回来了!!!那么就很简单了开始写代码!!

sprider 引擎
class HotTiebaSpider(scrapy.Spider):
    name = 'hotTieba'
    allowed_domains = DOMAINS
    start_urls = HOT_TIEBA_URL

    def __init__(self, mongo_uri, mongo_db, mongo_user, mongo_passwd):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.user = mongo_user
        self.passwd = mongo_passwd
        self.client = pymongo.MongoClient(self.mongo_uri, username=self.user, password=self.passwd)
        self.db = self.client[self.mongo_db]

    # 从setting.py中读取数据库连接信息
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB'),
            mongo_user=crawler.settings.get('MONGO_USER'),
            mongo_passwd=crawler.settings.get('MONGO_PASSWORD')
        )

    def start_requests(self):
        self.db[HOT_TIEBA].drop()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        for url in self.start_urls:
            yield scrapy.Request(url=url, headers=headers)

    def parse(self, response):
        crawl_time = datetime.now()
        response_json = json.loads(response.body)
        if response_json['errno'] == 0:
            data = response_json['data']
            for forum_info in data['forum_info']:
                forum_info['crawl_time'] = crawl_time
                yield HotTiebaSpiderItem(forum_info)


hotTiebaItem
class HotTiebaSpiderItem(scrapy.Item):
    forum_id = scrapy.Field()
    forum_name = scrapy.Field()
    avatar = scrapy.Field()
    thread_count = scrapy.Field()
    member_count = scrapy.Field()
    crawl_time = scrapy.Field()
pipeline
class TiebaspiderPipeline(object):
    def __init__(self, mongo_uri, mongo_db, mongo_user, mongo_passwd):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.user = mongo_user
        self.passwd = mongo_passwd

        self.client = pymongo.MongoClient(self.mongo_uri, username=self.user, password=self.passwd)
        self.db = self.client[self.mongo_db]

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB'),
            mongo_user=crawler.settings.get('MONGO_USER'),
            mongo_passwd=crawler.settings.get('MONGO_PASSWORD')
        )

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):
        # 保存数据到MongoDB数据库
        if isinstance(item, HotTiebaSpiderItem):
            self.db[HOT_TIEBA].insert_one(dict(item))
        elif isinstance(item, TiebaMessageSpiderItem):
            self.db[TIEBA_MASSAGE].insert_one(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()
启动脚本
import os

if __name__ == '__main__':
    os.system("scrapy crawl hotTieba")

贴文爬取

贴文爬取分析

每个吧的请求都是:url+f?kw=吧,然后打开浏览器查看需要的html信息,找到自己想要的信息,xpath处理之后就得到想要的贴文了,
但是需要注意,直接请求的话,返回的信息都是在注释中的。
请添加图片描述
分析完毕,开始写代码!!!

TiebaMessageSpider 引擎
def get_message_result(crawl_time, top_li, is_top, tieba_name):
    if len(top_li.xpath(".//div[contains(@class ,'mediago-ad')]")) > 0:
        return {}
    title_a = top_li.xpath(".//div[contains(@class ,'hreadlist_title')]/a")
    message_href = title_a.xpath("./@href").extract()[0]
    message_title = title_a.xpath("./@title").extract()[0]
    author_a = top_li.xpath(".//div[contains(@class ,'threadlist_author')]//span[@class='frs-author-name-wrap']/a")
    author_href = author_a.xpath("./@href").extract()[0]
    author_data_field = author_a.xpath("./@data-field").extract()[0]
    author_json = json.loads(author_data_field)
    user_id = author_json['id']
    user_un = author_json['un']
    return {
        "message_href": message_href,
        "message_title": message_title,
        "author_href": author_href,
        "user_id": user_id,
        "user_un": user_un,
        "is_top": is_top,
        "tieba_name": tieba_name,
        "crawl_time": crawl_time
    }



class TiebaMessageSpider(scrapy.Spider):
    name = "message"
    allowed_domains = DOMAINS
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }

    def __init__(self, mongo_uri, mongo_db, mongo_user, mongo_passwd):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.user = mongo_user
        self.passwd = mongo_passwd
        self.client = pymongo.MongoClient(self.mongo_uri, username=self.user, password=self.passwd)
        self.db = self.client[self.mongo_db]
        hotTiebas = self.db[HOT_TIEBA]
        params = []
        for hotTieba in hotTiebas.find():
            param = {
                "kw": hotTieba['forum_name'],
                "ie": "utf8",
                "pn": 0
            }
            params.append(param)
        self.params = params
        self.start_url =START_URL
        self.client.close()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB'),
            mongo_user=crawler.settings.get('MONGO_USER'),
            mongo_passwd=crawler.settings.get('MONGO_PASSWORD')
        )

    def start_requests(self):
        for param in self.params:
            param_encode = urlencode(param)
            url = self.start_url + param_encode
            yield scrapy.Request(url=url, headers=self.headers, meta=param)

    def parse(self, response):
        params = response.request.meta
        kw = params['kw']
        print('kw: ', kw)
        crawl_time = datetime.utcnow()
        results = response.xpath("//code[@id='pagelet_html_frs-list/pagelet/thread_list']").extract()
        if len(results) == 0:
            results = response.xpath("//div[@id='pagelet_frs-list/pagelet/thread_list']").extract()
        if len(results) == 0:
            results = response.xpath("//div[@id='content_wrap']").extract()
        for result in results:
            replace_result = result.replace("<!--\n\n", '').replace(
                "\n-->", '')
            result_selector = Selector(text=replace_result)
            top_lis = result_selector.xpath("//ul[@id='thread_top_list']/li")
            for top_li in top_lis:
                yield TiebaMessageSpiderItem(get_message_result(crawl_time, top_li, 1, kw))
            general_lis = result_selector.xpath("//ul[@id='thread_list']/li")
            for general_li in general_lis:
                print(general_li)
                yield TiebaMessageSpiderItem(get_message_result(crawl_time, general_li, 0, kw))

Item

class TiebaMessageSpiderItem(scrapy.Item):
    message_href = scrapy.Field()
    message_title = scrapy.Field()
    author_href = scrapy.Field()
    user_id = scrapy.Field()
    user_un = scrapy.Field()
    is_top = scrapy.Field()
    tieba_name = scrapy.Field()
    crawl_time = scrapy.Field()

验证码校验

贴文爬取十分钟之后 就会开始旋转验证码校验 感谢大佬提供的模型 准确率还是很高的
模型链接:https://blog.csdn.net/qq_44749634/article/details/125914114
使用的是请求接口 完毕之后 通过selenium 得到返回的响应 传给scrapy中间件处理

禁用scrapy redict中间件

服务器检测到你有问题之后,会把你重定向到验证页面,所以我们需要自己去处理重定向请求
在setting.py中禁用重定向中间件 并使用我们自定义的下载中间件

DOWNLOADER_MIDDLEWARES = {
    'tiebaspider.middlewares.TiebaspiderDownloaderMiddleware': 543,
    'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
    'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': None
}

验证码处理类

首先 打开页面 看一下加载请求
请添加图片描述

进入页面之后,发起了init请求,参数是ak 在跳转的url中可以得到参数ak ,响应得到tkas后面会用到
先来模拟下这个请求

  def post_init(self):
        params = {
            "refer": self.refer,
            "ak": self.ak,
            "ver": 1,
            "qrsign": ""
        }
        body = parse.urlencode(params)
        headers = {
            'Content-Type': self.URLENCODE_TYPE,
            'Referer': self.refer,
            'User-Agent': self.userAgent,
            'Cookies': self.cookies
        }
        return self.session.post(url=self.INIT_URL, headers=headers, data=body)

之后发起style请求 参数是ak、tk 响应得到 backstrcaptchalist

"backstr": ""
        "captchalist": [
            {
                "id": "spin-0",
                "source": {
                    "back": {
                        "path": ""
                    }
                },
                "type": "spin"
            }

模拟一下

def post_style(self):
        headers = {
            'Content-Type': self.URLENCODE_TYPE,
            'Referer': self.DOMAINS,
            'User-Agent': self.userAgent,
            'Cookies': self.cookies
        }
        self.tk = self.beforeData['tk']
        self.as_data = self.beforeData['as']
        if self.backstr == '':
            params = {
                "refer": self.refer,
                "ak": self.ak,
                "tk": self.tk
            }
        else:
            refresh = {"capId": "spin-0",
                       "backstr": self.backstr}
            params = {
                "refer": self.refer,
                "ak": self.ak,
                "tk": self.tk,
                "isios": 0,
                "type": 'spin',
                "refresh": refresh,
                "ver": 1
            }

            headers = {
                'Accept': 'application/json, text/plain, */*',
                'Content-Type': self.URLENCODE_TYPE,
                'Origin': self.DOMAINS,
                'Referer': self.refer,
                'User-Agent': self.userAgent,
                'Cookies': self.cookies
            }
        body = parse.urlencode(flat_dict(params))
        respose = self.session.post(url=self.STYLE_URL, headers=headers, data=body)
        return respose

image接口获取图片

    def post_image(self, respose):
        style_data = respose.json()['data']
        self.backstr = style_data['backstr']
        headers = {
            'Content-Type': self.URLENCODE_TYPE,
            'Referer': self.DOMAINS,
            'User-Agent': self.userAgent,
            'Cookies': self.cookies
        }
        captcha = style_data['captchalist'][0]
        element_source = captcha['source']
        element_path = element_source['back']['path']
        respose = self.session.get(url=element_path, headers=headers)
        return respose

log接口是检验接口 as tk ak 在前面都可以得到 fs是加密后的验证信息。主要的校验值,只有acc_cbackstr 追踪一下log接口堆栈 很容易得到fs加密前的信息,除了这两个值 其他写死或置空都可以
加密很简单 第一次加密 首先使用as计算出一个新的key ,在利用这个key进行两次AES加密后就可以得到fs
返回值为dstk
之后进行二次校验c请求主要用作二次校验 使用log请求返回的dstk,但是要注意,log,c请求不能跟之前求求在同一个session中请求,服务器会返回安全错误
模拟c接口与log接口

def post_log(self, png_path, respose):
        with open(png_path, 'wb') as file:
            file.write(respose.content)
        image = Image.open(
            png_path)
        new_png_path = self.IMAGE_LOCAL_PATH + self.backstr[0:5] + '_new.png'
        ocr = my_ocr()
        image.save(new_png_path
                   )  # 截取的验证码图片保存为新的文件
        result = ocr.identification(
            new_png_path)
        e = round(int(result) / 360, 2)
        mv_length = int(result) / 360 * (290 - 52)
        print('预测旋转角度为:', result, '滑动距离为:', mv_length, '旋转百分比为', e)
        self.updateFsJson(e)
        fs = self.encrypt_data(key=self.as_data, rzData=self.fs_json, backstr=self.backstr)
        params = {
            "refer": self.refer,
            "ak": self.ak,
            "as": self.as_data,
            "scene": '',
            "tk": self.tk,
            "ver": 1,
            "fs": fs,
            "cv": "submit",
            "typeid": "spin-0",
            "_": int(time.time()) * 1000

        }
        body = parse.urlencode(params)
        returnData = request_log(body, self.backstr, self.refer)
        return returnData

当返回的有url后就可以直接跳转了,这里我使用的是selenium,发送的请求,但是要保持所有的跳转都在一个session中,才能验证过去

  def log_is_success(self, returnData):
        result = None
        print(returnData)
        logdata = returnData['logdata']
        beforeData = logdata['data']
        backstr = returnData['backstr']
        if 's' in beforeData.keys():
            self.refer = beforeData['s']['url']
            self.driver.get(beforeData['s']['url'])
            # response = self.request_session.get(beforeData['s']['url'], allow_redirects=False)
            url = self.driver.current_url
            # url = response.headers.get('Location')
            time.sleep(random.randint(5, 10))
            if 'wappass' not in url:
                print("成功", url)
                status = 1
                content = self.driver.page_source
                # response = self.request_session.get(url, allow_redirects=False)
                # content=response.content
                result = content
                self.current_url = url
            else:
                print("继续", url)
                status = 2
                ak_now = self.ak
                matchObject = re.match(r'ak=(.*?)&', url, re.M | re.I)
                if (matchObject):
                    ak_now = matchObject.group(1)
                self.init(url, ak=ak_now, userAgent=self.userAgent, cookies=self.cookies, kw='看贴')
                result = url
                self.current_url = url
        elif 'f' in self.beforeData.keys():
            status = 3
            print("安全风险")
        else:
            status = 0

        self.beforeData = beforeData
        self.backstr = backstr
        return {"status": status, "result": result
                }

TiebaspiderDownloaderMiddleware

    def process_response(self, request, response, spider):
     # Called with the response returned from the downloader.
     newresponse = HtmlResponse(url=response.url, body=response.body, encoding='utf-8')
     if response.status == 302 and 'wappass' in str(response.headers.get('Location'), 'utf-8'):
         redirect_url = str(response.headers.get('Location'), 'utf-8')
         refer = redirect_url
         ak = ""
         userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
         cookies = ''
         matchObject = re.match(r'ak=(.*?)&', request.url, re.M | re.I)
         if (matchObject):
             ak = matchObject.group(1)
         rotatingVerificationCode = RotatingVerificationCode(refer=refer, ak=ak, userAgent=userAgent,
                                                             cookies=cookies,
                                                             kw='看帖')

         while True:
             rotatingVerificationCode.open_session()
             out = rotatingVerificationCode.request_verify()
             redirect_url = rotatingVerificationCode.current_url
             print('wappass' in redirect_url)
             rotatingVerificationCode.close_session()
             if 'wappass' not in redirect_url:
                 break
         print(request.url)
         newresponse = HtmlResponse(url=request.url, body=out, request=request, encoding='utf-8',
                                    status=200)
     return newresponse

总结

这个验证码逆向还是很简单的,稍微努努力就做完了,最后希望小伙伴们帮忙点赞关注加投币!!!

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值