搜狗微信 长链接转微信链接

背景:搜狗微信下文章采集

搜索关键词在未登录情况下,默认最多只能获取10页文章,且dom中文章链接是有有效期的,本人测试有效期大概是2个小时左右。分析过程暂时不写,直接上代码。
import json
import random
import traceback
import re
import requests


class wx_spider(object):
    def __init__(self):pass;

    def get_real_url(self,url,html_str):
        real_url = ''
        try:
            cookie = self.get_cookie()
            headers = {
                "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                "Cookie": cookie
            }
            url = self._wx_get_k_h(url)

            r = requests.get(url=url, headers=headers, stream=True, verify=False)
            real_url = ''.join(re.findall("url \+= '(.*?)'", r.text)).replace("@", "")
        except:
            print(traceback.format_exc())
        return real_url

    def get_pc_useragent(self):
        pc_useragent_list = [
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.3",
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
            "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1",
            "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
        ]
        return random.choice(pc_useragent_list)

    def get_cookie(self,html_str,content_url):
        try:
            headers = {'User-Agent': self.get_pc_useragent()}
            debug_mode = False
            ret = {'success': 0, 'msg': '', 'info': {}, 'debug': {}, 'code': 0}
            uigs_para = self._wx_get_uigs_para(html_str)
            if len(uigs_para) == 0:
                return ''
            if debug_mode: ret['debug']['uigs'] = uigs_para
            params = self._wx_get_cookie(uigs_para, headers, content_url)
            if debug_mode: ret['debug']['cookie'] = params
            Cookie = "SNUID={}; SUV={}".format(params['SNUID'], params['SUV'])
            return Cookie
        except:
            print(traceback.format_exc())
            return ''

    def _wx_get_cookie(self,uigs_para, headers, content_url):
        cookie_params = {"SNUID": uigs_para['snuid']}
        url = "https://www.sogou.com/sug/css/m3.min.v.7.css"
        headers = {
            "Accept": "text/css,*/*;q=0.1",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
            "Connection": "keep-alive",
            "Cookie": "SNUID={}".format(cookie_params['SNUID']),
            "Host": "www.sogou.com",
            "Referer": "https://weixin.sogou.com/",
            "User-Agent": headers.get('User-Agent', '')
        }
        response2 = requests.get(url, headers=headers, stream=True, verify=False)
        SetCookie = response2.headers['Set-Cookie']
        cookie_params['SUID'] = re.findall('SUID=(.*?);', SetCookie, re.S)[0]

        url = "https://weixin.sogou.com/websearch/wexinurlenc_sogou_profile.jsp"
        headers = {
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
            "Connection": "keep-alive",
            "Cookie": "SNUID={}".format(cookie_params['SNUID']),
            "Host": "weixin.sogou.com",
            "Referer": content_url,
            "User-Agent": headers.get('User-Agent', '')
        }
        response3 = requests.get(url, headers=headers, stream=True, verify=False)
        SetCookie = response3.headers['Set-Cookie']
        cookie_params['JSESSIONID'] = re.findall('JSESSIONID=(.*?);', SetCookie, re.S)[0]

        url = "https://pb.sogou.com/pv.gif"
        headers = {
            "Accept": "image/webp,*/*",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
            "Connection": "keep-alive",
            "Cookie": "SNUID={}".format(cookie_params['SNUID']),
            "Host": "pb.sogou.com",
            "Referer": "https://weixin.sogou.com/",
            "User-Agent": headers.get('User-Agent', '')
        }
        response4 = requests.get(url, headers=headers, params=uigs_para, stream=True, verify=False)
        SetCookie = response4.headers['Set-Cookie']
        cookie_params['SUV'] = re.findall('SUV=(.*?);', SetCookie, re.S)[0]
        return cookie_params

    def _wx_get_uigs_para(self,html_str):
        if 'var uigs_para = ' in html_str:
            uigs_para = re.findall(r'var uigs_para = (.*?);', html_str, re.S)[0]
            if 'passportUserId ? "1" : "0"' in uigs_para:
                uigs_para = uigs_para.replace('passportUserId ? "1" : "0"', '0')
            uigs_para = json.loads(uigs_para)
            exp_id = re.findall('uigs_para.exp_id = "(.*?)";', html_str, re.S)[0]
            uigs_para['right'] = 'right0_0'
            uigs_para['exp_id'] = exp_id[:-1]
            return uigs_para
        else:
            print('页面错误')
            return {}

    def _wx_get_k_h(self,url):
        b = int(random.random() * 100) + 1
        a = url.find("url=")
        url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
        return url

if __name__ == '__main__':
    content_url = '***'  # 替换文章链接
    html_str = '***'  # 搜索页dom结构
    s  = wx_spider()
    s.get_real_url(content_url, html_str)

  • 有问题 欢迎大家评论指定。
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 20
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 20
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值