背景:搜狗微信下文章采集
搜索关键词在未登录情况下,默认最多只能获取10页文章,且dom中文章链接是有有效期的,本人测试有效期大概是2个小时左右。分析过程暂时不写,直接上代码。
import json
import random
import traceback
import re
import requests
class wx_spider(object):
def __init__(self):pass;
def get_real_url(self,url,html_str):
real_url = ''
try:
cookie = self.get_cookie()
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
"Cookie": cookie
}
url = self._wx_get_k_h(url)
r = requests.get(url=url, headers=headers, stream=True, verify=False)
real_url = ''.join(re.findall("url \+= '(.*?)'", r.text)).replace("@", "")
except:
print(traceback.format_exc())
return real_url
def get_pc_useragent(self):
pc_useragent_list = [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.3",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1",
"Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
]
return random.choice(pc_useragent_list)
def get_cookie(self,html_str,content_url):
try:
headers = {'User-Agent': self.get_pc_useragent()}
debug_mode = False
ret = {'success': 0, 'msg': '', 'info': {}, 'debug': {}, 'code': 0}
uigs_para = self._wx_get_uigs_para(html_str)
if len(uigs_para) == 0:
return ''
if debug_mode: ret['debug']['uigs'] = uigs_para
params = self._wx_get_cookie(uigs_para, headers, content_url)
if debug_mode: ret['debug']['cookie'] = params
Cookie = "SNUID={}; SUV={}".format(params['SNUID'], params['SUV'])
return Cookie
except:
print(traceback.format_exc())
return ''
def _wx_get_cookie(self,uigs_para, headers, content_url):
cookie_params = {"SNUID": uigs_para['snuid']}
url = "https://www.sogou.com/sug/css/m3.min.v.7.css"
headers = {
"Accept": "text/css,*/*;q=0.1",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
"Cookie": "SNUID={}".format(cookie_params['SNUID']),
"Host": "www.sogou.com",
"Referer": "https://weixin.sogou.com/",
"User-Agent": headers.get('User-Agent', '')
}
response2 = requests.get(url, headers=headers, stream=True, verify=False)
SetCookie = response2.headers['Set-Cookie']
cookie_params['SUID'] = re.findall('SUID=(.*?);', SetCookie, re.S)[0]
url = "https://weixin.sogou.com/websearch/wexinurlenc_sogou_profile.jsp"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
"Cookie": "SNUID={}".format(cookie_params['SNUID']),
"Host": "weixin.sogou.com",
"Referer": content_url,
"User-Agent": headers.get('User-Agent', '')
}
response3 = requests.get(url, headers=headers, stream=True, verify=False)
SetCookie = response3.headers['Set-Cookie']
cookie_params['JSESSIONID'] = re.findall('JSESSIONID=(.*?);', SetCookie, re.S)[0]
url = "https://pb.sogou.com/pv.gif"
headers = {
"Accept": "image/webp,*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
"Cookie": "SNUID={}".format(cookie_params['SNUID']),
"Host": "pb.sogou.com",
"Referer": "https://weixin.sogou.com/",
"User-Agent": headers.get('User-Agent', '')
}
response4 = requests.get(url, headers=headers, params=uigs_para, stream=True, verify=False)
SetCookie = response4.headers['Set-Cookie']
cookie_params['SUV'] = re.findall('SUV=(.*?);', SetCookie, re.S)[0]
return cookie_params
def _wx_get_uigs_para(self,html_str):
if 'var uigs_para = ' in html_str:
uigs_para = re.findall(r'var uigs_para = (.*?);', html_str, re.S)[0]
if 'passportUserId ? "1" : "0"' in uigs_para:
uigs_para = uigs_para.replace('passportUserId ? "1" : "0"', '0')
uigs_para = json.loads(uigs_para)
exp_id = re.findall('uigs_para.exp_id = "(.*?)";', html_str, re.S)[0]
uigs_para['right'] = 'right0_0'
uigs_para['exp_id'] = exp_id[:-1]
return uigs_para
else:
print('页面错误')
return {}
def _wx_get_k_h(self,url):
b = int(random.random() * 100) + 1
a = url.find("url=")
url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
return url
if __name__ == '__main__':
content_url = '***'
html_str = '***'
s = wx_spider()
s.get_real_url(content_url, html_str)