1)网站采集都有失效性,一般请求都会把当时的时间戳当成一种变量进行混淆生成相应的签名,这一点也是为了反爬。
下面函数是为了生成13位的时间字符串。
def diy_getTimeStamp12(): # 生成13时间戳 eg:1540281250399895 datetime_now = datetime.datetime.now() # 10位,时间点相当于从UNIX TIME的纪元时间开始的当年时间编号 date_stamp = str(int(time.mktime(datetime_now.timetuple()))) # 3位,微秒 data_microsecond = str("%06d" % datetime_now.microsecond)[0:3] date_stamp = date_stamp + data_microsecond time_stamp13 = int(date_stamp) return time_stamp13
2)网站的采集一般要用到登录网站的cookie或者匿名访问的cookie,代码如下:
def diy_get_cookie(): # 读取加载以后的cookie result = os.popen(cur_path + '\\' + 'tiktok_up_key.exe "{}"'.format(cookie_path)).read() cookies = eval(str(result)) cookies_exe_dict = {} for cookie in cookies: cookies_exe_dict[cookie['name']] = cookie['value'] return cookies_exe_dict
3)还有的网站要生成fp,一般的代码如下:
def mergeFp(self): verifyFp_str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" verifyFp_e = self.split_str(verifyFp_str) verifyFp_t = len(verifyFp_e) verifyFp_n = dict() verifyFp_n[8] = verifyFp_n[13] = verifyFp_n[18] = verifyFp_n[23] = '_' verifyFp_n[14] = '4' verifyFp_13time_r = self.getTimeStamp12() verifyFp_r = self.basen(verifyFp_13time_r, 36) # 10进制转36进制 for o in range(0, 36): random_t = random.random() * verifyFp_t random_t = math.ceil(random_t) if not o in verifyFp_n: i = 0 | random_t if 19 == o: xiabiao = 3 & i | 8 else: xiabiao = i if xiabiao >= 61: xiabiao = 61 verifyFp_n[o] = verifyFp_e[xiabiao] verifyFp_n_pn_str = "" for key in range(0, 36): verifyFp_n_pn_str = verifyFp_n_pn_str + verifyFp_n[key] return "verify_" + str(verifyFp_r) + "_" + str(verifyFp_n_pn_str)
4)网站采集还会用到代理ip代理一般如下:
def get_proxies(self): host_api = 'http://您的接口请求地址' try: # 没有就去拿token 和代理信息 url = host_api + '/dianba/random_zhanghao' response = requests.get(url) if response.status_code != 200: return '' dd = response.json() try: dd = json.loads(AES_Decrypt(dd.get('data'))) except Exception as e: pass token = dd.get('token') ip_password = dd.get('ip_password') ip_account = dd.get('ip_account') ip = dd.get('ip') host = dd.get('host') if token and ip_account and ip_account and ip and host and ip_password: proxies = {"http": 'http://{0}:{1}@{2}:{3}'.format(ip_account, ip_password, ip, str(host))} return proxies except Exception as e: pass return ""
5)有的网站会把访问当前的页面生成一个标记,这个标记有的叫s_v_web_id,这个数值每当感觉有风险时,要重新获取,然后得到新的s_v_web_id才能正常访问,当IP有风险时,就要切换IP来解决。一切都是为了网站安全。
def get_s_v_web_id(self): if (os.path.exists('s_v_web_id.json')): with open('s_v_web_id.json', 'r', encoding='utf-8') as f: cookies = json.load(f) cookies_exe_dict = {} for cookie in cookies: cookies_exe_dict[cookie['name']] = cookie['value'] self.s_v_web_id = cookies_exe_dict['s_v_web_id'] return self.s_v_web_id else: # 读取加载以后的cookie result = os.popen(cur_path + '\\' + 'tiktok_up_key.exe "{}"'.format(cookie_path)).read() cookies = eval(str(result)) cookies_exe_dict = {} for cookie in cookies: cookies_exe_dict[cookie['name']] = cookie['value'] self.cookies = cookies_exe_dict self.s_v_web_id = self.cookies['s_v_web_id'] return self.s_v_web_id
6)为了大家共同学习,给大家一段我写的实操代码,如下:
def up_app_aity(self):
for i in range(0, 15000):
p_url = 'https://网站请求接口/uploadGoodsDetail?type=app'
print(f"睡眠{self.detail_sleep_time}秒请求产品详情")
time.sleep(self.detail_sleep_time)
res = requests.post(p_url, json={}).text
res_data = json.loads(res)
product_id = str(res_data["data"]["product_id"])
res_h5 = str(res_data["data"]["h5"])
res_app = str(res_data["data"]["app"])
print(res_data)
if len(product_id) < 7:
print(f"今天数据已经采集完毕,睡眠6秒会结束")
time.sleep(6)
exit(1)
break
else:
verifyFp = self.s_v_web_id
print("---------------verifyFp-->" + verifyFp)
msToken = self.get_ms_token()
print(msToken)
# api_post_data = "promotion_ids={}&use_new_price=1&isFromVideo=false&is_in_app=0"
api_url = "is_h5=1&is_native_h5=1&verifyFp={}&msToken={}"
api_post_data = "use_new_price=1&is_h5=1&bff_type=2&is_in_app=0&origin_type=&promotion_ids={}&meta_param=&source_page=&request_additions=&isFromVideo=false"
fapi_url = api_url.format(verifyFp, msToken)
fapi_post_data = api_post_data.format(product_id)
xb = get_tiiktok_anti2(fapi_url, fapi_post_data)
url = "https://采集网址/aweme/v2/shop/promotion/pack/?" + xb
print(url)
headers = {
'Host': 'ecom.ecombdapi.com',
'Connection': 'keep-alive',
'Content-Length': '550',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'Accept': 'application/json, text/plain, */*',
'Content-Type': 'application/x-www-form-urlencoded',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0',
'sec-ch-ua-platform': '"Windows"',
'Origin': 'https://网址',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://网址/',
'Accept-Language': 'zh-CN,zh;q=0.9',
'cookie': f'msToken={msToken}'
}
if str(res_h5) == "0":
continue
# response = requests.request("POST", url, headers=headers, data=fapi_post_data)
# h5 = json.loads(response.text)
h5 = {}
# print(h5)
else:
h5 = {}
print("睡眠1秒请求app产品详情")
api_url = "is_h5=1&is_native_h5=1&verifyFp={}&msToken={}"
# api_post_data="promotion_ids={}&use_new_price=1&isFromVideo=false&is_in_app=1"
api_post_data = "promotion_ids={}&use_new_price=1&isFromVideo=false&use_new_price=1&bff_type=2&is_in_app=1"
#逆向接口,q258599831
fapi_url = api_url.format(verifyFp, msToken)
fapi_post_data = api_post_data.format(product_id)
xb = get_tiiktok_anti2(fapi_url, fapi_post_data)
url = "https://采集域名/aweme/v2/shop/promotion/pack/?" + xb
if str(res_app) == "0":
print("开始app请求--->")
try:
response = requests.request("POST", url, headers=headers, data=fapi_post_data, timeout=5)
app_data = json.loads(response.text)
except Exception as e:
print("开始app请求超时--->")
rnd_sleep = random.randint(1, 2)
print(f"采集睡眠{rnd_sleep}秒后重新请求 ")
time.sleep(rnd_sleep)
try:
response = requests.request("POST", url, headers=headers, data=fapi_post_data, timeout=5)
app_data = json.loads(response.text)
except Exception as e:
print(f"------------------------>产品id:{product_id} 两次请求超时会主动退出 ")
if DouYinConfig.if_reboot_router == "1":
path = os.path.join(os.getcwd(), 'selenium_chrome', 'rebootRouterNow.exe')
anti = " ".join([path, ' '])
os.popen(anti).read()
time.sleep(120)
print("睡眠2120秒,从新启动路由器")
continue
else:
time.sleep(1800)
continue
# exit(1)
print("app请求结束--->")
goods_dict = {}
goods_dict["h5"] = h5
goods_dict["app"] = app_data
if 'is_risky' in goods_dict["app"]:
print(f"------------------------>产品id:{product_id} 因风控所以主动退出 ")
if DouYinConfig.if_reboot_router == "1":
path = os.path.join(os.getcwd(), 'selenium_chrome', 'rebootRouterNow.exe')
anti = " ".join([path, ' '])
os.popen(anti).read()
time.sleep(120)
print("睡眠120秒,从新启动路由器")
continue
else:
time.sleep(1800)
continue
# exit(1)
if 'page_style' in goods_dict["h5"]:
del goods_dict["h5"]['page_style']
if 'fallback_page' in goods_dict["h5"]:
del goods_dict["h5"]['fallback_page']
if 'extra' in goods_dict["h5"]:
del goods_dict["h5"]['extra']
if 'log_pb' in goods_dict["h5"]:
del goods_dict["h5"]['log_pb']
if 'page_style' in goods_dict["app"]:
del goods_dict["app"]['page_style']
if 'fallback_page' in goods_dict["app"]:
del goods_dict["app"]['fallback_page']
if 'extra' in goods_dict["app"]:
del goods_dict["app"]['extra']
if 'log_pb' in goods_dict["app"]:
del goods_dict["app"]['log_pb']
############ 打印json ############
print(f"------------------------>产品id:{product_id} 返回json开始")
json_str = json.dumps(goods_dict)
print(json_str)
p_url = f'https://您的数据保存接口?product_id={product_id}&type=app'
if not goods_dict:
print(Fore.GREEN + '----------------未获取远端数据----------------')
break
'''
if "is_risky" in goods_dict["h5"]:
print(Fore.GREEN + f'----------------product_id:{product_id} 数据被临时风控----------------')
break
'''
# break
res = requests.post(p_url, json=goods_dict).text
print(Fore.GREEN + '----------------返回调用结果----------------')
print(res)
print(f"------------------------>产品id:{product_id} 返回json结束")
############ 打印json ############
rnd_sleep = random.randint(1, 3)
print(f"------------------------>产品id:{product_id} 采集睡眠{rnd_sleep}秒")
time.sleep(rnd_sleep)
以前内容百分百原创,欢迎小伙伴和我一起讨论学习,大家共同成长。
特殊声明:不能用学会的本领对任何网站进行攻击,要遵守中国乃至世界互联网安全,还要树立正确的人生观和世界观,欢迎大家学习会中国的互联网安全助力。