def get_num(url):
response = requests.get(url, headers=headers).text
result = re.search(
r’&md5sum=(.)&sign=(.)&rtcs_flag=(.)&rtcs_ver=(.?)“.rsign":"(.?)”,', response, re.M | re.I) # 寻找参数
reader = {
“md5sum”: result.group(1),
“sign”: result.group(2),
“rtcs_flag”: result.group(3),
“rtcs_ver”: result.group(4),
“width”: 176,
“type”: “org”,
“rsign”: result.group(5)
}
result_page = re.findall(
r’merge":"(.?)".?“page”😦.*?)}', response) # 获取每页的标签
doc_url = “https://wkretype.bdimg.com/retype/merge/” + url[29:-5] # 网页的前缀
n = 0
for i in range(len(result_page)): # 最大同时一次爬取10页
if i % 10 is 0:
doc_range = ‘_’.join([k for k, v in result_page[n:i]])
reader[‘pn’] = n + 1
reader[‘r