import re
import base64
from io import BytesIO
from fontTools.ttLib import TTFont
import requests
from lxml import etree
session = requests.session()
response = session.get(url='https://bj.58.com/chuzu/')
bs64_str = re.findall("charset=utf-8;base64,(.*?)'\)", response.text)[0]
def get_page_show_ret(string):
font = TTFont(BytesIO(base64.decodebytes(bs64_str.encode())))
c = font['cmap'].tables[0].ttFont.tables['cmap'].tables[0].cmap
ret_list = []
for char in string:
decode_num = ord(char)
if decode_num in c:
num = c[decode_num]
num = int(num[-2:])-1
ret_list.append(num)
else:
ret_list.append(char)
ret_str_show = ''
for num in ret_list:
ret_str_show += str(num)
return ret_str_show
将爬取的数据遍历调用上面的函数就可以解析
monery = html.xpath('//div/ul/li/div[3]/div[2]/b/text()')[:10]
price = []
for i in monery:
p = get_page_show_ret(i)
price.append(p)