目标网站:http://www.porters.vip/confusion/food.html
详细解析可查看:https://blog.csdn.net/BigBoy_Coder/article/details/104748253
import requests from parsel import Selector import re def getSvgMapping(): svg_url = 'http://www.porters.vip/confusion/font/food.svg' css_url = 'http://www.porters.vip/confusion/css/food.css' svg_resp = requests.get(svg_url).text css_resp = requests.get(css_url).text svg_list = re.findall(r'<text x=".*?" y="(.*?)">(.*?)</text>',svg_resp) # print(svg_list) # [('38', '154669136497975167479825383996313925720573'), ('83', '560862462805204755437571121437458524985017'), ('120', '671260781104096663000892328440489239185923'), ('164', '684431081139502796807382')] css_list = re.findall(r'\.(\w{6}) \{\s+background: -(.*?)px -(.*?)px;\s+\}',css_resp) # print(css_list) # [('vhk08k', '274', '141'), ('vhk6zl', '7', '15'), ('vhk0ao', '133', '97'), ('vhk9or', '330', '141'), ('vhkfln', '428', '15'), ('vhkbvu', '386', '97'), ('vhk84t', '176', '141'), ('vhkvxd', '246', '141'), ('vhkqsc', '288', '141'), ('vhkjj4', '316', '141'), ('vhk0f1', '316', '97')] svg_dict = {i[0]: i[1] for i in svg_list} # print(svg_dict) # {'38': '154669136497975167479825383996313925720573', '83': '560862462805204755437571121437458524985017', '120': '671260781104096663000892328440489239185923', '164': '684431081139502796807382'} css_dict = {i[0]:[i[1],i[2]] for i in css_list} # print(css_dict) # {'vhk08k': ['274', '141'], 'vhk6zl': ['7', '15'], 'vhk0ao': ['133', '97'], 'vhk9or': ['330', '141'], 'vhkfln': ['428', '15'], 'vhkbvu': ['386', '97'], 'vhk84t': ['176', '141'], 'vhkvxd': ['246', '141'], 'vhkqsc': ['288', '141'], 'vhkjj4': ['316', '141'], 'vhk0f1': ['316', '97']} font_size = int(re.search(r'<style>.*?font-size:(\d+)px.*?</style>',svg_resp).group(1)) # 14 num_dict = {} for i in css_dict.items(): for j in svg_dict.keys(): if int(i[1][1]) < int(j): break # 偏移量 offset = int(i[1][0])//font_size # 看到的数值 rel_num = svg_dict[j][offset:offset+1] num_dict[i[0]] = rel_num return num_dict def spider(): url = 'http://www.porters.vip/confusion/food.html' resp = requests.get(url).text num_dict = getSvgMapping() # {'vhk08k': '0', 'vhk6zl': '1', 'vhk0ao': '1', 'vhk9or': '2', 'vhkfln': '3', 'vhkbvu': '4', 'vhk84t': '5', 'vhkvxd': '6', 'vhkqsc': '7', 'vhkjj4': '8', 'vhk0f1': '9'} # 全文替换 for i in num_dict.items(): resp = resp.replace(f'<d class="{i[0]}"></d>',i[1]) html = Selector(resp) col_details = html.xpath('//div[@class="col details"]') for i in col_details: title = i.xpath('./div[1]/text()').extract_first().strip() score = i.xpath('./div[2]/span[2]/text()').extract() score += i.xpath('./div[2]/span[3]/text()').extract() score += i.xpath('./div[2]/span[4]/span[1]/text()').extract() score += i.xpath('./div[2]/span[4]/span[2]/text()').extract() score += i.xpath('./div[2]/span[4]/span[3]/text()').extract() address = ''.join(i.xpath('./div[3]/span/text()').extract()) characteristic = i.xpath('./div[4]/span/text()').extract_first() phonenum = ''.join(i.xpath('./div[5]//text()').extract()) print(title,' '.join(score),address,characteristic,phonenum) # 柳州螺蛳粉 100条评论 人均:12 口味:8.7 环境:7.4 服务:7.6 地址:中山大道浦西路28号商铺 特色:脆爽酸笋,热辣红油,香葱萝卜,吃完还想吃 电话:400-51771 spider()