目标网站:http://www.porters.vip/confusion/food.html
详细解析可查看: https://blog.csdn.net/BigBoy_Coder/article/details/104748253
import requests
from parsel import Selector
import re
def getSvgMapping():
svg_url = 'http://www.porters.vip/confusion/font/food.svg'
css_url = 'http://www.porters.vip/confusion/css/food.css'
svg_resp = requests.get(svg_url).text
css_resp = requests.get(css_url).text
svg_list = re.findall(r'<text x=".*?" y="(.*?)">(.*?)</text>', svg_resp)
# print(svg_list) # [('38', '154669136497975167479825383996313925720573'), ('83', '560862462805204755437571121437458524985017'), ('120', '671260781104096663000892328440489239185923'), ('164', '684431081139502796807382')]
css_list = re.findall(r'\.(\w{6}) \{\s+background: -(.*?)px -(.*?)px;\s+\}',css_resp)
# print(css_list) # [('vhk08k', '274', '141'), ('vhk6zl', '7', '15'), ('vhk0ao', '133', '97'), ('vhk9or', '330', '141'), ('vhkfln', '428', '15'), ('vhkbvu', '386', '97'), ('vhk84t', '176', '141'), ('vhkvxd', '246', '141'), ('vhkqsc', '288', '141'), ('vhkjj4', '316', '141'), ('vhk0f1', '316', '97')]
svg_dict = {i[0]: i[1] for i in svg_list}
# print(svg_dict) # {'38': '154669136497975167479825383996313925720573', '83': '560862462805204755437571121437458524985017', '120': '671260781104096663000892328440489239185923', '164': '684431081139502796807382'}
css_dict = {i[0]:[i[1],i[2]] for i in css_list}
# print(css_dict) # {'vhk08k': ['274', '141'], 'vhk6zl': ['7', '15'], 'vhk0ao': ['133', '97'], 'vhk9or': ['330', '141'], 'vhkfln': ['428', '15'], 'vhkbvu': ['386', '97'], 'vhk84t': ['176', '141'], 'vhkvxd': ['246', '141'], 'vhkqsc': ['288', '141'], 'vhkjj4': ['316', '141'], 'vhk0f1': ['316', '97']}
font_size = int(re.search(r'<style>.*?font-size:(\d+)px.*?</style>',svg_resp).group(1)) # 14
num_dict = {}
for i in css_dict.items():
for j in svg_dict.keys():
if int(i[1][1]) < int(j):
break
# 偏移量
offset = int(i[1][0]) //font_size
# 看到的数值
rel_num = svg_dict[j][offset:offset+1]
num_dict[i[0]] = rel_num
return num_dict
def spider():
url = 'http://www.porters.vip/confusion/food.html'
resp = requests.get(url).text
num_dict = getSvgMapping()
# {{'vhk08k': '0', 'vhk6zl': '1', 'vhk0ao': '1', 'vhk9or': '2', 'vhkfln': '3', 'vhkbvu': '4', 'vhk84t': '5', 'vhkvxd': '6', 'vhkqsc': '7', 'vhkjj4': '8', 'vhk0f1': '9'}
# 全文替换
for i in num_dict.items():
resp = resp.replace(f'<d class="{i[0]}"></d>', i[1])
html = Selector(resp)
col_details = html.xpath('//div[@class="col details"]')
for i in col_details:
title = i.xpath('./div[1]/text()').extract_first().strip()
score = i.xpath('./div[2]/span[2]/text()').extract()
score += i.xpath('./div[2]/span[3]/text()').extract()
score += i.xpath('./div[2]/span[4]/span[1]/text()').extract()
score += i.xpath('./div[2]/span[4]/span[2]/text()').extract()
score += i.xpath('./div[2]/span[4]/span[3]/text()').extract()
address = ''.join(i.xpath('./div[3]/span/text()').extract())
characteristic = i.xpath('./div[4]/span/text()').extract_first()
phonenum = ''.join(i.xpath('./div[5]//text()').extract())
print(title, ' '.join(score),address, characteristic, phonenum)
# 柳州螺蛳粉 100条评论 人均:12 口味:8.7 环境:7.4 服务:7.6 地址:中山大道浦西路28号商铺 特色:脆爽酸笋,热辣红油,香葱萝卜,吃完还想吃 电话:400-51771
spider()