目标网站:http://www.porters.vip/confusion/flight.html#
css法详解可到崔佬的https://cuiqingcai.com/8678.html查看,在此以表感谢,学习了。
import requests
import re
from parsel import Selector
# css法
def spider():
url = 'http://www.porters.vip/confusion/flight.html'
resp = requests.get(url)
sel = Selector(resp.text)
em = sel.css('em.rel')
for element in em:
# 定位所有的<b>标签
element_b = element.css('b').extract()
b1 = Selector(element_b.pop(0))
b1_style = b1.css('b::attr("style")').get()
# print(b1_style) # width:48px;left:-48px width:64px;left:-64px
b1_width = ''.join(re.findall('width:(.*)px;', b1_style))
number = int(int(b1_width) / 16) # 3 4
# 获取第 1 对 <b> 标签中的值(列表)
base_price = b1.css('i::text').extract()[:number]
# print(base_price)
alternate_price = []
for eb in element_b:
eb = Selector(eb)
# 提取<b>标签的 style 属性值
style = eb.css('b::attr("style")').get()
# 获得具体的位置
position = ''.join(re.findall('left:(.*)px', style))
# 获得该标签下的数字
value = eb.css('b::text').get()
# 将<b>标签的位置信息和数字以字典的格式添加到替补票价列表中
alternate_price.append({'position': position, 'value': value})
# print(alternate_price)
for al in alternate_price:
position = int(al.get('position'))
value = al.get('value')
# 计算下标,以 16px 为基准
index = int(position / 16)
# 替换第一对<b>标签值列表中的元素,也就是完成值覆盖操作
base_price[index] = value
print(base_price)
# spider()
# xpath法一:
def spider1():
url = 'http://www.porters.vip/confusion/flight.html'
resp = requests.get(url)
sel = Selector(resp.text)
em = sel.xpath('//em[@class="rel"]')
for element in em:
element_b = element.xpath('./b').extract()
b1 = Selector(element_b.pop(0))
b1_style = b1.xpath('//@style').extract_first()
b1_width = re.search('width:(.*)px;', b1_style).group(1)
number = int(int(b1_width) / 16) # 3 4
base_price = b1.xpath('//i/text()').extract()[:number]
# print(base_price)
alternate_price = []
for eb in element_b:
eb = Selector(eb)
style = eb.xpath('//@style').extract_first()
position = re.search('left:(.*)px', style).group(1)
value = eb.xpath('//text()').extract_first()
alternate_price.append({'position': position, 'value': value})
# print(alternate_price)
for al in alternate_price:
position = int(al.get('position'))
value = al.get('value')
index = int(position / 16)
base_price[index] = value
print(base_price)
# spider1()
# xpath法二:
def spider2():
url = 'http://www.porters.vip/confusion/flight.html'
resp = requests.get(url)
sel = Selector(resp.text)
em = sel.xpath('//em[@class="rel"]')
for element in em:
b1_style = element.xpath('./b[1]/@style').extract_first()
b1_width = re.search('width:(.*)px;', b1_style).group(1)
number = int(int(b1_width) / 16) # 3 4
base_price = element.xpath('./b[1]/i/text()').extract()[:number]
# 取当前节点下所有兄弟标签
element_b = element.xpath('./b[1]/following-sibling::*').extract()
# 效果等同于上一行
# element_b = element.xpath('./b[position()>1]').extract()
alternate_price = []
for eb in element_b:
eb = Selector(eb)
style = eb.xpath('//@style').extract_first()
position = re.search('left:(.*)px', style).group(1)
value = eb.xpath('//text()').extract_first()
alternate_price.append({'position': position, 'value': value})
# print(alternate_price)
for al in alternate_price:
position = int(al.get('position'))
value = al.get('value')
index = int(position / 16)
base_price[index] = value
print(base_price)
spider2()