使用xpath,右键网络源码复制xpth地址提取相应元素
将获取到的数据存入excel
# (1) 获取网页的源码
# (2) 解析 解析的服务器响应的文件 etree.HTML
# (3) 打印
# 解析网页源码 来获取我们想要的数据
import xlrd
from lxml import etree
import urllib.request
from xlutils.copy import copy
# url = 'http://s.lvmama.com/ticket/H9K310000P1?keyword=%E6%A1%82%E6%9E%97&tabType=route350#list'
def create_url(index):
base_url1 = 'http://s.lvmama.com/ticket/H9K310000P'
base_url2 = '?keyword=%E6%A1%82%E6%9E%97&tabType=route350#list'
base_url = base_url1+str(index)+base_url2
return base_url
def mappend(rows,cols,values):
#初始化
data = xlrd.open_workbook('excel_test.xls',formatting_info=True)
excel = copy(wb=data) # 完成xlrd对象向xlwt对象转换
excel_table = excel.get_sheet(0) # 获得要操作的页
table = data.sheets()[0]
print(rows,cols)
excel_table.write(rows,cols,values) # 因为单元格从0开始算,所以row不需要加一
excel.save('excel_test.xls')
def helper(s):
#去除空格
s = s.strip()
#去除回车
s = s.replace('\n', '')
# 去除水平制表符
s = s.replace('\t', '')
# 去除换行符
s = s.replace('\r', '')
return s
def get_info(url, i):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器访问服务器
response = urllib.request.urlopen(request)
# 获取网页源码
content = response.read().decode('utf-8')
# 解析服务器响应的文件
tree = etree.HTML(content)
# 获取想要的数据 xpath的返回值是一个列表类型的数据
# @value获取属性值
result = tree.xpath("//div[@class='product-item product-ticket searchTicket clearfix']")
offset = 0
for div in result:
name = div.xpath("./div[1]/div[2]/h3/a/@title")
if name == []:
mappend((i-1)*8+offset,0,"")
else:
mappend((i - 1) * 8 + offset, 0, helper(name[0]))
level = div.xpath("./div[1]/div[2]/h3/span[2]/text()")
if level == []:
mappend((i-1)*8+offset,1,"")
else:
mappend((i - 1) * 8 + offset, 1, helper(level[0]))
address = div.xpath("./div[1]/div[2]/dl[1]/dd/text()")
if address == []:
mappend((i-1)*8+offset,2,"")
else:
mappend((i - 1) * 8 + offset, 2, helper(address[0]))
time = div.xpath("./div[1]/div[2]/dl[2]/dd/div/text()")
if time == []:
mappend((i-1)*8+offset,3,"")
else:
mappend((i - 1) * 8 + offset, 3, helper(time[0]))
topic = div.xpath("./div[1]/div[2]/dl[3]/dd/text()")
if topic == []:
mappend((i-1)*8+offset,4,"")
else:
mappend((i - 1) * 8 + offset, 4, helper(topic[0]))
introduce = div.xpath("./div[1]/div[2]/dl[4]/dd/div/text()")
if introduce == []:
mappend((i-1)*8+offset,5,"")
else:
mappend((i - 1) * 8 + offset, 5, helper(introduce[0]))
picture = div.xpath("./div[1]/div[1]/a/img/@src")
if picture == []:
mappend((i-1)*8+offset,6,"")
else:
mappend((i - 1) * 8 + offset, 6, helper(picture[0]))
score = div.xpath("./div[1]/div[3]/ul/li[1]/b/text()")
if score == []:
mappend((i-1)*8+offset,7,"")
else:
mappend((i - 1) * 8 + offset, 7, helper(score[0]))
price = div.xpath("./div[1]/div[3]/div/em/text()")
if price == []:
mappend((i-1)*8+offset,8,"")
else:
mappend((i - 1) * 8 + offset, 8, helper(price[0]))
detail = div.xpath("./div[1]/div[1]/a/@href")
if detail == []:
mappend((i-1)*8+offset,9,"")
else:
mappend((i - 1) * 8 + offset, 9, helper(detail[0]))
print(name)
print(address)
print(level)
print(time)
print(topic)
print(introduce)
print(picture)
print(score)
print(price)
print(detail)
offset+=1
if __name__ == '__main__':
for i in range(1,10):
url = create_url(i)
get_info(url,i)