2019数学建模 萧山机场航班数据爬取代码
#爬取萧山机场航班数据
import urllib.request
import re
import xlwt
#创建空列表以储存数据
hangban_list = []
arrive_list=[]
zong_list=[]
# 对网址发送请求,获取源代码
def getdata():
for i in range(1, 41):
url = 'http://www.hzairport.com/flight/arrive/p/{}.html'.format(i) # 获取需要的大量URL
html = urllib.request.urlopen(url).read().decode('utf-8') # 获取源码
# print(html)
# 从源码找到数据(re正则表达式)
re1 = re.compile('<div class="timetable_item clearfix">.*?<p>(.*?)</p>',re.S)
re2=re.compile('<div class="timetable_item clearfix">.*?<div class="time fl">(.*?)</div>.*?<div class="end fr"><span>.*?</span></div>',re.S)
re12=re.compile('<div class="timetable_item clearfix">.*?<p>(.*?)</p>.*?<div class="number fl">(.*?)</div>.*?<div class="company fl">(.*?)</div>.*?<div class="time fl">(.*?)</div>.*?<div class="end fr"><span>.*?</span></div>',re.S)
hangban = re.findall(re1, html)
arrive=re.findall(re2,html)
zong=re.findall(re12,html)
# print(page_list)
#记录信息
hangban_list.append(hangban)
arrive_list.append(arrive)
zong_list.append(zong)
return zong_list
#打印子信息集
# print(hangban_list)
# print(arrive_list)
# print(zong_list)
# print(len(hangban_list),len(arrive_list),len(zong_list))
# 存到Excel中
def excel_write(items):
newTable = 'test12020.xls'
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet('fana')
headData = ['航班号','机型','航空公司','到达时间']
for colnum in range(0, 4):
ws.write(0, colnum, headData[colnum], xlwt.easyxf('font:bold on'))
index = 1
for j in range(0, len(items)):
len_list=len(items[j])
for m in range(0,len_list):
for i in range(0, 4):
ws.write(index, i, items[j][m][i])
index +=1
wb.save(newTable)
#调用运行
items = getdata()
excel_write(items)