关于2019数模国赛C题
早就想写一点东西了,最近终于得空来总结一下这次数模国赛爬取数据的问题。
打比赛的时候先看颠鼓的B题,看的我们一脸懵逼,果断换题。
后面看了C题,觉得机场出租车问题数据可能比较好找,于是开始寻找线索。
我们选择先看看能否获取出租车数量数据(出租车竞争影响等待时间)和机场航班架次数据(影响乘客数量),然后在知乎找到了有网站上贴出了郑州机场的实时出租车进出机场数据,我在第二天脑袋才开窍开始爬出租车数据。
回归正题
出租车数据爬取
这个的实现很简单的,因为这个网页只有几个字,没有什么结构
因为网站会显示前半个小时的数据,于是我定为半个小时爬取一次数据,然后我就让程序跑了一天…
#!/usr/bin/env python
# encoding: utf-8
"""
@version:
@author: Zengjc
@site: http://
@file: taxi.py
@time: 2019/9/14 16:44
"""
import requests
import time
from pyquery import PyQuery as pq
headers = {
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
url = 'http://www.whalebj.com/xzjc/default.aspx'
while 1:
with open('taxi.txt', 'a')as f:
# while time.localtime().tm_min == 30 or time.localtime().tm_min == 00:
# while time.localtime().tm_sec == 0:
while time.localtime().tm_sec == 0: # 秒为0
while time.localtime().tm_min == 30 or time.localtime().tm_min == 0:
# 分为 0 15 30 45
print(time.localtime().tm_hour, time.localtime().tm_min, time.localtime().tm_sec)
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = response.text
doc = pq(html)
box = doc.find('.content_Case') # 匹配有文本的 span 标签
string = str(box.text()).replace('\n', '')
f.write(string) # 写入文件
f.write('\n')
print(string)
break
机场航班数量爬取
接下来就是机场航班数量
我找了挺久的,没有什么官方渠道去获取航班架次,就只好去携程爬数据了
导入包和headers
#!/usr/bin/env python
# encoding: utf-8
"""
@version:
@author: Zengjc
@site: http://
@readfile: airplane.py
@time: 2019/9/14 9:49
"""
import requests
import re
from pyquery import PyQuery as pq
from collections import Counter
# headers是在一个网站自动生成的,防止反扒
headers = {
'authority': 'flights.ctrip.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'none',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '_abtest_userid=0edabc4e-ec22-49c7-a2c2-64cef28c4110; _RF1=106.224.215.171; _RSG=N4aI.8vitx41WkP5W8rKwB; _RDG=2821b70647ca8f2b322ee8343f1c812626; _RGUID=4ef7b392-3e4a-4117-be94-d4880566e801; MKT_Pagesource=PC; gad_city=5f904035b521a7120222cd876d0ea3c5; _bfa=1.1562835846177.790rz.1.1562835846177.1568424671723.2.8; _bfs=1.7; _jzqco=^%^7C^%^7C^%^7C^%^7C^%^7C1.1672377718.1568424673914.1568424736730.1568424744000.1568424736730.1568424744000.0.0.0.11.11; __zpspc=9.1.1568424673.1568424744.11^%^234^%^7C^%^7C^%^7C^%^7C^%^7C^%^23; appFloatCnt=10',
}
第一个网页
有很多目的地,详细数据要点击链接进去获取,在这个网页爬取几日内有关郑州的所有航班的详细信息链接写入文件中,即 writeURL 函数
# 爬取详细信息链接
def writeURL(url,writetxt):
print('writeURL start')
response = requests.get(url, headers=headers)
response.encoding = 'gbk'
html = response.text
doc = pq(html)
box = doc.find('.m a')
a = str(box).replace('<a href="','').replace('</a>','').replace('">','~').replace(' ','') # 处理数据
with open(writetxt,'w',encoding='utf-8') as f:
f.write(str(a).strip())
print('writeURL end')
第二个网页
因为我把航班起降分开了,写的有点乱,见谅_
# 爬取航班具体信息
def writeTime(readtxt):
readfile = open(readtxt, 'r', encoding='utf-8') # 读取链接文件
writeDepartFile = open('DepartTime.txt', 'w', encoding='gbk') # 起飞时间文件
writeArriveFile = open('ArriveTime.txt', 'w', encoding='gbk') # 到达时间文件
print('writeTime start')
for line in readfile:
line = line.strip().split('~')
if line[0] != '':
# print(line[0], line[1]) # 0是链接 1是飞机起飞、降落地址
response = requests.get(line[0], headers=headers)
response.encoding = 'utf-8'
html = response.text
doc = pq(html)
# print(doc)
box = doc.find('#flt1')
# print(box)
depart = box.find('.depart .time').text() # 起飞时间
arrive = box.find('.arrive .time').text() # 落地时间
depart = depart.split(' ')
arrive = arrive.split(' ')
for i in range(len(depart)):
print(line[1], depart[i], arrive[i])
string = str(line[1]) + ' ' + str(depart[i]) + ' ' + str(arrive[i])
if line[1].find('郑州') == 0: # 起飞
t = depart[i].split(':')
string = string + ' 起飞时间所属区间:'+ str(int(t[0])%24)+'-'+str((int(t[0])+1)%24)
writeDepartFile.write(string) # 写入文件
writeDepartFile.write('\n')
else: # 降落
t = arrive[i].split(':')
string = string +' 落地时间所属区间:'+ str(int(t[0])%24)+'-'+str((int(t[0])+1)%24)
writeArriveFile.write(string)
writeArriveFile.write('\n')
print(string)
readfile.close()
writeDepartFile.close()
writeArriveFile.close()
print('writeTime end')
然后就是总的调用函数了
if __name__ == '__main__':
URL = 'https://flights.ctrip.com/schedule/cgo..html' #爬取网页
txt = 'url.txt'
# writeURL(URL,txt) # 爬取详情网址链接
# writeTime(txt) # 爬取航班详情
mofify('DepartTime.txt') # 处理郑州起飞数据
mofify('ArriveTime.txt') # 处理郑州降落数据
总结
感觉这次爬取数据还是挺容易的,这次建模的主要难点在于如何用仿真软件进行仿真,我感觉我们组仿真的不怎么好,仓促的学了点弄了仿真就交了,