先登录这个网址获取航班列表
http://www.variflight.com/sitemap.html?AE71649A58c77=
然后随便点击一个获取其Cookie,由于这个网址封ip 和验证码比较厉害记得
验证完以后获得其Request Header 的全部内容放到 代码的request 里面的
headers 里,如果有代理ip最好用代理ip获取数据,如果没有可以用自己的手机热点进行代理,在程序被中断要打开手机飞行模型,然后再让电脑链接热点,这个时候我们就又切换了一个ip,这个重复进行这个网址数据就可以随意拿下来了.
结果
程序里的redis 一个是用来存储爬取到航班编号,一个是存储航班的编号在数组中的位置。方便在程序中断是重新启动的时候再次接着上一次的位置继续获取数据
// An highlighted block
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import threading
from queue import Queue
import collections
import redis
import os,signal
pid =os.getpid ()
redis = redis.Redis (decode_responses=True, password="****")
ip_key = ['60.170.152.46:38888', '111.177.192.26:3256','125.122.52.15:8088','47.107.128.69:888','47.92.234.75:80']
url_base = 'http://www.variflight.com'
url ='http://www.variflight.com/sitemap.html?AE71649A58c77='
print ('主进程id :', os.getpid ())
num_of_threads =10
buffer_keys = collections.deque(maxlen=len(ip_key))
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': 'no-cache',
'Cookie': '''PHPSESSID=et83gh4r9gnrlollsl3lmnfh20; vaptchaNetway=1; ASPSESSIONIDCCQACTCD=OBFOOGLBHOCDNLIICCKGLFCP; ASPSESSIONIDSACTARSB=AKPPJPFCGIAHJENADBCOHAIA; ASPSESSIONIDQCDQATTD=IPJAKPFCMONGAIBDOKHDHLBL; ASPSESSIONIDACTBCTDC=JMNBKPFCAIGHEBEOAKDOLLBK; Hm_lvt_d1f759cd744b691c20c25f874cadc061=1625052094,1625121572; ASPSESSIONIDSCDQDRSD=CHDGGIADEIHELFNICNGMEACA; ASPSESSIONIDCCQDBTCD=GIDGGIADNEJCNPOLCLKPFHKD; ASPSESSIONIDQACSDSQB=OAEGGIADHBLFFBHJADECGNAP; ASPSESSIONIDQCBQCRTC=JNHDCBLDJAFKOPBGOOKALLAA; ASPSESSIONIDQACTDSRA=GHIDCBLDBOGLMFMIICEJKILD; ASPSESSIONIDAATBDSCD=JLIDCBLDGFPNOFEDEICNDGPG; ASPSESSIONIDQCCRDQTD=DPOIOJFACOFMLONNBKENPNGM; ASPSESSIONIDQCBTBQTB=ADOIOJFADCONDFDPHIGDCLIM; ASPSESSIONIDCARDCTDD=KOFKOJFALGIBFPJNKIMIJHKK; authCode=ffd66e69d01fee073453c62715cf0b07; fnumHistory=%5B%7B%22fnum%22%3A%22CZ3474%22%7D%2C%7B%22fnum%22%3A%22CA3954%22%7D%2C%7B%22fnum%22%3A%22CA3681%22%7D%2C%7B%22fnum%22%3A%22CA4432%22%7D%2C%7B%22fnum%22%3A%22CA3879%22%7D%2C%7B%22fnum%22%3A%22CA1101%22%7D%2C%7B%22fnum%22%3A%223U2011%22%7D%2C%7B%22fnum%22%3A%223U2013%22%7D%5D; vaptchaNetwayTime=1625395534387; salt=60e1914ed231e; Hm_lpvt_d1f759cd744b691c20c25f874cadc061=1625395544''',
'Host': 'www.variflight.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
}
headers1={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Cookie': 'PHPSESSID=et83gh4r9gnrlollsl3lmnfh20; vaptchaNetway=1; ASPSESSIONIDCCQACTCD=OBFOOGLBHOCDNLIICCKGLFCP; ASPSESSIONIDSACTARSB=AKPPJPFCGIAHJENADBCOHAIA; ASPSESSIONIDQCDQATTD=IPJAKPFCMONGAIBDOKHDHLBL; ASPSESSIONIDACTBCTDC=JMNBKPFCAIGHEBEOAKDOLLBK; Hm_lvt_d1f759cd744b691c20c25f874cadc061=1625052094,1625121572; ASPSESSIONIDSCDQDRSD=CHDGGIADEIHELFNICNGMEACA; ASPSESSIONIDCCQDBTCD=GIDGGIADNEJCNPOLCLKPFHKD; ASPSESSIONIDQACSDSQB=OAEGGIADHBLFFBHJADECGNAP; ASPSESSIONIDQCBQCRTC=JNHDCBLDJAFKOPBGOOKALLAA; ASPSESSIONIDQACTDSRA=GHIDCBLDBOGLMFMIICEJKILD; ASPSESSIONIDAATBDSCD=JLIDCBLDGFPNOFEDEICNDGPG; ASPSESSIONIDQCCRDQTD=DPOIOJFACOFMLONNBKENPNGM; ASPSESSIONIDQCBTBQTB=ADOIOJFADCONDFDPHIGDCLIM; ASPSESSIONIDCARDCTDD=KOFKOJFALGIBFPJNKIMIJHKK; ASPSESSIONIDSCCQAQTB=DHIOJCABKBPLCHCIDJJCPBHP; ASPSESSIONIDSCBRCRSC=ALIOJCABFPFHBOOJBNKGGAIA; ASPSESSIONIDAARCDTDD=LDOOJCABBPFKMPCFGADDHOGK; ASPSESSIONIDQADSBQTA=CNABGLKBJGELBNHKAJGDCNAO; ASPSESSIONIDSCBQDRSD=KBCBGLKBHABKJDBNCJPMFIDC; ASPSESSIONIDCCQCASCD=DBCBGLKBCHNCEHNMEAPMOBLN; fnumHistory=%5B%7B%22fnum%22%3A%223U8837%22%7D%2C%7B%22fnum%22%3A%223U8513%22%7D%2C%7B%22fnum%22%3A%22CZ3937%22%7D%2C%7B%22fnum%22%3A%223U8758%22%7D%2C%7B%22fnum%22%3A%223U5103%22%7D%2C%7B%22fnum%22%3A%223U8411%22%7D%2C%7B%22fnum%22%3A%223U5082%22%7D%2C%7B%22fnum%22%3A%223U5048%22%7D%5D; vaptchaSpareCh=1; salt=60e40d13446a7; midsalt=60e40d13634b9; authCode=18373f4fddf15b5400dbb02ec7ecef6b; vaptchaNetwayTime=1625560654940; Hm_lpvt_d1f759cd744b691c20c25f874cadc061=1625560658',
'Host': 'www.variflight.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.variflight.com/flight/fnum/CA4432.html?AE71649A58c77&fdate=20210703',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r = requests.get (url, headers=headers)
soup = bs (r.text, 'lxml')
list_a = soup.find (class_='list').find_all ('a')
list_url_fnum = [url_base + a.attrs['href'] for a in list_a]
print (list_url_fnum)
def init_queen():
for i in range(len(ip_key)):
buffer_keys.append(ip_key[i])
print('当前可供使用的高德密钥:', buffer_keys)
airnumdict = {}
def get_index() :
for i, url_fr in enumerate (list_url_fnum[1 :]) :
flightnum = 'http://www.variflight.com/flight/fnum/(.*?).html.*'
flightdata = re.compile (flightnum, re.S).findall (str (url_fr))
airnumdict[flightdata[0]] = i
return airnumdict
def load_data_from_dict(o, *keys):
oo = o
for i, key in enumerate(keys):
if not oo:
return None
if i == (len(keys) - 1):
return oo.get(key) if isinstance(oo, dict) else None
oo = oo.get(key) if isinstance(oo, dict) else oo
def get_proxy():
#获取ip代理
a = requests.get ("http://*****/get")
b = json.loads (a.text)
proxy = load_data_from_dict (b, "proxy")
ip_list = [proxy]
print(proxy)
url_fr ='http://httpbin.org/ip'
try:
r1 = requests.get (url_fr, proxies={'http' : 'http://' + proxy, 'https' : 'https://' + proxy},timeout=5)
origin = load_data_from_dict (json.loads (r1.text), 'origin')
if (origin.split (',')[0] == ip_list[0].split (':')[0]) :
print (proxy)
return proxy
else :
return get_proxy ()
except:
return get_proxy ()
return proxy
class myThread (threading.Thread) :
def __init__(self, threadID, city_queue,proxy) :
threading.Thread.__init__ (self)
self.threadID = threadID
self.city_queue = city_queue
self.proxy=proxy
self.singal = threading.Event ()
self.singal.set ()
def run(self) :
while not self.city_queue.empty () :
code = self.city_queue.get ()
print (code)
self.mian (code,proxy)
def pause(self) :
self.log_ctrl.AppendText ("pause\n")
self.singal.clear ()
def restart(self) :
self.log_ctrl.AppendText ("continues\n")
self.singal.set ()
def write_fun(self,line):
with open('飞行{}.csv'.format(fdata),'a') as f:
f.write(line)
f.close()
def get_location(self,address, i):
# 输入API问号前固定不变的部分
url = 'https://restapi.amap.com/v3/geocode/geo'
# 将两个参数放入字典
params = {'key' : '***',
'address' : str(address)}
res = requests.get (url, params)
# 输出结果为json,将其转为字典格式
jd = json.loads (res.text)
geocodes=load_data_from_dict(jd,'geocodes')
location=load_data_from_dict (geocodes[0], 'location')
return location
def mian(self,code,proxy):
# if buffer_keys.maxlen == 0 :
# print ('密钥已经用尽,程序退出!!!!!!!!!!!!!!!')
# exit (0)
# proxy = buffer_keys[0] # 总是获取队列中的第一个密钥
print("*"*100)
for index in range(6):
flight=redis.hget ("flight:num1", 1)
if flight:
h = get_index ()[flight]
i =get_index()[code[0]]
if i > h :
url_fr = 'http://www.variflight.com/flight/fnum/{0}.html?AE71649A58c77&fdate={1}'.format (code[0],fdata)
#proxies = {'http' : 'http://' + proxy, 'https' : 'https://' + proxy}
r1 = requests.get (url_fr, headers=headers1)
#print(r1.text)
soup = bs (r1.text, 'lxml')
try:
dplan='<span class="w150" dplan=\\"(.*?)\\">'
aplan='<span aplan=\\"(.*?)\\" class="w150">'
arae='<span class="w150">(.*?)</span>'
flightnum ='http://www.variflight.com/flight/fnum/(.*?).html.*'
badip='<html><body><p>{"msg":\\"(.*?)\\"}</p></body></html>'
dplandata = re.compile (dplan, re.S).findall (str (soup))
aplandata = re.compile (aplan, re.S).findall (str (soup))
araedata = re.compile (arae, re.S).findall (str (soup))
flightdata = re.compile (flightnum, re.S).findall (str (url_fr))
badipdata = re.compile (badip, re.S).findall (str (soup))
redis.hset ("flight:num1", 2, i)
if len(badipdata) >0:
if badipdata[0] =="IP blocked":
print('无效的密钥!!!!!!!!!!!!!,重新切换密钥进行爬取')
try:
#proxy = buffer_keys[0] # 总是获取队列中的第一个密钥
# proxy = get_proxy ()
# mian (code,proxy)
print (proxy+" "+str(self.threadID)+" "+ badipdata[0]+'密钥已经用尽,程序退出...')
redis.hset ("flight:num1", 1, code[0])
os.kill (pid, signal.SIGHUP)
exit (0)
except Exception as e:
print(proxy+" "+str(self.threadID)+" "+ badipdata[0]+'异常密钥已经用尽,程序退出...')
redis.hset ("flight:num1", 1, code[0])
os.kill (pid, signal.SIGHUP)
exit(0)
if len (dplandata) > 1 :
j = 0
for i, value in enumerate (dplandata) :
line = str (flightdata[0]) + ',' + str (dplandata[i]) + ',' + str (aplandata[i]) + ',' + str (
araedata[j]) + ',' + str (araedata[j + 1]) + '\n'
j = (i + 1) * 3
print (str (h) + " " + str (i) + " " + line)
self.write_fun (line)
elif len (dplandata) == 1 :
line = str (flightdata[0]) + ',' + str (dplandata[0]) + ',' + str (aplandata[0]) + ',' + str (
araedata[0]) + ',' + str (araedata[1]) + '\n'
print (str (h) + " " + str (i) + " " + line)
self.write_fun (line)
elif len (dplandata) == 0 :
print (str (h) + " " + str (i) + " " + str (flightdata) + "无数据: " + str (dplandata) + " " + str (
badipdata))
redis.hset ("flight:num1", 1, code[0])
except:
print("异常: "+str(dplandata))
if __name__ == '__main__':
fdata = 20210708
init_queen ()
get_index ()
#proxy = get_proxy ()
city_queue = Queue ()
num = redis.hget ("flight:num1", 2)
for i in list_url_fnum[int('{}'.format(num)):]:
flightnum = 'http://www.variflight.com/flight/fnum/(.*?).html.*'
flightdata = re.compile (flightnum, re.S).findall (str (i))
if len(flightdata) >0:
city_queue.put (flightdata)
threads = [myThread (i, city_queue,proxy) for i in range (num_of_threads)]
for i in range (num_of_threads) :
threads[i].start ()