variflight 多线程爬虫获取所有航班信息,绕过封锁ip

先登录这个网址获取航班列表
http://www.variflight.com/sitemap.html?AE71649A58c77=
在这里插入图片描述
然后随便点击一个获取其Cookie,由于这个网址封ip 和验证码比较厉害记得
验证完以后获得其Request Header 的全部内容放到 代码的request 里面的
headers 里,如果有代理ip最好用代理ip获取数据,如果没有可以用自己的手机热点进行代理,在程序被中断要打开手机飞行模型,然后再让电脑链接热点,这个时候我们就又切换了一个ip,这个重复进行这个网址数据就可以随意拿下来了.
在这里插入图片描述
结果
在这里插入图片描述
程序里的redis 一个是用来存储爬取到航班编号,一个是存储航班的编号在数组中的位置。方便在程序中断是重新启动的时候再次接着上一次的位置继续获取数据

// An highlighted block
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import threading
from queue import Queue
import collections
import redis
import os,signal
pid =os.getpid ()
redis = redis.Redis (decode_responses=True, password="****")


ip_key = ['60.170.152.46:38888', '111.177.192.26:3256','125.122.52.15:8088','47.107.128.69:888','47.92.234.75:80']
url_base = 'http://www.variflight.com'
url ='http://www.variflight.com/sitemap.html?AE71649A58c77='
print ('主进程id :', os.getpid ())

num_of_threads =10


buffer_keys = collections.deque(maxlen=len(ip_key))

headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9  ',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': 'no-cache',
'Cookie': '''PHPSESSID=et83gh4r9gnrlollsl3lmnfh20; vaptchaNetway=1; ASPSESSIONIDCCQACTCD=OBFOOGLBHOCDNLIICCKGLFCP; ASPSESSIONIDSACTARSB=AKPPJPFCGIAHJENADBCOHAIA; ASPSESSIONIDQCDQATTD=IPJAKPFCMONGAIBDOKHDHLBL; ASPSESSIONIDACTBCTDC=JMNBKPFCAIGHEBEOAKDOLLBK; Hm_lvt_d1f759cd744b691c20c25f874cadc061=1625052094,1625121572; ASPSESSIONIDSCDQDRSD=CHDGGIADEIHELFNICNGMEACA; ASPSESSIONIDCCQDBTCD=GIDGGIADNEJCNPOLCLKPFHKD; ASPSESSIONIDQACSDSQB=OAEGGIADHBLFFBHJADECGNAP; ASPSESSIONIDQCBQCRTC=JNHDCBLDJAFKOPBGOOKALLAA; ASPSESSIONIDQACTDSRA=GHIDCBLDBOGLMFMIICEJKILD; ASPSESSIONIDAATBDSCD=JLIDCBLDGFPNOFEDEICNDGPG; ASPSESSIONIDQCCRDQTD=DPOIOJFACOFMLONNBKENPNGM; ASPSESSIONIDQCBTBQTB=ADOIOJFADCONDFDPHIGDCLIM; ASPSESSIONIDCARDCTDD=KOFKOJFALGIBFPJNKIMIJHKK; authCode=ffd66e69d01fee073453c62715cf0b07; fnumHistory=%5B%7B%22fnum%22%3A%22CZ3474%22%7D%2C%7B%22fnum%22%3A%22CA3954%22%7D%2C%7B%22fnum%22%3A%22CA3681%22%7D%2C%7B%22fnum%22%3A%22CA4432%22%7D%2C%7B%22fnum%22%3A%22CA3879%22%7D%2C%7B%22fnum%22%3A%22CA1101%22%7D%2C%7B%22fnum%22%3A%223U2011%22%7D%2C%7B%22fnum%22%3A%223U2013%22%7D%5D; vaptchaNetwayTime=1625395534387; salt=60e1914ed231e; Hm_lpvt_d1f759cd744b691c20c25f874cadc061=1625395544''',
'Host': 'www.variflight.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
}


headers1={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Cookie': 'PHPSESSID=et83gh4r9gnrlollsl3lmnfh20; vaptchaNetway=1; ASPSESSIONIDCCQACTCD=OBFOOGLBHOCDNLIICCKGLFCP; ASPSESSIONIDSACTARSB=AKPPJPFCGIAHJENADBCOHAIA; ASPSESSIONIDQCDQATTD=IPJAKPFCMONGAIBDOKHDHLBL; ASPSESSIONIDACTBCTDC=JMNBKPFCAIGHEBEOAKDOLLBK; Hm_lvt_d1f759cd744b691c20c25f874cadc061=1625052094,1625121572; ASPSESSIONIDSCDQDRSD=CHDGGIADEIHELFNICNGMEACA; ASPSESSIONIDCCQDBTCD=GIDGGIADNEJCNPOLCLKPFHKD; ASPSESSIONIDQACSDSQB=OAEGGIADHBLFFBHJADECGNAP; ASPSESSIONIDQCBQCRTC=JNHDCBLDJAFKOPBGOOKALLAA; ASPSESSIONIDQACTDSRA=GHIDCBLDBOGLMFMIICEJKILD; ASPSESSIONIDAATBDSCD=JLIDCBLDGFPNOFEDEICNDGPG; ASPSESSIONIDQCCRDQTD=DPOIOJFACOFMLONNBKENPNGM; ASPSESSIONIDQCBTBQTB=ADOIOJFADCONDFDPHIGDCLIM; ASPSESSIONIDCARDCTDD=KOFKOJFALGIBFPJNKIMIJHKK; ASPSESSIONIDSCCQAQTB=DHIOJCABKBPLCHCIDJJCPBHP; ASPSESSIONIDSCBRCRSC=ALIOJCABFPFHBOOJBNKGGAIA; ASPSESSIONIDAARCDTDD=LDOOJCABBPFKMPCFGADDHOGK; ASPSESSIONIDQADSBQTA=CNABGLKBJGELBNHKAJGDCNAO; ASPSESSIONIDSCBQDRSD=KBCBGLKBHABKJDBNCJPMFIDC; ASPSESSIONIDCCQCASCD=DBCBGLKBCHNCEHNMEAPMOBLN; fnumHistory=%5B%7B%22fnum%22%3A%223U8837%22%7D%2C%7B%22fnum%22%3A%223U8513%22%7D%2C%7B%22fnum%22%3A%22CZ3937%22%7D%2C%7B%22fnum%22%3A%223U8758%22%7D%2C%7B%22fnum%22%3A%223U5103%22%7D%2C%7B%22fnum%22%3A%223U8411%22%7D%2C%7B%22fnum%22%3A%223U5082%22%7D%2C%7B%22fnum%22%3A%223U5048%22%7D%5D; vaptchaSpareCh=1; salt=60e40d13446a7; midsalt=60e40d13634b9; authCode=18373f4fddf15b5400dbb02ec7ecef6b; vaptchaNetwayTime=1625560654940; Hm_lpvt_d1f759cd744b691c20c25f874cadc061=1625560658',
'Host': 'www.variflight.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.variflight.com/flight/fnum/CA4432.html?AE71649A58c77&fdate=20210703',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}

r = requests.get (url, headers=headers)
soup = bs (r.text, 'lxml')
list_a = soup.find (class_='list').find_all ('a')
list_url_fnum = [url_base + a.attrs['href'] for a in list_a]
print (list_url_fnum)

def init_queen():
    for i in range(len(ip_key)):
        buffer_keys.append(ip_key[i])
    print('当前可供使用的高德密钥:', buffer_keys)

airnumdict = {}
def get_index() :
        for i, url_fr in enumerate (list_url_fnum[1 :]) :
            flightnum = 'http://www.variflight.com/flight/fnum/(.*?).html.*'
            flightdata = re.compile (flightnum, re.S).findall (str (url_fr))
            airnumdict[flightdata[0]] = i
        return airnumdict

def load_data_from_dict(o, *keys):
    oo = o
    for i, key in enumerate(keys):
        if not oo:
            return None
        if i == (len(keys) - 1):
            return oo.get(key) if isinstance(oo, dict) else None
        oo = oo.get(key) if isinstance(oo, dict) else oo


def get_proxy():
#获取ip代理
    a = requests.get ("http://*****/get")
    b = json.loads (a.text)
    proxy = load_data_from_dict (b, "proxy")
    ip_list = [proxy]
    print(proxy)
    url_fr ='http://httpbin.org/ip'
    try:
        r1 = requests.get (url_fr,  proxies={'http' : 'http://' + proxy, 'https' : 'https://' + proxy},timeout=5)
        origin = load_data_from_dict (json.loads (r1.text), 'origin')
        if (origin.split (',')[0] == ip_list[0].split (':')[0]) :
            print (proxy)
            return proxy
        else :
            return get_proxy ()

    except:
        return get_proxy ()
    return proxy



class myThread (threading.Thread) :
    def __init__(self, threadID, city_queue,proxy) :
        threading.Thread.__init__ (self)
        self.threadID = threadID
        self.city_queue = city_queue
        self.proxy=proxy
        self.singal = threading.Event ()
        self.singal.set ()

    def run(self) :
        while not self.city_queue.empty () :
            code = self.city_queue.get ()
            print (code)
            self.mian (code,proxy)


    def pause(self) :
        self.log_ctrl.AppendText ("pause\n")
        self.singal.clear ()

    def restart(self) :
        self.log_ctrl.AppendText ("continues\n")
        self.singal.set ()


    def write_fun(self,line):
        with open('飞行{}.csv'.format(fdata),'a') as f:
            f.write(line)
            f.close()





    def get_location(self,address, i):


        # 输入API问号前固定不变的部分
        url = 'https://restapi.amap.com/v3/geocode/geo'

        # 将两个参数放入字典
        params = {'key' : '***',
                  'address' : str(address)}
        res = requests.get (url, params)

        # 输出结果为json,将其转为字典格式
        jd = json.loads (res.text)
        geocodes=load_data_from_dict(jd,'geocodes')
        location=load_data_from_dict (geocodes[0], 'location')
        return location


    def mian(self,code,proxy):
        # if buffer_keys.maxlen == 0 :
        #     print ('密钥已经用尽,程序退出!!!!!!!!!!!!!!!')
        #     exit (0)
        # proxy = buffer_keys[0]  # 总是获取队列中的第一个密钥

        print("*"*100)
        for index in range(6):
            flight=redis.hget ("flight:num1", 1)
            if flight:
                h = get_index ()[flight]
                i =get_index()[code[0]]
                if i > h :
                    url_fr = 'http://www.variflight.com/flight/fnum/{0}.html?AE71649A58c77&fdate={1}'.format (code[0],fdata)
                    #proxies = {'http' : 'http://' + proxy, 'https' : 'https://' + proxy}
                    r1 = requests.get (url_fr, headers=headers1)
                    #print(r1.text)
                    soup = bs (r1.text, 'lxml')
                    try:
                        dplan='<span class="w150" dplan=\\"(.*?)\\">'
                        aplan='<span aplan=\\"(.*?)\\" class="w150">'
                        arae='<span class="w150">(.*?)</span>'
                        flightnum ='http://www.variflight.com/flight/fnum/(.*?).html.*'
                        badip='<html><body><p>{"msg":\\"(.*?)\\"}</p></body></html>'
                        dplandata = re.compile (dplan, re.S).findall (str (soup))
                        aplandata = re.compile (aplan, re.S).findall (str (soup))
                        araedata  = re.compile (arae, re.S).findall (str (soup))
                        flightdata = re.compile (flightnum, re.S).findall (str (url_fr))
                        badipdata = re.compile (badip, re.S).findall (str (soup))
                        redis.hset ("flight:num1", 2, i)
                        if len(badipdata) >0:

                            if badipdata[0] =="IP blocked":
                                    print('无效的密钥!!!!!!!!!!!!!,重新切换密钥进行爬取')
                                    try:
                                        #proxy = buffer_keys[0] # 总是获取队列中的第一个密钥
                                        # proxy = get_proxy ()
                                        # mian (code,proxy)
                                        print (proxy+" "+str(self.threadID)+" "+ badipdata[0]+'密钥已经用尽,程序退出...')
                                        redis.hset ("flight:num1", 1, code[0])
                                        os.kill (pid, signal.SIGHUP)
                                        exit (0)

                                    except Exception as e:
                                        print(proxy+" "+str(self.threadID)+" "+ badipdata[0]+'异常密钥已经用尽,程序退出...')
                                        redis.hset ("flight:num1", 1, code[0])
                                        os.kill (pid, signal.SIGHUP)
                                        exit(0)
                        if len (dplandata) > 1 :
                            j = 0
                            for i, value in enumerate (dplandata) :
                                line = str (flightdata[0]) + ',' + str (dplandata[i]) + ',' + str (aplandata[i]) + ',' + str (
                                    araedata[j]) + ',' + str (araedata[j + 1]) + '\n'
                                j = (i + 1) * 3
                                print (str (h) + " " + str (i) + " " + line)
                                self.write_fun (line)


                        elif len (dplandata) == 1 :
                            line = str (flightdata[0]) + ',' + str (dplandata[0]) + ',' + str (aplandata[0]) + ',' + str (
                                araedata[0]) + ',' + str (araedata[1]) + '\n'
                            print (str (h) + " " + str (i) + " " + line)
                            self.write_fun (line)
                        elif len (dplandata) == 0 :
                            print (str (h) + " " + str (i) + " " + str (flightdata) + "无数据: " + str (dplandata) + " " + str (
                                badipdata))
                        redis.hset ("flight:num1", 1, code[0])


                    except:
                            print("异常: "+str(dplandata))






if __name__ == '__main__':
    fdata = 20210708
    init_queen ()
    get_index ()
    #proxy = get_proxy ()
    city_queue = Queue ()
    num = redis.hget ("flight:num1", 2)
    for i in list_url_fnum[int('{}'.format(num)):]:
        flightnum = 'http://www.variflight.com/flight/fnum/(.*?).html.*'
        flightdata = re.compile (flightnum, re.S).findall (str (i))
        if len(flightdata) >0:
            city_queue.put (flightdata)
    threads = [myThread (i, city_queue,proxy) for i in range (num_of_threads)]
    for i in range (num_of_threads) :
        threads[i].start ()







  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值