Python 爬取航班信息

接上篇:
Python爬虫练习

这里做了简单的优化,网址什么的老规矩隐藏掉。
目前不是完全体。

缺少部分:

  • 异常场景处理
  • 该网站做了反爬处理,需要使用代理池,这个后续有时间在搞吧
import time
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import pytesseract
import logging
import xlwt

from flyCodeList import flyCodes
from flyInfo import flyInfo


class flyspider:



    # 初始化请求头
    def __init__(self):
        # 准备请求头信息
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
        host = '...'
        connection = 'keep-alive'
        accept_Language = 'zh-CN;zh;q=0.9'
        accept_Encoding = 'gzip; deflate'
        accept = 'text/html;application/xhtml+xml;application/xml;q=0.9;image/avif;image/webp;image/apng;*/*;q=0.8;application/signed-exchange;v=b3;q=0.9'
        referer = '...'

        # 组装请求头
        headers = {
            'Accept': accept,
            'Accept-Language': accept_Language,
            'Connection': connection,
            'Host': host,
            'Referer': referer,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': user_agent
        }

        self.headers = headers
        self.cookies = ''

    # ocr识别
    def ocrWebImg(self, url):
        ocrStr = ''
        try:
            imgResp = requests.get(url + '/get', headers=self.headers, cookies=self.cookies)
            image = Image.open(BytesIO(imgResp.content))
            ocrStr = pytesseract.image_to_string(image)
        except BaseException as e:
            logging.info(e)
            pass
        finally:
            imgResp.close()
        # print('ocr:' + ocrStr)
        return ocrStr

    # 爬取
    def spiderRun(self, flyCode, flyDate):

        url = '...' + flyCode + '.html?AE71649A58c77&fdate=' + flyDate
        base_url = "..."
        # 请求
        r_get = requests.get(url + '/get', headers=self.headers)
        respCode = r_get.status_code
        self.cookies = r_get.cookies
        if 200 != respCode:
            print(respCode)
            flyInfoTmp = flyInfo()
            flyInfoTmp.fly_code = flyCode
            flyInfoTmp.fly_date = flyDate
            return flyInfoTmp

        r_get.encoding = 'utf-8'
        # 找到需要爬取的信息
        text = r_get.text
        # 获取页面html页面
        soup = BeautifulSoup(text, 'html.parser')
        # print(soup)

        flyInfoTmp = flyInfo()
        flyInfoTmp.fly_code = flyCode
        flyInfoTmp.fly_date = flyDate
        if soup.find('p', class_='t') is None:
            # 航班列表
            items = soup.find_all('div', class_="li_com")

            size = items.__len__()
            if size > 0:
                flyInfoTmp.is_empty = 'N'
                for index in range(0, size):

                    imgs = items[index].find_all('img')
                    # print(imgs)
                    # 航空公司
                    flyInfoTmp.fly_company = imgs[0].attrs['align']
                    # 计划起飞
                    flyInfoTmp.plan_start_time = items[index].find_all('span')[1].text.strip()
                    # 实际起飞
                    flyInfoTmp.real_start_time_ocr = self.ocrWebImg(base_url + imgs[1]['src'])
                    # 出发地
                    flyInfoTmp.start_address = items[index].find_all('span')[3].text.strip()
                    # 计划到达
                    flyInfoTmp.plan_end_time = items[index].find_all('span')[4].text.strip()
                    # 实际到达
                    flyInfoTmp.real_end_time_ocr = self.ocrWebImg(base_url + imgs[2]['src'])
                    # 到达地
                    flyInfoTmp.end_address = items[index].find_all('span')[6].text.strip()
                    # 准点率
                    flyInfoTmp.time_performance = self.ocrWebImg(base_url + imgs[3]['src'])
        r_get.close()
        return flyInfoTmp

    # 入口
    def start(self, datastr):
        # 获取列表
        flyCodeList = flyCodes.getflyList()
        # 默认
        flyDate = time.strftime('%Y-%m-%d',time.localtime(time.time()))

        if datastr is not None and flyCodeList is not None and flyCodeList.__len__() > 0:
            flyDate = datastr

            # Excel
            wb = xlwt.Workbook()
            wbSheet = wb.add_sheet(flyDate)
            wbSheet.write(0, 0, '序号')
            wbSheet.write(0, 1, '航空公司')
            wbSheet.write(0, 2, '航班号')
            wbSheet.write(0, 3, '日期')
            wbSheet.write(0, 4, '计划起飞')
            wbSheet.write(0, 5, '实际起飞')
            wbSheet.write(0, 6, '出发地')
            wbSheet.write(0, 7, '计划到达')
            wbSheet.write(0, 8, '实际到达')
            wbSheet.write(0, 9, '到达地')
            wbSheet.write(0, 10, '准点率')
            wbSheet.write(0, 11, '是否为空')
            wb.save('D://FLYTMP.xls')

            '''
            size = flyCodeList.__len__()
            for index in range(0, size):
                try:
                    seq = index + 1
                    res = self.spiderRun(flyCodeList[0], flyDate)
                    print(res)
                    wbSheet.write(seq, 0, str(seq))
                    wbSheet.write(seq, 1, res.fly_company if(res.fly_company is not None) else '')
                    wbSheet.write(seq, 2, res.fly_code if(res.fly_code is not None) else '')
                    wbSheet.write(seq, 3, res.fly_date if(res.fly_date is not None) else '')
                    wbSheet.write(seq, 4, res.plan_start_time if(res.plan_start_time is not None) else '')
                    wbSheet.write(seq, 5, res.real_start_time_ocr if(res.real_start_time_ocr is not None) else '')
                    wbSheet.write(seq, 6, res.start_address if(res.start_address is not None) else '')
                    wbSheet.write(seq, 7, res.plan_end_time if(res.plan_end_time is not None) else '')
                    wbSheet.write(seq, 8, res.real_end_time_ocr if(res.real_end_time_ocr is not None) else '')
                    wbSheet.write(seq, 9, res.end_address if(res.end_address is not None) else '')
                    wbSheet.write(seq, 10, res.time_performance if(res.time_performance is not None) else '')
                    wbSheet.write(seq, 11, res.is_empty if(res.is_empty is not None) else '')
                except BaseException as e:
                    print(e)
                    continue
            '''
            res = self.spiderRun('KN5977', flyDate)
            print(res)
            seq = 1
            wbSheet.write(seq, 0, str(seq))
            wbSheet.write(seq, 1, res.fly_company if(res.fly_company is not None) else '')
            wbSheet.write(seq, 2, res.fly_code if(res.fly_code is not None) else '')
            wbSheet.write(seq, 3, res.fly_date if(res.fly_date is not None) else '')
            wbSheet.write(seq, 4, res.plan_start_time if(res.plan_start_time is not None) else '')
            wbSheet.write(seq, 5, res.real_start_time_ocr if(res.real_start_time_ocr is not None) else '')
            wbSheet.write(seq, 6, res.start_address if(res.start_address is not None) else '')
            wbSheet.write(seq, 7, res.plan_end_time if(res.plan_end_time is not None) else '')
            wbSheet.write(seq, 8, res.real_end_time_ocr if(res.real_end_time_ocr is not None) else '')
            wbSheet.write(seq, 9, res.end_address if(res.end_address is not None) else '')
            wbSheet.write(seq, 10, res.time_performance if(res.time_performance is not None) else '')
            wbSheet.write(seq, 11, res.is_empty if(res.is_empty is not None) else '')
            wb.save('D://FLYTMP.xls')
        # return self.spiderRun(flyCodeList[0], flyDate)
        # return



if __name__ == "__main__":
    flyutil = flyspider()
    flyutil.start('20201014')


  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要使用Python简单爬取航班信息,你可以使用Python的第三方库BeautifulSoup和requests来实现。首先,你需要安装这两个库。你可以使用pip命令来安装它们: ``` pip install beautifulsoup4 pip install requests ``` 接下来,你可以使用requests库发送HTTP请求来获取航班信息网页的HTML内容。例如,你可以发送一个GET请求到特定的航班信息网站,然后获取响应的HTML内容: ```python import requests # 发送GET请求获取航班信息网页的HTML内容 response = requests.get("https://example.com/flight_information") # 获取HTML内容 html_content = response.text ``` 然后,你可以使用BeautifulSoup库来解析HTML内容,从中提取你想要的航班信息。你可以使用BeautifulSoup的find或find_all方法来查找特定的HTML元素。例如,如果航班信息是包含在<table>标签中的<tr>标签中的<td>标签中,你可以使用以下代码来提取航班信息: ```python from bs4 import BeautifulSoup # 创建BeautifulSoup对象 soup = BeautifulSoup(html_content, "html.parser") # 查找包含航班信息的<table>标签 flight_table = soup.find("table") # 遍历<table>标签中的每个<tr>标签 for row in flight_table.find_all("tr"): # 查找<tr>标签中的每个<td>标签 columns = row.find_all("td") # 提取航班信息 flight_number = columns<span class="em">1</span> #### 引用[.reference_title] - *1* [svm支持向量机python代码](https://download.csdn.net/download/weixin_45725404/88247840)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 100%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值