[Python] 解析Pcap三个Python库(Dpkt Scapy Pyshark)应用实例

说明

如果要处理pcap文件,python有仨库比较有名

  • scapy 速度慢 资料最多
  • pyshark 速度适中 需要系统安装了wireshark
  • dpkt 速度快 资料偏少 不够全

Dpkt输出传输层协议信息

(如果有传输层的话)
包含以下信息。

时间戳 Timestamp
二层 MAC (source, destination)
IP(source, destination)
len
ttl
DF
MF
offset
protocol

# coding:utf-8
"""
逐个packet输出长度,info等信息
https://dpkt.readthedocs.io/en/latest/print_packets.html
"""
import dpkt
import datetime
import socket
from dpkt.compat import compat_ord
from dpkt.ip import get_ip_proto_name


def mac_addr(address):
    """Convert a MAC address to a readable/printable string

       Args:
           address (str): a MAC address in hex form (e.g. '\x01\x02\x03\x04\x05\x06')
       Returns:
           str: Printable/readable MAC address
    """
    return ':'.join('%02x' % compat_ord(b) for b in address)


def inet_to_str(inet):
    """Convert inet object to a string

        Args:
            inet (inet struct): inet network address
        Returns:
            str: Printable/readable IP address
    """
    # First try ipv4 and then ipv6
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        return socket.inet_ntop(socket.AF_INET6, inet)


def print_packets(pcap):
    """Print out information about each packet in a pcap

       Args:
           pcap: dpkt pcap reader object (dpkt.pcap.Reader)
    """
    # For each packet in the pcap process the contents
    for timestamp, buf in pcap:

        # Print out the timestamp in UTC
        print('Timestamp: ', str(datetime.datetime.utcfromtimestamp(timestamp)))

        # Unpack the Ethernet frame (mac src/dst, ethertype)
        eth = dpkt.ethernet.Ethernet(buf)
        print('Ethernet Frame: ', mac_addr(eth.src), mac_addr(eth.dst), eth.type)

        # Make sure the Ethernet data contains an IP packet
        if not isinstance(eth.data, dpkt.ip.IP):
            print('Non IP Packet type not supported %s\n' % eth.data.__class__.__name__)
            continue

        # Now unpack the data within the Ethernet frame (the IP packet)
        # Pulling out src, dst, length, fragment info, TTL, and Protocol
        ip = eth.data

        # Pull out fragment information (flags and offset all packed into off field, so use bitmasks)
        do_not_fragment = bool(ip.off & dpkt.ip.IP_DF)
        more_fragments = bool(ip.off & dpkt.ip.IP_MF)
        fragment_offset = ip.off & dpkt.ip.IP_OFFMASK
        protocol = get_ip_proto_name(ip.p)
        # Print out the info
        print('IP: %s -> %s   (len=%d ttl=%d DF=%d MF=%d offset=%d protocol=%s)\n' % \
              (inet_to_str(ip.src), inet_to_str(ip.dst), ip.len, ip.ttl, do_not_fragment, more_fragments,
               fragment_offset, protocol))


if __name__ == '__main__':
    with open('pcap/2021_11_02_Idle.pcap', 'rb') as f:
        pcap = dpkt.pcap.Reader(f)
        print_packets(pcap)

Pyshark Or Wireshark

如果只是想要所有packet的[src_IP, dst_IP, src_MAC, dst_MAC, Protocol, TimeStamp, Info]等信息,可以直接打开Wireshark导出

  • 首先确定需要导出的Column名
  • 然后直接导出为CSV文件即可
    毕竟能不写代码肯定是好的!

这个操作也可以用Pyshark库完成。

import asyncio
import os
from tqdm import tqdm, trange
import pyshark
import pandas as pd

filePath = 'pcap/nestcamPOWER_3.pcap'


def pcap2csv(filePath):
    print(filePath)
    loop = asyncio.ProactorEventLoop()
    asyncio.set_event_loop(loop)
    cap = pyshark.FileCapture(filePath, only_summaries=True, eventloop=loop)
    # 预加载的时候很慢, 可以类比为Wireshark打开这个包的速度
    cap.load_packets()
    packetAmount = len(cap)
    data3 = []
    # 由于之前预加载了,处理得时候嘎嘎快!
    processBar = tqdm(cap, desc="Pcap Progress Bar ", total=packetAmount)
    for packet in processBar:
        line = str(packet)
        pItem = line.split(" ")
        # 1 - 时间戳 (s)
        # 4 - Highest Protocol
        # 5 - Length()
        data3.append(pItem[1], pItem[4], pItem[5]])

    dataframe = pd.DataFrame(columns=["TimeStamp", "Protocol", "Length"], data=data3)
    # 保存的csv文件名字
    csvName = filePath.replace("pcap/", "result/").replace("pcap", "csv")
    print(csvName)
    dataframe.to_csv(csvName, index=False, sep=',')
    cap.close()

使用 PyShark 和 scapy 从 pcap 文件中读取字段并填充 CSV

Usage: pcap2csv --pcap <input pcap file> --csv <output pcap file>

pcap 中的每个数据包都呈现到 csv 文件的一行中。要提取的特定项目以及它们在 csv 中的呈现顺序在脚本的“render_csv_row”函数中进行了硬编码。另请注意,csv 中的分隔符是“|”字符,而不是逗号。

此脚本同时使用 PyShark (https://kiminewt.github.io/pyshark/) 和 Scapy 来完成其工作。PyShark是因为我们希望利用tshark强大的协议解码能力来生成CSV的“文本描述”字段(如“标准查询0xf3de A www.cisco.com”,“Client Hello”等),而Scapy则因为同时我们希望访问数据包的“有效载荷”部分(PyShark似乎无法提供这一点)。

#!/usr/bin/env python3

"""pcap2csv
Script to extract specific pieces of information from a pcap file and
render into a csv file.

Usage: <program name> --pcap <input pcap file> --csv <output pcap file>

Each packet in the pcap is rendered into one row of the csv file.
The specific items to extract, and the order in which they are rendered
in the csv are hard-coded in the script, in the 'render_csv_row' function.
Also note that the separators in the csv are '|' characters, not commas.

This script uses *both* PyShark (https://kiminewt.github.io/pyshark/) and
Scapy to do its work. PyShark because we want to leverage tshark's powerful
protocol decoding ability to generate the "textual description" field of
the CSV, and Scapy because at the same time we want to access the "payload"
portion of the packet (PyShark seems to be unable to provide this).
"""

import argparse
import os.path
import sys

import pyshark
from scapy.utils import RawPcapReader
from scapy.layers.l2 import Ether
from scapy.layers.inet import IP, UDP, TCP

#--------------------------------------------------

def render_csv_row(pkt_sh, pkt_sc, fh_csv):
    """Write one packet entry into the CSV file.
    pkt_sh is the PyShark representation of the packet

    pkt_sc is a 'bytes' representation of the packet as returned from
    scapy's RawPcapReader

    fh_csv is the csv file handle
    """
    ether_pkt_sc = Ether(pkt_sc)
    if ether_pkt_sc.type != 0x800:
        print('Ignoring non-IP packet')
        return False

    ip_pkt_sc = ether_pkt_sc[IP]       # <<<< Assuming Ethernet + IPv4 here
    proto = ip_pkt_sc.fields['proto']
    if proto == 17:
        udp_pkt_sc = ip_pkt_sc[UDP]
        l4_payload_bytes = bytes(udp_pkt_sc.payload)
        l4_proto_name = 'UDP'
        l4_sport = udp_pkt_sc.sport
        l4_dport = udp_pkt_sc.dport
    elif proto == 6:
        tcp_pkt_sc = ip_pkt_sc[TCP]
        l4_payload_bytes = bytes(tcp_pkt_sc.payload)
        l4_proto_name = 'TCP'
        l4_sport = tcp_pkt_sc.sport
        l4_dport = tcp_pkt_sc.dport
    else:
        # Currently not handling packets that are not UDP or TCP
        print('Ignoring non-UDP/TCP packet')
        return False

    # Each line of the CSV has this format
    fmt = '{0}|{1}|{2}({3})|{4}|{5}:{6}|{7}:{8}|{9}|{10}'
    #       |   |   |   |    |   |   |   |   |   |   |
    #       |   |   |   |    |   |   |   |   |   |   o-> {10} L4 payload hexdump
    #       |   |   |   |    |   |   |   |   |   o-----> {9}  total pkt length
    #       |   |   |   |    |   |   |   |   o---------> {8}  dst port
    #       |   |   |   |    |   |   |   o-------------> {7}  dst ip address
    #       |   |   |   |    |   |   o-----------------> {6}  src port
    #       |   |   |   |    |   o---------------------> {5}  src ip address
    #       |   |   |   |    o-------------------------> {4}  text description
    #       |   |   |   o------------------------------> {3}  L4 protocol
    #       |   |   o----------------------------------> {2}  highest protocol
    #       |   o--------------------------------------> {1}  time
    #       o------------------------------------------> {0}  frame number

    # Example:
    # 1|0.0|DNS(UDP)|Standard query 0xf3de A www.cisco.com|192.168.1.116:57922|1.1.1.1:53|73|f3de010000010000000000000377777705636973636f03636f6d0000010001

    print(fmt.format(pkt_sh.no,               # {0}
                     pkt_sh.time,             # {1}
                     pkt_sh.protocol,         # {2}
                     l4_proto_name,           # {3}
                     pkt_sh.info,             # {4}
                     pkt_sh.source,           # {5}
                     l4_sport,                # {6}
                     pkt_sh.destination,      # {7}
                     l4_dport,                # {8}
                     pkt_sh.length,           # {9}
                     l4_payload_bytes.hex()), # {10}
          file=fh_csv)

    return True
    #--------------------------------------------------

def pcap2csv(in_pcap, out_csv):
    """Main entry function called from main to process the pcap and
    generate the csv file.

    in_pcap = name of the input pcap file (guaranteed to exist)
    out_csv = name of the output csv file (will be created)

    This function walks over each packet in the pcap file, and for
    each packet invokes the render_csv_row() function to write one row
    of the csv.
    """

    # Open the pcap file with PyShark in "summary-only" mode, since this
    # is the mode where the brief textual description of the packet (e.g.
    # "Standard query 0xf3de A www.cisco.com", "Client Hello" etc.) are
    # made available.
    pcap_pyshark = pyshark.FileCapture(in_pcap, only_summaries=True)
    pcap_pyshark.load_packets()
    pcap_pyshark.reset()

    frame_num = 0
    ignored_packets = 0
    with open(out_csv, 'w') as fh_csv:
        # Open the pcap file with scapy's RawPcapReader, and iterate over
        # each packet. In each iteration get the PyShark packet as well,
        # and then call render_csv_row() with both representations to generate
        # the CSV row.
        for (pkt_scapy, _) in RawPcapReader(in_pcap):
            try:
                pkt_pyshark = pcap_pyshark.next_packet()
                frame_num += 1
                if not render_csv_row(pkt_pyshark, pkt_scapy, fh_csv):
                    ignored_packets += 1
            except StopIteration:
                # Shouldn't happen because the RawPcapReader iterator should also
                # exit before this happens.
                break

    print('{} packets read, {} packets not written to CSV'.
          format(frame_num, ignored_packets))
#--------------------------------------------------

def command_line_args():
    """Helper called from main() to parse the command line arguments"""

    parser = argparse.ArgumentParser()
    parser.add_argument('--pcap', metavar='<input pcap file>',
                        help='pcap file to parse', required=True)
    parser.add_argument('--csv', metavar='<output csv file>',
                        help='csv file to create', required=True)
    args = parser.parse_args()
    return args
#--------------------------------------------------

def main():
    """Program main entry"""
    args = command_line_args()

    if not os.path.exists(args.pcap):
        print('Input pcap file "{}" does not exist'.format(args.pcap),
              file=sys.stderr)
        sys.exit(-1)

    if os.path.exists(args.csv):
        print('Output csv file "{}" already exists, '
              'won\'t overwrite'.format(args.csv),
              file=sys.stderr)
        sys.exit(-1)

    pcap2csv(args.pcap, args.csv)
#--------------------------------------------------

if __name__ == '__main__':
    main()
  • 11
    点赞
  • 73
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值