hsbench: add Hyperscan benchmarker

The hsbench tool provides an easy way to measure Hyperscan's
performance for a particular set of patterns and corpus of data
to be scanned.
This commit is contained in:
Matthew Barr
2016-12-14 15:26:01 +11:00
parent 06cde4c94d
commit f626276271
26 changed files with 3145 additions and 1 deletions

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
'''
A module to construct corpora databases for the Hyperscan benchmarker
(hsbench).
After construction, simply add blocks with the add_chunk() method, then call
finish() when you're done.
'''
import os.path
try:
from sqlite3 import dbapi2 as sqlite
except:
from pysqlite2 import dbapi2 as sqlite
class CorpusBuilder:
SCHEMA = '''
CREATE TABLE chunk (
id integer primary key,
stream_id integer not null,
data blob
);
'''
def __init__(self, outfile):
if os.path.exists(outfile):
raise RuntimeError("Database '%s' already exists" % outfile)
self.outfile = outfile
self.db = sqlite.connect(self.outfile)
self.db.executescript(CorpusBuilder.SCHEMA)
self.current_chunk_id = 0;
def add_chunk(self, stream_id, data):
chunk_id = self.current_chunk_id;
c = self.db.cursor()
q = 'insert into chunk (id, stream_id, data) values (?, ?, ?)'
c.execute(q, (chunk_id, stream_id, sqlite.Binary(data)))
self.current_chunk_id += 1
return chunk_id
def finish(self):
self.db.commit()
c = self.db.cursor()
q = 'create index chunk_stream_id_idx on chunk(stream_id)'
c.execute(q)
c = self.db.cursor()
q = 'vacuum'
c.execute(q)
c = self.db.cursor()
q = 'analyze'
c.execute(q)
self.db.commit()

View File

@@ -0,0 +1,68 @@
#!/usr/bin/python
'''
This script creates a Hyperscan benchmarking corpus database from a supplied
group of Project Gutenberg texts.
'''
import sys, getopt, os.path
import gutenberg.acquire, gutenberg.cleanup, gutenberg.query
from CorpusBuilder import CorpusBuilder
stream_id = 0
stream_bytes = 0
def addBlocks(builder, block_size, stream_size, text_id, text):
global stream_id
global stream_bytes
print "text", text_id, "len", len(text)
i = 0
while i < len(text):
chunk = text[i:min(len(text), i + block_size)]
builder.add_chunk(stream_id, chunk)
i += block_size
stream_bytes += len(chunk)
if stream_bytes >= stream_size:
stream_id += 1
stream_bytes = 0
print "Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes."
def buildCorpus(outFN, block_size, stream_size, text_ids):
if len(text_ids) == 0:
print >>sys.stderr, "Must provide at least one input ID"
sys.exit(0)
builder = CorpusBuilder(outFN)
total_bytes = 0
stream_id = 0
stream_bytes = 0
for text_id in text_ids:
text_id = int(text_id)
text = gutenberg.acquire.load_etext(text_id)
text = gutenberg.cleanup.strip_headers(text).strip()
addBlocks(builder, block_size, stream_size, text_id, text)
total_bytes += len(text)
builder.finish()
print "Total:", total_bytes, "bytes."
def usage(exeName):
errmsg = "Usage: %s -o <output file> -b <block size> -s <max stream size> <gutenberg text id>..."
errmsg = errmsg % exeName
print >> sys.stderr, errmsg
sys.exit(-1)
if __name__ == '__main__':
opts, args = getopt.getopt(sys.argv[1:], 'o:b:s:')
opts = dict(opts)
requiredKeys = [ '-o', '-b', '-s' ]
for k in requiredKeys:
if not opts.has_key(k):
usage(os.path.basename(sys.argv[0]))
buildCorpus(opts['-o'], int(opts['-b']), int(opts['-s']), args)

View File

@@ -0,0 +1,53 @@
#!/usr/bin/python
'''
Simple script to take a file full of lines of text and push them into a
Hyperscan benchmarking corpus database, one block per line.
'''
import sys, getopt, os.path
from CorpusBuilder import CorpusBuilder
def lineCorpus(inFN, outFN):
'''
Read lines from file name @inFN and write them as blocks to a new db with
name @outFN.
'''
if not os.path.exists(inFN):
print >> sys.stderr, "Input file '%s' does not exist. Exiting." % outFN
sys.exit(-1)
lines = open(inFN).readlines()
if len(lines) == 0:
print >> sys.stderr, "Input file contained no lines. Exiting."
sys.exit(0)
builder = CorpusBuilder(outFN)
# write a single stream to contain everything
streamId = 0
for l in lines:
builder.add_chunk(streamId, l.rstrip())
builder.finish()
def usage(exeName):
errmsg = "Usage: %s -i <input file> -o <output file>"
errmsg = errmsg % exeName
print >> sys.stderr, errmsg
sys.exit(-1)
if __name__ == '__main__':
args = getopt.getopt(sys.argv[1:], 'i:o:c:')
args = dict(args[0])
requiredKeys = [ '-i', '-o' ]
for k in requiredKeys:
if not args.has_key(k):
usage(os.path.basename(sys.argv[0]))
fnArgs = tuple([args[k] for k in requiredKeys])
lineCorpus(*fnArgs)

View File

@@ -0,0 +1,301 @@
#!/usr/bin/env python
'''
Script to convert a pcap file containing UDP and TCP packets to a corpus file.
'''
import sys, getopt, pprint, os
from sqlite3 import dbapi2 as sqlite
import pcap
from optparse import OptionParser
from socket import AF_INET, IPPROTO_UDP, IPPROTO_TCP, inet_ntop, ntohs, ntohl, inet_ntoa
import struct
from CorpusBuilder import CorpusBuilder
ETHERTYPE_IP = 0x0800 # IP protocol
ETHERTYPE_ARP = 0x0806 # Addr. resolution protocol
ETHERTYPE_REVARP = 0x8035 # reverse Addr. resolution protocol
ETHERTYPE_VLAN = 0x8100 # IEEE 802.1Q VLAN tagging
ETHERTYPE_IPV6 = 0x86dd # IPv6
#
# A dictionary of active TCP streams
#
tcp_streams = {}
#
# A dictionary of UDP streams
#
udp_streams = {}
#
# Current stream id
cur_stream_id = 0
def usage(exeName) :
errmsg = "Usage: %s -i <pcap-file> -o <sqlite-file>"
errmsg = errmsg % exeName
print >> sys.stderr, errmsg
sys.exit(-1)
class FiveTuple(object):
def __init__(self, protocol, src_addr, src_port, dst_addr, dst_port):
self.protocol = protocol
self.src_addr = src_addr
self.src_port = src_port
self.dst_addr = dst_addr
self.dst_port = dst_port
def __str__(self):
return "%d,%s,%d,%s,%d" % (self.protocol, self.src_addr, self.src_port, self.dst_addr, self.dst_port)
class UdpSegment:
"""Definition of a UDP segment
"""
def __init__(self, five_tuple, header, payload):
self.five_tuple = five_tuple
self.udp_header = header
self.udp_payload = payload
class TcpSegment:
"""Definition of a TCP segment
"""
def __init__(self, five_tuple, header, payload):
self.five_tuple = five_tuple
self.tcp_header = header
self.tcp_payload = payload
self.tcp_sequence_number, self.tcp_acknowledgement_number = struct.unpack('!LL', header[4:12])
def opt_isset_FIN(self):
opts = ord(self.tcp_header[13]) & 0x3F
return (opts & 0x01)
def opt_isset_SYN(self):
opts = ord(self.tcp_header[13]) & 0x3F
return (opts & 0x02)
def get_sequence_number(self):
return self.tcp_sequence_number
def __cmp__(self, other):
return cmp(self.tcp_sequence_number, other.tcp_sequence_number)
class TcpStream:
"""Definition of a TCP stream.
"""
TCP_STREAM_ACTIVE = 0x1
TCP_STREAM_CLOSED = 0x02
def __init__(self, five_tuple):
self.five_tuple = five_tuple
self.initial_sequence_number = 0
self.segments = []
def reset_stream(self):
self.segments = []
self.initial_sequence_number = 0
def set_initial_sequence_number(self, sequence_number):
self.initial_sequence_number = sequence_number
def append_segment(self, tcp_segment):
if len(self.segments) == 0:
self.set_initial_sequence_number(tcp_segment.get_sequence_number())
self.segments.append(tcp_segment)
def get_segments_sorted(self):
return sorted(self.segments)
class UdpStream:
"""A container for UDP packets that share the same 5-tuple
"""
def __init__(self, five_tuple):
self.five_tuple = five_tuple
self.segments = []
def append_segment(self, udp_segment):
self.segments.append(udp_segment)
def newStream(five_tuple):
'''
Create a new stream using the arguments passed-in and return its ID.
'''
global cur_stream_id
stream_id = cur_stream_id
cur_stream_id += 1
return stream_id
def process_tcp_segment(builder, segment):
"""Process a tcp segment. It checks for SYN and FIN segments are
if set modifies the associated stream.
"""
segment_id = str(segment.five_tuple)
if segment_id in tcp_streams:
m_tcp_stream = tcp_streams[segment_id]
m_tcp_stream.append_segment(segment)
else:
m_tcp_stream = TcpStream(segment.five_tuple)
m_tcp_stream.append_segment(segment)
tcp_streams[segment_id] = m_tcp_stream
if segment.opt_isset_SYN():
m_tcp_stream.segments = []
if segment.opt_isset_FIN():
#
# Finished with the stream - add the segments in the
# stream to db allowing the stream to be reused.
#
db_add_tcp_stream_segments(builder, m_tcp_stream)
del tcp_streams[segment_id]
def process_udp_segment(builder, segment):
""" Process a UDP segment. Given the connectionless nature of the UDP
protocol we simple accumulate the segment for later processing
when all the packets have been read
"""
segment_id = str(segment.five_tuple)
if segment_id in udp_streams:
m_udp_stream = udp_streams[segment_id]
m_udp_stream.append_segment(segment)
else:
m_udp_stream = UdpStream(segment.five_tuple)
m_udp_stream.append_segment(segment)
udp_streams[segment_id] = m_udp_stream
def db_add_tcp_stream_segments(builder, tcp_stream):
"""Add the contents of a tcp stream to the database
"""
tcp_segments = tcp_stream.get_segments_sorted()
last_sequence_num = 0
streamID = None
for tcp_segment in tcp_segments:
if (len(tcp_segment.tcp_payload) > 0) and (tcp_segment.tcp_sequence_number > last_sequence_num):
#
# Segment with an actual payload - add it to the stream's
# list of chunks.
#
# Note: delay creating the stream until we have a via chunk to
# commit to it
#
if streamID == None:
streamID = newStream(tcp_stream.five_tuple)
builder.add_chunk(streamID, tcp_segment.tcp_payload)
last_sequence_num = tcp_segment.tcp_sequence_number
def db_add_udp_stream_segments(builder, udp_stream):
"""Add the contents of a UDP stream to the database. Since UDP is
connection-less, a UDP stream object is really just an accumulation
of all the packets associated with a given 5-tuple.
"""
udp_segments = udp_stream.segments
streamID = None
for udp_segment in udp_segments:
if len(udp_segment.udp_payload) > 0:
if streamID == None:
streamID = newStream(udp_stream.five_tuple)
builder.add_chunk(streamID, udp_segment.udp_payload)
def enchunk_pcap(pcapFN, sqliteFN):
"""Read the contents of a pcap file with name @pcapFN and produce
a sqlite db with name @sqliteFN. It will contain chunks of data
from TCP and UDP streams,
"""
if not os.path.exists(pcapFN):
print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN
sys.exit(-1)
builder = CorpusBuilder(sqliteFN)
#
# Read in the contents of the pcap file, adding stream segments as found
#
pkt_cnt = 0;
ip_pkt_cnt = 0;
unsupported_ip_protocol_cnt = 0
pcap_ref = pcap.pcap(pcapFN)
done = False
while not done:
try:
ts, packet = pcap_ref.next()
except:
break
pkt_cnt += 1
linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0]
if linkLayerType != ETHERTYPE_IP:
#
# We're only interested in IP packets
#
continue
ip_pkt_cnt += 1
ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0]
ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len]
pkt_protocol = struct.unpack('B', ip_pkt[9])[0]
if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP):
#
# we're only interested in UDP and TCP packets at the moment
#
continue
pkt_src_addr = inet_ntoa(ip_pkt[12:16])
pkt_dst_addr = inet_ntoa(ip_pkt[16:20])
ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4
ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)]
pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4])
five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port, pkt_dst_addr, pkt_dst_port)
five_tuple_id = str(five_tuple)
if pkt_protocol == IPPROTO_UDP:
udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8
udp_header = ip_payload[0:8]
udp_payload = ip_payload[8:len(ip_payload)]
udp_segment = UdpSegment(five_tuple, udp_header, udp_payload)
process_udp_segment(builder, udp_segment)
elif pkt_protocol == IPPROTO_TCP:
tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4
tcp_header = ip_payload[0:tcp_hdr_len]
tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)]
segment = TcpSegment(five_tuple, tcp_header, tcp_payload)
process_tcp_segment(builder, segment)
#
# Having read the contents of the pcap, we fill the database with any
# remaining TCP and UDP segments
#
for tcp_stream in tcp_streams.itervalues():
db_add_tcp_stream_segments(builder, tcp_stream)
for udp_stream in udp_streams.itervalues():
db_add_udp_stream_segments(builder, udp_stream)
#
# We've finished with the database
#
builder.finish()
if __name__ == '__main__' :
args = getopt.getopt(sys.argv[1:], 'i:o:')
args = dict(args[0])
requiredKeys = [ '-i', '-o']
for k in requiredKeys :
if not args.has_key(k) :
usage(os.path.basename(sys.argv[0]))
fnArgs = tuple([ args[k] for k in requiredKeys ])
enchunk_pcap(*fnArgs)