#!/usr/bin/env python3.3 ''' /* * Copyright (c) 2015, Charles McLouth * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL STEWART LORD BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. auditconverter - This python script converts audit logs for ingestion into the Helix Threat Detection analytics engine. It converts all input files to be encoded as utf-8. It converts all structured log formats to standard P4AUDIT format. It splits output files in 2GB chunks It optionally anonymizes user, client, host, and revisions It optionally compresses output files see auditconverter -h for usage $Id: //guest/cmclouth/projects/auditconverter/src/auditconverter.py#32 $ */ ''' import logging import sys import os import argparse import gzip import sqlite3 import datetime #import operator import re numbers_re = re.compile(r'(\d+)') scriptversion = "1.0" scriptname = os.path.basename(sys.argv[0]) # standard format: '%s %s@%s %s %s %s#%s' # structured format: '6,,,%s,,,%s,%s,,%s,,,,%s,%s,%s' # (self.f_date, self.f_user, self.f_client, self.f_host, self.f_action, self.f_file, self.f_rev) P4AUDIT_RECORDFORMAT='%s %s@%s %s %s %s#%s' STRUCTURED_RECORDFORMAT='6,,,%s,,,%s,%s,,%s,,,,%s,%s,%s' def isDEBUG(record): return record.levelname == 'DEBUG' def isINFO(record): return record.levelname == 'INFO' def isWARN(record): return record.levelname == 'WARNING' def isERROR(record): return record.levelname in ['ERROR', 'CRITICAL'] class AuditException(Exception): errorMessages = ['', # 0 : no error 'File:%s; Line:%d; Cannot determine codec.', # 1 : errCode, fileName, lineNo, lineBin 'File:%s; Line:%d; Ignoring empty line.', # 2 : errCode, fileName, lineNo 'File:%s; Line:%d; Is not a recognized structured audit log record format.', # 3 : errCode, fileName, lineNo, lineUTF8 'File:%s; Line:%d; user field is empty.', # 4 : errCode, fileName, lineNo, aRecord, lineBin ] def __str__(self): #(errCode, lineBin, lineUTF8, linesRead, fileName) = self.args errCode = self.args[0] fileName = self.args[1] if fileName is None: fileName = 'unknown' lineNo = self.args[2] if lineNo is None: lineNo = -1 if errCode == 0: return AuditException.errorMessages[errCode] elif errCode >= 1 and errCode <=4: return AuditException.errorMessages[errCode] % (fileName, lineNo) return '' class UTF8Converter(): def __init__(self): self.codecs = ['cp1252', 'cp1250', 'cp1251', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'latin_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'cp037', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1140', 'cp65001', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr', 'ascii', 'big5', 'big5hkscs', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'johab', 'koi8_r', 'koi8_u', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman', 'mac_turkish', 'ptcp154', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_7', 'utf_8_sig'] self.codecMap = {'utf_8':0} self.codecSorted = ['utf_8'] self.counter = 0 def convert(self, lineBin): self.counter += 1 lineEnc = None lineUTF8 = None codecName = None iCodec = 0 while codecName is None and iCodec < len(self.codecSorted): try: lineEnc = lineBin.decode(encoding=self.codecSorted[iCodec], errors='strict') codecName = self.codecSorted[iCodec] except UnicodeError: iCodec += 1 codecName = None iCodec = 0 while codecName is None and iCodec < len(self.codecs): if self.codecs[iCodec] in self.codecSorted: iCodec += 1 continue try: lineEnc = lineBin.decode(encoding=self.codecs[iCodec], errors='strict') codecName = self.codecs[iCodec] except UnicodeError: iCodec += 1 codecName = None if codecName is not None: #logger.debug('codec: %s' % codecName) if codecName == 'utf_8': lineUTF8 = lineEnc else: lineUTF8 = lineEnc.encode(encoding='utf_8').decode(encoding='utf_8') #increment use count if codecName in self.codecMap: self.codecMap[codecName] += 1 else: self.codecMap[codecName] = 1 # re-sort every 1000 iterations if len(self.codecMap) > 1 and self.counter % 1000 == 0: self.codecSorted = sorted(list(self.codecMap.keys()), key=lambda k: self.codecMap[k], reverse=True) offset = 0 while offset < len(self.codecs): if self.codecs[offset] in self.codecMap: del(self.codecs[offset]) else: offset += 1 return (codecName, lineUTF8) class Anonymizer(): def __init__(self, database): # does the database exist self.hdbc = None self.hstmt = None self.__unames = {'user':0, 'client':0, 'host':0, 'project':0} self.__userMap = {} self.__clientMap = {} self.__hostMap = {} self.__projectMap = {} self.__dbVersion = 3 self.__userRMap = {} self.__clientRMap = {} self.__hostRMap = {} self.__projectRMap = {} if database is not None: bInitDatabase = not os.path.isfile(database) self.hdbc = sqlite3.connect(database) self.hdbc.isolation_level = None self.hstmt = self.hdbc.cursor() if bInitDatabase: # initialize blank database self.__initDatabase(self.hdbc) else: self.__migrateDatabase() # load maps from database # counters logger.debug('load from db: counters') sSql = 'SELECT n, v FROM counters' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: self.__unames[r[0]] = r[1] r = self.hstmt.fetchone() # userMap logger.debug('load from db: userMap') sSql = 'SELECT n, v FROM userMap' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: self.__userMap[r[0]] = r[1] self.__userRMap[r[1]] = r[0] r = self.hstmt.fetchone() # clientMap logger.debug('load from db: clientMap') sSql = 'SELECT n, v FROM clientMap' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: self.__clientMap[r[0]] = r[1] self.__clientRMap[r[1]] = r[0] r = self.hstmt.fetchone() # hostMap logger.debug('load from db: hostMap') sSql = 'SELECT n, v FROM hostMap' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: self.__hostMap[r[0]] = r[1] self.__hostRMap[r[1]] = r[0] r = self.hstmt.fetchone() # projectMap logger.debug('load from db: projectMap') sSql = 'SELECT n, v FROM projectMap' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: self.__projectMap[r[0]] = r[1] self.__projectRMap[r[1]] = r[0] r = self.hstmt.fetchone() def close(self): if self.hstmt is not None: self.hstmt.close() if self.hdbc is not None: self.hdbc.close() def __initDatabase(self, hdbc): sSql = 'CREATE TABLE counters (n TEXT, v INTEGER);' hdbc.execute(sSql) sSql = 'CREATE UNIQUE INDEX counters_uix ON counters (n)' hdbc.execute(sSql) sSql = "INSERT INTO counters (n,v) VALUES('user', 0)" hdbc.execute(sSql) sSql = "INSERT INTO counters (n,v) VALUES('client', 0)" hdbc.execute(sSql) sSql = "INSERT INTO counters (n,v) VALUES('host', 0)" hdbc.execute(sSql) sSql = "INSERT INTO counters (n,v) VALUES('project', 0)" hdbc.execute(sSql) sSql = "INSERT INTO counters (n,v) VALUES('dbversion', %d)" % self.__dbVersion hdbc.execute(sSql) sSql = 'CREATE TABLE userMap (n TEXT, v TEXT)' hdbc.execute(sSql) sSql = 'CREATE UNIQUE INDEX userMap_uix ON userMap (n)' hdbc.execute(sSql) sSql = 'CREATE TABLE clientMap (n TEXT, v TEXT)' hdbc.execute(sSql) sSql = 'CREATE UNIQUE INDEX clientMap_uix ON clientMap (n)' hdbc.execute(sSql) sSql = 'CREATE TABLE hostMap (n TEXT, v TEXT)' hdbc.execute(sSql) sSql = 'CREATE UNIQUE INDEX hostMap_uix ON hostMap (n)' hdbc.execute(sSql) sSql = 'CREATE TABLE projectMap (n TEXT, v TEXT)' hdbc.execute(sSql) sSql = 'CREATE UNIQUE INDEX projectMap_uix ON hostMap (n)' hdbc.execute(sSql) def getMappedUser(self, k): # always get from the map return self.__userMap.get(k) def getReverseMappedUser(self, k): # always get from the map return self.__userRMap.get(k) def __putMappedUser(self, k, v): # always put to the map self.__userMap[k] = v self.__userRMap[v] = k sSql = "INSERT INTO userMap (n, v) VALUES('%s', '%s')" % (k, v) self.hdbc.execute(sSql) def getMappedClient(self, k): # always get from the map return self.__clientMap.get(k) def getReverseMappedClient(self, k): # always get from the map return self.__clientRMap.get(k) def __putMappedClient(self, k, v): # always put to the map self.__clientMap[k] = v self.__clientRMap[v] = k sSql = "INSERT INTO clientMap (n, v) VALUES('%s', '%s')" % (k, v) self.hdbc.execute(sSql) def getMappedHost(self, k): # always get from the map return self.__hostMap.get(k) def getReverseMappedHost(self, k): # always get from the map return self.__hostRMap.get(k) def __putMappedHost(self, k, v): # always put to the map self.__hostMap[k] = v self.__hostRMap[v] = k sSql = "INSERT INTO hostMap (n, v) VALUES('%s', '%s')" % (k, v) self.hdbc.execute(sSql) def getMappedProject(self, k): # always get from the map return self.__projectMap.get(k) def getReverseMappedProject(self, k): # always get from the map return self.__projectRMap.get(k) def __putMappedProject(self, k, v): # always put to the map self.__projectMap[k] = v self.__projectRMap[v] = k sSql = "INSERT INTO projectMap (n, v) VALUES('%s', '%s')" % (k, v) self.hdbc.execute(sSql) def __anonymize_user(self, data): result = self.getMappedUser(data) if result is None: result = self.__genUniqueName('user', 'User') self.__putMappedUser(data, result) return result def __anonymize_client(self, data): result = self.getMappedClient(data) if result is None: result = self.__genUniqueName('client', 'Client') self.__putMappedClient(data, result) return result def __anonymize_host(self, data): result = self.getMappedHost(data) if result is None: result = self.__genUniqueName('host', '1.1.1.') self.__putMappedHost(data, result) return result def __anonymize_depotfile(self, data): # convert depotfile to project parts = data.split('/') # ignore the filename del(parts[-1]) if len(parts) > 5: del(parts[5:]) projectNameIn = '/'.join(parts) result = self.getMappedProject(projectNameIn) if result is None: # The format for the Project name must look like a file and path # //Project%d/file.ext result = '//%s/file.ext' % self.__genUniqueName('project', 'Project') self.__putMappedProject(projectNameIn, result) return result def __genUniqueName(self, classType, prefix): self.__unames[classType] += 1 sSql = "UPDATE counters SET v=%d WHERE n='%s'" % (self.__unames[classType], classType) self.hdbc.execute(sSql) return prefix + str(self.__unames[classType]) def __migrateDatabase(self): sSql = "SELECT n, v FROM counters WHERE n = 'dbversion'" self.hstmt.execute(sSql) r = self.hstmt.fetchone() dbVersion = 0 if r is not None: dbVersion = r[1] while dbVersion < self.__dbVersion: logger.debug('Migrate database from version:%d to version: %d' % (dbVersion, dbVersion+1)) if dbVersion == 0: # from 0 to 1 we add the dbversion to the counter table sSql = "INSERT INTO counters (n,v) VALUES('dbversion', %d)" % (dbVersion+1) self.hdbc.execute(sSql) elif dbVersion == 1: # from 1 to 2 we change the convention of the host # from Host%d to 1.1.1.%d hostMap = {} # hostMap sSql = 'SELECT n, v FROM hostMap' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: logger.debug('fetching: %s' % str(r)) k = r[0] vOld = r[1] v = '1.1.1.' + vOld[4:] hostMap[k] = v r = self.hstmt.fetchone() sSql = 'DELETE FROM hostMap' self.hdbc.execute(sSql) for (k,v) in hostMap.items(): logger.debug('inserting: (%s,%s)' % (k,v)) sSql = "INSERT INTO hostMap (n, v) VALUES('%s', '%s')" % (k, v) self.hdbc.execute(sSql) sSql = "UPDATE counters SET v = %d WHERE n = 'dbversion'" % (dbVersion+1) self.hdbc.execute(sSql) elif dbVersion == 2: # from 2 to 3 we change the convention of the Project # from Project%d to //Project%d/file.ext projectMap = {} # projectMap sSql = 'SELECT n, v FROM projectMap' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: logger.debug('fetching: %s' % str(r)) k = r[0] vOld = r[1] v = '//%s/file.ext' % vOld projectMap[k] = v r = self.hstmt.fetchone() sSql = 'DELETE FROM projectMap' self.hdbc.execute(sSql) for (k,v) in projectMap.items(): logger.debug('inserting: (%s,%s)' % (k,v)) sSql = "INSERT INTO projectMap (n, v) VALUES('%s', '%s')" % (k, v) self.hdbc.execute(sSql) sSql = "UPDATE counters SET v = %d WHERE n = 'dbversion'" % (dbVersion+1) self.hdbc.execute(sSql) dbVersion += 1 def anonymizeRecord(self, aRecordIn): bUser = True bClient = True bHost = True bDepotFile = True aRecordOut = AuditLogLine() aRecordOut.f_date = aRecordIn.f_date aRecordOut.f_user = aRecordIn.f_user aRecordOut.f_client = aRecordIn.f_client aRecordOut.f_host = aRecordIn.f_host aRecordOut.f_action = aRecordIn.f_action aRecordOut.f_file = aRecordIn.f_file aRecordOut.f_rev = aRecordIn.f_rev if bUser: aRecordOut.f_user = self.__anonymize_user(aRecordIn.f_user) if bClient: aRecordOut.f_client = self.__anonymize_client(aRecordIn.f_client) if bHost: aRecordOut.f_host = self.__anonymize_host(aRecordIn.f_host) if bDepotFile: aRecordOut.f_file = self.__anonymize_depotfile(aRecordIn.f_file) return aRecordOut def dump(self, outputFile): fWrite = open(outputFile, mode='wt', encoding='utf_8') fWrite.write('This file contains the mappings between the real user and project IDs in your\n'\ 'log files, and the anonymized identifiers used in the analytics.\n\n'\ 'This report is organized into the following lists:\n\n'\ '* Mapping from real user name to anonymized user identifier\n'\ '* Mapping from real client name to anonymized client identifier\n'\ '* Mapping from real host/ip name to anonymized host/ip identifier\n'\ '* Mapping from real project name to anonymized project identifier\n') # by User ID fWrite.write('\n### User ID = Anonymized Identifier\n\n') sSql = 'SELECT n, v FROM userMap ORDER BY n' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: n = r[0] if n is None or len(n) < 1: n = '(none)' v = r[1] if v is None or len(v) < 1: v = '(none)' fWrite.write(' %s = %s\n' % (n, v)) r = self.hstmt.fetchone() # by Client ID fWrite.write('\n### Client ID = Anonymized Identifier\n\n') sSql = 'SELECT n, v FROM clientMap ORDER BY n' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: n = r[0] if n is None or len(n) < 1: n = '(none)' v = r[1] if v is None or len(v) < 1: v = '(none)' fWrite.write(' %s = %s\n' % (n, v)) r = self.hstmt.fetchone() # by Host ID fWrite.write('\n### Host ID = Anonymized Identifier\n\n') sSql = 'SELECT n, v FROM hostMap ORDER BY n' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: n = r[0] if n is None or len(n) < 1: n = '(none)' v = r[1] if v is None or len(v) < 1: v = '(none)' fWrite.write(' %s = %s\n' % (n, v)) r = self.hstmt.fetchone() # by Project ID fWrite.write('\n### Project ID = Anonymized Identifier\n\n') sSql = 'SELECT n, v FROM projectMap ORDER BY n' self.hstmt.execute(sSql) r = self.hstmt.fetchone() while r is not None: n = r[0] if n is None or len(n) < 1: n = '(none)' v = r[1] if v is None or len(v) < 1: v = '(none)' fWrite.write(' %s = %s\n' % (n, v)) r = self.hstmt.fetchone() fWrite.close() def load(self, inputFile): fRead = open(inputFile, mode='rt', encoding='utf_8') sSql = "DELETE FROM userMap" self.hdbc.execute(sSql) sSql = "DELETE FROM clientMap" self.hdbc.execute(sSql) sSql = "DELETE FROM hostMap" self.hdbc.execute(sSql) sSql = "DELETE FROM projectMap" self.hdbc.execute(sSql) mapFunction = None mapName = None lineCount = 0 rowCount = 0 for line in fRead: lineCount += 1 op = line[0:3] if op == '###': if mapName is not None: logger.info('loaded %d records into table %s' % (rowCount, mapName)) if line.startswith('### User ID'): mapFunction = self.__putMappedUser mapName = 'User' elif line.startswith('### Client ID'): mapFunction = self.__putMappedClient mapName = 'Client' elif line.startswith('### Host ID'): mapFunction = self.__putMappedHost mapName = 'Host' elif line.startswith('### Project ID'): mapFunction = self.__putMappedProject mapName = 'Project' else: mapFunction = None mapName = None rowCount = 0 if mapName is not None: logger.info('Processing %s records' % mapName) elif op == ' ' and mapFunction is not None: parts = line.strip().split('=', 1) n = parts[0].strip() v = parts[1].strip() mapFunction(n,v) rowCount += 1 if rowCount % 100 == 0: logger.debug('%s:rowcount:%d' % (mapName, rowCount)) if mapName is not None: logger.info('loaded %d records into table %s' % (rowCount, mapName)) self.__unames = {'user':len(self.__userMap), 'client':0, 'host':0, 'project':0} self.__unames['user'] = len(self.__userMap) self.__unames['client'] = len(self.__clientMap) self.__unames['host'] = len(self.__hostMap) self.__unames['project'] = len(self.__projectMap) for (n,v) in self.__unames.items(): sSql = "UPDATE counters SET v=%d WHERE n='%s'" % (v, n) self.hdbc.execute(sSql) # fWrite.write('\n### User ID = Anonymized Identifier\n\n') # fWrite.write('\n### Client ID = Anonymized Identifier\n\n') # fWrite.write('\n### Host ID = Anonymized Identifier\n\n') # fWrite.write('\n### Project ID = Anonymized Identifier\n\n') fRead.close() def debugMaps(self): logger.debug('len(self.__userMap):%d' % len(self.__userMap)) logger.debug('len(self.__clientMap):%d' % len(self.__clientMap)) logger.debug('len(self.__hostMap):%d' % len(self.__hostMap)) logger.debug('len(self.__projectMap):%d' % len(self.__projectMap)) def fileCompareKey(value): parts = numbers_re.split(value) parts[1::2] = map(int, parts[1::2]) return parts def sortFiles(fileListIn): return sorted(fileListIn, key=fileCompareKey) def processInputParams(pargs=sys.argv): ''' process commandline arguments and run function ''' gParser = argparse.ArgumentParser() gParser.description="This python script converts audit logs for ingesting into the Helix Threat Detection analytics engine.\n"\ "It converts all input files to be encoded as utf-8, converts all structured log formats to standard P4AUDIT format."\ "Optionally it will anonymize the data in the output log files and compress them." gParser.add_argument('-V', '--version', action='version', version='%(prog)s ' + scriptversion) gParser.add_argument('-i', '--input', dest='input', metavar='input', \ help='a directory of audit logs to convert.') gParser.add_argument('-o', '--output', dest='output', metavar='output', \ help='a directory to write converted log files to.') gParser.add_argument('-c', '--compress', dest='compress', action='store_true', \ help='compress output log files with gzip compatible compression.') gParser.add_argument('-d', '--database', dest='database', metavar='database', \ help='path to anonymization database file.') gParser.add_argument('-a', '--anonymization-map', dest='anonymizationmap', metavar='anonymization-map', \ help='output file of anonymization-map.') gParser.add_argument('-C', '--case-sensitive', dest='casesensitive', action='store_true', default=False, \ help='format as case-sensitive.') gParser.add_argument('-s', '--size', dest='maxSize', metavar='maxSize', \ type=int, default=2048, \ help='Maximum size of output files in MB (default:2048.) Specify zero (0) for no limit.') # gParser.add_argument('-m', '--ipmap', dest='ipmap', action='store_true', \ # help='generate a map of ip addresses to user/client.') gParser.add_argument('-v', '--novalidate', dest='validate', action='store_false', default=True, \ help='Do not validate the data of audit records.') gParser.add_argument('-f', '--format', dest='logformat', metavar='logformat', \ type=int, default=0, \ help='Output record format. 0 (zero) for P4AUDIT 1 (one) for Structured Audit Log.') args = gParser.parse_args(pargs) # must have i or a or both if (not hasattr(args, 'input') or args.input is None) \ and (not hasattr(args, 'anonymizationmap') or args.anonymizationmap is None): gParser.print_help() gParser.error('input (-i) or anonymization-map (-a) are required.') # if i then o if (hasattr(args, 'input') and args.input is not None) \ and hasattr(args, 'output') and args.output is None: gParser.print_help() gParser.error('output (-o) is required.') # if a then d if (hasattr(args, 'anonymizationmap') and args.anonymizationmap is not None) \ and hasattr(args, 'database') and args.database is None: gParser.print_help() gParser.error('database (-d) is required.') # validate inputs if hasattr(args, 'input') and args.input is not None: if not os.path.isdir(args.input): gParser.print_help() gParser.error("invalid input directory (-i) '%s'" % (args.input)) if hasattr(args, 'output') and args.output is not None: if not os.path.isdir(args.output): gParser.print_help() gParser.error("invalid output directory (-o) '%s'" % (args.output)) return args class AuditFileIO(object): ''' Abstration for reading/writing to/from a file ''' def __init__(self, fileName, forRead=True, caseSensitive=False, utf8converter=None, maxWriteSize=None): self.fileName = fileName self.isCompressed = False self.linesRead = 0 self.linesIgnored = 0 self.bytesRead = 0 self.linesWritten = 0 self.bytesWritten = 0 self.bytesWrittenPart = 0 self.forRead = forRead self.firstLineRead = False self.caseSensitive = caseSensitive self.utf8converter = utf8converter self.structuredFormat = False self.maxWriteSize = maxWriteSize self.splitfiles = False self.parts = 0 if fileName.endswith('.gz'): self.isCompressed = True if self.isCompressed: if self.forRead: self.f = gzip.open(self.fileName, 'rb') else: self.f = gzip.open(self.fileName, 'wb') else: if self.forRead: self.f = open(fileName, mode='r+b') else: self.f = open(self.fileName, 'w+b') self.splitfiles = self.maxWriteSize is not None and self.maxWriteSize > 0 and not self.isCompressed def readRecord(self, bValidateRecord=True): if not self.forRead: return None lineBin = self.f.readline() if lineBin is not None and len(lineBin) > 0: self.linesRead += 1 self.bytesRead += len(lineBin) else: return None (codecName, lineUTF8) = self.utf8converter.convert(lineBin) if codecName is None: raise AuditException(1, self.fileName, self.linesRead, lineBin) else: if codecName != 'utf_8': logger.debug('file: %s - converted line %d from %s to utf_8' % (self.fileName, self.linesRead, codecName)) lineUTF8 = lineUTF8.rstrip() if len(lineUTF8) < 1: self.linesIgnored += 1 raise AuditException(2, self.fileName, self.linesRead) if not self.caseSensitive: # convert all data to lowercase lineUTF8 = lineUTF8.lower() if not self.firstLineRead and len(lineUTF8) > 0: self.firstLineRead = True # test format self.structuredFormat = len(lineUTF8.split(',')) > 2 aRecord = AuditLogLine() try: aRecord.setLine(lineUTF8, self.structuredFormat) if bValidateRecord: aRecord.validateRecord() except AuditException as e: errCode = e.args[0] if errCode == 1: raise AuditException(errCode, self.fileName, self.linesRead, lineBin) elif errCode == 2: raise AuditException(errCode, self.fileName, self.linesRead) elif errCode == 3: raise AuditException(errCode, self.fileName, self.linesRead, lineUTF8) elif errCode == 4: raise AuditException(errCode, self.fileName, self.linesRead, aRecord, lineBin) else: raise e return aRecord def writeLine(self, lineUTF8): if self.forRead: return 0 # add \n lineUTF8 = lineUTF8 + '\n' lineBin = lineUTF8.encode(encoding='utf_8', errors='strict') lineBytes = len(lineBin) # Does this exceed the max filesize? if self.splitfiles and lineBytes + self.bytesWrittenPart > self.maxWriteSize: # close the file self.f.close() # rename the file if self.isCompressed: newFileName = "%s.%03d.%s" % (self.fileName[0:len(self.fileName)-3], self.parts, 'gz') else: newFileName = "%s.%03d" % (self.fileName, self.parts) os.replace(self.fileName, newFileName) self.parts += 1 self.bytesWrittenPart = 0 # open new file if os.path.isfile(self.fileName): logger.error("rollover file still exists.") if self.isCompressed: self.f = gzip.open(self.fileName, 'wb') else: self.f = open(self.fileName, 'w+b') bytesWritten = self.f.write(lineBin) if bytesWritten > 0: self.linesWritten += 1 self.bytesWritten += bytesWritten self.bytesWrittenPart += bytesWritten return bytesWritten def close(self): # close the file fReturn = self.f.close() if self.splitfiles and self.parts > 0: # rename the file if self.isCompressed: newFileName = "%s.%03d.%s" % (self.fileName[0:len(self.fileName)-3], self.parts, 'gz') else: newFileName = "%s.%03d" % (self.fileName, self.parts) os.replace(self.fileName, newFileName) self.parts += 1 self.bytesWrittenPart = 0 self.fileName = None self.isCompressed = False self.linesRead = 0 self.linesIgnored = 0 self.bytesRead = 0 self.linesWritten = 0 self.bytesWritten = 0 self.bytesWrittenPart = 0 self.forRead = None self.firstLineRead = None self.caseSensitive = None self.structuredFormat = False self.parts = 0 self.splitfiles = False return fReturn class AuditLogLine(object): ''' Audit log record standard format: '%s %s@%s %s %s %s#%s' structured format: '6,,,%s,,,%s,%s,,%s,,,,%s,%s,%s' (self.f_date, self.f_user, self.f_client, self.f_host, self.f_action, self.f_file, self.f_rev) ''' def __init__(self, rowformat='%s %s@%s %s %s %s#%s'): self.f_date = '' self.f_user = '' self.f_client = '' self.f_host = '' self.f_action = '' self.f_file = '' self.f_rev = '' self.rowformat = rowformat def setLine(self, data, structuredLog=False): self.f_date = '' self.f_user = '' self.f_client = '' self.f_host = '' self.f_action = '' self.f_file = '' self.f_rev = '' # f_date f_user@f_client f_host f_action f_file#f_rev fields = [] if data is None or len(data) < 1: # file and line# set by caller raise AuditException(2, None, None) elif structuredLog: ieventtype=0 idate=3 iuser=6 iclient=7 ihost=9 iaction=13 ifile=14 irev=15 fields = data.split(',') if len(fields) > irev and fields[ieventtype] == '6': dateParts = fields[idate].split(' ') self.f_date = "%s %s" % (dateParts[0], dateParts[1]) self.f_user = fields[iuser] self.f_client = fields[iclient] self.f_host = fields[ihost] self.f_action = fields[iaction] self.f_file = fields[ifile] # trip the revision specifier if fields[irev][0] == '#': self.f_rev = fields[irev][1:] else: self.f_rev = fields[irev] else: # file and line# set by caller raise AuditException(3, None, None, data) else: offset = 0 for field in data.split(maxsplit=5): if offset == 2: for f in field.split('@', maxsplit=1): fields.append(f) elif offset == 5: for f in field.split('#', maxsplit=1): fields.append(f) # found a weird case where no # revision field. add empty string if len(fields) < 8: fields.append('') else: fields.append(field) offset += 1 self.f_date = "%s %s" % (fields[0], fields[1]) self.f_user = fields[2] self.f_client = fields[3] self.f_host = fields[4] self.f_action = fields[5] self.f_file = fields[6] self.f_rev = fields[7] return True def getLine(self, rowformat=None): if rowformat is not None: self.rowformat=rowformat return self.__str__() def __str__(self): # f_date f_user@f_client f_host f_action f_file#f_rev return self.rowformat % (self.f_date, self.f_user, self.f_client, self.f_host, self.f_action, self.f_file, self.f_rev) def validateRecord(self): if self.f_user is None or len(self.f_user) < 1: # file and line# set by caller sError = '%s\n' % str(self) raise AuditException(4, None, None, self, sError.encode(encoding='utf_8', errors='strict')) class HostMRU(object): ''' MRU Host/User combination used for fixing problem records. ''' def __init__(self): self.hostMap = {} self.hostusermap = {} def addRecord(self, aRecord): if aRecord is None or aRecord.f_host is None or len(aRecord.f_host) < 1 or aRecord.f_host == 'unknown': return False self.hostMap[aRecord.f_host] = (aRecord.f_user, aRecord.f_client) if aRecord.f_host not in self.hostusermap: self.hostusermap[aRecord.f_host] = {} self.hostusermap[aRecord.f_host][aRecord.f_user] = self.hostusermap[aRecord.f_host].get(aRecord.f_user, 0) + 1 return True def fixRecord(self, aRecord): if aRecord is None or aRecord.f_host is None or len(aRecord.f_host) < 1 or aRecord.f_host == 'unknown': return False (user, client) = self.hostMap.get(aRecord.f_host, (None, None)) if aRecord.f_client is None or len(aRecord.f_client) < 1 or aRecord.f_client == 'unknown': aRecord.f_client = client if aRecord.f_user is None or len(aRecord.f_user) < 1 or aRecord.f_user == 'unknown': aRecord.f_user = user if aRecord.f_user is None or len(aRecord.f_user) < 1 or aRecord.f_user == 'unknown': return False return True if __name__ == '__main__': logger = logging.getLogger(scriptname) logger.propagate = False logger.setLevel(logging.INFO) debugHandler = logging.StreamHandler() #debugHandler = logging.FileHandler('mergetracker.out') debugHandler.setLevel(logging.DEBUG) debugHandler.setFormatter(logging.Formatter('%(levelname)s:%(filename)s:%(lineno)d:%(funcName)s:%(message)s')) debugHandler.addFilter(isDEBUG) logger.addHandler(debugHandler) infoHandler = logging.StreamHandler(sys.stdout) infoHandler.setLevel(logging.INFO) infoHandler.setFormatter(logging.Formatter('%(message)s')) infoHandler.addFilter(isINFO) logger.addHandler(infoHandler) warnHandler = logging.StreamHandler() warnHandler.setLevel(logging.WARN) warnHandler.setFormatter(logging.Formatter('%(message)s')) warnHandler.addFilter(isWARN) logger.addHandler(warnHandler) errorHandler = logging.StreamHandler() errorHandler.setLevel(logging.ERROR) errorHandler.setFormatter(logging.Formatter('%(message)s')) errorHandler.addFilter(isERROR) logger.addHandler(errorHandler) args = processInputParams(sys.argv[1:]) anonimizer = Anonymizer(args.database) if args.logformat == 1: outputRecordFormat = STRUCTURED_RECORDFORMAT else: outputRecordFormat = P4AUDIT_RECORDFORMAT recordFixer = HostMRU() if hasattr(args, 'input') and args.input is not None: fileList = [] for fileName in os.listdir(args.input): if not os.path.isfile(os.path.join(args.input, fileName)): continue fileList.append(fileName) if len(fileList) < 1: logger.info('No files to process in -i %s' % (args.input)) # sort filenames sortedFiles = sortFiles(fileList) fileList.clear() for fileName in sortedFiles: fileList.append(os.path.join(args.input, fileName)) utf8converter = UTF8Converter() #maxWriteSize = 10485760 maxWriteSize = 1048576 * args.maxSize if maxWriteSize < 1: maxWriteSize = None fError = None for fileOffset in range(0, len(fileList)): fileName = fileList[fileOffset] fRead = None fWrite = None charsWritten = 0 fileNameOut = None dtStart = datetime.datetime.now() try: fileNameOut = os.path.basename(fileName) if fileName.endswith('.gz'): baseName = os.path.basename(fileName) fileNameOut = baseName[0:len(baseName)-3] fileNameOut += '.utf8' if args.database: fileNameOut += '.anonymized' if args.compress: fileNameOut += '.gz' fileNameOut = os.path.join(args.output, fileNameOut) fRead = AuditFileIO(fileName, True, args.casesensitive, utf8converter, None) fWrite = AuditFileIO(fileNameOut, False, args.casesensitive, utf8converter, maxWriteSize) if fWrite.splitfiles: logger.info('Processing file: %s (%d of %d) converting to: %s (splitfile enabled)' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName)) logger.debug('splitfiles enabled with maxSize of %d bytes' % fWrite.maxWriteSize) else: logger.info('Processing file: %s (%d of %d) converting to: %s' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName)) while True: try: try: aRecord = fRead.readRecord(args.validate) # record passed validation so add it to the MRU recordFixer.addRecord(aRecord) except AuditException as e: errCode = e.args[0] if errCode != 4: raise e aRecord = e.args[3] # record failed validation try data in MRU if not recordFixer.fixRecord(aRecord): raise e aRecord.validateRecord() # eof if aRecord is None: break # anonymize if args.database: # Helix Threat Detection is Case Insensitive for files. If we are anonymizing the data then force case insentive aRecord.f_client = aRecord.f_client.lower() aRecord.f_file = aRecord.f_file.lower() aRecord.f_user = aRecord.f_user.lower() aRecord = anonimizer.anonymizeRecord(aRecord) # write output fWrite.writeLine(aRecord.getLine(outputRecordFormat)) except AuditException as e: errCode = e.args[0] fileName = e.args[1] lineNo = e.args[2] logger.error(str(e)) if errCode in [1,3,4]: #write bad line to error file if fError is None: fileNameErr = os.path.join(args.output, "auditconverter.err") fError = open(fileNameErr, 'a+b') fError.write('\n'.encode(encoding='utf_8', errors='strict')) if errCode == 1: lineBin = e.args[3] elif errCode == 3: lineBin = ("%s\n" % (e.args[3])).encode(encoding='utf_8', errors='strict') elif errCode == 4: lineBin = e.args[4] prefix = 'file=%s;line=%d;err=%d;:' % (fileName, lineNo, errCode) fError.write(prefix.encode(encoding='utf_8', errors='strict')) fError.write(lineBin) # if errCode == 4: # # dump recordFixer # for k in sorted(recordFixer.hostMap.keys(), key=fileCompareKey): # sDebug = 'MRU;%s:%s\n' % (k,recordFixer.hostMap.get(k)) # fError.write(sDebug.encode(encoding='utf_8', errors='strict')) # raise e if fRead.linesRead != fWrite.linesWritten: logger.warn('Lines read: %d; Lines written: %d' % (fRead.linesRead, fWrite.linesWritten)) dtStop = datetime.datetime.now() seconds = (dtStop-dtStart).seconds if seconds == 0: seconds = 1 if fWrite.splitfiles and fWrite.parts > 0: logger.info('Completed: %s (%d of %d) converting to: %s (%d splitparts) with %d lines in %d seconds (%d lines/second.)' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName, fWrite.parts, fRead.linesRead, seconds, int(fRead.linesRead/seconds))) else: logger.info('Completed: %s (%d of %d) converting to: %s with %d lines in %d seconds (%d lines/second.)' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName, fRead.linesRead, seconds, int(fRead.linesRead/seconds))) except: logger.exception('unknown exception processing input file: %s' % (fileName)) finally: if fRead is not None: fRead.close() if fWrite is not None: fWrite.close() anonimizer.debugMaps() if fError is not None: fError.close() if hasattr(args, 'anonymizationmap') and args.anonymizationmap is not None: # if args.load: # logger.info('load database from anonymization map: %s' % args.anonymizationmap) # anonimizer.load(args.anonymizationmap) # else: logger.info('writing anonymization map to: %s' % args.anonymizationmap) anonimizer.dump(args.anonymizationmap) anonimizer.close()
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#32 | 18147 | Charlie McLouth | Move logging initialization to main | ||
#31 | 18146 | Charlie McLouth | Fix bug where global variable was being used unintentionally | ||
#30 | 18144 | Charlie McLouth | Added support for Structured Log format as output | ||
#29 | 18143 | Charlie McLouth | Removed dead functions | ||
#28 | 18124 | Charlie McLouth | Add linefeed to exception output when line data is UTF-8 | ||
#27 | 18123 | Charlie McLouth | Added re-validate after attempted fix | ||
#26 | 18122 | Charlie McLouth | Add linefeed to error line | ||
#25 | 18121 | Charlie McLouth | make exception parameters variable length | ||
#24 | 18120 | Charlie McLouth | Changed exception parameters to be variable length | ||
#23 | 18119 | Charlie McLouth | Sort files numerically | ||
#22 | 18118 | Charlie McLouth |
Changed to default to case-insensitve. switch now turns it to case-sensitive. |
||
#21 | 18117 | Charlie McLouth | Now it will try to fix data where the user is null or unknown | ||
#20 | 18116 | Charlie McLouth | always case-insensitive when anonymizing | ||
#19 | 18115 | Charlie McLouth | strip pound sign from record format and apply back in string conversion | ||
#18 | 18114 | Charlie McLouth | Added option to dump the ip/user map | ||
#17 | 18113 | Charlie McLouth | Make Record validation optional | ||
#16 | 18112 | Charlie McLouth | Write problem records to error file | ||
#15 | 18111 | Charlie McLouth | Added Exception handling | ||
#14 | 18110 | Charlie McLouth | Added reverse lookups for decoding anonymization | ||
#13 | 18109 | Charlie McLouth | Alter file io to write binary output files seems to write output slightly faster | ||
#12 | 18108 | Charlie McLouth | Added support for splitting output files | ||
#11 | 18107 | Charlie McLouth | Refactored file io | ||
#10 | 18106 | Charlie McLouth |
refactored parse and join operations added option for case-insensitive |
||
#9 | 18105 | Charlie McLouth | Fixed double newline | ||
#8 | 18104 | Charlie McLouth | Small fix to lineending problem | ||
#7 | 18103 | Charlie McLouth | upped dbversion to support the transition of project anonymization | ||
#6 | 18102 | Charlie McLouth |
Fixed host anonymization to work with the Interset connector. also added database versioning to migrate the anonymization db for EA |
||
#5 | 18101 | Charlie McLouth | Handle non-audit line in the structured log | ||
#4 | 18100 | Charlie McLouth | Resolved a bug where a depotFile did not have a revision specified | ||
#3 | 18099 | Charlie McLouth | changed Version | ||
#2 | 18098 | Charlie McLouth |
auditconverter converts audit logs from local encoding to utf-8 from structured logs to standard P4AUDIT anonymizes the fields for Helix Threat Detection |
||
#1 | 18097 | Charlie McLouth | Rename project |