/* * Copyright 1995, 2003 Perforce Software. All rights reserved. * * This file is part of Perforce - the FAST SCM System. */ #include "validate.h" /* * ValidateCharSet */ CharSetValid::~CharSetValid() { } CharSetUTF8Valid::CharSetUTF8Valid() : followcnt(0), magic(0) { } void CharSetUTF8Valid::Reset() { followcnt = 0; magic = 0; } /* * What do these bits mean? * * 0x40 Valid single byte code * 0x80 Part of a multi byte sequence * 0x08 UTF-16 surrogate * 0x07 count of following bytes */ unsigned char CharSetUTF8Valid::validmap[256] = { // 0 - 0x7f 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x80 - 0x8f 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, // 0x90 - 0x9f 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, // 0xa0 - 0xbf 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0xc0, 0xc1 illegal 0, 0, // 0xc2 - 0xdf 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0xe0 - 0xef ( 0xe0 and 0xed are magical ) 0x72, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x4a, 0x42, 0x42, // 0xf0 - 0xf4 ( 0xf0 and 0xf4 are magical ) 0x63, 0x43, 0x43, 0x43, 0x53, // 0xf5 - 0xff illegal 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* * return values are... * * 0 not valid * 1 valid * 3 valid so far (following bytes needed to complete a multi-byte char) */ int CharSetUTF8Valid::Valid( const char *buf, int len, const char **retp ) { while( len-- > 0 ) { int chflags = validmap[0xff & *buf]; if( followcnt ) { if( ( chflags & 0x80 ) != 0x80 ) return 0; --followcnt; if( magic ) { switch( magic ) { case 0x10: // lead is 0xf4 if( ( chflags & 0x20 ) != 0x20 ) return 0; break; case 0x20: // lead is 0xf0 if( ( chflags & 0x20 ) == 0x20 ) return 0; break; case 0x30: // lead is 0xe0 if( ( chflags & 0x10 ) == 0x10 ) return 0; break; case 0x08: // lead is 0xed (UTF-16 surrogates) if( ( chflags & 0x30 ) == 0x00 ) return 0; break; } magic = 0; } } else { if( retp ) *retp = buf; if( ( chflags & 0x40 ) != 0x40 ) return 0; followcnt = chflags & 0x7; magic = chflags & 0x38; } buf++; } if( followcnt ) return 3; if( retp ) *retp = buf; return 1; }