/* * Copyright 2005 Perforce Software. All rights reserved. * */ /* * unicvt.cc - Character set conversion code between unicode encodings * which are never needed inside the p4d server. In a * seporate file to keep server size low. */ #include <stdhdrs.h> #include <strbuf.h> #include "i18napi.h" #include "charcvt.h" #include "charman.h" #include "validate.h" CharSetCvtUTF8UTF8::CharSetCvtUTF8UTF8( int dir, int f ) : validator( NULL ), flags( f ) { // do not validate from the server if( ( flags & UTF8_VALID_CHECK ) && ( direction = dir ) == -1 ) validator = new CharSetUTF8Valid; } CharSetCvtUTF8UTF8::~CharSetCvtUTF8UTF8() { delete validator; } /* * direction controls BOM prepending * * 0 means never prepend BOM * 1 means prepend BOM * -1 means only prepend BOM in the reverse direction * * This allows us to always eliminate the BOM when going into the * server while prepending the BOM when going to the client * and still use the same code to always suppress a BOM */ CharSetCvt * CharSetCvtUTF8UTF8::Clone() { return new CharSetCvtUTF8UTF8( direction, flags ); } CharSetCvt * CharSetCvtUTF8UTF8::ReverseCvt() { return new CharSetCvtUTF8UTF8( -direction, flags ); } int CharSetCvtUTF8UTF8::Cvt(const char **sourcestart, const char *sourceend, char **targetstart, char *targetend) { int slen = sourceend - *sourcestart; int tlen = targetend - *targetstart; if( checkBOM && slen >= 1 && (**sourcestart & 0xff) == 0xef ) { if( slen < 3 ) { lasterr = PARTIALCHAR; return 0; } if( ((*sourcestart)[1] & 0xff) == 0xbb && ((*sourcestart)[2] & 0xff) == 0xbf ) { *sourcestart += 3; slen -= 3; } } if( checkBOM && ( flags & UTF8_WRITE_BOM ) && direction == 1 ) { // write a BOM to target if( tlen < 3 ) { lasterr = PARTIALCHAR; return 0; } **targetstart = 0xef; *++*targetstart = 0xbb; *++*targetstart = 0xbf; ++*targetstart; tlen -= 3; } checkBOM = 0; if( tlen < slen ) slen = tlen; if( validator ) { const char *ep, *sp; sp = (const char *)*sourcestart; switch ( validator->Valid( sp, slen, &ep ) ) { case 0: // eventually not valid lasterr = NOMAPPING; slen = ep - sp; validator->Reset(); break; case 3: // valid to a partial char lasterr = PARTIALCHAR; slen = ep - sp; validator->Reset(); break; case 1: // valid, do nothing break; } // sigh... need to count newlines... while ( sp < ep ) { if ( !( sp = (const char *)memchr( sp, '\n', ep - sp ) ) ) break; ++linecnt; ++sp; } } memcpy( (void *)*targetstart, (void *)*sourcestart, slen ); *sourcestart += slen; *targetstart += slen; return 0; } CharSetCvt * CharSetCvtUTF832::Clone() { return new CharSetCvtUTF832( invert, bom ); } CharSetCvt * CharSetCvtUTF328::Clone() { return new CharSetCvtUTF328( invert, bom ); } CharSetCvt * CharSetCvtUTF832::ReverseCvt() { return new CharSetCvtUTF328( invert, bom ); } CharSetCvt * CharSetCvtUTF328::ReverseCvt() { return new CharSetCvtUTF832( invert, bom ); } int CharSetCvtUTF832::Cvt(const char **sourcestart, const char *sourceend, char **targetstart, char *targetend) { unsigned int v; if( checkBOM && bom ) { if( 4+*targetstart >= targetend ) { lasterr = PARTIALCHAR; return 0; } if( fileinvert ) { **targetstart = 0xff; *++*targetstart = 0xfe; *++*targetstart = 0; *++*targetstart = 0; } else { **targetstart = 0; *++*targetstart = 0; *++*targetstart = 0xfe; *++*targetstart = 0xff; } ++*targetstart; } while( *sourcestart < sourceend && *targetstart < targetend - 1 ) { v = **sourcestart & 0xff; int l; if ( v & 0x80 ) { l = bytesFromUTF8[v]; if ( l + *sourcestart >= sourceend ) { lasterr = PARTIALCHAR; return 0; } switch( l ) { default: lasterr = NOMAPPING; return 0; case 3: // surrogates are needed - check for more target space if( *targetstart > targetend - 4 ) { lasterr = PARTIALCHAR; return 0; } v <<= 6; v += 0xff & *++*sourcestart; // fall through... case 2: v <<= 6; v += 0xff & *++*sourcestart; // fall through... case 1: v <<= 6; v += 0xff & *++*sourcestart; v -= offsetsFromUTF8[l]; # ifdef STRICT_UTF8 if( v < minimumFromUTF8[l] ) { // illegal over long UTF8 sequence lasterr = NOMAPPING; *sourcestart -= l; return 0; } #endif // at this point v is a unicode position if( checkBOM && v == 0xfeff ) { checkBOM = 0; ++*sourcestart; continue; } // note fall through } } checkBOM = 0; # ifdef STRICT_UTF8 // check for invalid unicode positions if( ( v & 0x1ff800 ) == 0xd800 || ( v >= 0xfdd0 && v <= 0xfdef ) ) { lasterr = NOMAPPING; *sourcestart -= l; return 0; } # endif ++charcnt; if( v == '\n' ) { ++linecnt; charcnt = 0; } if( fileinvert ) { **targetstart = v & 0xff; *++*targetstart = (v >> 8) & 0xff; *++*targetstart = (v >> 16) & 0xff; *++*targetstart = (v >> 24) & 0xff; } else { **targetstart = (v >> 24) & 0xff; *++*targetstart = (v >> 16) & 0xff; *++*targetstart = (v >> 8) & 0xff; *++*targetstart = v & 0xff; } ++*targetstart; ++*sourcestart; } if( *sourcestart < sourceend && *targetstart < targetend ) lasterr = PARTIALCHAR; return 0; } int CharSetCvtUTF328::Cvt(const char **sourcestart, const char *sourceend, char **targetstart, char *targetend) { unsigned int v; while( 3+*sourcestart < sourceend && *targetstart < targetend ) { if( fileinvert ) { v = **sourcestart & 0xff; v |= (*++*sourcestart & 0xff) << 8; v |= (*++*sourcestart & 0xff) << 16; v |= (*++*sourcestart & 0xff) << 24; } else { v = (**sourcestart & 0xff) << 24; v |= (*++*sourcestart & 0xff) << 16; v |= (*++*sourcestart & 0xff) << 8; v |= *++*sourcestart & 0xff; } ++*sourcestart; if( checkBOM ) { checkBOM = 0; switch( v ) { case 0xfffe0000: fileinvert ^= 1; // fall through... case 0xfeff: // suppress BOM continue; } } // emit UTF8 of v if( ( v & 0x1ff800 ) == 0xd800 || ( v >= 0xfdd0 && v <= 0xfdef ) ) { lasterr = NOMAPPING; *sourcestart -= 2; if( v >= 0x10000 ) *sourcestart -= 2; return 0; } if( v >= 0x10000 ) { // Extended multilingual plane - 4 byte UTF8 if (3 + *targetstart >= targetend) { lasterr = PARTIALCHAR; *sourcestart -= 4; // 4 because of UTF 16 surrogates return 0; } **targetstart = 0xf0 | (v >> 18); *++*targetstart = 0x80 | ((v >> 12) & 0x3f); *++*targetstart = 0x80 | ((v >> 6) & 0x3f); *++*targetstart = 0x80 | (v & 0x3f); } else if( v >= 0x800 ) { if (2 + *targetstart >= targetend) { lasterr = PARTIALCHAR; *sourcestart -= 2; return 0; } **targetstart = 0xe0 | (v >> 12); *++*targetstart = 0x80 | ((v >> 6) & 0x3f); *++*targetstart = 0x80 | (v & 0x3f); } else if( v >= 0x80 ) { if (1 + *targetstart >= targetend) { lasterr = PARTIALCHAR; *sourcestart -= 2; return 0; } **targetstart = 0xc0 | (v >> 6); *++*targetstart = 0x80 | (v & 0x3f); } else **targetstart = v; ++*targetstart; } if( *sourcestart < sourceend && *targetstart < targetend ) lasterr = PARTIALCHAR; ++charcnt; if( v == '\n' ) { ++linecnt; charcnt = 0; } return 0; }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#1 | 18760 | rlranft |
Populate -o //guest/perforce_software/p4/... //guest/rlranft/p4/.... |
||
//guest/perforce_software/p4/2014-2/i18n/unicvt.cc | |||||
#1 | 15903 | Matt Attaway | Everything should be happy now between the Workshop and the depot paths | ||
//guest/perforce_software/p4/2014_2/i18n/unicvt.cc | |||||
#1 | 15901 | Matt Attaway | Clean up code to fit modern Workshop naming standards | ||
//guest/perforce_software/p4/2014.2/i18n/unicvt.cc | |||||
#1 | 12189 | Matt Attaway | Initial (and much belated) drop of 2014.2 p4 source code |