/* * FileSys::CheckType() - look at the file and see if it is binary, etc */ # include <stdhdrs.h> # include <charman.h> # include <i18napi.h> # include <charcvt.h> # include <validate.h> # include <debug.h> # include <tunable.h> # define BestFiletype(t) (FileSysType)(t) # ifdef OS_VMS # include <unistd.h> # include <dirent.h> # endif # include <error.h> # include <strbuf.h> # include "filesys.h" # include "fileio.h" FileSysType FileSys::CheckType( int scan ) { if( scan < 0 || p4tunable.IsSet( P4TUNE_FILESYS_BINARYSCAN ) ) { // How far to look in a file for binary characters scan = p4tunable.Get( P4TUNE_FILESYS_BINARYSCAN ); } // Stat & check for missing, special int fsf = Stat(); if( ( fsf & FSF_SYMLINK ) ) return FST_SYMLINK; if( !( fsf & FSF_EXISTS ) ) return FST_MISSING; if( ( fsf & FSF_DIRECTORY ) ) return FST_DIRECTORY; if( ( fsf & FSF_SPECIAL ) ) return FST_SPECIAL; // Remember if it is executable. int execbits = fsf & FSF_EXECUTABLE; # if defined ( OS_MACOSX ) if( fsf & FSF_EMPTY ) { // !data + resource == apple FileIOApple f; f.Set( Name() ); if( f.HasResourceFork() ) return execbits ? FST_XAPPLEFILE : FST_APPLEFILE; return BestFiletype( FST_EMPTY ); } # else if( fsf & FSF_EMPTY ) return FST_EMPTY; # endif // otherwise, we need to read the file to test for ubinary // Open file to read some Error e; Open( FOM_READ, &e ); if( e.Test() ) return BestFiletype( FST_CANTTELL ); // Read some StrFixed fileBuf( scan ); char *buf = fileBuf.Text(); int len = Read( buf, fileBuf.Length(), &e ); char *p = buf; int l = len; Close( &e ); if( e.Test() || !l ) return BestFiletype( FST_EMPTY ); // Look for binary chars. int highbit = 0; int controlchar = 0; int zero = 0; for( ; l--; p++ ) { highbit |= 0x80 & *p; zero |= !*p; controlchar |= isAcntrl( p ) && !isAspace( p ); } // But text with just %PDF- is still binary (yuk) static unsigned char pdfMagic[] = { '%', 'P', 'D', 'F', '-' }; if( len < 5 || memcmp( buf, pdfMagic, sizeof( pdfMagic ) ) ) { CharSetCvt *cvt; int rettype = FST_TEXT; // Always look for a utf8 bom... int utf8bomPresent = !memcmp( buf, "\xef\xbb\xbf", 3 ); if( utf8bomPresent ) { if( controlchar ) goto somebinary; CharSetUTF8Valid utf8test; if( utf8test.Valid( buf, len ) ) rettype = FST_UTF8; } // is there an UTF16 BOM at the start if( ( *(unsigned short *)buf == 0xfeff || *(unsigned short *)buf == 0xfffe ) && ((unsigned short *)buf)[1] != 0 ) // second word of zero means UTF-32 { // might be utf16... rettype = FST_UTF16; content_charSet = CharSetCvt::UTF_16; goto like16; } switch( (CharSetCvt::CharSet)content_charSet ) { case CharSetCvt::UTF_8: case CharSetCvt::UTF_8_BOM: if( controlchar ) goto somebinary; if( highbit && utf8bomPresent ) { rettype = FST_UNICODE; } else if( highbit ) { // run special UTF_8 validator... CharSetUTF8Valid utf8test; if( utf8test.Valid( buf, len ) ) rettype = FST_UNICODE; } // else? should we do unicode if no highbit? break; case CharSetCvt::UTF_32: case CharSetCvt::UTF_32_LE: case CharSetCvt::UTF_32_BE: case CharSetCvt::UTF_32_LE_BOM: case CharSetCvt::UTF_32_BE_BOM: case CharSetCvt::UTF_32_BOM: if( !zero && !highbit && !controlchar || utf8bomPresent ) break; rettype = FST_UNICODE; // is there a BOM at the start if( *(unsigned long *)buf == 0xfeff || *(unsigned long *)buf == 0xfffe0000 ) break; // is there a UTF16 BOM at start... consider binary... if( *(unsigned short *)buf == 0xfeff || *(unsigned short *)buf == 0xfffe ) goto somebinary; goto like16; case CharSetCvt::UTF_16: case CharSetCvt::UTF_16_LE: case CharSetCvt::UTF_16_BE: case CharSetCvt::UTF_16_LE_BOM: case CharSetCvt::UTF_16_BE_BOM: case CharSetCvt::UTF_16_BOM: if( !zero && !highbit && !controlchar || utf8bomPresent ) break; rettype = FST_UNICODE; // is there a BOM at the start if( *(unsigned short *)buf == 0xfeff || *(unsigned short *)buf == 0xfffe ) { // second word of zero means UTF-32 if( ((unsigned short *)buf)[1] == 0 ) goto somebinary; break; } // is there a UTF32 BOM at start... consider binary... if( *(unsigned short *)buf == 0 ) goto somebinary; like16: cvt = CharSetCvt::FindCvt((CharSetCvt::CharSet)content_charSet, CharSetCvt::UTF_8); if( cvt ) { StrFixed tbuf( scan * 2 ); cvt->ResetErr(); const char *ss = buf; p = tbuf.Text(); if( cvt->Cvt( &ss, buf + len, &p, tbuf.Text() + tbuf.Length() ) != 0 || cvt->LastErr() == CharSetCvt::NOMAPPING ) { // it does not convert... consider it binary... delete cvt; goto somebinary; } delete cvt; // it did convert... see if it looks like utf8 text... // we think it is text if there are more than // 1 space character every 40 characters or so... int cnt = 0, ccnt = 0; CharStepUTF8 step( tbuf.Text() ); while( step.Ptr() < p ) { if( isAspace( step.Ptr() ) ) ++cnt; ++ccnt; step.Next(); } if( 40 * cnt < ccnt ) goto somebinary; } break; case CharSetCvt::NOCONV: // non-unicode mode goes here... if( controlchar ) goto somebinary; break; default: // most 8-bit charsets go here... if( controlchar ) goto somebinary; if( utf8bomPresent ) break; if( highbit ) { // Found a high bit and a charset is set... cvt = CharSetCvt::FindCvt((CharSetCvt::CharSet)content_charSet, CharSetCvt::UTF_8); if( cvt ) { StrFixed tbuf( scan * 3 ); cvt->ResetErr(); const char *ss = buf; p = tbuf.Text(); if( cvt->Cvt( &ss, buf + len, &p, tbuf.Text() + tbuf.Length() ) == 0 && cvt->LastErr() != CharSetCvt::NOMAPPING ) { // it converts... consider it unicode... rettype = FST_UNICODE; } delete cvt; } } } if( rettype == FST_UNICODE ) { if( highbit ) { CharSetUTF8Valid utf8test; if( utf8test.Valid( buf, len ) ) rettype = FST_UTF8; } if( rettype == FST_UNICODE && !p4tunable.Get( P4TUNE_FILESYS_DETECTUNICODE ) ) rettype = FST_TEXT; } if( execbits ) rettype |= FST_M_EXEC; return BestFiletype( (FileSysType)rettype ); } somebinary: // It's binary. Let's see if it is a known compressed type. // Yuk -- what a list! static unsigned char gifMagic[] = { 'G', 'I', 'F' }; static unsigned char jpgMagic[] = { 0377, 0330, 0377, 0356 }; static unsigned char jpegMagic[] = { 0377, 0330, 0377, 0340 }; static unsigned char exifMagic[] = { 0377, 0330, 0377, 0341 }; static unsigned char gzipMagic[] = { 037, 0213 }; static unsigned char pkzipMagic[] = { 'P', 'K', 03, 04 }; static unsigned char compaMagic[] = { 0377, 037 }; static unsigned char comprMagic[] = { 037, 0235 }; if( !execbits && len >= 5 && ( !memcmp( buf, gifMagic, sizeof( gifMagic ) ) || !memcmp( buf, jpgMagic, sizeof( jpgMagic ) ) || !memcmp( buf, jpegMagic, sizeof( jpegMagic ) ) || !memcmp( buf, exifMagic, sizeof( exifMagic ) ) || !memcmp( buf, gzipMagic, sizeof( gzipMagic ) ) || !memcmp( buf, pkzipMagic, sizeof( pkzipMagic ) ) || !memcmp( buf, compaMagic, sizeof( compaMagic ) ) || !memcmp( buf, comprMagic, sizeof( comprMagic ) ) ) ) return FST_CBINARY; # if defined ( OS_MACOSX ) { // binary data + resource == apple FileIOApple f; f.Set( Name() ); if( f.HasResourceFork() ) return execbits ? FST_XAPPLEFILE : FST_APPLEFILE; } # endif return execbits ? FST_XBINARY : FST_BINARY; }