package com.perforce.common.asset; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; import com.perforce.svn.parser.Content; public class ScanArchive { private static Logger logger = LoggerFactory.getLogger(ScanArchive.class); /** * Unicode file type detection by scanning content in blocks. * * @param content * @return * @throws IOException */ public static ContentType detectType(Content content) throws Exception { ContentType type = ContentType.UNKNOWN; // ICU4J charsetDetector to find all matches ContentStream contentStream = ContentStreamFactory.scanContentStream( content, 1048576L); BufferedInputStream bufContent = new BufferedInputStream( (InputStream) contentStream); CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText(bufContent); CharsetMatch cm; try { cm = charsetDetector.detect(); } catch (ArrayIndexOutOfBoundsException e) { cm = null; } // Set confidence (smaller data sets are harder to spot) int minConfidenceLevel = 30; if (content.getLength() < 128) minConfidenceLevel = 8; // Get detected types int confidence = 0; if (cm != null) { confidence = cm.getConfidence(); if (confidence > minConfidenceLevel) type = ContentType.parse(cm.getName()); else type = ContentType.P4_BINARY; if (logger.isTraceEnabled()) { logger.trace("icu4j detected:" + cm.getName() + " conf%:" + confidence); } } else { // for unknown or unparseable types type = ContentType.P4_BINARY; } // Check for plain text if (contentStream.isText()) { type = ContentType.P4_TEXT; } content.setDetectedType(type); if (logger.isTraceEnabled()) { logger.trace("setDetectedType:" + type + " (p4type:" + type.getP4Type().toString() + " isText:" + contentStream.isText() + ")"); } // Clean up and return type contentStream.close(); return type; } }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#5 | 13920 | Paul Allen | copy part 2 (no errors) | ||
#4 | 12476 | Paul Allen |
CVS: Swapped UFT16 downgrade to BINARY. When Translation is disabled downgrade UTF16 to BINARY |
||
#3 | 12469 | Paul Allen | CVS: When Translation is disabled downgrade UTF16 as RAW-TEXT | ||
#2 | 12222 | Paul Allen |
New non-translate mode for high-ascii files. If the P4_TRANSLATE mode is disabled then text files with high-ascii characters are given the new type RAW-TEXT. Raw types are not translated and the content is used as-is. (exception UTF16/32) The default translation configuration is enabled: com.p4convert.p4.translate=true The com.p4convert.p4.translate=false mode is intended for use with non-unicode Perforce servers in a Windows only client environment. |
||
#1 | 9807 | Paul Allen | Initial import of p4-convert (from change 894340) |