package com.perforce.common.asset; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; import com.perforce.config.CFG; import com.perforce.config.Config; import com.perforce.svn.parser.Content; public class ScanArchive { private static Logger logger = LoggerFactory.getLogger(ScanArchive.class); /** * Unicode file type detection by scanning content in blocks. * * @param content * @return * @throws IOException */ public static ContentType detectType(Content content) throws Exception { ContentType type = ContentType.UNKNOWN; // ICU4J charsetDetector to find all matches ContentStream contentStream = ContentStreamFactory.scanContentStream( content, 1048576L); BufferedInputStream bufContent = new BufferedInputStream( (InputStream) contentStream); CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText(bufContent); CharsetMatch cm; try { cm = charsetDetector.detect(); } catch (ArrayIndexOutOfBoundsException e) { cm = null; } // Set confidence (smaller data sets are harder to spot) int minConfidenceLevel = 30; if (content.getLength() < 128) minConfidenceLevel = 8; // Get detected types int confidence = 0; if (cm != null) { confidence = cm.getConfidence(); if (confidence > minConfidenceLevel) { type = ContentType.parse(cm.getName()); // If translation is disabled, use RAW for unicode files. if (!(Boolean) Config.get(CFG.P4_TRANSLATE)) { switch (type) { case UTF_16LE: case UTF_16BE: case UTF_32LE: case UTF_32BE: type = ContentType.P4_BINARY; break; default: type = ContentType.P4_RAW; break; } } } else { type = ContentType.P4_BINARY; } if (logger.isTraceEnabled()) { logger.trace("icu4j detected:" + cm.getName() + " conf%:" + confidence); } } else { // for unknown or unparseable types type = ContentType.P4_BINARY; } // Check for plain text if (contentStream.isText()) { type = ContentType.P4_TEXT; } content.setDetectedType(type); if (logger.isTraceEnabled()) { logger.trace("setDetectedType:" + type + " (p4type:" + type.getP4Type().toString() + " isText:" + contentStream.isText() + ")"); } // Clean up and return type contentStream.close(); return type; } }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#1 | 13876 | Paul Allen | Rename/move file(s) | ||
//guest/paul_allen/p4convert-maven/src/com/perforce/common/asset/ScanArchive.java | |||||
#1 | 13873 | Paul Allen | Branching using p4convert-maven | ||
//guest/perforce_software/p4convert/src/com/perforce/common/asset/ScanArchive.java | |||||
#4 | 12476 | Paul Allen |
CVS: Swapped UFT16 downgrade to BINARY. When Translation is disabled downgrade UTF16 to BINARY |
||
#3 | 12469 | Paul Allen | CVS: When Translation is disabled downgrade UTF16 as RAW-TEXT | ||
#2 | 12222 | Paul Allen |
New non-translate mode for high-ascii files. If the P4_TRANSLATE mode is disabled then text files with high-ascii characters are given the new type RAW-TEXT. Raw types are not translated and the content is used as-is. (exception UTF16/32) The default translation configuration is enabled: com.p4convert.p4.translate=true The com.p4convert.p4.translate=false mode is intended for use with non-unicode Perforce servers in a Windows only client environment. |
||
#1 | 9807 | Paul Allen | Initial import of p4-convert (from change 894340) |