package com.perforce.common.asset;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.perforce.svn.parser.Content;
public class ScanArchive {
private static Logger logger = LoggerFactory.getLogger(ScanArchive.class);
/**
* Unicode file type detection by scanning content in blocks.
*
* @param content
* @return
* @throws IOException
*/
public static ContentType detectType(Content content) throws Exception {
ContentType type = ContentType.UNKNOWN;
// ICU4J charsetDetector to find all matches
ContentStream contentStream = ContentStreamFactory.scanContentStream(
content, 1048576L);
BufferedInputStream bufContent = new BufferedInputStream(
(InputStream) contentStream);
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.setText(bufContent);
CharsetMatch cm;
try {
cm = charsetDetector.detect();
} catch (ArrayIndexOutOfBoundsException e) {
cm = null;
}
// Set confidence (smaller data sets are harder to spot)
int minConfidenceLevel = 30;
if (content.getLength() < 128)
minConfidenceLevel = 8;
// Get detected types
int confidence = 0;
if (cm != null) {
confidence = cm.getConfidence();
if (confidence > minConfidenceLevel)
type = ContentType.parse(cm.getName());
else
type = ContentType.P4_BINARY;
if (logger.isTraceEnabled()) {
logger.trace("icu4j detected:" + cm.getName() + " conf%:"
+ confidence);
}
} else {
// for unknown or unparseable types
type = ContentType.P4_BINARY;
}
// Check for plain text
if (contentStream.isText()) {
type = ContentType.P4_TEXT;
}
content.setDetectedType(type);
if (logger.isTraceEnabled()) {
logger.trace("setDetectedType:" + type + " (p4type:"
+ type.getP4Type().toString() + " isText:"
+ contentStream.isText() + ")");
}
// Clean up and return type
contentStream.close();
return type;
}
}
# |
Change |
User |
Description |
Committed |
|
#1
|
10021 |
Norman Morse |
Create dev branch of p4convert |
|
|
//guest/perforce_software/p4convert/src/com/perforce/common/asset/ScanArchive.java |
#1
|
9807 |
Paul Allen |
Initial import of p4-convert (from change 894340) |
|
|