ScanArchive.java #1

package com.perforce.common.asset;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.perforce.config.CFG;
import com.perforce.config.Config;
import com.perforce.svn.parser.Content;

public class ScanArchive {

	private static Logger logger = LoggerFactory.getLogger(ScanArchive.class);

	/**
	 * Unicode file type detection by scanning content in blocks.
	 * 
	 * @param content
	 * @return
	 * @throws IOException
	 */
	public static ContentType detectType(Content content) throws Exception {
		ContentType type = ContentType.UNKNOWN;

		// ICU4J charsetDetector to find all matches
		ContentStream contentStream = ContentStreamFactory.scanContentStream(
				content, 1048576L);
		BufferedInputStream bufContent = new BufferedInputStream(
				(InputStream) contentStream);
		CharsetDetector charsetDetector = new CharsetDetector();
		charsetDetector.setText(bufContent);
		CharsetMatch cm;
		try {
			cm = charsetDetector.detect();
		} catch (ArrayIndexOutOfBoundsException e) {
			cm = null;
		}

		// Set confidence (smaller data sets are harder to spot)
		int minConfidenceLevel = 30;
		if (content.getLength() < 128)
			minConfidenceLevel = 8;

		// Get detected types
		int confidence = 0;
		if (cm != null) {
			confidence = cm.getConfidence();

			if (confidence > minConfidenceLevel) {
				type = ContentType.parse(cm.getName());

				// If translation is disabled, use RAW for unicode files.
				if (!(Boolean) Config.get(CFG.P4_TRANSLATE)) {
					switch (type) {
					case UTF_16LE:
					case UTF_16BE:
					case UTF_32LE:
					case UTF_32BE:
						type = ContentType.P4_BINARY;
						break;

					default:
						type = ContentType.P4_RAW;
						break;
					}
				}
			} else {
				type = ContentType.P4_BINARY;
			}

			if (logger.isTraceEnabled()) {
				logger.trace("icu4j detected:" + cm.getName() + " conf%:"
						+ confidence);
			}
		} else {
			// for unknown or unparseable types
			type = ContentType.P4_BINARY;
		}

		// Check for plain text
		if (contentStream.isText()) {
			type = ContentType.P4_TEXT;
		}

		content.setDetectedType(type);

		if (logger.isTraceEnabled()) {
			logger.trace("setDetectedType:" + type + " (p4type:"
					+ type.getP4Type().toString() + " isText:"
					+ contentStream.isText() + ")");
		}

		// Clean up and return type
		contentStream.close();
		return type;
	}
}

#	Change	User	Description
#1	13876	Paul Allen	Rename/move file(s)
//guest/paul_allen/p4convert-maven/src/com/perforce/common/asset/ScanArchive.java
#1	13873	Paul Allen	Branching using p4convert-maven
//guest/perforce_software/p4convert/src/com/perforce/common/asset/ScanArchive.java
#4	12476	Paul Allen	CVS: Swapped UFT16 downgrade to BINARY. When Translation is disabled downgrade UTF16 to BINARY
#3	12469	Paul Allen	CVS: When Translation is disabled downgrade UTF16 as RAW-TEXT
#2	12222	Paul Allen	New non-translate mode for high-ascii files. If the P4_TRANSLATE mode is disabled then text files with high-ascii characters are given the new type RAW-TEXT. Raw types are not translated and the content is used as-is. (exception UTF16/32) The default translation configuration is enabled: com.p4convert.p4.translate=true The com.p4convert.p4.translate=false mode is intended for use with non-unicode Perforce servers in a Windows only client environment.
#1	9807	Paul Allen	Initial import of p4-convert (from change 894340)