Detect a file’s encoding
How to auto-detect a file’s encoding
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
public class CharsetDetectorUtils {
public static Charset detectCharset(File f, String[] charsets) {
Charset charset = null;
for (String charsetName : charsets) {
charset = detectCharset(f, Charset.forName(charsetName));
if (charset != null) {
break;
}
}
return charset;
}
public static Charset detectCharset(File f, Charset charset) {
try {
BufferedInputStream input = new BufferedInputStream(
new FileInputStream(f));
CharsetDecoder decoder = charset.newDecoder();
decoder.reset();
byte[] buffer = new byte[512];
boolean identified = false;
while ((input.read(buffer) != -1) && (!identified)) {
identified = identify(buffer, decoder);
}
input.close();
if (identified) {
return charset;
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
private static boolean identify(byte[] bytes, CharsetDecoder decoder) {
try {
decoder.decode(ByteBuffer.wrap(bytes));
} catch (CharacterCodingException e) {
return false;
}
return true;
}
}
Example :
String[] charsetsToBeTested = {“UTF-8″, “MS932″};
File f = new File(csvPath);
Charset charset = CharsetDetectorUtils.detectCharset(f, charsetsToBeTested);
File f = new File(csvPath);
Charset charset = CharsetDetectorUtils.detectCharset(f, charsetsToBeTested);
No comments:
Post a Comment