Detect a file’s encoding
How to auto-detect a file’s encoding
import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; public class CharsetDetectorUtils { public static Charset detectCharset(File f, String[] charsets) { Charset charset = null; for (String charsetName : charsets) { charset = detectCharset(f, Charset.forName(charsetName)); if (charset != null) { break; } } return charset; } public static Charset detectCharset(File f, Charset charset) { try { BufferedInputStream input = new BufferedInputStream( new FileInputStream(f)); CharsetDecoder decoder = charset.newDecoder(); decoder.reset(); byte[] buffer = new byte[512]; boolean identified = false; while ((input.read(buffer) != -1) && (!identified)) { identified = identify(buffer, decoder); } input.close(); if (identified) { return charset; } else { return null; } } catch (Exception e) { return null; } } private static boolean identify(byte[] bytes, CharsetDecoder decoder) { try { decoder.decode(ByteBuffer.wrap(bytes)); } catch (CharacterCodingException e) { return false; } return true; } }
Example :
String[] charsetsToBeTested = {“UTF-8″, “MS932″};
File f = new File(csvPath);
Charset charset = CharsetDetectorUtils.detectCharset(f, charsetsToBeTested);
File f = new File(csvPath);
Charset charset = CharsetDetectorUtils.detectCharset(f, charsetsToBeTested);
No comments:
Post a Comment