UnicodeReader.java 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. package com.yihu.ehr.util.encode;
  2. /**
  3. * UTF-8 编码工具类,解决bom问题
  4. * Created by HZY on 2015/8/12.
  5. */
  6. import java.io.*;
  7. /**
  8. * Generic unicode textreader, which will use BOM mark to identify the encoding
  9. * to be used. If BOM is not found then use a given default or system encoding.
  10. */
  11. public class UnicodeReader extends Reader {
  12. PushbackInputStream internalIn;
  13. InputStreamReader internalIn2 = null;
  14. String defaultEnc;
  15. private static final int BOM_SIZE = 4;
  16. /**
  17. *
  18. * @param in
  19. * inputstream to be read
  20. * @param defaultEnc
  21. * default encoding if stream does not have BOM marker. Give NULL
  22. * to use system-level default.
  23. */
  24. public UnicodeReader(InputStream in, String defaultEnc) {
  25. internalIn = new PushbackInputStream(in, BOM_SIZE);
  26. this.defaultEnc = defaultEnc;
  27. }
  28. public String getDefaultEncoding() {
  29. return defaultEnc;
  30. }
  31. /**
  32. * Get stream encoding or NULL if stream is uninitialized. Call init() or
  33. * read() method to initialize it.
  34. */
  35. public String getEncoding() {
  36. if (internalIn2 == null)
  37. return null;
  38. return internalIn2.getEncoding();
  39. }
  40. /**
  41. * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
  42. * back to the stream, only BOM bytes are skipped.
  43. */
  44. protected void init() throws IOException {
  45. if (internalIn2 != null)
  46. return;
  47. String encoding;
  48. byte bom[] = new byte[BOM_SIZE];
  49. int n, unread;
  50. n = internalIn.read(bom, 0, bom.length);
  51. if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
  52. && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
  53. encoding = "UTF-32BE";
  54. unread = n - 4;
  55. } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
  56. && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
  57. encoding = "UTF-32LE";
  58. unread = n - 4;
  59. } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
  60. && (bom[2] == (byte) 0xBF)) {
  61. encoding = "UTF-8";
  62. unread = n - 3;
  63. } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
  64. encoding = "UTF-16BE";
  65. unread = n - 2;
  66. } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
  67. encoding = "UTF-16LE";
  68. unread = n - 2;
  69. } else {
  70. // Unicode BOM mark not found, unread all bytes
  71. encoding = defaultEnc;
  72. unread = n;
  73. }
  74. // System.out.println("read=" + n + ", unread=" + unread);
  75. if (unread > 0)
  76. internalIn.unread(bom, (n - unread), unread);
  77. // Use given encoding
  78. if (encoding == null) {
  79. internalIn2 = new InputStreamReader(internalIn);
  80. } else {
  81. internalIn2 = new InputStreamReader(internalIn, encoding);
  82. }
  83. }
  84. public void close() throws IOException {
  85. init();
  86. internalIn2.close();
  87. }
  88. public int read(char[] cbuf, int off, int len) throws IOException {
  89. init();
  90. return internalIn2.read(cbuf, off, len);
  91. }
  92. }