Skip to content

Commit

Permalink
编码识别改进
Browse files Browse the repository at this point in the history
  • Loading branch information
liyujiang-gzu committed Aug 7, 2020
1 parent e13fbf2 commit d70cfd9
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 14 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@
implementation 'com.github.gzu-liyujiang:CJKCharsetDetector:latest.version'
}
```
```groovy
Charset charset = CJKCharsetDetector.detect(new FileInputStream(file));
String str = new String(bytes, charset.name());
if (CJKCharsetDetector.inWrongEncoding(str)) {
System.err.println("File was loaded using wrong encoding: " + charset.name());
}
```
## License

```text
Expand Down
4 changes: 4 additions & 0 deletions library/ASCII.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ASCII (American Standard Code for information exchange) is a computer coding system based on Latin alphabet, which is mainly used to display modern English and other Western European languages.

Powered by gzu-liyujiang
2020/8/7
8 changes: 8 additions & 0 deletions library/UTF-8-BOM.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
KOI8-R представляет собой кодирование 8 - битного текста на славянском языке серии KOI-8 для использования на русском и болгарском языках. 

 до того, как Unicode не стал популярным, KOI8-R был наиболее широко используемым русским кодом, который даже выше стандарта ISO-8859-5. 

我爱中国 我愛中國 中国を愛しています Я люблю китай I love China.

Powered by 貴州穿青人李裕江
2020年8月7日
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ private void guessCharset(InputStream inputStream, int language) throws Exceptio
if (isAscii) {
alreadyFound = true;
if (DEBUG) {
System.out.println("Is ASCII");
System.out.println("ASCII first: true");
}
probableCharset = "ASCII";
return;
Expand All @@ -129,18 +129,23 @@ private void guessCharset(InputStream inputStream, int language) throws Exceptio
if (DEBUG) {
System.out.println("Probable charsets: " + Arrays.toString(probableCharsets));
}
// 先取第一个可能的字符集,然后再赛选其他可能的字符集
probableCharset = probableCharsets[0];
for (String itCharset : probableCharsets) {
if (!itCharset.startsWith("UTF") && !itCharset.startsWith("GB18030")) {
// 可能有多个字符集的情况,范围比较大的UTF系列及GB18030优先级靠后
// [UTF-16LE, Big5, GB18030, UTF-16BE]
// [GB18030, Shift_JIS, UTF-16BE]
// “UTF-16LE、UTF-16BE、GB18030”这几种范围比较大,目前并不常用,优先级靠后
// [UTF-16BE, Big5, GB18030]
// [UTF-16LE, Big5, GB18030, UTF-16BE]
// [GB18030, Shift_JIS, UTF-16BE]
if (!(itCharset.startsWith("UTF-16") || itCharset.startsWith("GB18030"))) {
probableCharset = itCharset;
break;
}
}
}
if ("nomatch".equals(probableCharset)) {
if (DEBUG) {
System.out.println("Charset no match");
}
throw new Exception("no match");
}
alreadyFound = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.junit.Assert;
import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
Expand All @@ -38,19 +39,24 @@ public class JUnitTest {
CJKCharsetDetector.DEBUG = true;
}

@Test
public final void detectASCII() {
Assert.assertTrue(guessCharset("ASCII"));
}

@Test
public final void detectUTF8() {
Assert.assertTrue(guessCharset("UTF-8"));
}

@Test
public final void detectGBK() {
Assert.assertTrue(guessCharset("GBK"));
public final void detectUTF8WithBOM() {
Assert.assertTrue(guessCharset("UTF-8-BOM", "UTF-8"));
}

@Test
public final void detectBig5() {
Assert.assertTrue(guessCharset("Big5"));
public final void detectGBK() {
Assert.assertTrue(guessCharset("GBK"));
}

@Test
Expand All @@ -63,6 +69,11 @@ public final void detectGB18030() {
Assert.assertTrue(guessCharset("GB18030"));
}

@Test
public final void detectBig5() {
Assert.assertTrue(guessCharset("Big5"));
}

@Test
public final void detectShiftJIS() {
Assert.assertTrue(guessCharset("Shift_JIS"));
Expand All @@ -75,22 +86,28 @@ public final void detectEUCKR() {

@Test
public final void detectKOI8R() {
// NOTE: KOI8-R 编码 被识别成 Shift_JIS 编码,不知道是不是样本不靠谱?
Assert.assertTrue(guessCharset("KOI8-R"));
}

private static boolean guessCharset(String charsetName) {
return guessCharset(charsetName, charsetName);
}

private static boolean guessCharset(String fileName, String charsetName) {
System.out.println("---------------------------------");
try {
System.out.println("Origin charset: " + charsetName);
File file = new File(System.getProperty("user.dir"), charsetName + ".txt");
File file = new File(System.getProperty("user.dir"), fileName + ".txt");
System.out.println("Target file: " + file);
Charset charset = CJKCharsetDetector.detect(new FileInputStream(file));
assert charset != null;
System.out.println("Detect charset: " + charset);
byte[] bytes = readBytes(new FileInputStream(file));
System.out.println("Bytes length: " + bytes.length);
//Charset charset = CJKCharsetDetector.detect(new FileInputStream(file));
Charset charset = CJKCharsetDetector.detect(new ByteArrayInputStream(bytes));
assert charset != null;
System.out.println("Detect charset: " + charset);
String str = new String(bytes, charset.name());
System.out.println("Display text: " + str);
System.out.println("Display text: \n**********\n" + str.trim() + "\n**********");
if (CJKCharsetDetector.inWrongEncoding(str)) {
System.err.println("File was loaded in the wrong encoding: " + charset.name());
return false;
Expand Down

0 comments on commit d70cfd9

Please sign in to comment.