Skip to content
This repository has been archived by the owner on Sep 2, 2021. It is now read-only.

Commit

Permalink
Correct UTF8 detection which was being detected as ISO-8859-1 (#612)
Browse files Browse the repository at this point in the history
* Correcte UTF8 detection which was being detected as ISO-8859-1

* File is read only once now in mac

* minor change

* Added back functions which are used in windows

* minor change

* minor change

* minor change

* Added some comments related to BOM

* Added some comments related to BOM
  • Loading branch information
saurabh95 authored and nethip committed Jul 3, 2017
1 parent d1b1c3e commit 14a0758
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 33 deletions.
79 changes: 50 additions & 29 deletions appshell/appshell_extensions_mac.mm
Original file line number Diff line number Diff line change
Expand Up @@ -654,46 +654,62 @@ int32 ReadFile(ExtensionString filename, ExtensionString& encoding, std::string&
if (encoding == "utf8") {
encoding = "UTF-8";
}
NSString* path = [NSString stringWithUTF8String:filename.c_str()];

NSStringEncoding enc;
int32 error = NO_ERROR;

try {
std::ifstream file(filename.c_str());
std::stringstream ss;
ss << file.rdbuf();
contents = ss.str();
std::string detectedCharSet;

NSString* fileContents = nil;
if (encoding == "UTF-8") {
enc = NSUTF8StringEncoding;
NSError* NSerror = nil;
fileContents = [NSString stringWithContentsOfFile:path encoding:enc error:&NSerror];
}

if (fileContents)
{
contents = [fileContents UTF8String];
// We check if the file contains BOM or not
// if yes, then we set preserveBOM to true
// Please note we try to read first 3 characters
// again to check for BOM
CheckForUTF8BOM(filename, preserveBOM);
return NO_ERROR;
} else {
try {
if (encoding == "UTF-8") {
CharSetDetect ICUDetector;
ICUDetector(contents.c_str(), contents.size(), detectedCharSet);
}
else {
detectedCharSet = encoding;
}
if (detectedCharSet == "UTF-16LE" || detectedCharSet == "UTF-16BE") {
return ERR_UNSUPPORTED_UTF16_ENCODING;
}
if (detectedCharSet != "UTF-8") {
try {
std::ifstream file(filename.c_str());
std::stringstream ss;
ss << file.rdbuf();
contents = ss.str();
std::string detectedCharSet;
try {
if (encoding == "UTF-8") {
CharSetDetect ICUDetector;
ICUDetector(contents.c_str(), contents.size(), detectedCharSet);
}
else {
detectedCharSet = encoding;
}
if (detectedCharSet == "UTF-16LE" || detectedCharSet == "UTF-16BE") {
return ERR_UNSUPPORTED_UTF16_ENCODING;
}
if (!detectedCharSet.empty()) {
std::transform(detectedCharSet.begin(), detectedCharSet.end(), detectedCharSet.begin(), ::toupper);
DecodeContents(contents, detectedCharSet);
encoding = detectedCharSet;
} catch (...) {
error = ERR_DECODE_FILE_FAILED;
}
}
else {
CheckAndRemoveUTF8BOM(contents, preserveBOM);
else {
error = ERR_UNSUPPORTED_ENCODING;
}
} catch (...) {
error = ERR_UNSUPPORTED_ENCODING;
}
} catch (...) {
error = ERR_UNSUPPORTED_ENCODING;
error = ERR_CANT_READ;
}
} catch (...) {
error = ERR_CANT_READ;
}

return error;
}
return error;}

int32 WriteFile(ExtensionString filename, std::string contents, ExtensionString encoding, bool preserveBOM)
{
Expand All @@ -711,13 +727,18 @@ int32 WriteFile(ExtensionString filename, std::string contents, ExtensionString
error = ERR_ENCODE_FILE_FAILED;
}
} else if (encoding == "UTF-8" && preserveBOM) {
// The file originally contained BOM chars
// so we prepend BOM chars
contents = UTF8_BOM + contents;
}

try {
std::ofstream file;
file.open (filenameStr);
file << contents;
if (file.fail()) {
error = ERR_CANT_WRITE;
}
file.close();
} catch (...) {
return ERR_CANT_WRITE;
Expand Down
24 changes: 22 additions & 2 deletions appshell/appshell_extensions_platform.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "appshell/appshell_extensions_platform.h"
#include <unicode/ucsdet.h>
#include <unicode/ucnv.h>
#include <fstream>

#define UTF8_BOM "\xEF\xBB\xBF"

Expand Down Expand Up @@ -92,9 +93,28 @@ void DecodeContents(std::string &contents, const std::string& encoding) {
#endif

void CheckAndRemoveUTF8BOM(std::string& contents, bool& preserveBOM) {
if (contents.length() >= 3 && contents.substr(0,3) == UTF8_BOM) {
contents.erase(0,3);
if (contents.length() >= 3 && contents.substr(0, 3) == UTF8_BOM) {
preserveBOM = true;
contents.erase(0, 3);
}
}

void CheckForUTF8BOM(const std::string& filename, bool& preserveBOM) {
try {
std::ifstream file(filename.c_str());
int ch1, ch2, ch3;
ch1 = ch2 = ch3 = 0;
if (file.good())
ch1 = file.get();
if (file.good())
ch2 = file.get();
if (file.good())
ch3 = file.get();
if (ch1 == 0xef && ch2 == 0xbb && ch3 == 0xbf) {
preserveBOM = true;
}
}
catch (...) {
}
}

2 changes: 2 additions & 0 deletions appshell/appshell_extensions_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ void DecodeContents(std::string &contents, const std::string& encoding);

void CheckAndRemoveUTF8BOM(std::string& contents, bool& preserveBOM);

void CheckForUTF8BOM(const std::string& filename, bool& preserveBOM);

// Native extension code. These are implemented in appshell_extensions_mac.mm
// and appshell_extensions_win.cpp
int32 OpenLiveBrowser(ExtensionString argURL, bool enableRemoteDebugging);
Expand Down
10 changes: 8 additions & 2 deletions appshell/appshell_extensions_win.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ int32 ReadFile(ExtensionString filename, ExtensionString& encoding, std::string&
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
detectedCharSet = conv.to_bytes(encoding);
}
if (detectedCharSet == "UTF-16LE" || detectedCharSet == "UTF-16LE") {
if (detectedCharSet == "UTF-16LE" || detectedCharSet == "UTF-16BE") {
return ERR_UNSUPPORTED_UTF16_ENCODING;
}
std::transform(detectedCharSet.begin(), detectedCharSet.end(), detectedCharSet.begin(), ::toupper);
Expand All @@ -1012,7 +1012,10 @@ int32 ReadFile(ExtensionString filename, ExtensionString& encoding, std::string&
}
}
if (encoding == L"UTF-8") {
CheckAndRemoveUTF8BOM(contents, preserveBOM);
// If file starts with BOM chars, then
// we set preserveBOM to true, so that
// while writing we can preprend the BOM
CheckAndRemoveUTF8BOM(contents, preserveBOM);
}
}
else {
Expand Down Expand Up @@ -1080,6 +1083,9 @@ int32 WriteFile(ExtensionString filename, std::string contents, ExtensionString
error = ERR_ENCODE_FILE_FAILED;
}
}
// We check if the file originally contained BOM chars
// if yes, then we prepend BOM chars to file
// Currently BOM is supported only for UTF-8
if (encoding == L"UTF-8" && preserveBOM) {
contents = UTF8_BOM + contents;
}
Expand Down

0 comments on commit 14a0758

Please sign in to comment.