Read Unicode Utf-8 File into Wstring

How to read UTF-16 file into wstring use wfstream

It's because the BOM has to be written/read in binary whereas the text is just done in text mode..

You can use something like this to close/reopen the file or else do it manually.. Otherwide you might have to use C++11 or WinAPI.. The idea is to read/write the bom in binary mode and then read/write the file in text mode. It works that way. I've tested it. Otherwise you're going to have to do conversions.

#include <iostream>
#include <vector>
#include <fstream>

template<typename T, typename Traits = std::char_traits<T>>
class ModFStream
{
private:
std::string filepath;
std::basic_fstream<T, Traits> stream;
std::ios_base::openmode mode;

public:
ModFStream() : stream(), mode() {}
ModFStream(const std::string &FilePath, std::ios_base::openmode mode) : filepath(FilePath), stream(FilePath, mode), mode(mode) {}
~ModFStream() {}

inline std::basic_fstream<T, Traits>& get() {return stream;}

void setmode(std::ios::openmode mode)
{
stream.close();
stream.open(filepath, mode);
}

template<typename U>
ModFStream& operator << (const U& other)
{
stream << other;
return *this;
}

template<typename U>
ModFStream& operator >> (U& other)
{
stream >> other;
return *this;
}
};

int main()
{
wchar_t bom[] = L"\xFF\xFE";
std::wstring str = L"Chào";

ModFStream<wchar_t> stream("C:/Users/Brandon/Desktop/UTF16Test.txt", std::ios::out | std::ios::binary);
stream << bom;
stream.setmode(std::ios::out | std::ios::binary);
stream << str;

str.clear();
stream.setmode(std::ios::in | std::ios::binary);
stream >> bom[0] >> bom[1];

stream.setmode(std::ios::in);
stream >> str;

std::wcout<<str;
}

You could write a WinAPI fstream simulator I guess..

#include <iostream>
#include <vector>
#include <locale>
#include <windows.h>

namespace win
{
template<typename T>
struct is_wide_char : std::false_type {};

template<>
struct is_wide_char<wchar_t> : std::true_type {};

enum class open_mode
{
app = 1L << 0,
ate = 1L << 1,
bin = 1L << 2,
in = 1L << 3,
out = 1L << 4,
trunc = 1L << 5
};

enum class seek_dir
{
beg = 1L << 0,
cur = 1L << 1,
end = 1L << 2
};

inline constexpr open_mode operator & (open_mode a, open_mode b) {return open_mode(static_cast<int>(a) & static_cast<int>(b));}
inline constexpr open_mode operator | (open_mode a, open_mode b) {return open_mode(static_cast<int>(a) | static_cast<int>(b));}
inline constexpr open_mode operator ^ (open_mode a, open_mode b) {return open_mode(static_cast<int>(a) ^ static_cast<int>(b));}
inline constexpr open_mode operator~(open_mode a) {return open_mode(~static_cast<int>(a));}
inline const open_mode& operator |= (open_mode& a, open_mode b) {return a = a | b;}
inline const open_mode& operator &= (open_mode& a, open_mode b) {return a = a & b;}
inline const open_mode& operator ^= (open_mode& a, open_mode b) {return a = a ^ b;}

template<typename T>
std::wstring to_wide_string(const T* str)
{
if (is_wide_char<T>::value)
return std::wstring(str);

std::wstring utf16 = std::wstring(std::mbstowcs(nullptr, reinterpret_cast<const char*>(str), 0), '\0');
std::mbstowcs(&utf16[0], reinterpret_cast<const char*>(str), utf16.size());
return utf16;
}

template<typename T>
class WinFStream
{
private:
open_mode mode;
HANDLE hFile;
bool binary_mode = false;

public:
WinFStream(const T* FilePath, open_mode mode = open_mode::in | open_mode::out) : mode(mode), hFile(nullptr), binary_mode(false)
{
unsigned int open_flags = 0;

if (static_cast<int>(mode & open_mode::bin))
{
binary_mode = true;
}

if (static_cast<int>(mode & open_mode::in))
{
open_flags |= GENERIC_READ;
}
else if (static_cast<int>(mode & open_mode::app))
{
open_flags |= FILE_APPEND_DATA;
}

if (static_cast<int>(mode & open_mode::out))
{
open_flags |= GENERIC_WRITE;
}

std::wstring path = to_wide_string(FilePath);
hFile = CreateFileW(path.c_str(), open_flags, 0, nullptr, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr);

if (static_cast<int>(mode & open_mode::ate))
{
SetFilePointer(hFile, 0, nullptr, FILE_END);
}
}

~WinFStream() {CloseHandle(hFile); hFile = nullptr;}

inline std::size_t seekg(std::size_t pos, seek_dir from)
{
return SetFilePointer(hFile, pos, nullptr, static_cast<int>(from) - 1);
}

inline std::size_t tellg()
{
return GetFileSize(hFile, nullptr);
}

void close()
{
CloseHandle(hFile);
hFile = nullptr;
}

template<typename U>
inline std::size_t write(const U* str, std::size_t size)
{
long unsigned int bytes_written = 0;
WriteFile(hFile, &str[0], size * sizeof(U), &bytes_written, nullptr);
return bytes_written;
}

template<typename U>
inline std::size_t read(U* str, std::size_t size)
{
long unsigned int bytes_read = 0;
ReadFile(hFile, &str[0], size * sizeof(U), &bytes_read, nullptr);
return bytes_read;
}

template<typename U>
WinFStream& operator << (const U &other)
{
this->write(&other, 1);
return *this;
}

template<typename U, std::size_t size>
WinFStream& operator << (U (&str)[size])
{
this->write(&str[0], size);
return *this;
}

template<typename U, typename Traits = std::char_traits<U>>
WinFStream& operator << (const std::basic_string<U, Traits>& str)
{
this->write(str.c_str(), str.size());
return *this;
}

template<typename U>
WinFStream& operator >> (U &other)
{
this->read(&other, 1);
return *this;
}

template<typename U, std::size_t size>
WinFStream& operator >> (U (&str)[size])
{
this->read(&str[0], size);
return *this;
}

template<typename U, typename Traits = std::char_traits<U>>
WinFStream& operator >> (std::basic_string<U, Traits>& str)
{
unsigned int i = 0;
std::vector<U> buffer(512, 0);

while(true)
{
long unsigned int bytes_read = 0;
bool result = ReadFile(hFile, &buffer[i], sizeof(U), &bytes_read, nullptr);

if (std::isspace(buffer[i]) || buffer[i] == '\r' || buffer[i] == '\n')
break;

++i;

if (bytes_read != sizeof(U) || !result)
break;
}

str.append(buffer.begin(), buffer.begin() + i);
return *this;
}
};

typedef WinFStream<wchar_t> WinFStreamW;
typedef WinFStream<char> WinFStreamA;

}


using namespace win;

int main()
{
unsigned char bom[2] = {0XFF, 0xFE};
std::wstring str = L"Chào";

WinFStreamW File(L"C:/Users/Brandon/Desktop/UTF16Test.txt");
File << bom;
File << str;


File.seekg(0, win::seek_dir::beg);

std::wstring str2;
File>>bom;
File>>str2;

std::wcout<<str2;
}

I know, it's dirty and doesn't work the exact same as fstream but it was worth my time "trying" to simulate it..

But again, my operator << and >> aren't "equivalent" to std::fstream's..

You're probably better off just using CreateFileW, ReadFile, WriteFile or re-opening the file in text mode after writing the bom in binary mode..

Read Unicode UTF-32 file into wstring

You may want to use a classic C++ pattern of allocating wifstream on the stack instead of the heap (new):

std::wifstream* file = new std::wifstream(name, ifstream::binary);
std::wifstream file(name, ifstream::binary);

For the codecvt part, I'd try with std::codecvt_utf16<char32_t>.

P.S. Note that wchar_t can have different sizes (16 bits, 32 bits) on different platforms. So it may be better for you to use std::u16string for UTF-16 and std::u32string for UTF-32.

Can I use wstring to read, parse, and emit utf-8?


Is wstring able to do utf-8?

C++ has standard functions (wstring_convert) that are able to convert between wstring and UTF-8 strings. There are also standard functions in both C and C++ (wcstombs, mbstowcs), that may be able to do the same with C-wstrings if your system has an appropriate locale. Most POSIX-is systems do, Windows-based ones normally don't (they have non-standard facilities for that). That's about all wstring and UTF-8 have to do with each other.

Is utf-8 even the thing I should be concerned with?

It depends. If you are living in 1980, or don't do any programming, then probably not. If you don't do any character-level processing, and only shuffle entire strings, you should also be fine. Just use char-based strings and don't worry about any fancy characters. It all should work more or less automaticaly.

If you do need character-level stuff (substrings, search, ...) you probably do need to be aware of UTF-8. It's probably wise to do all internal processing with either wchar_t or char32_t, and convert from or to UTF-8 upon I/O. (I would just say "use wchar_t" but alas, on Windows wchar_t is broken. You may still be able to get away with it, but no promises.)

If I have a u8"string" or L"string" which contains characters from multiple languages, how would I write this to file using only the C standard IO library?

You cannot do much about u8"string" in C. In C++, they are normal char-based strings and can be written as any other string, and do the right thing. (You may have to jump through some hoops on Windows, see _setmode and _O_U8TEXT docs). Thia is however of a minor importance. You nirmally don't need to have any fancy characters in string literals. All user-facing strings should be loaded from files.

With wchar_t based strings, you may or may not be able to output UTF-8 directly, depending on your OS and compiler. You can always convert to UTF-8 and output that.

If you are willing to use third-party libraries, consider using http://utfcpp.sourceforge.net/

Also read:
http://utf8everywhere.org
http://www.joelonsoftware.com/articles/Unicode.html

Confused about C++'s std::wstring, UTF-16, UTF-8 and displaying strings in a windows GUI

Windows from NT4 onwards is based on Unicode encoded strings, yes. Early versions were based on UCS-2, which is the predecessor of UTF-16, and thus does not support all of the characters that UTF-16 does. Later versions are based on UTF-16. Not all OSes are based on UTF-16/UCS-2, though. *nix systems, for instance, are based on UTF-8 instead.

UTF-8 is a very good choice for storing data persistently. It is a universally supported encoding in all Unicode environments, and it is a good balance between data size and loss-less data compatibility.

Yes, you would have to parse the XML, extract the necessary information from it, and decode and transform it into something the UI can use.



Related Topics



Leave a reply



Submit