How to read UTF-16 file into wstring use wfstream
It's because the BOM has to be written/read in binary whereas the text is just done in text mode..
You can use something like this to close/reopen the file or else do it manually.. Otherwide you might have to use C++11 or WinAPI.. The idea is to read/write the bom in binary mode and then read/write the file in text mode. It works that way. I've tested it. Otherwise you're going to have to do conversions.
#include <iostream>
#include <vector>
#include <fstream>
template<typename T, typename Traits = std::char_traits<T>>
class ModFStream
{
private:
std::string filepath;
std::basic_fstream<T, Traits> stream;
std::ios_base::openmode mode;
public:
ModFStream() : stream(), mode() {}
ModFStream(const std::string &FilePath, std::ios_base::openmode mode) : filepath(FilePath), stream(FilePath, mode), mode(mode) {}
~ModFStream() {}
inline std::basic_fstream<T, Traits>& get() {return stream;}
void setmode(std::ios::openmode mode)
{
stream.close();
stream.open(filepath, mode);
}
template<typename U>
ModFStream& operator << (const U& other)
{
stream << other;
return *this;
}
template<typename U>
ModFStream& operator >> (U& other)
{
stream >> other;
return *this;
}
};
int main()
{
wchar_t bom[] = L"\xFF\xFE";
std::wstring str = L"Chào";
ModFStream<wchar_t> stream("C:/Users/Brandon/Desktop/UTF16Test.txt", std::ios::out | std::ios::binary);
stream << bom;
stream.setmode(std::ios::out | std::ios::binary);
stream << str;
str.clear();
stream.setmode(std::ios::in | std::ios::binary);
stream >> bom[0] >> bom[1];
stream.setmode(std::ios::in);
stream >> str;
std::wcout<<str;
}
You could write a WinAPI fstream simulator I guess..
#include <iostream>
#include <vector>
#include <locale>
#include <windows.h>
namespace win
{
template<typename T>
struct is_wide_char : std::false_type {};
template<>
struct is_wide_char<wchar_t> : std::true_type {};
enum class open_mode
{
app = 1L << 0,
ate = 1L << 1,
bin = 1L << 2,
in = 1L << 3,
out = 1L << 4,
trunc = 1L << 5
};
enum class seek_dir
{
beg = 1L << 0,
cur = 1L << 1,
end = 1L << 2
};
inline constexpr open_mode operator & (open_mode a, open_mode b) {return open_mode(static_cast<int>(a) & static_cast<int>(b));}
inline constexpr open_mode operator | (open_mode a, open_mode b) {return open_mode(static_cast<int>(a) | static_cast<int>(b));}
inline constexpr open_mode operator ^ (open_mode a, open_mode b) {return open_mode(static_cast<int>(a) ^ static_cast<int>(b));}
inline constexpr open_mode operator~(open_mode a) {return open_mode(~static_cast<int>(a));}
inline const open_mode& operator |= (open_mode& a, open_mode b) {return a = a | b;}
inline const open_mode& operator &= (open_mode& a, open_mode b) {return a = a & b;}
inline const open_mode& operator ^= (open_mode& a, open_mode b) {return a = a ^ b;}
template<typename T>
std::wstring to_wide_string(const T* str)
{
if (is_wide_char<T>::value)
return std::wstring(str);
std::wstring utf16 = std::wstring(std::mbstowcs(nullptr, reinterpret_cast<const char*>(str), 0), '\0');
std::mbstowcs(&utf16[0], reinterpret_cast<const char*>(str), utf16.size());
return utf16;
}
template<typename T>
class WinFStream
{
private:
open_mode mode;
HANDLE hFile;
bool binary_mode = false;
public:
WinFStream(const T* FilePath, open_mode mode = open_mode::in | open_mode::out) : mode(mode), hFile(nullptr), binary_mode(false)
{
unsigned int open_flags = 0;
if (static_cast<int>(mode & open_mode::bin))
{
binary_mode = true;
}
if (static_cast<int>(mode & open_mode::in))
{
open_flags |= GENERIC_READ;
}
else if (static_cast<int>(mode & open_mode::app))
{
open_flags |= FILE_APPEND_DATA;
}
if (static_cast<int>(mode & open_mode::out))
{
open_flags |= GENERIC_WRITE;
}
std::wstring path = to_wide_string(FilePath);
hFile = CreateFileW(path.c_str(), open_flags, 0, nullptr, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr);
if (static_cast<int>(mode & open_mode::ate))
{
SetFilePointer(hFile, 0, nullptr, FILE_END);
}
}
~WinFStream() {CloseHandle(hFile); hFile = nullptr;}
inline std::size_t seekg(std::size_t pos, seek_dir from)
{
return SetFilePointer(hFile, pos, nullptr, static_cast<int>(from) - 1);
}
inline std::size_t tellg()
{
return GetFileSize(hFile, nullptr);
}
void close()
{
CloseHandle(hFile);
hFile = nullptr;
}
template<typename U>
inline std::size_t write(const U* str, std::size_t size)
{
long unsigned int bytes_written = 0;
WriteFile(hFile, &str[0], size * sizeof(U), &bytes_written, nullptr);
return bytes_written;
}
template<typename U>
inline std::size_t read(U* str, std::size_t size)
{
long unsigned int bytes_read = 0;
ReadFile(hFile, &str[0], size * sizeof(U), &bytes_read, nullptr);
return bytes_read;
}
template<typename U>
WinFStream& operator << (const U &other)
{
this->write(&other, 1);
return *this;
}
template<typename U, std::size_t size>
WinFStream& operator << (U (&str)[size])
{
this->write(&str[0], size);
return *this;
}
template<typename U, typename Traits = std::char_traits<U>>
WinFStream& operator << (const std::basic_string<U, Traits>& str)
{
this->write(str.c_str(), str.size());
return *this;
}
template<typename U>
WinFStream& operator >> (U &other)
{
this->read(&other, 1);
return *this;
}
template<typename U, std::size_t size>
WinFStream& operator >> (U (&str)[size])
{
this->read(&str[0], size);
return *this;
}
template<typename U, typename Traits = std::char_traits<U>>
WinFStream& operator >> (std::basic_string<U, Traits>& str)
{
unsigned int i = 0;
std::vector<U> buffer(512, 0);
while(true)
{
long unsigned int bytes_read = 0;
bool result = ReadFile(hFile, &buffer[i], sizeof(U), &bytes_read, nullptr);
if (std::isspace(buffer[i]) || buffer[i] == '\r' || buffer[i] == '\n')
break;
++i;
if (bytes_read != sizeof(U) || !result)
break;
}
str.append(buffer.begin(), buffer.begin() + i);
return *this;
}
};
typedef WinFStream<wchar_t> WinFStreamW;
typedef WinFStream<char> WinFStreamA;
}
using namespace win;
int main()
{
unsigned char bom[2] = {0XFF, 0xFE};
std::wstring str = L"Chào";
WinFStreamW File(L"C:/Users/Brandon/Desktop/UTF16Test.txt");
File << bom;
File << str;
File.seekg(0, win::seek_dir::beg);
std::wstring str2;
File>>bom;
File>>str2;
std::wcout<<str2;
}
I know, it's dirty and doesn't work the exact same as fstream
but it was worth my time "trying" to simulate it..
But again, my operator << and >> aren't "equivalent" to std::fstream's
..
You're probably better off just using CreateFileW, ReadFile, WriteFile
or re-opening the file in text mode after writing the bom in binary mode..
Read Unicode UTF-32 file into wstring
You may want to use a classic C++ pattern of allocating wifstream
on the stack instead of the heap (new
):
std::wifstream* file = new std::wifstream(name, ifstream::binary);
std::wifstream file(name, ifstream::binary);
For the codecvt part, I'd try with std::codecvt_utf16<char32_t>
.
P.S. Note that wchar_t
can have different sizes (16 bits, 32 bits) on different platforms. So it may be better for you to use std::u16string
for UTF-16 and std::u32string
for UTF-32.
Can I use wstring to read, parse, and emit utf-8?
Is wstring able to do utf-8?
C++ has standard functions (wstring_convert
) that are able to convert between wstring and UTF-8 strings. There are also standard functions in both C and C++ (wcstombs
, mbstowcs
), that may be able to do the same with C-wstrings if your system has an appropriate locale. Most POSIX-is systems do, Windows-based ones normally don't (they have non-standard facilities for that). That's about all wstring and UTF-8 have to do with each other.
Is utf-8 even the thing I should be concerned with?
It depends. If you are living in 1980, or don't do any programming, then probably not. If you don't do any character-level processing, and only shuffle entire strings, you should also be fine. Just use char
-based strings and don't worry about any fancy characters. It all should work more or less automaticaly.
If you do need character-level stuff (substrings, search, ...) you probably do need to be aware of UTF-8. It's probably wise to do all internal processing with either wchar_t or char32_t, and convert from or to UTF-8 upon I/O. (I would just say "use wchar_t" but alas, on Windows wchar_t is broken. You may still be able to get away with it, but no promises.)
If I have a u8"string" or L"string" which contains characters from multiple languages, how would I write this to file using only the C standard IO library?
You cannot do much about u8"string"
in C. In C++, they are normal char
-based strings and can be written as any other string, and do the right thing. (You may have to jump through some hoops on Windows, see _setmode and _O_U8TEXT docs). Thia is however of a minor importance. You nirmally don't need to have any fancy characters in string literals. All user-facing strings should be loaded from files.
With wchar_t based strings, you may or may not be able to output UTF-8 directly, depending on your OS and compiler. You can always convert to UTF-8 and output that.
If you are willing to use third-party libraries, consider using http://utfcpp.sourceforge.net/
Also read:
http://utf8everywhere.org
http://www.joelonsoftware.com/articles/Unicode.html
Confused about C++'s std::wstring, UTF-16, UTF-8 and displaying strings in a windows GUI
Windows from NT4 onwards is based on Unicode encoded strings, yes. Early versions were based on UCS-2, which is the predecessor of UTF-16, and thus does not support all of the characters that UTF-16 does. Later versions are based on UTF-16. Not all OSes are based on UTF-16/UCS-2, though. *nix systems, for instance, are based on UTF-8 instead.
UTF-8 is a very good choice for storing data persistently. It is a universally supported encoding in all Unicode environments, and it is a good balance between data size and loss-less data compatibility.
Yes, you would have to parse the XML, extract the necessary information from it, and decode and transform it into something the UI can use.
Related Topics
Statically Linking System Libraries, Libc, Pthreads, to Aid in Debugging
Openprocess/Readprocessmemory/Writeprocessmemory/Closehandle Equivalent
Make Shared_Ptr Not Use Delete
How to Initialize Std::Array<T, N> Elegantly If T Is Not Default Constructible
In Which Order Should Floats Be Added to Get the Most Precise Result
How to Use C++ Classes with Ctypes
Std::Lock_Guard or Std::Scoped_Lock
C++: Simple Return Value from Std::Thread
Making Std::Vector Allocate Aligned Memory
Why Isn't Malloc Filling Up Memory
Pthread Condition Variables Not Signalling Even Though Set to Pthread_Process_Shared
Selecting a Member Function Using Different Enable_If Conditions
Store Functions with Different Signatures in a Map
When to Use the Brace-Enclosed Initializer
Send and Receive a File in Socket Programming in Linux with C/C++ (Gcc/G++)