author: madmaxoft@gmail.com <madmaxoft@gmail.com@0a769ca7-a7f5-676a-18bf-c427514a06d6> 2012-08-28 23:59:49 +0200
committer: madmaxoft@gmail.com <madmaxoft@gmail.com@0a769ca7-a7f5-676a-18bf-c427514a06d6> 2012-08-28 23:59:49 +0200
commit: 1651fcd9807f1033a5fc76e60cb17922e687a7b1 (patch)
tree: c9b82453023b52b05a61bcfd4c882ab5f6b60e23 /source/StringUtils.cpp
parent: Fixed a few warnings (diff)
download: cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar
cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.gz
cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.bz2
cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.lz
cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.xz
cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.zst
cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.zip
1 files changed, 159 insertions, 0 deletions
diff --git a/source/StringUtils.cpp b/source/StringUtils.cpp
index e3eb95a6e..2ec68550b 100644
--- a/source/StringUtils.cpp
+++ b/source/StringUtils.cpp
@@ -306,3 +306,162 @@ AString & RawBEToUTF8(short * a_RawData, int a_NumShorts, AString & a_UTF8)
 
 
 
+// UTF-8 conversion code adapted from:
+//  http://stackoverflow.com/questions/2867123/convert-utf-16-to-utf-8-under-windows-and-linux-in-c
+
+#define UNI_MAX_BMP         0x0000FFFF
+#define UNI_MAX_UTF16       0x0010FFFF
+#define UNI_MAX_UTF32       0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
+#define UNI_SUR_HIGH_START  0xD800
+#define UNI_SUR_HIGH_END    0xDBFF
+#define UNI_SUR_LOW_START   0xDC00
+#define UNI_SUR_LOW_END     0xDFFF
+
+
+
+
+
+static const char trailingBytesForUTF8[256] =
+{
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+};
+
+
+
+
+
+static const unsigned int offsetsFromUTF8[6] =
+{
+	0x00000000UL, 0x00003080UL, 0x000E2080UL, 
+	0x03C82080UL, 0xFA082080UL, 0x82082080UL
+};
+
+
+
+
+
+static bool isLegalUTF8(const unsigned char * source, int length)
+{
+	unsigned char a;
+	const unsigned char * srcptr = source + length;
+	switch (length)
+	{
+		default: return false;
+		// Everything else falls through when "true"...
+		case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+		case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+		case 2:
+		{
+			if ((a = (*--srcptr)) > 0xBF) return false;
+			switch (*source)
+			{
+				// no fall-through in this inner switch
+				case 0xE0: if (a < 0xA0) return false; break;
+				case 0xED: if (a > 0x9F) return false; break;
+				case 0xF0: if (a < 0x90) return false; break;
+				case 0xF4: if (a > 0x8F) return false; break;
+				default:   if (a < 0x80) return false;
+			}
+		}
+		case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+	}
+	if (*source > 0xF4) return false;
+	return true;
+}
+
+
+
+
+
+AString & UTF8ToRawBEUTF16(const char * a_UTF8, size_t a_UTF8Length, AString & a_UTF16)
+{
+	a_UTF16.clear();
+	a_UTF16.reserve(a_UTF8Length * 3);
+
+	const unsigned char * source    = (const unsigned char*)a_UTF8;
+	const unsigned char * sourceEnd = source + a_UTF8Length;
+	const int halfShift  = 10;  // used for shifting by 10 bits
+	const unsigned int halfBase = 0x0010000UL;
+	const unsigned int halfMask = 0x3FFUL;
+
+	while (source < sourceEnd)
+	{
+		unsigned int ch = 0;
+		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+		if (source + extraBytesToRead >= sourceEnd)
+		{
+			return a_UTF16;
+		}
+		// Do this check whether lenient or strict
+		if (!isLegalUTF8(source, extraBytesToRead + 1))
+		{
+			return a_UTF16;
+			break;
+		}
+		
+		// The cases all fall through. See "Note A" below.
+		switch (extraBytesToRead)
+		{
+			case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+			case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+			case 3: ch += *source++; ch <<= 6;
+			case 2: ch += *source++; ch <<= 6;
+			case 1: ch += *source++; ch <<= 6;
+			case 0: ch += *source++;
+		}
+		ch -= offsetsFromUTF8[extraBytesToRead];
+
+		if (ch <= UNI_MAX_BMP)
+		{
+			// Target is a character <= 0xFFFF
+			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
+			{
+				// UTF-16 surrogate values are illegal in UTF-32
+				ch = ' ';
+			}
+			unsigned short v = htons((unsigned short)ch);
+			a_UTF16.append((const char *)&v, 2);
+		}
+		else if (ch > UNI_MAX_UTF16)
+		{
+			// Invalid value, replace with a space
+			unsigned short v = htons(' ');
+			a_UTF16.append((const char *)&v, 2);
+		}
+		else
+		{
+			// target is a character in range 0xFFFF - 0x10FFFF.
+			ch -= halfBase;
+			unsigned short v1 = htons((ch >> halfShift) + UNI_SUR_HIGH_START);
+			unsigned short v2 = htons((ch & halfMask) + UNI_SUR_LOW_START);
+			a_UTF16.append((const char *)&v1, 2);
+			a_UTF16.append((const char *)&v2, 2);
+		}
+	}
+	return a_UTF16;
+}
+
+/* ---------------------------------------------------------------------
+
+    Note A.
+    The fall-through switches in UTF-8 reading code save a
+    temp variable, some decrements & conditionals.  The switches
+    are equivalent to the following loop:
+    {
+        int tmpBytesToRead = extraBytesToRead+1;
+        do {
+        ch += *source++;
+        --tmpBytesToRead;
+        if (tmpBytesToRead) ch <<= 6;
+        } while (tmpBytesToRead > 0);
+    }
+
+   --------------------------------------------------------------------- */
+\ No newline at end of file
author	madmaxoft@gmail.com <madmaxoft@gmail.com@0a769ca7-a7f5-676a-18bf-c427514a06d6>	2012-08-28 23:59:49 +0200
committer	madmaxoft@gmail.com <madmaxoft@gmail.com@0a769ca7-a7f5-676a-18bf-c427514a06d6>	2012-08-28 23:59:49 +0200
commit	1651fcd9807f1033a5fc76e60cb17922e687a7b1 (patch)
tree	c9b82453023b52b05a61bcfd4c882ab5f6b60e23 /source/StringUtils.cpp
parent	Fixed a few warnings (diff)
download	cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.gz cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.bz2 cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.lz cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.xz cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.tar.zst cuberite-1651fcd9807f1033a5fc76e60cb17922e687a7b1.zip