diff --git a/src/common/Text/TextConvert.cpp b/src/common/Text/TextConvert.cpp --- a/src/common/Text/TextConvert.cpp +++ b/src/common/Text/TextConvert.cpp @@ -1,512 +1,552 @@ #include "lgi/common/Lgi.h" #include "lgi/common/TextConvert.h" #include "lgi/common/Mime.h" #include "lgi/common/Base64.h" #include "lgi/common/Charset.h" // return true if there are any characters with the 0x80 bit set bool Is8Bit(const char *Text) { if (!Text) return false; while (*Text) { if (*Text & 0x80) return true; Text++; } return false; } char ConvHexToBin(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c + 10 - 'a'; if (c >= 'A' && c <= 'F') return c + 10 - 'A'; return 0; } char *DecodeBase64Str(char *Str, ssize_t Len) { if (Str) { ssize_t B64Len = (Len < 0) ? strlen(Str) : Len; ssize_t BinLen = BufferLen_64ToBin(B64Len); char *s = new char[BinLen+1]; if (s) { ssize_t Converted = ConvertBase64ToBinary((uchar*)s, BinLen, Str, B64Len); s[Converted] = 0; DeleteArray(Str); Str = s; } } return Str; } LString LDecodeBase64Str(LString Str) { LString r; ssize_t BinLen = BufferLen_64ToBin(Str.Length()); if (Str && r.Length(BinLen) > 0) { ssize_t Converted = ConvertBase64ToBinary((uchar*)r.Get(), r.Length(), Str.Get(), Str.Length()); if (Converted >= 0) r.Get()[Converted] = 0; else r.Empty(); } return r; } char *DecodeQuotedPrintableStr(char *Str, ssize_t Len) { if (Str) { if (Len < 0) Len = strlen(Str); uchar *s = new uchar[Len+1]; if (s) { char *Out = (char*) s; char *Text = Str; for (int i=0; i Output) { if (!Str) - return NULL; + return; LStringPipe p(256); + bool prevWasEncoded = false; for (char *s = Str; *s; ) { char *e = s; - bool Decode = 0, Descape = 0; + bool Decode = 0, Descape = 0, nonWhitespace = false; while (*e) { if ( (Decode = (e[0] == '=' && e[1] == '?')) || (Descape = (e[0] == '\\')) ) { - // Emit characters between 's' and 'e' - if (e > s) + // Emit characters between 's' and 'e'? + if (e > s && !(!nonWhitespace && prevWasEncoded)) p.Write(s, e - s); + + /* + nonWhitespace=0, prevWasEncoded=0 = emit + nonWhitespace=1, prevWasEncoded=0 = emit + nonWhitespace=0, prevWasEncoded=1 = skip + nonWhitespace=1, prevWasEncoded=1 = emit + */ break; } + + if (!strchr(WhiteSpace, *e)) + nonWhitespace = true; + e++; } + prevWasEncoded = false; if (Decode) { // is there a word remaining bool Encoded = false; char *Start = e + 2; char *First = strchr(Start, '?'); char *Second = First ? strchr(First + 1, '?') : NULL; char *End = Second ? strstr(Second + 1, "?=") : NULL; if (End) { LString Cp(Start, First - Start); int Type = CONTENT_NONE; bool StripUnderscores = false; if (ToUpper(First[1]) == 'B') { // Base64 encoding Type = CONTENT_BASE64; } else if (ToUpper(First[1]) == 'Q') { // Quoted printable Type = CONTENT_QUOTED_PRINTABLE; StripUnderscores = true; } if (Type != CONTENT_NONE) { Second++; LString Block(Second, End - Second); if (Block) { switch (Type) { case CONTENT_BASE64: Block = LDecodeBase64Str(Block); break; case CONTENT_QUOTED_PRINTABLE: Block = LDecodeQuotedPrintableStr(Block); break; } if (StripUnderscores) { for (char *i=Block; *i; i++) { if (*i == '_') *i = ' '; } } if (Cp && !_stricmp(Cp, "utf-8")) { p.Write(Block); } else { auto Inst = LCharsetSystem::Inst(); LString Detect = Inst && Inst->DetectCharset ? Inst->DetectCharset(Block) : LString(); LAutoString Utf8((char*)LNewConvertCp("utf-8", Block, Detect ? Detect : Cp, Block.Length())); if (Utf8) { if (LIsUtf8(Utf8)) p.Write((uchar*)Utf8.Get(), strlen(Utf8)); } else { p.Write(Block); } } } s = End + 2; if (*s == '\n') { s++; while (*s && strchr(WhiteSpace, *s)) s++; } Encoded = true; } } - if (!Encoded) + if (Encoded) + { + prevWasEncoded = true; + } + else { // Encoding error, just emit the raw string and exit. size_t Len = strlen(s); p.Write((uchar*) s, Len); break; } } else if (Descape) { // Un-escape the string... e++; if (*e) p.Write(e, 1); else break; s = e + 1; } else { // Last segment of string... LAssert(*e == 0); if (e > s) p.Write(s, e - s); break; } } - DeleteArray(Str); - return p.NewStr(); + Output(p); } +char *DecodeRfc2047(char *Str) +{ + DecodeRfc2047_Impl( Str, + [&Str](LStringPipe &p) + { + DeleteArray(Str); + Str = p.NewStr(); + }); + + return Str; +} + +LString LDecodeRfc2047(LString Str) +{ + DecodeRfc2047_Impl( Str, + [&Str](LStringPipe &p) + { + Str = p.NewLStr(); + }); + + return Str; +} + + #define MIME_MAX_LINE 76 static void EncodeRfc2047_Impl( char *Input, size_t Length, const char *InCharset, LString::Array *OutCharsets, ssize_t LineLength, std::function Process) { if (!Input) return; if (!InCharset) InCharset = "utf-8"; LStringPipe p(256); if (Is8Bit(Input)) { // pick an encoding bool Base64 = false; const char *DestCs = "utf-8"; char *Buf = NULL; if (!OutCharsets || (OutCharsets && OutCharsets->Length() == 0) || Stristr(InCharset, "utf") != NULL) { if (!Stricmp(InCharset, "utf-8")) { auto DetectedCs = LUnicodeToCharset(Input, Length, OutCharsets); if (DetectedCs) Buf = (char*)LNewConvertCp(DestCs = DetectedCs, Input, InCharset, Length); } else { // Not utf-8 so convert to that first... LAutoString utf8((char*)LNewConvertCp("utf-8", Input, InCharset)); if (utf8) { auto DetectCs = LUnicodeToCharset(utf8, Strlen(utf8.Get()), OutCharsets); if (DetectCs) Buf = (char*)LNewConvertCp(DestCs = DetectCs, Input, InCharset, Length); } } } else { for (auto Cs: *OutCharsets) { if (Buf = (char*)LNewConvertCp(DestCs = Cs, Input, InCharset, Length)) break; } if (!Buf || !DestCs) { // Fall back to utf-8 Buf = (char*)LNewConvertCp(DestCs = "utf-8", Input, InCharset, Length); } } if (!Buf) { // Fall back to copy... Buf = NewStr(Input, Length); if (InCharset) DestCs = InCharset; } int Chars = 0; for (unsigned i=0; Buf && Buf[i]; i++) { if (Buf[i] & 0x80) Chars++; } if ( Length > 0 && ((double)Chars/Length) > 0.4 ) { Base64 = true; } if (Buf) { // encode the word char Prefix[64]; int Ch = sprintf_s(Prefix, sizeof(Prefix), "=?%s?%c?", DestCs, Base64 ? 'B' : 'Q'); p.Write(Prefix, Ch); LineLength += Ch; if (Base64) { // Base64 size_t InLen = strlen(Buf); // int EstBytes = BufferLen_BinTo64(InLen); char Temp[512]; ssize_t Bytes = ConvertBinaryToBase64(Temp, sizeof(Temp), (uchar*)Buf, InLen); p.Push(Temp, Bytes); } else { // Quoted printable for (char *w = Buf; *w; w++) { if (*w == ' ') { if (LineLength > MIME_MAX_LINE - 3) { p.Print("?=\r\n\t%s", Prefix); LineLength = 1 + strlen(Prefix); } p.Write((char*)"_", 1); LineLength++; } else if (*w & 0x80 || *w == '_' || *w == '?' || *w == '=') { if (LineLength > MIME_MAX_LINE - 5) { p.Print("?=\r\n\t%s", Prefix); LineLength = 1 + strlen(Prefix); } char Temp[16]; Ch = sprintf_s(Temp, sizeof(Temp), "=%2.2X", (uchar)*w); p.Write(Temp, Ch); LineLength += Ch; } else { if (LineLength > MIME_MAX_LINE - 3) { p.Print("?=\r\n\t%s", Prefix); LineLength = 1 + strlen(Prefix); } p.Write(w, 1); LineLength++; } } } p.Push("?="); DeleteArray(Buf); } Process(Input, p); } else { bool RecodeNewLines = false; for (char *s = Input; *s; s++) { if (*s == '\n' && (s == Input || s[-1] != '\r')) { RecodeNewLines = true; break; } } if (RecodeNewLines) { for (char *s = Input; *s; s++) { if (*s == '\r') ; else if (*s == '\n') p.Write("\r\n", 2); else p.Write(s, 1); } Process(Input, p); } } // It's not an error to not call 'OutputStr', in that case // the input is passed through to the output unchanged. } // Old heap string encode method (will eventually remove this...) char *EncodeRfc2047(char *Str, const char *Charset, LString::Array *CharsetPrefs, ssize_t LineLength) { char *Out = Str; EncodeRfc2047_Impl( Str, Strlen(Str), Charset, CharsetPrefs, LineLength, [&Out](auto s, auto &pipe) { DeleteArray(s); Out = pipe.NewStr(); }); return Out; } // New LString encode method LString LEncodeRfc2047(LString Str, const char *Charset, LString::Array *CharsetPrefs, ssize_t LineLength) { EncodeRfc2047_Impl( Str.Get(), Str.Length(), Charset, CharsetPrefs, LineLength, [&Str](auto s, auto &pipe) { Str = pipe.NewLStr(); }); return Str; } diff --git a/test/UnitTests/src/StringTests.cpp b/test/UnitTests/src/StringTests.cpp --- a/test/UnitTests/src/StringTests.cpp +++ b/test/UnitTests/src/StringTests.cpp @@ -1,76 +1,91 @@ -#include "lgi/common/Lgi.h" +#include "lgi/common/Lgi.h" #include "lgi/common/TextConvert.h" #include "UnitTests.h" class PrivLStringTests { public: }; LStringTests::LStringTests() : UnitTest("LStringTests") { d = new PrivLStringTests; } LStringTests::~LStringTests() { DeleteObj(d); } bool LStringTests::Run() { /* Things to test: bool Is8Bit(const char *Text); [[deprecated]] char *DecodeBase64Str(char *Str, ssize_t Len = -1); LString LDecodeBase64Str(LString Str); [[deprecated]] char *DecodeQuotedPrintableStr(char *Str, ssize_t Len = -1); LString LDecodeQuotedPrintableStr(LString Str); [[deprecated]] char *DecodeRfc2047(char *Str); LString LDecodeRfc2047(LString Str); LString LEncodeRfc2047(LString Str, const char *Charset, List *CharsetPrefs, ssize_t LineLength = 0); */ const char *EncodeResult1 = "=?iso-8859-1?Q?Beytullah_Gen=E7?="; const char *EncodeResult2 = "=?iso-8859-9?Q?Beytullah_Gen=E7?="; LString::Array CharsetPrefs; - const char *Rfc2047Input = "Beytullah Gen"; - const char *UtfInput = "Beytullah Genç"; + const char *Rfc2047Input = "Beytullah Genç"; + const char *UtfInput = "Beytullah Genç"; const char *Charset = "windows-1252"; // No prefered charset testing: LAutoString result1( EncodeRfc2047(NewStr(Rfc2047Input), Charset) ); if (Stricmp(result1.Get(), EncodeResult1)) return FAIL(_FL, "EncodeRfc2047"); LAutoString decode1( DecodeRfc2047(NewStr(result1)) ); if (Strcmp(UtfInput, decode1.Get())) return FAIL(_FL, "DecodeRfc2047"); LString result2 = LEncodeRfc2047(Rfc2047Input, Charset); if (Stricmp(result2.Get(), EncodeResult1)) return FAIL(_FL, "LEncodeRfc2047"); LAutoString decode2( DecodeRfc2047(NewStr(result2)) ); if (Strcmp(UtfInput, decode2.Get())) return FAIL(_FL, "DecodeRfc2047"); // Redo tests with a charset preference set: CharsetPrefs.Add("iso-8859-9"); LAutoString result3( EncodeRfc2047(NewStr(Rfc2047Input), Charset, &CharsetPrefs)); if (Stricmp(result3.Get(), EncodeResult2)) return FAIL(_FL, "EncodeRfc2047"); LString result4 = LEncodeRfc2047(Rfc2047Input, Charset, &CharsetPrefs); if (Stricmp(result4.Get(), EncodeResult2)) return FAIL(_FL, "EncodeRfc2047"); + // Quoted printable decode test: + auto input5 = "=?UTF-8?q?=D0=92=D0=B0=D0=BC_=D0=BF=D1=80=D0=B8=D1=88=D0=BB=D0=BE_=D0=BD?= =?UTF-8?q?=D0=BE=D0=B2=D0=BE=D0=B5_=D1=81=D0=BE=D0=BE=D0=B1=D1=89=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5?="; + auto result5 = LDecodeRfc2047(input5); + auto encodeResult5 = L"Вам пришло новое сообщение"; + LAutoWString decode5( Utf8ToWide(result5) ); + if (Stricmp(encodeResult5, decode5.Get())) + return FAIL(_FL, "LDecodeRfc2047"); + + // Mixing encoded and plain words: + auto input6 = "test =?UTF-8?q?=D0=BE=D0=B2=D0=BE=D0=B5?= of words =?UTF-8?q?=D0=BE=D0=B2=D0=BE=D0=B5?="; + auto result6 = LDecodeRfc2047(input6); + auto encodeResult6 = L"test овое of words овое"; + LAutoWString decode6( Utf8ToWide(result6) ); + if (Stricmp(encodeResult6, decode6.Get())) + return FAIL(_FL, "LDecodeRfc2047"); return true; }