Detect UTF-8 in SRT files

Also, remove unused code in utf8Converter.cpp and remove the "sfDefault" from the SourceFormat enum, whose meaning was platform-dependent anyway and did not make sense anymore, as we now default to UTF-8 on non-Windows platforms anyway. Processing falls back to the ACP on Windows to keep backwards compatibility.
justdan96 · Feb 8, 2020 · 0035efb · 0035efb · abakum · Feb 8, 2020
1 parent 355151c
commit 0035efb
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 50 deletions.
diff --git a/tsMuxer/convertUTF.cpp b/tsMuxer/convertUTF.cpp
@@ -402,6 +402,24 @@ Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd)
     return isLegalUTF8(source, length);
 }
 
+Boolean isLegalUTF8String(const UTF8* string, int length)
+{
+    /* same as above, but verify if the whole passed bytestream consists of valid UTF-8 sequences only. */
+    const auto stringEnd = string + length;
+    while (string < stringEnd)
+    {
+        const auto seqLength = trailingBytesForUTF8[*string] + 1;
+        const auto seqEnd = string + seqLength;
+        /* as the comment for the trailing bytes array notes, valid UTF-8 cannot contain 5- or 6-byte sequences. */
+        if (seqLength >= 5 || seqEnd > stringEnd || !isLegalUTF8(string, seqLength))
+        {
+            return false;
+        }
+        string = seqEnd;
+    }
+    return true;
+}
+
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart,

diff --git a/tsMuxer/convertUTF.h b/tsMuxer/convertUTF.h
@@ -141,6 +141,8 @@ extern "C"
 
     Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);
 
+    Boolean isLegalUTF8String(const UTF8* string, int length);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/tsMuxer/srtStreamReader.cpp b/tsMuxer/srtStreamReader.cpp
@@ -3,6 +3,7 @@
 
 #include <string>
 
+#include "convertUTF.h"
 #include "matroskaParser.h"
 #include "memory.h"
 #include "vodCoreException.h"
@@ -54,7 +55,7 @@ bool SRTStreamReader::detectSrcFormat(uint8_t* dataStart, int len, int& prefixLe
     if (len < 4)
         return false;
     // detect UTF-8/UTF-16/UTF-32 format
-    if (dataStart[0] == 0xEF && dataStart[1] == 0xBB && dataStart[2] == 0xBF)
+    if ((dataStart[0] == 0xEF && dataStart[1] == 0xBB && dataStart[2] == 0xBF) || isLegalUTF8String(dataStart, len))
     {
         m_charSize = 1;
         m_srcFormat = UtfConverter::sfUTF8;
@@ -97,9 +98,10 @@ bool SRTStreamReader::detectSrcFormat(uint8_t* dataStart, int len, int& prefixLe
     else
     {
 #ifdef _WIN32
+        LTRACE(LT_INFO, 2, "Failed to auto-detect SRT encoding : falling back to the active code page");
         m_srcFormat = UtfConverter::sfANSI;  // default value for win32
 #else
-        // m_srcFormat = UtfConverter::sfDefault;
+        LTRACE(LT_INFO, 2, "Failed to auto-detect SRT encoding : falling back to UTF-8");
         m_srcFormat = UtfConverter::sfUTF8;
 #endif
     }

diff --git a/tsMuxer/utf8Converter.cpp b/tsMuxer/utf8Converter.cpp
@@ -71,7 +71,7 @@ std::wstring toWideString(uint8_t* start, size_t widesize, SourceFormat srcForma
 {
     if (widesize == 0)
         return L"";
-#if defined(_WIN32)
+#ifdef _WIN32
     else if (srcFormat == sfANSI)
     {
         wchar_t* widestringnative = new wchar_t[widesize + 1];
@@ -81,39 +81,6 @@ std::wstring toWideString(uint8_t* start, size_t widesize, SourceFormat srcForma
         delete[] widestringnative;
         return resultstring;
     }
-#elif __linux__ == 1
-        /*
-        else if (srcFormat == sfDefault)
-        {
-                if (cd == 0)
-                {
-                        sourceEncoding = nl_langinfo(CODESET);
-                        LTRACE(LT_INFO, 2, "Default text encoding: " << sourceEncoding.c_str());
-                        if (sizeof(wchar_t) == 4)
-                                cd = iconv_open("UTF-32", sourceEncoding.c_str());
-                        else
-                                cd = iconv_open("UTF-16", sourceEncoding.c_str());
-                        if (cd == (iconv_t)-1)
-                                THROW(ERR_COMMON, "Can't initialize iconv library for source encoding " <<
-        sourceEncoding);
-                }
-
-                wchar_t* widestringnative = new wchar_t[widesize+1];
-                memset(widestringnative, 0, sizeof(wchar_t) * (widesize+1));
-                size_t outLen = (widesize+1) * sizeof(wchar_t);
-                size_t __widesize = widesize;
-                char* src = (char*) start;
-                char* dst = (char*) widestringnative;
-                size_t wlen = iconv(cd, &src, &__widesize, &dst, &outLen);
-                if (wlen == -1)
-                        THROW(ERR_COMMON, "Can't convert source string '" << start << "'to unicode");
-                widestringnative[widesize] = 0;
-
-            std::wstring resultstring(widestringnative);
-            delete [] widestringnative;
-            return resultstring;
-        }
-        */
 #endif
     else if (sizeof(wchar_t) == 2)
     {

diff --git a/tsMuxer/utf8Converter.h b/tsMuxer/utf8Converter.h
@@ -10,8 +10,7 @@ namespace UtfConverter
 enum SourceFormat
 {
     sfUnknown,
-    sfANSI,
-    sfDefault,
+    sfANSI,  // currently active code page (CP_ACP). used only on Windows.
     sfUTF8,
     sfUTF16le,
     sfUTF16be,

diff --git a/tsMuxer/vod_common.h b/tsMuxer/vod_common.h
@@ -11,18 +11,21 @@
 
 #if 1
 extern bool sLastMsg;
-#define LTRACE(level, errIndex, msg)           \
-    {                                          \
-        if (errIndex & 2)                      \
-        {                                      \
-            if (level <= LT_WARN)              \
-                std::cerr << msg << std::endl; \
-            else if (level == LT_INFO)         \
-                std::cout << msg << std::endl; \
-            if (level <= LT_INFO)              \
-                sLastMsg = true;               \
-        }                                      \
-    }
+#define LTRACE(level, errIndex, msg)               \
+    do                                             \
+    {                                              \
+        {                                          \
+            if (errIndex & 2)                      \
+            {                                      \
+                if (level <= LT_WARN)              \
+                    std::cerr << msg << std::endl; \
+                else if (level == LT_INFO)         \
+                    std::cout << msg << std::endl; \
+                if (level <= LT_INFO)              \
+                    sLastMsg = true;               \
+            }                                      \
+        }                                          \
+    } while (0)
 class Process
 {
    public:
-Original file line number
+Diff line change
@@ Expand Up / @@ -141,6 +141,8 @@ extern "C" @@
         Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);
+        Boolean isLegalUTF8String(const UTF8* string, int length);
     #ifdef __cplusplus
     }
     #endif
@@ Expand Down @@