Merge pull request #14020 from jordan-woyak/string-util-cleanups

StringUtil: Cleanups and add some character encoding conversion unit tests.
This commit is contained in:
JosJuice 2025-11-08 17:36:54 +01:00 committed by GitHub
commit 958db7c78c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 64 additions and 43 deletions

View File

@ -12,9 +12,7 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <iomanip> #include <iomanip>
#include <istream>
#include <iterator> #include <iterator>
#include <limits.h>
#include <locale> #include <locale>
#include <sstream> #include <sstream>
#include <string> #include <string>
@ -25,18 +23,18 @@
#include <fmt/ranges.h> #include <fmt/ranges.h>
#include "Common/CommonFuncs.h" #include "Common/CommonFuncs.h"
#include "Common/CommonPaths.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/Logging/Log.h" #include "Common/Logging/Log.h"
#include "Common/Swap.h"
#ifdef _WIN32 #ifdef _WIN32
#include <Windows.h> #include <Windows.h>
#include <shellapi.h> #include <shellapi.h>
constexpr u32 CODEPAGE_SHIFT_JIS = 932; constexpr u32 CODEPAGE_SHIFT_JIS = 932;
constexpr u32 CODEPAGE_WINDOWS_1252 = 1252; constexpr u32 CODEPAGE_WINDOWS_1252 = 1252;
#include "Common/Swap.h"
#else #else
#include <errno.h> #include <cerrno>
#include <iconv.h> #include <iconv.h>
#include <locale.h> #include <locale.h>
#endif #endif
@ -85,8 +83,6 @@ std::string HexDump(const u8* data, size_t size)
bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list args) bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list args)
{ {
int writtenCount;
#ifdef _WIN32 #ifdef _WIN32
// You would think *printf are simple, right? Iterate on each character, // You would think *printf are simple, right? Iterate on each character,
// if it's a format specifier handle it properly, etc. // if it's a format specifier handle it properly, etc.
@ -114,28 +110,26 @@ bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list ar
static _locale_t c_locale = nullptr; static _locale_t c_locale = nullptr;
if (!c_locale) if (!c_locale)
c_locale = _create_locale(LC_ALL, "C"); c_locale = _create_locale(LC_ALL, "C");
writtenCount = _vsnprintf_l(out, outsize, format, c_locale, args); const int written_count = _vsnprintf_l(out, outsize, format, c_locale, args);
#else #else
#if !defined(ANDROID) && !defined(__HAIKU__) && !defined(__OpenBSD__) && !defined(__NetBSD__) #if !defined(ANDROID) && !defined(__HAIKU__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
locale_t previousLocale = uselocale(GetCLocale()); locale_t previousLocale = uselocale(GetCLocale());
#endif #endif
writtenCount = vsnprintf(out, outsize, format, args); const int written_count = vsnprintf(out, outsize, format, args);
#if !defined(ANDROID) && !defined(__HAIKU__) && !defined(__OpenBSD__) && !defined(__NetBSD__) #if !defined(ANDROID) && !defined(__HAIKU__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
uselocale(previousLocale); uselocale(previousLocale);
#endif #endif
#endif #endif
if (writtenCount > 0 && writtenCount < outsize) if (written_count > 0 && written_count < outsize)
{ {
out[writtenCount] = '\0'; out[written_count] = '\0';
return true; return true;
} }
else
{
out[outsize - 1] = '\0'; out[outsize - 1] = '\0';
return false; return false;
} }
}
std::string StringFromFormat(const char* format, ...) std::string StringFromFormat(const char* format, ...)
{ {
@ -182,7 +176,7 @@ std::string ArrayToString(const u8* data, u32 size, int line_len, bool spaces)
std::ostringstream oss; std::ostringstream oss;
oss << std::setfill('0') << std::hex; oss << std::setfill('0') << std::hex;
for (int line = 0; size; ++data, --size) for (int line = 0; size != 0; ++data, --size)
{ {
oss << std::setw(2) << static_cast<int>(*data); oss << std::setw(2) << static_cast<int>(*data);
@ -205,7 +199,7 @@ static std::string_view StripEnclosingChars(std::string_view str, T chars)
if (str.npos != s) if (str.npos != s)
return str.substr(s, str.find_last_not_of(chars) - s + 1); return str.substr(s, str.find_last_not_of(chars) - s + 1);
else
return ""; return "";
} }
@ -227,7 +221,7 @@ std::string_view StripQuotes(std::string_view s)
{ {
if (!s.empty() && '\"' == s[0] && '\"' == *s.rbegin()) if (!s.empty() && '\"' == s[0] && '\"' == *s.rbegin())
return s.substr(1, s.size() - 2); return s.substr(1, s.size() - 2);
else
return s; return s;
} }
@ -305,10 +299,12 @@ bool SplitPath(std::string_view full_path, std::string* path, std::string* filen
if (full_path.empty()) if (full_path.empty())
return false; return false;
size_t dir_end = full_path.find_last_of("/" size_t dir_end = full_path.find_last_of(
// Windows needs the : included for something like just "C:" to be considered a directory // Windows needs the : included for something like just "C:" to be considered a directory
#ifdef _WIN32 #ifdef _WIN32
":" "/:"
#else
'/'
#endif #endif
); );
if (std::string::npos == dir_end) if (std::string::npos == dir_end)
@ -351,7 +347,8 @@ std::string WithUnifiedPathSeparators(std::string path)
std::string PathToFileName(std::string_view path) std::string PathToFileName(std::string_view path)
{ {
std::string file_name, extension; std::string file_name;
std::string extension;
SplitPath(path, nullptr, &file_name, &extension); SplitPath(path, nullptr, &file_name, &extension);
return file_name + extension; return file_name + extension;
} }
@ -485,14 +482,15 @@ std::string UTF16BEToUTF8(const char16_t* str, size_t max_size)
#else #else
template <typename T> template <typename T>
std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_view<T> input) static std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_view<T> input)
{ {
std::string result; std::string result;
iconv_t const conv_desc = iconv_open(tocode, fromcode); auto* const conv_desc = iconv_open(tocode, fromcode);
if ((iconv_t)-1 == conv_desc) if ((iconv_t)-1 == conv_desc)
{ {
ERROR_LOG_FMT(COMMON, "Iconv initialization failure [{}]: {}", fromcode, strerror(errno)); ERROR_LOG_FMT(COMMON, "Iconv initialization failure [{}]: {}", fromcode,
Common::LastStrerrorString());
} }
else else
{ {
@ -502,9 +500,9 @@ std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_v
std::string out_buffer; std::string out_buffer;
out_buffer.resize(out_buffer_size); out_buffer.resize(out_buffer_size);
auto src_buffer = input.data(); auto* src_buffer = input.data();
size_t src_bytes = in_bytes; size_t src_bytes = in_bytes;
auto dst_buffer = out_buffer.data(); auto* dst_buffer = out_buffer.data();
size_t dst_bytes = out_buffer.size(); size_t dst_bytes = out_buffer.size();
while (src_bytes != 0) while (src_bytes != 0)
@ -525,7 +523,7 @@ std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_v
} }
else else
{ {
ERROR_LOG_FMT(COMMON, "iconv failure [{}]: {}", fromcode, strerror(errno)); ERROR_LOG_FMT(COMMON, "iconv failure [{}]: {}", fromcode, Common::LastStrerrorString());
break; break;
} }
} }
@ -541,7 +539,7 @@ std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_v
} }
template <typename T> template <typename T>
std::string CodeToUTF8(const char* fromcode, std::basic_string_view<T> input) static std::string CodeToUTF8(const char* fromcode, std::basic_string_view<T> input)
{ {
return CodeTo("UTF-8", fromcode, input); return CodeTo("UTF-8", fromcode, input);
} }
@ -566,11 +564,9 @@ std::string UTF8ToSHIFTJIS(std::string_view input)
std::string WStringToUTF8(std::wstring_view input) std::string WStringToUTF8(std::wstring_view input)
{ {
using codecvt = std::conditional_t<sizeof(wchar_t) == 2, std::codecvt_utf8_utf16<wchar_t>, // Note: Without LE iconv expects a BOM.
std::codecvt_utf8<wchar_t>>; // The "WCHAR_T" code would be appropriate, but it's apparently not in every iconv implementation.
return CodeToUTF8((sizeof(wchar_t) == 2) ? "UTF-16LE" : "UTF-32LE", input);
std::wstring_convert<codecvt, wchar_t> converter;
return converter.to_bytes(input.data(), input.data() + input.size());
} }
std::string UTF16BEToUTF8(const char16_t* str, size_t max_size) std::string UTF16BEToUTF8(const char16_t* str, size_t max_size)

View File

@ -201,13 +201,13 @@ std::string PathToFileName(std::string_view path);
void StringPopBackIf(std::string* s, char c); void StringPopBackIf(std::string* s, char c);
size_t StringUTF8CodePointCount(std::string_view str); size_t StringUTF8CodePointCount(std::string_view str);
std::string CP1252ToUTF8(std::string_view str); std::string CP1252ToUTF8(std::string_view input);
std::string SHIFTJISToUTF8(std::string_view str); std::string SHIFTJISToUTF8(std::string_view input);
std::string UTF8ToSHIFTJIS(std::string_view str); std::string UTF8ToSHIFTJIS(std::string_view input);
std::string WStringToUTF8(std::wstring_view str); std::string WStringToUTF8(std::wstring_view input);
std::string UTF16BEToUTF8(const char16_t* str, size_t max_size); // Stops at \0 std::string UTF16BEToUTF8(const char16_t* str, size_t max_size); // Stops at \0
std::string UTF16ToUTF8(std::u16string_view str); std::string UTF16ToUTF8(std::u16string_view input);
std::u16string UTF8ToUTF16(std::string_view str); std::u16string UTF8ToUTF16(std::string_view input);
#ifdef _WIN32 #ifdef _WIN32
@ -311,7 +311,7 @@ std::string GetEscapedHtml(std::string html);
void ToLower(std::string* str); void ToLower(std::string* str);
void ToUpper(std::string* str); void ToUpper(std::string* str);
bool CaseInsensitiveEquals(std::string_view a, std::string_view b); bool CaseInsensitiveEquals(std::string_view a, std::string_view b);
bool CaseInsensitiveContains(std::string_view a, std::string_view b); bool CaseInsensitiveContains(std::string_view haystack, std::string_view needle);
// 'std::less'-like comparison function object type for case-insensitive strings. // 'std::less'-like comparison function object type for case-insensitive strings.
struct CaseInsensitiveLess struct CaseInsensitiveLess

View File

@ -2,10 +2,12 @@
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include "Common/StringUtil.h" #include "Common/StringUtil.h"
#include "Common/Swap.h"
TEST(StringUtil, StringPopBackIf) TEST(StringUtil, StringPopBackIf)
{ {
@ -256,3 +258,26 @@ TEST(StringUtil, CaseInsensitiveContains_OverlappingMatches)
EXPECT_TRUE(Common::CaseInsensitiveContains("aaaaaa", "aa")); EXPECT_TRUE(Common::CaseInsensitiveContains("aaaaaa", "aa"));
EXPECT_TRUE(Common::CaseInsensitiveContains("ababababa", "bABa")); EXPECT_TRUE(Common::CaseInsensitiveContains("ababababa", "bABa"));
} }
TEST(StringUtil, CharacterEncodingConversion)
{
// wstring
EXPECT_EQ(WStringToUTF8(L"hello 🐬"), "hello 🐬");
// UTF-16
EXPECT_EQ(UTF16ToUTF8(u"hello 🐬"), "hello 🐬");
EXPECT_EQ(UTF8ToUTF16("hello 🐬"), u"hello 🐬");
// UTF-16BE
char16_t utf16be_str[] = u"hello 🐬";
for (auto& c : utf16be_str)
c = Common::swap16(c);
EXPECT_EQ(UTF16BEToUTF8(utf16be_str, 99), "hello 🐬");
// Shift JIS
EXPECT_EQ(SHIFTJISToUTF8("\x83\x43\x83\x8b\x83\x4a"), "イルカ");
EXPECT_EQ(UTF8ToSHIFTJIS("イルカ"), "\x83\x43\x83\x8b\x83\x4a");
// CP1252
EXPECT_EQ(CP1252ToUTF8("hello \xa5"), "hello ¥");
}