Merge pull request #14020 from jordan-woyak/string-util-cleanups

StringUtil: Cleanups and add some character encoding conversion unit tests.
This commit is contained in:
JosJuice 2025-11-08 17:36:54 +01:00 committed by GitHub
commit 958db7c78c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 64 additions and 43 deletions

View File

@ -12,9 +12,7 @@
#include <cstdlib>
#include <cstring>
#include <iomanip>
#include <istream>
#include <iterator>
#include <limits.h>
#include <locale>
#include <sstream>
#include <string>
@ -25,18 +23,18 @@
#include <fmt/ranges.h>
#include "Common/CommonFuncs.h"
#include "Common/CommonPaths.h"
#include "Common/CommonTypes.h"
#include "Common/Logging/Log.h"
#include "Common/Swap.h"
#ifdef _WIN32
#include <Windows.h>
#include <shellapi.h>
constexpr u32 CODEPAGE_SHIFT_JIS = 932;
constexpr u32 CODEPAGE_WINDOWS_1252 = 1252;
#include "Common/Swap.h"
#else
#include <errno.h>
#include <cerrno>
#include <iconv.h>
#include <locale.h>
#endif
@ -85,8 +83,6 @@ std::string HexDump(const u8* data, size_t size)
bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list args)
{
int writtenCount;
#ifdef _WIN32
// You would think *printf are simple, right? Iterate on each character,
// if it's a format specifier handle it properly, etc.
@ -114,28 +110,26 @@ bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list ar
static _locale_t c_locale = nullptr;
if (!c_locale)
c_locale = _create_locale(LC_ALL, "C");
writtenCount = _vsnprintf_l(out, outsize, format, c_locale, args);
const int written_count = _vsnprintf_l(out, outsize, format, c_locale, args);
#else
#if !defined(ANDROID) && !defined(__HAIKU__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
locale_t previousLocale = uselocale(GetCLocale());
#endif
writtenCount = vsnprintf(out, outsize, format, args);
const int written_count = vsnprintf(out, outsize, format, args);
#if !defined(ANDROID) && !defined(__HAIKU__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
uselocale(previousLocale);
#endif
#endif
if (writtenCount > 0 && writtenCount < outsize)
if (written_count > 0 && written_count < outsize)
{
out[writtenCount] = '\0';
out[written_count] = '\0';
return true;
}
else
{
out[outsize - 1] = '\0';
return false;
}
}
std::string StringFromFormat(const char* format, ...)
{
@ -182,7 +176,7 @@ std::string ArrayToString(const u8* data, u32 size, int line_len, bool spaces)
std::ostringstream oss;
oss << std::setfill('0') << std::hex;
for (int line = 0; size; ++data, --size)
for (int line = 0; size != 0; ++data, --size)
{
oss << std::setw(2) << static_cast<int>(*data);
@ -205,7 +199,7 @@ static std::string_view StripEnclosingChars(std::string_view str, T chars)
if (str.npos != s)
return str.substr(s, str.find_last_not_of(chars) - s + 1);
else
return "";
}
@ -227,7 +221,7 @@ std::string_view StripQuotes(std::string_view s)
{
if (!s.empty() && '\"' == s[0] && '\"' == *s.rbegin())
return s.substr(1, s.size() - 2);
else
return s;
}
@ -305,10 +299,12 @@ bool SplitPath(std::string_view full_path, std::string* path, std::string* filen
if (full_path.empty())
return false;
size_t dir_end = full_path.find_last_of("/"
size_t dir_end = full_path.find_last_of(
// Windows needs the : included for something like just "C:" to be considered a directory
#ifdef _WIN32
":"
"/:"
#else
'/'
#endif
);
if (std::string::npos == dir_end)
@ -351,7 +347,8 @@ std::string WithUnifiedPathSeparators(std::string path)
std::string PathToFileName(std::string_view path)
{
std::string file_name, extension;
std::string file_name;
std::string extension;
SplitPath(path, nullptr, &file_name, &extension);
return file_name + extension;
}
@ -485,14 +482,15 @@ std::string UTF16BEToUTF8(const char16_t* str, size_t max_size)
#else
template <typename T>
std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_view<T> input)
static std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_view<T> input)
{
std::string result;
iconv_t const conv_desc = iconv_open(tocode, fromcode);
auto* const conv_desc = iconv_open(tocode, fromcode);
if ((iconv_t)-1 == conv_desc)
{
ERROR_LOG_FMT(COMMON, "Iconv initialization failure [{}]: {}", fromcode, strerror(errno));
ERROR_LOG_FMT(COMMON, "Iconv initialization failure [{}]: {}", fromcode,
Common::LastStrerrorString());
}
else
{
@ -502,9 +500,9 @@ std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_v
std::string out_buffer;
out_buffer.resize(out_buffer_size);
auto src_buffer = input.data();
auto* src_buffer = input.data();
size_t src_bytes = in_bytes;
auto dst_buffer = out_buffer.data();
auto* dst_buffer = out_buffer.data();
size_t dst_bytes = out_buffer.size();
while (src_bytes != 0)
@ -525,7 +523,7 @@ std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_v
}
else
{
ERROR_LOG_FMT(COMMON, "iconv failure [{}]: {}", fromcode, strerror(errno));
ERROR_LOG_FMT(COMMON, "iconv failure [{}]: {}", fromcode, Common::LastStrerrorString());
break;
}
}
@ -541,7 +539,7 @@ std::string CodeTo(const char* tocode, const char* fromcode, std::basic_string_v
}
template <typename T>
std::string CodeToUTF8(const char* fromcode, std::basic_string_view<T> input)
static std::string CodeToUTF8(const char* fromcode, std::basic_string_view<T> input)
{
return CodeTo("UTF-8", fromcode, input);
}
@ -566,11 +564,9 @@ std::string UTF8ToSHIFTJIS(std::string_view input)
std::string WStringToUTF8(std::wstring_view input)
{
using codecvt = std::conditional_t<sizeof(wchar_t) == 2, std::codecvt_utf8_utf16<wchar_t>,
std::codecvt_utf8<wchar_t>>;
std::wstring_convert<codecvt, wchar_t> converter;
return converter.to_bytes(input.data(), input.data() + input.size());
// Note: Without LE iconv expects a BOM.
// The "WCHAR_T" code would be appropriate, but it's apparently not in every iconv implementation.
return CodeToUTF8((sizeof(wchar_t) == 2) ? "UTF-16LE" : "UTF-32LE", input);
}
std::string UTF16BEToUTF8(const char16_t* str, size_t max_size)

View File

@ -201,13 +201,13 @@ std::string PathToFileName(std::string_view path);
void StringPopBackIf(std::string* s, char c);
size_t StringUTF8CodePointCount(std::string_view str);
std::string CP1252ToUTF8(std::string_view str);
std::string SHIFTJISToUTF8(std::string_view str);
std::string UTF8ToSHIFTJIS(std::string_view str);
std::string WStringToUTF8(std::wstring_view str);
std::string CP1252ToUTF8(std::string_view input);
std::string SHIFTJISToUTF8(std::string_view input);
std::string UTF8ToSHIFTJIS(std::string_view input);
std::string WStringToUTF8(std::wstring_view input);
std::string UTF16BEToUTF8(const char16_t* str, size_t max_size); // Stops at \0
std::string UTF16ToUTF8(std::u16string_view str);
std::u16string UTF8ToUTF16(std::string_view str);
std::string UTF16ToUTF8(std::u16string_view input);
std::u16string UTF8ToUTF16(std::string_view input);
#ifdef _WIN32
@ -311,7 +311,7 @@ std::string GetEscapedHtml(std::string html);
void ToLower(std::string* str);
void ToUpper(std::string* str);
bool CaseInsensitiveEquals(std::string_view a, std::string_view b);
bool CaseInsensitiveContains(std::string_view a, std::string_view b);
bool CaseInsensitiveContains(std::string_view haystack, std::string_view needle);
// 'std::less'-like comparison function object type for case-insensitive strings.
struct CaseInsensitiveLess

View File

@ -2,10 +2,12 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "Common/StringUtil.h"
#include "Common/Swap.h"
TEST(StringUtil, StringPopBackIf)
{
@ -256,3 +258,26 @@ TEST(StringUtil, CaseInsensitiveContains_OverlappingMatches)
EXPECT_TRUE(Common::CaseInsensitiveContains("aaaaaa", "aa"));
EXPECT_TRUE(Common::CaseInsensitiveContains("ababababa", "bABa"));
}
TEST(StringUtil, CharacterEncodingConversion)
{
// wstring
EXPECT_EQ(WStringToUTF8(L"hello 🐬"), "hello 🐬");
// UTF-16
EXPECT_EQ(UTF16ToUTF8(u"hello 🐬"), "hello 🐬");
EXPECT_EQ(UTF8ToUTF16("hello 🐬"), u"hello 🐬");
// UTF-16BE
char16_t utf16be_str[] = u"hello 🐬";
for (auto& c : utf16be_str)
c = Common::swap16(c);
EXPECT_EQ(UTF16BEToUTF8(utf16be_str, 99), "hello 🐬");
// Shift JIS
EXPECT_EQ(SHIFTJISToUTF8("\x83\x43\x83\x8b\x83\x4a"), "イルカ");
EXPECT_EQ(UTF8ToSHIFTJIS("イルカ"), "\x83\x43\x83\x8b\x83\x4a");
// CP1252
EXPECT_EQ(CP1252ToUTF8("hello \xa5"), "hello ¥");
}