Split Generic Text (#1)

* split generic text * Add splited table, change split flag to `__split__` * add split flag check * fix target
2024-07-29 05:32:37 -05:00 · 2024-07-29 05:32:37 -05:00 · 0e1ad6959b
parent 6ddf4212d4
commit 0e1ad6959b
2 changed files with 235 additions and 21 deletions
--- a/app/src/main/cpp/GakumasLocalify/Local.cpp
+++ b/app/src/main/cpp/GakumasLocalify/Local.cpp
@ -11,6 +11,9 @@
 #include <thread>
 #include <regex>
 #include <ranges>
 #include <string>
 #include <cctype>
 #include <algorithm>
 #include "BaseDefine.h"
@ -18,16 +21,63 @@ namespace GakumasLocal::Local {
    std::unordered_map<std::string, std::string> i18nData{};
    std::unordered_map<std::string, std::string> i18nDumpData{};
    std::unordered_map<std::string, std::string> genericText{};
    std::unordered_map<std::string, std::string> genericSplitText{};
    std::vector<std::string> genericTextDumpData{};
    std::vector<std::string> genericSplittedDumpData{};
    std::vector<std::string> genericOrigTextDumpData{};
    std::unordered_set<std::string> translatedText{};
    int genericDumpFileIndex = 0;
    const std::string splitTextPrefix = "[__split__]";
    std::filesystem::path GetBasePath() {
        return Plugin::GetInstance().GetHookInstaller()->localizationFilesDir;
    }
    std::string trim(const std::string& str) {
        auto is_not_space = [](char ch) { return !std::isspace(ch); };
        auto start = std::ranges::find_if(str, is_not_space);
        auto end = std::ranges::find_if(str | std::views::reverse, is_not_space).base();
        if (start < end) {
            return {start, end};
        }
        return "";
    }
    std::string findInMapIgnoreSpace(const std::string& key, const std::unordered_map<std::string, std::string>& searchMap) {
        auto is_space = [](char ch) { return std::isspace(ch); };
        auto front = std::ranges::find_if_not(key, is_space);
        auto back = std::ranges::find_if_not(key | std::views::reverse, is_space).base();
        std::string prefix(key.begin(), front);
        std::string suffix(back, key.end());
        std::string trimmedKey = trim(key);
        if ( auto it = searchMap.find(trimmedKey); it != searchMap.end()) {
            return prefix + it->second + suffix;
        }
        else {
            return "";
        }
    }
    enum class DumpStrStat {
        DEFAULT = 0,
        SPLITTABLE_ORIG = 1,
        SPLITTED = 2
    };
    enum class SplitTagsTranslationStat {
        NO_TRANS,
        PART_TRANS,
        FULL_TRANS,
        NO_SPLIT,
        NO_SPLIT_AND_EMPTY
    };
    void LoadJsonDataToMap(const std::filesystem::path& filePath, std::unordered_map<std::string, std::string>& dict,
-                           const bool insertToTranslated = false, const bool needClearDict = true) {
+                           const bool insertToTranslated = false, const bool needClearDict = true,
                           const bool needCheckSplitPrefix = false) {
        if (!exists(filePath)) return;
        try {
            if (needClearDict) {
@ -44,8 +94,16 @@ namespace GakumasLocal::Local {
            for (auto& i : fileData.items()) {
                const auto& key = i.key();
                const std::string value = i.value();
-                if (insertToTranslated) translatedText.emplace(value);
+                if (needCheckSplitPrefix && key.starts_with(splitTextPrefix) && value.starts_with(splitTextPrefix)) {
-                dict[key] = value;
+                    static const auto splitTextPrefixLength = splitTextPrefix.size();
                    const auto splitValue = value.substr(splitTextPrefixLength);
                    genericSplitText[key.substr(splitTextPrefixLength)] = splitValue;
                    if (insertToTranslated) translatedText.emplace(splitValue);
                }
                else {
                    dict[key] = value;
                    if (insertToTranslated) translatedText.emplace(value);
                }
            }
        }
        catch (std::exception& e) {
@ -84,7 +142,7 @@ namespace GakumasLocal::Local {
    }
    void DumpVectorDataToJson(const std::filesystem::path& dumpBasePath, const std::filesystem::path& fileName,
-                           const std::vector<std::string>& vec) {
+                           const std::vector<std::string>& vec, const std::string& prefix = "") {
        const auto dumpFilePath = dumpBasePath / fileName;
        try {
            if (!is_directory(dumpBasePath)) {
@ -101,7 +159,12 @@ namespace GakumasLocal::Local {
            dumpLrcFile.close();
            auto fileData = nlohmann::ordered_json::parse(fileContent);
            for (const auto& i : vec) {
-                fileData[i] = i;
+                if (!prefix.empty()) {
                    fileData[prefix + i] = prefix + i;
                }
                else {
                    fileData[i] = i;
                }
            }
            const auto newStr = fileData.dump(4, 32, false);
            std::ofstream dumpWriteLrcFile(dumpFilePath, std::ofstream::out);
@ -199,9 +262,91 @@ namespace GakumasLocal::Local {
        return ret;
    }
    SplitTagsTranslationStat GetSplitTagsTranslationFull(const std::string& origTextIn, std::string* newText, std::vector<std::string>& unTransResultRet) {
        // static const std::u16string splitFlags = u"0123456789+＋-－%％【】.";
        static const std::unordered_set<char16_t> splitFlags = {u'0', u'1', u'2', u'3', u'4', u'5',
                                                                u'6', u'7', u'8', u'9', u'+', u'＋',
                                                                u'-', u'－', u'%', u'％', u'【', u'】',
                                                                u'.', u':', u'：', u'×'};
        const auto origText = Misc::ToUTF16(origTextIn);
        bool isInTag = false;
        std::vector<std::string> waitingReplaceTexts{};
        std::u16string currentWaitingReplaceText;
 #define checkCurrentWaitingReplaceTextAndClear() \
    if (!currentWaitingReplaceText.empty()) { \
        waitingReplaceTexts.push_back(Misc::ToUTF8(currentWaitingReplaceText)); \
        currentWaitingReplaceText.clear(); }
        for (char16_t currChar : origText) {
            if (currChar == u'<') {
                isInTag = true;
            }
            if (currChar == u'>') {
                isInTag = false;
                checkCurrentWaitingReplaceTextAndClear()
                continue;
            }
            if (isInTag) {
                checkCurrentWaitingReplaceTextAndClear()
                continue;
            }
            if (!splitFlags.contains(currChar)) {
                currentWaitingReplaceText.push_back(currChar);
            }
            else {
                checkCurrentWaitingReplaceTextAndClear()
            }
        }
        if (waitingReplaceTexts.empty()) {
            if (currentWaitingReplaceText.empty()) {
                return SplitTagsTranslationStat::NO_SPLIT_AND_EMPTY;
            }
            else {
                return SplitTagsTranslationStat::NO_SPLIT;
            }
        }
        checkCurrentWaitingReplaceTextAndClear()
        *newText = origTextIn;
        SplitTagsTranslationStat ret;
        bool hasTrans = false;
        bool hasNotTrans = false;
        if (!waitingReplaceTexts.empty()) {
            for (const auto& i : waitingReplaceTexts) {
                const auto searchResult = findInMapIgnoreSpace(i, genericSplitText);
                if (!searchResult.empty()) {
                    ReplaceString(newText, i, searchResult);
                    hasTrans = true;
                }
                else {
                    unTransResultRet.emplace_back(trim(i));
                    hasNotTrans = true;
                }
            }
            if (hasTrans && hasNotTrans) {
                ret = SplitTagsTranslationStat::PART_TRANS;
            }
            else if (hasTrans && !hasNotTrans) {
                ret = SplitTagsTranslationStat::FULL_TRANS;
            }
            else {
                ret = SplitTagsTranslationStat::NO_TRANS;
            }
        }
        else {
            ret = SplitTagsTranslationStat::NO_TRANS;
        }
        return ret;
    }
    void LoadData() {
        static auto localizationFile = GetBasePath() / "local-files" / "localization.json";
        static auto genericFile = GetBasePath() / "local-files" / "generic.json";
        static auto genericSplitFile = GetBasePath() / "local-files" / "generic.split.json";
        static auto genericDir = GetBasePath() / "local-files" / "genericTrans";
        if (!std::filesystem::is_regular_file(localizationFile)) {
@ -211,13 +356,20 @@ namespace GakumasLocal::Local {
        LoadJsonDataToMap(localizationFile, i18nData, true);
        Log::InfoFmt("%ld localization items loaded.", i18nData.size());
-        LoadJsonDataToMap(genericFile, genericText, true);
+        LoadJsonDataToMap(genericFile, genericText, true, true, true);
        genericSplitText.clear();
        LoadJsonDataToMap(genericSplitFile, genericSplitText, true, true, true);
        if (std::filesystem::exists(genericDir) || std::filesystem::is_directory(genericDir)) {
            for (const auto& entry : std::filesystem::recursive_directory_iterator(genericDir)) {
                if (std::filesystem::is_regular_file(entry.path())) {
-                    const auto currFile = entry.path();
+                    const auto& currFile = entry.path();
                    if (to_lower(currFile.extension().string()) == ".json") {
-                        LoadJsonDataToMap(currFile, genericText, true, false);
+                        if (currFile.filename().string().ends_with(".split.json")) {  // split text file
                            LoadJsonDataToMap(currFile, genericSplitText, true, false, true);
                        }
                        else {
                            LoadJsonDataToMap(currFile, genericText, true, false, true);
                        }
                    }
                }
            }
@ -285,29 +437,47 @@ namespace GakumasLocal::Local {
        return false;
    }
-    std::string GetDumpGenericFileName() {
+    std::string GetDumpGenericFileName(DumpStrStat stat = DumpStrStat::DEFAULT) {
-        if (genericDumpFileIndex == 0) return "generic.json";
+        if (stat == DumpStrStat::SPLITTABLE_ORIG) {
-        return Log::StringFormat("generic_%d.json", genericDumpFileIndex);
+            if (genericDumpFileIndex == 0) return "generic_orig.json";
            return Log::StringFormat("generic_orig_%d.json", genericDumpFileIndex);
        }
        else {
            if (genericDumpFileIndex == 0) return "generic.json";
            return Log::StringFormat("generic_%d.json", genericDumpFileIndex);
        }
    }
    bool inDumpGeneric = false;
-    void DumpGenericText(const std::string& origText) {
+    void DumpGenericText(const std::string& origText, DumpStrStat stat = DumpStrStat::DEFAULT) {
        if (translatedText.contains(origText)) return;
-        if (std::find(genericTextDumpData.begin(), genericTextDumpData.end(), origText) != genericTextDumpData.end()) {
+        std::array<std::reference_wrapper<std::vector<std::string>>, 3> targets = {
                genericTextDumpData,
                genericOrigTextDumpData,
                genericSplittedDumpData
        };
        auto& appendTarget = targets[static_cast<int>(stat)].get();
        if (std::find(appendTarget.begin(), appendTarget.end(), origText) != appendTarget.end()) {
            return;
        }
        if (IsPureStringValue(origText)) return;
-        genericTextDumpData.push_back(origText);
+        appendTarget.push_back(origText);
        static auto dumpBasePath = GetBasePath() / "dump-files";
        if (inDumpGeneric) return;
        inDumpGeneric = true;
        std::thread([](){
            std::this_thread::sleep_for(std::chrono::seconds(5));
-            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(), genericTextDumpData);
+            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(DumpStrStat::DEFAULT), genericTextDumpData);
            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(DumpStrStat::SPLITTABLE_ORIG), genericOrigTextDumpData);
            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(DumpStrStat::SPLITTED), genericSplittedDumpData, splitTextPrefix);
            genericTextDumpData.clear();
            genericSplittedDumpData.clear();
            genericOrigTextDumpData.clear();
            inDumpGeneric = false;
        }).detach();
    }
@ -318,25 +488,50 @@ namespace GakumasLocal::Local {
            return true;
        }
        auto ret = false;
        std::vector<std::string> unTransResultRet;
-        if (GetSplitTagsTranslation(origText, newStr, unTransResultRet)) {
+        const auto splitTransStat = GetSplitTagsTranslationFull(origText, newStr, unTransResultRet);
-            return true;
+        switch (splitTransStat) {
            case SplitTagsTranslationStat::FULL_TRANS: {
                return true;
            } break;
            case SplitTagsTranslationStat::NO_SPLIT_AND_EMPTY: {
                return false;
            } break;
            case SplitTagsTranslationStat::NO_SPLIT: {
                ret = false;
            } break;
            case SplitTagsTranslationStat::NO_TRANS: {
                ret = false;
            } break;
            case SplitTagsTranslationStat::PART_TRANS: {
                ret = true;
            } break;
        }
        if (!Config::dumpText) {
-            return false;
+            return ret;
        }
-        if (unTransResultRet.empty()) {
+        if (unTransResultRet.empty() || (splitTransStat == SplitTagsTranslationStat::NO_SPLIT)) {
            DumpGenericText(origText);
        }
        else {
            for (const auto& i : unTransResultRet) {
-                DumpGenericText(i);
+                DumpGenericText(i, DumpStrStat::SPLITTED);
            }
            // 若未翻译部分长度为1，且未翻译文本等于原文本，则不 dump 到原文本文件
            //if (unTransResultRet.size() != 1 || unTransResultRet[0] != origText) {
                DumpGenericText(origText, DumpStrStat::SPLITTABLE_ORIG);
            //}
        }
-        return false;
+        return ret;
    }
    std::string ChangeDumpTextIndex(int changeValue) {
--- a/app/src/main/cpp/deps/UnityResolve/UnityResolve.hpp
+++ b/app/src/main/cpp/deps/UnityResolve/UnityResolve.hpp
@ -1471,6 +1471,25 @@ public:
 				}
 			}
            [[nodiscard]] auto ToWString() const -> std::u16string {
 #if WINDOWS_MODE
                if (IsBadReadPtr(this, sizeof(String))) return {};
 				if (IsBadReadPtr(m_firstChar, m_stringLength)) return {};
 #endif
                if (!this) return {};
                try {
                    // using convert_typeX = std::codecvt_utf8<wchar_t>;
                    // std::wstring_convert<convert_typeX> converterX;
                    // return converterX.to_bytes(m_firstChar);
                    return {chars};
                }
                catch (std::exception& e) {
                    std::cout << "String Invoke Error\n";
                    GakumasLocal::Log::ErrorFmt("String Invoke Error: %s", e.what());
                    return {};
                }
            }
 			auto operator=(const std::string& newString) const -> String* { return New(newString); }
 			auto operator==(const std::wstring& newString) const -> bool { return Equals(newString); }