Split Generic Text (#1)

* split generic text * Add splited table, change split flag to `__split__` * add split flag check * fix target
2024-07-29 05:32:37 -05:00 · 2024-07-29 05:32:37 -05:00 · 0e1ad6959b
parent 6ddf4212d4
commit 0e1ad6959b
2 changed files with 235 additions and 21 deletions
--- a/app/src/main/cpp/GakumasLocalify/Local.cpp
+++ b/app/src/main/cpp/GakumasLocalify/Local.cpp
@ -11,6 +11,9 @@
 #include <thread>
 #include <regex>
 #include <ranges>
+#include <string>
+#include <cctype>
+#include <algorithm>
 #include "BaseDefine.h"


@ -18,16 +21,63 @@ namespace GakumasLocal::Local {
    std::unordered_map<std::string, std::string> i18nData{};
    std::unordered_map<std::string, std::string> i18nDumpData{};
    std::unordered_map<std::string, std::string> genericText{};
+    std::unordered_map<std::string, std::string> genericSplitText{};
    std::vector<std::string> genericTextDumpData{};
+    std::vector<std::string> genericSplittedDumpData{};
+    std::vector<std::string> genericOrigTextDumpData{};
    std::unordered_set<std::string> translatedText{};
    int genericDumpFileIndex = 0;
+    const std::string splitTextPrefix = "[__split__]";

    std::filesystem::path GetBasePath() {
        return Plugin::GetInstance().GetHookInstaller()->localizationFilesDir;
    }

+    std::string trim(const std::string& str) {
+        auto is_not_space = [](char ch) { return !std::isspace(ch); };
+        auto start = std::ranges::find_if(str, is_not_space);
+        auto end = std::ranges::find_if(str | std::views::reverse, is_not_space).base();
+
+        if (start < end) {
+            return {start, end};
+        }
+        return "";
+    }
+
+    std::string findInMapIgnoreSpace(const std::string& key, const std::unordered_map<std::string, std::string>& searchMap) {
+        auto is_space = [](char ch) { return std::isspace(ch); };
+        auto front = std::ranges::find_if_not(key, is_space);
+        auto back = std::ranges::find_if_not(key | std::views::reverse, is_space).base();
+
+        std::string prefix(key.begin(), front);
+        std::string suffix(back, key.end());
+
+        std::string trimmedKey = trim(key);
+        if ( auto it = searchMap.find(trimmedKey); it != searchMap.end()) {
+            return prefix + it->second + suffix;
+        }
+        else {
+            return "";
+        }
+    }
+
+    enum class DumpStrStat {
+        DEFAULT = 0,
+        SPLITTABLE_ORIG = 1,
+        SPLITTED = 2
+    };
+
+    enum class SplitTagsTranslationStat {
+        NO_TRANS,
+        PART_TRANS,
+        FULL_TRANS,
+        NO_SPLIT,
+        NO_SPLIT_AND_EMPTY
+    };
+
    void LoadJsonDataToMap(const std::filesystem::path& filePath, std::unordered_map<std::string, std::string>& dict,
-                           const bool insertToTranslated = false, const bool needClearDict = true) {
+                           const bool insertToTranslated = false, const bool needClearDict = true,
+                           const bool needCheckSplitPrefix = false) {
        if (!exists(filePath)) return;
        try {
            if (needClearDict) {
@ -44,8 +94,16 @@ namespace GakumasLocal::Local {
            for (auto& i : fileData.items()) {
                const auto& key = i.key();
                const std::string value = i.value();
-                if (insertToTranslated) translatedText.emplace(value);
-                dict[key] = value;
+                if (needCheckSplitPrefix && key.starts_with(splitTextPrefix) && value.starts_with(splitTextPrefix)) {
+                    static const auto splitTextPrefixLength = splitTextPrefix.size();
+                    const auto splitValue = value.substr(splitTextPrefixLength);
+                    genericSplitText[key.substr(splitTextPrefixLength)] = splitValue;
+                    if (insertToTranslated) translatedText.emplace(splitValue);
+                }
+                else {
+                    dict[key] = value;
+                    if (insertToTranslated) translatedText.emplace(value);
+                }
            }
        }
        catch (std::exception& e) {
@ -84,7 +142,7 @@ namespace GakumasLocal::Local {
    }

    void DumpVectorDataToJson(const std::filesystem::path& dumpBasePath, const std::filesystem::path& fileName,
-                           const std::vector<std::string>& vec) {
+                           const std::vector<std::string>& vec, const std::string& prefix = "") {
        const auto dumpFilePath = dumpBasePath / fileName;
        try {
            if (!is_directory(dumpBasePath)) {
@ -101,7 +159,12 @@ namespace GakumasLocal::Local {
            dumpLrcFile.close();
            auto fileData = nlohmann::ordered_json::parse(fileContent);
            for (const auto& i : vec) {
-                fileData[i] = i;
+                if (!prefix.empty()) {
+                    fileData[prefix + i] = prefix + i;
+                }
+                else {
+                    fileData[i] = i;
+                }
            }
            const auto newStr = fileData.dump(4, 32, false);
            std::ofstream dumpWriteLrcFile(dumpFilePath, std::ofstream::out);
@ -199,9 +262,91 @@ namespace GakumasLocal::Local {
        return ret;
    }

+    SplitTagsTranslationStat GetSplitTagsTranslationFull(const std::string& origTextIn, std::string* newText, std::vector<std::string>& unTransResultRet) {
+        // static const std::u16string splitFlags = u"0123456789+＋-－%％【】.";
+        static const std::unordered_set<char16_t> splitFlags = {u'0', u'1', u'2', u'3', u'4', u'5',
+                                                                u'6', u'7', u'8', u'9', u'+', u'＋',
+                                                                u'-', u'－', u'%', u'％', u'【', u'】',
+                                                                u'.', u':', u'：', u'×'};
+
+        const auto origText = Misc::ToUTF16(origTextIn);
+        bool isInTag = false;
+        std::vector<std::string> waitingReplaceTexts{};
+
+        std::u16string currentWaitingReplaceText;
+
+#define checkCurrentWaitingReplaceTextAndClear() \
+    if (!currentWaitingReplaceText.empty()) { \
+        waitingReplaceTexts.push_back(Misc::ToUTF8(currentWaitingReplaceText)); \
+        currentWaitingReplaceText.clear(); }
+
+        for (char16_t currChar : origText) {
+            if (currChar == u'<') {
+                isInTag = true;
+            }
+            if (currChar == u'>') {
+                isInTag = false;
+                checkCurrentWaitingReplaceTextAndClear()
+                continue;
+            }
+            if (isInTag) {
+                checkCurrentWaitingReplaceTextAndClear()
+                continue;
+            }
+
+            if (!splitFlags.contains(currChar)) {
+                currentWaitingReplaceText.push_back(currChar);
+            }
+            else {
+                checkCurrentWaitingReplaceTextAndClear()
+            }
+        }
+        if (waitingReplaceTexts.empty()) {
+            if (currentWaitingReplaceText.empty()) {
+                return SplitTagsTranslationStat::NO_SPLIT_AND_EMPTY;
+            }
+            else {
+                return SplitTagsTranslationStat::NO_SPLIT;
+            }
+        }
+        checkCurrentWaitingReplaceTextAndClear()
+
+        *newText = origTextIn;
+        SplitTagsTranslationStat ret;
+        bool hasTrans = false;
+        bool hasNotTrans = false;
+        if (!waitingReplaceTexts.empty()) {
+            for (const auto& i : waitingReplaceTexts) {
+                const auto searchResult = findInMapIgnoreSpace(i, genericSplitText);
+                if (!searchResult.empty()) {
+                    ReplaceString(newText, i, searchResult);
+                    hasTrans = true;
+                }
+                else {
+                    unTransResultRet.emplace_back(trim(i));
+                    hasNotTrans = true;
+                }
+            }
+            if (hasTrans && hasNotTrans) {
+                ret = SplitTagsTranslationStat::PART_TRANS;
+            }
+            else if (hasTrans && !hasNotTrans) {
+                ret = SplitTagsTranslationStat::FULL_TRANS;
+            }
+            else {
+                ret = SplitTagsTranslationStat::NO_TRANS;
+            }
+        }
+        else {
+            ret = SplitTagsTranslationStat::NO_TRANS;
+        }
+        return ret;
+    }
+
    void LoadData() {
        static auto localizationFile = GetBasePath() / "local-files" / "localization.json";
        static auto genericFile = GetBasePath() / "local-files" / "generic.json";
+        static auto genericSplitFile = GetBasePath() / "local-files" / "generic.split.json";
        static auto genericDir = GetBasePath() / "local-files" / "genericTrans";

        if (!std::filesystem::is_regular_file(localizationFile)) {
@ -211,13 +356,20 @@ namespace GakumasLocal::Local {
        LoadJsonDataToMap(localizationFile, i18nData, true);
        Log::InfoFmt("%ld localization items loaded.", i18nData.size());

-        LoadJsonDataToMap(genericFile, genericText, true);
+        LoadJsonDataToMap(genericFile, genericText, true, true, true);
+        genericSplitText.clear();
+        LoadJsonDataToMap(genericSplitFile, genericSplitText, true, true, true);
        if (std::filesystem::exists(genericDir) || std::filesystem::is_directory(genericDir)) {
            for (const auto& entry : std::filesystem::recursive_directory_iterator(genericDir)) {
                if (std::filesystem::is_regular_file(entry.path())) {
-                    const auto currFile = entry.path();
+                    const auto& currFile = entry.path();
                    if (to_lower(currFile.extension().string()) == ".json") {
-                        LoadJsonDataToMap(currFile, genericText, true, false);
+                        if (currFile.filename().string().ends_with(".split.json")) {  // split text file
+                            LoadJsonDataToMap(currFile, genericSplitText, true, false, true);
+                        }
+                        else {
+                            LoadJsonDataToMap(currFile, genericText, true, false, true);
+                        }
                    }
                }
            }
@ -285,29 +437,47 @@ namespace GakumasLocal::Local {
        return false;
    }

-    std::string GetDumpGenericFileName() {
-        if (genericDumpFileIndex == 0) return "generic.json";
-        return Log::StringFormat("generic_%d.json", genericDumpFileIndex);
+    std::string GetDumpGenericFileName(DumpStrStat stat = DumpStrStat::DEFAULT) {
+        if (stat == DumpStrStat::SPLITTABLE_ORIG) {
+            if (genericDumpFileIndex == 0) return "generic_orig.json";
+            return Log::StringFormat("generic_orig_%d.json", genericDumpFileIndex);
+        }
+        else {
+            if (genericDumpFileIndex == 0) return "generic.json";
+            return Log::StringFormat("generic_%d.json", genericDumpFileIndex);
+        }
    }

    bool inDumpGeneric = false;
-    void DumpGenericText(const std::string& origText) {
+    void DumpGenericText(const std::string& origText, DumpStrStat stat = DumpStrStat::DEFAULT) {
        if (translatedText.contains(origText)) return;

-        if (std::find(genericTextDumpData.begin(), genericTextDumpData.end(), origText) != genericTextDumpData.end()) {
+        std::array<std::reference_wrapper<std::vector<std::string>>, 3> targets = {
+                genericTextDumpData,
+                genericOrigTextDumpData,
+                genericSplittedDumpData
+        };
+
+        auto& appendTarget = targets[static_cast<int>(stat)].get();
+
+        if (std::find(appendTarget.begin(), appendTarget.end(), origText) != appendTarget.end()) {
            return;
        }
        if (IsPureStringValue(origText)) return;

-        genericTextDumpData.push_back(origText);
+        appendTarget.push_back(origText);
        static auto dumpBasePath = GetBasePath() / "dump-files";

        if (inDumpGeneric) return;
        inDumpGeneric = true;
        std::thread([](){
            std::this_thread::sleep_for(std::chrono::seconds(5));
-            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(), genericTextDumpData);
+            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(DumpStrStat::DEFAULT), genericTextDumpData);
+            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(DumpStrStat::SPLITTABLE_ORIG), genericOrigTextDumpData);
+            DumpVectorDataToJson(dumpBasePath, GetDumpGenericFileName(DumpStrStat::SPLITTED), genericSplittedDumpData, splitTextPrefix);
            genericTextDumpData.clear();
+            genericSplittedDumpData.clear();
+            genericOrigTextDumpData.clear();
            inDumpGeneric = false;
        }).detach();
    }
@ -318,25 +488,50 @@ namespace GakumasLocal::Local {
            return true;
        }

+        auto ret = false;
+
        std::vector<std::string> unTransResultRet;
-        if (GetSplitTagsTranslation(origText, newStr, unTransResultRet)) {
-            return true;
+        const auto splitTransStat = GetSplitTagsTranslationFull(origText, newStr, unTransResultRet);
+        switch (splitTransStat) {
+            case SplitTagsTranslationStat::FULL_TRANS: {
+                return true;
+            } break;
+
+            case SplitTagsTranslationStat::NO_SPLIT_AND_EMPTY: {
+                return false;
+            } break;
+
+            case SplitTagsTranslationStat::NO_SPLIT: {
+                ret = false;
+            } break;
+
+            case SplitTagsTranslationStat::NO_TRANS: {
+                ret = false;
+            } break;
+
+            case SplitTagsTranslationStat::PART_TRANS: {
+                ret = true;
+            } break;
        }

        if (!Config::dumpText) {
-            return false;
+            return ret;
        }

-        if (unTransResultRet.empty()) {
+        if (unTransResultRet.empty() || (splitTransStat == SplitTagsTranslationStat::NO_SPLIT)) {
            DumpGenericText(origText);
        }
        else {
            for (const auto& i : unTransResultRet) {
-                DumpGenericText(i);
+                DumpGenericText(i, DumpStrStat::SPLITTED);
            }
+            // 若未翻译部分长度为1，且未翻译文本等于原文本，则不 dump 到原文本文件
+            //if (unTransResultRet.size() != 1 || unTransResultRet[0] != origText) {
+                DumpGenericText(origText, DumpStrStat::SPLITTABLE_ORIG);
+            //}
        }

-        return false;
+        return ret;
    }

    std::string ChangeDumpTextIndex(int changeValue) {
--- a/app/src/main/cpp/deps/UnityResolve/UnityResolve.hpp
+++ b/app/src/main/cpp/deps/UnityResolve/UnityResolve.hpp
@ -1471,6 +1471,25 @@ public:
 				}
 			}

+            [[nodiscard]] auto ToWString() const -> std::u16string {
+#if WINDOWS_MODE
+                if (IsBadReadPtr(this, sizeof(String))) return {};
+				if (IsBadReadPtr(m_firstChar, m_stringLength)) return {};
+#endif
+                if (!this) return {};
+                try {
+                    // using convert_typeX = std::codecvt_utf8<wchar_t>;
+                    // std::wstring_convert<convert_typeX> converterX;
+                    // return converterX.to_bytes(m_firstChar);
+                    return {chars};
+                }
+                catch (std::exception& e) {
+                    std::cout << "String Invoke Error\n";
+                    GakumasLocal::Log::ErrorFmt("String Invoke Error: %s", e.what());
+                    return {};
+                }
+            }
+
 			auto operator=(const std::string& newString) const -> String* { return New(newString); }

 			auto operator==(const std::wstring& newString) const -> bool { return Equals(newString); }