From 3d7b8cd5517b47afbd8d3ff3d04e3218d128a565 Mon Sep 17 00:00:00 2001 From: simakover Date: Mon, 10 Apr 2023 05:54:39 +0500 Subject: [PATCH] fix to cryllic letter filtering (#837) --- .../main/java/exh/smartsearch/SmartSearchEngine.kt | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/app/src/main/java/exh/smartsearch/SmartSearchEngine.kt b/app/src/main/java/exh/smartsearch/SmartSearchEngine.kt index 981a0a9b5..a85b09ea2 100644 --- a/app/src/main/java/exh/smartsearch/SmartSearchEngine.kt +++ b/app/src/main/java/exh/smartsearch/SmartSearchEngine.kt @@ -107,8 +107,18 @@ class SmartSearchEngine( cleanedTitle = removeTextInBrackets(preTitle, false) } + // Strip chapter reference RU + cleanedTitle = cleanedTitle.replace(chapterRefCyrillicRegexp, " ").trim() + // Strip non-special characters - cleanedTitle = cleanedTitle.replace(titleRegex, " ") + val cleanedTitleEng = cleanedTitle.replace(titleRegex, " ") + + // Do not strip foreign language letters if cleanedTitle is too short + if (cleanedTitleEng.length <= 5) { + cleanedTitle = cleanedTitle.replace(titleCyrillicRegex, " ") + } else { + cleanedTitle = cleanedTitleEng + } // Strip splitters and consecutive spaces cleanedTitle = cleanedTitle.trim().replace(" - ", " ").replace(consecutiveSpacesRegex, " ").trim() @@ -167,7 +177,9 @@ class SmartSearchEngine( const val MIN_NORMAL_ELIGIBLE_THRESHOLD = 0.4 private val titleRegex = Regex("[^a-zA-Z0-9- ]") + private val titleCyrillicRegex = Regex("[^\\p{L}0-9- ]") private val consecutiveSpacesRegex = Regex(" +") + private val chapterRefCyrillicRegexp = Regex("""((- часть|- глава) \d*)""") } }